napistu 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/sbml_dfs_core.py CHANGED
@@ -7,8 +7,12 @@ from typing import Iterable
7
7
  from typing import Mapping
8
8
  from typing import MutableMapping
9
9
  from typing import TYPE_CHECKING
10
+ from typing import Optional
11
+ from typing import Union
10
12
 
13
+ from fs import open_fs
11
14
  import pandas as pd
15
+
12
16
  from napistu import identifiers
13
17
  from napistu import sbml_dfs_utils
14
18
  from napistu import source
@@ -17,25 +21,14 @@ from napistu.ingestion import sbml
17
21
  from napistu.constants import SBML_DFS
18
22
  from napistu.constants import SBML_DFS_SCHEMA
19
23
  from napistu.constants import IDENTIFIERS
20
- from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
21
- from napistu.constants import CPR_STANDARD_OUTPUTS
22
- from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
24
+ from napistu.constants import NAPISTU_STANDARD_OUTPUTS
23
25
  from napistu.constants import BQB_PRIORITIES
24
26
  from napistu.constants import ONTOLOGY_PRIORITIES
25
- from napistu.constants import BQB
26
- from napistu.constants import BQB_DEFINING_ATTRS
27
27
  from napistu.constants import MINI_SBO_FROM_NAME
28
28
  from napistu.constants import MINI_SBO_TO_NAME
29
- from napistu.constants import ONTOLOGIES
30
- from napistu.constants import SBO_NAME_TO_ROLE
31
29
  from napistu.constants import SBOTERM_NAMES
32
- from napistu.constants import SBO_ROLES_DEFS
33
30
  from napistu.constants import ENTITIES_W_DATA
34
31
  from napistu.constants import ENTITIES_TO_ENTITY_DATA
35
- from napistu.ingestion.constants import GENERIC_COMPARTMENT
36
- from napistu.ingestion.constants import COMPARTMENT_ALIASES
37
- from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
38
- from fs import open_fs
39
32
 
40
33
  logger = logging.getLogger(__name__)
41
34
 
@@ -65,26 +58,76 @@ class SBML_dfs:
65
58
  schema : dict
66
59
  Dictionary representing the structure of the other attributes and meaning of their variables
67
60
 
68
- Methods
69
- -------
70
- get_table(entity_type, required_attributes)
71
- Get a table from the SBML_dfs object with optional attribute validation
72
- search_by_ids(ids, entity_type, identifiers_df, ontologies)
73
- Find entities and identifiers matching a set of query IDs
74
- search_by_name(name, entity_type, partial_match)
75
- Find entities by exact or partial name match
61
+ Public Methods (alphabetical)
62
+ ----------------------------
63
+ add_reactions_data(label, data)
64
+ Add a new reactions data table to the model with validation.
65
+ add_species_data(label, data)
66
+ Add a new species data table to the model with validation.
67
+ export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
68
+ Export the SBML_dfs model and its tables to files in a specified directory.
69
+ get_characteristic_species_ids(dogmatic=True)
70
+ Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
76
71
  get_cspecies_features()
77
- Get additional attributes of compartmentalized species
78
- get_species_features()
79
- Get additional attributes of species
72
+ Compute and return additional features for compartmentalized species, such as degree and type.
80
73
  get_identifiers(id_type)
81
- Get identifiers from a specified entity type
82
- get_uri_urls(entity_type, entity_ids)
83
- Get reference URLs for specified entities
74
+ Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
75
+ get_network_summary()
76
+ Return a dictionary of diagnostic statistics summarizing the network structure.
77
+ get_species_features()
78
+ Compute and return additional features for species, such as species type.
79
+ get_table(entity_type, required_attributes=None)
80
+ Retrieve a table for a given entity type, optionally validating required attributes.
81
+ get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
82
+ Return reference URLs for specified entities, optionally filtered by ontology.
83
+ infer_sbo_terms()
84
+ Infer and fill in missing SBO terms for reaction species based on stoichiometry.
85
+ infer_uncompartmentalized_species_location()
86
+ Infer and assign compartments for compartmentalized species with missing compartment information.
87
+ name_compartmentalized_species()
88
+ Rename compartmentalized species to include compartment information if needed.
89
+ reaction_formulas(r_ids=None)
90
+ Generate human-readable reaction formulas for specified reactions.
91
+ reaction_summaries(r_ids=None)
92
+ Return a summary DataFrame for specified reactions, including names and formulas.
93
+ remove_compartmentalized_species(sc_ids)
94
+ Remove specified compartmentalized species and associated reactions from the model.
95
+ remove_reactions(r_ids, remove_species=False)
96
+ Remove specified reactions and optionally remove unused species.
97
+ remove_reactions_data(label)
98
+ Remove a reactions data table by label.
99
+ remove_species_data(label)
100
+ Remove a species data table by label.
101
+ search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
102
+ Find entities and identifiers matching a set of query IDs.
103
+ search_by_name(name, entity_type, partial_match=True)
104
+ Find entities by exact or partial name match.
105
+ select_species_data(species_data_table)
106
+ Select a species data table from the SBML_dfs object by name.
107
+ species_status(s_id)
108
+ Return all reactions a species participates in, with stoichiometry and formula information.
84
109
  validate()
85
- Validate the SBML_dfs structure and relationships
110
+ Validate the SBML_dfs structure and relationships.
86
111
  validate_and_resolve()
87
- Validate and attempt to automatically fix common issues
112
+ Validate and attempt to automatically fix common issues.
113
+
114
+ Private/Hidden Methods (alphabetical, appear after public methods)
115
+ -----------------------------------------------------------------
116
+ _attempt_resolve(e)
117
+ _check_pk_fk_correspondence()
118
+ _find_underspecified_reactions_by_scids(sc_ids)
119
+ _get_unused_cspecies()
120
+ _get_unused_species()
121
+ _remove_compartmentalized_species(sc_ids)
122
+ _remove_entity_data(entity_type, label)
123
+ _remove_species(s_ids)
124
+ _remove_unused_cspecies()
125
+ _remove_unused_species()
126
+ _validate_r_ids(r_ids)
127
+ _validate_reaction_species()
128
+ _validate_reactions_data(reactions_data_table)
129
+ _validate_species_data(species_data_table)
130
+ _validate_table(table_name)
88
131
  """
89
132
 
90
133
  compartments: pd.DataFrame
@@ -162,193 +205,176 @@ class SBML_dfs:
162
205
  '"validate" = False so "resolve" will be ignored (eventhough it was True)'
163
206
  )
164
207
 
165
- def get_table(
166
- self, entity_type: str, required_attributes: None | set[str] = None
167
- ) -> pd.DataFrame:
208
+ # =============================================================================
209
+ # PUBLIC METHODS (ALPHABETICAL ORDER)
210
+ # =============================================================================
211
+
212
+ def add_reactions_data(self, label: str, data: pd.DataFrame):
168
213
  """
169
- Get a table from the SBML_dfs object with optional attribute validation.
214
+ Add additional reaction data with validation.
170
215
 
171
216
  Parameters
172
217
  ----------
173
- entity_type : str
174
- The type of entity table to retrieve (e.g., 'species', 'reactions')
175
- required_attributes : Optional[Set[str]], optional
176
- Set of attributes that must be present in the table, by default None.
177
- Must be passed as a set, e.g. {'id'}, not a string.
178
-
179
- Returns
180
- -------
181
- pd.DataFrame
182
- The requested table
218
+ label : str
219
+ Label for the new data
220
+ data : pd.DataFrame
221
+ Data to add, must be indexed by reaction_id
183
222
 
184
223
  Raises
185
224
  ------
186
225
  ValueError
187
- If entity_type is invalid or required attributes are missing
188
- TypeError
189
- If required_attributes is not a set
226
+ If the data is invalid or label already exists
190
227
  """
191
-
192
- schema = self.schema
193
-
194
- if entity_type not in schema.keys():
228
+ self._validate_reactions_data(data)
229
+ if label in self.reactions_data:
195
230
  raise ValueError(
196
- f"{entity_type} does not match a table in the SBML_dfs object. The tables "
197
- f"which are present are {', '.join(schema.keys())}"
198
- )
199
-
200
- if required_attributes is not None:
201
- if not isinstance(required_attributes, set):
202
- raise TypeError(
203
- f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
204
- "Did you pass a string instead of a set?"
205
- )
206
-
207
- # determine whether required_attributes are appropriate
208
- VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
209
- invalid_required_attributes = required_attributes.difference(
210
- VALID_REQUIRED_ATTRIBUTES
231
+ f"{label} already exists in reactions_data. " "Drop it first."
211
232
  )
233
+ self.reactions_data[label] = data
212
234
 
213
- if len(invalid_required_attributes) > 0:
214
- raise ValueError(
215
- f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
216
- f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
217
- )
235
+ def add_species_data(self, label: str, data: pd.DataFrame):
236
+ """
237
+ Add additional species data with validation.
218
238
 
219
- # determine if required_attributes are satisified
220
- invalid_attrs = [
221
- s for s in required_attributes if s not in schema[entity_type].keys()
222
- ]
223
- if len(invalid_attrs) > 0:
224
- raise ValueError(
225
- f"The following required attributes are not present for the {entity_type} table: "
226
- f"{', '.join(invalid_attrs)}."
227
- )
239
+ Parameters
240
+ ----------
241
+ label : str
242
+ Label for the new data
243
+ data : pd.DataFrame
244
+ Data to add, must be indexed by species_id
228
245
 
229
- return getattr(self, entity_type)
246
+ Raises
247
+ ------
248
+ ValueError
249
+ If the data is invalid or label already exists
250
+ """
251
+ self._validate_species_data(data)
252
+ if label in self.species_data:
253
+ raise ValueError(
254
+ f"{label} already exists in species_data. " "Drop it first."
255
+ )
256
+ self.species_data[label] = data
230
257
 
231
- def search_by_ids(
258
+ def export_sbml_dfs(
232
259
  self,
233
- ids: list[str],
234
- entity_type: str,
235
- identifiers_df: pd.DataFrame,
236
- ontologies: None | set[str] = None,
237
- ) -> tuple[pd.DataFrame, pd.DataFrame]:
260
+ model_prefix: str,
261
+ outdir: str,
262
+ overwrite: bool = False,
263
+ dogmatic: bool = True,
264
+ ) -> None:
238
265
  """
239
- Find entities and identifiers matching a set of query IDs.
266
+ Export SBML_dfs
240
267
 
241
- Parameters
242
- ----------
243
- ids : List[str]
244
- List of identifiers to search for
245
- entity_type : str
246
- Type of entity to search (e.g., 'species', 'reactions')
247
- identifiers_df : pd.DataFrame
248
- DataFrame containing identifier mappings
249
- ontologies : Optional[Set[str]], optional
250
- Set of ontologies to filter by, by default None
268
+ Export summaries of species identifiers and each table underlying
269
+ an SBML_dfs pathway model
270
+
271
+ Params
272
+ ------
273
+ model_prefix: str
274
+ Label to prepend to all exported files
275
+ outdir: str
276
+ Path to an existing directory where results should be saved
277
+ overwrite: bool
278
+ Should the directory be overwritten if it already exists?
279
+ dogmatic: bool
280
+ If True then treat genes, transcript, and proteins as separate species. If False
281
+ then treat them interchangeably.
251
282
 
252
283
  Returns
253
284
  -------
254
- Tuple[pd.DataFrame, pd.DataFrame]
255
- - Matching entities
256
- - Matching identifiers
257
-
258
- Raises
259
- ------
260
- ValueError
261
- If entity_type is invalid or ontologies are invalid
262
- TypeError
263
- If ontologies is not a set
285
+ None
264
286
  """
265
- # validate inputs
266
- entity_table = self.get_table(entity_type, required_attributes={"id"})
267
- entity_pk = self.schema[entity_type]["pk"]
287
+ if not isinstance(model_prefix, str):
288
+ raise TypeError(
289
+ f"model_prefix was a {type(model_prefix)} " "and must be a str"
290
+ )
291
+ if not isinstance(self, SBML_dfs):
292
+ raise TypeError(
293
+ f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
294
+ )
268
295
 
269
- utils.match_pd_vars(
270
- identifiers_df,
271
- req_vars={
272
- entity_pk,
273
- IDENTIFIERS.ONTOLOGY,
274
- IDENTIFIERS.IDENTIFIER,
275
- IDENTIFIERS.URL,
276
- IDENTIFIERS.BQB,
277
- },
278
- allow_series=False,
279
- ).assert_present()
296
+ # filter to identifiers which make sense when mapping from ids -> species
297
+ species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
280
298
 
281
- if ontologies is not None:
282
- if not isinstance(ontologies, set):
283
- # for clarity this should not be reachable based on type hints
284
- raise TypeError(
285
- f"ontologies must be a set, but got {type(ontologies).__name__}"
286
- )
287
- ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
288
- invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
289
- if len(invalid_ontologies) > 0:
290
- raise ValueError(
291
- f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
292
- f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
299
+ try:
300
+ utils.initialize_dir(outdir, overwrite=overwrite)
301
+ except FileExistsError:
302
+ logger.warning(
303
+ f"Directory {outdir} already exists and overwrite is False. "
304
+ "Files will be added to the existing directory."
305
+ )
306
+ with open_fs(outdir, writeable=True) as fs:
307
+ species_identifiers_path = (
308
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
309
+ )
310
+ with fs.openbin(species_identifiers_path, "w") as f:
311
+ species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
312
+ f, sep="\t", index=False
293
313
  )
294
314
 
295
- # fitler to just to identifiers matchign the ontologies of interest
296
- identifiers_df = identifiers_df.query("ontology in @ontologies")
315
+ # export jsons
316
+ species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
317
+ reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
318
+ reation_species_path = (
319
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
320
+ )
321
+ compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
322
+ compartmentalized_species_path = (
323
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
324
+ )
325
+ with fs.openbin(species_path, "w") as f:
326
+ self.species[[SBML_DFS.S_NAME]].to_json(f)
297
327
 
298
- matching_identifiers = identifiers_df.loc[
299
- identifiers_df["identifier"].isin(ids)
300
- ]
301
- entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
328
+ with fs.openbin(reactions_path, "w") as f:
329
+ self.reactions[[SBML_DFS.R_NAME]].to_json(f)
302
330
 
303
- return entity_subset, matching_identifiers
331
+ with fs.openbin(reation_species_path, "w") as f:
332
+ self.reaction_species.to_json(f)
304
333
 
305
- def search_by_name(
306
- self, name: str, entity_type: str, partial_match: bool = True
307
- ) -> pd.DataFrame:
334
+ with fs.openbin(compartments_path, "w") as f:
335
+ self.compartments[[SBML_DFS.C_NAME]].to_json(f)
336
+
337
+ with fs.openbin(compartmentalized_species_path, "w") as f:
338
+ self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
339
+ f
340
+ )
341
+
342
+ return None
343
+
344
+ def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
308
345
  """
309
- Find entities by exact or partial name match.
346
+ Get Characteristic Species IDs
347
+
348
+ List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
310
349
 
311
350
  Parameters
312
351
  ----------
313
- name : str
314
- Name to search for
315
- entity_type : str
316
- Type of entity to search (e.g., 'species', 'reactions')
317
- partial_match : bool, optional
318
- Whether to allow partial string matches, by default True
352
+ sbml_dfs : sbml_dfs_core.SBML_dfs
353
+ The SBML_dfs object.
354
+ dogmatic : bool, default=True
355
+ Whether to use the dogmatic flag to determine which BQB attributes are valid.
319
356
 
320
357
  Returns
321
358
  -------
322
359
  pd.DataFrame
323
- Matching entities
360
+ A DataFrame containing the systematic identifiers which are characteristic of molecular species.
324
361
  """
325
- entity_table = self.get_table(entity_type, required_attributes={"label"})
326
- label_attr = self.schema[entity_type]["label"]
327
362
 
328
- if partial_match:
329
- matches = entity_table.loc[
330
- entity_table[label_attr].str.contains(name, case=False)
331
- ]
332
- else:
333
- matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
334
- return matches
363
+ # select valid BQB attributes based on dogmatic flag
364
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
365
+ dogmatic
366
+ )
335
367
 
336
- def get_species_features(self) -> pd.DataFrame:
337
- """
338
- Get additional attributes of species.
368
+ # pre-summarize ontologies
369
+ species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
339
370
 
340
- Returns
341
- -------
342
- pd.DataFrame
343
- Species with additional features including:
344
- - species_type: Classification of the species (e.g., metabolite, protein)
345
- """
346
- species = self.species
347
- augmented_species = species.assign(
348
- **{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
371
+ # drop some BQB_HAS_PART annotations
372
+ species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
373
+ species_identifiers,
374
+ defining_biological_qualifiers=defining_biological_qualifiers,
349
375
  )
350
376
 
351
- return augmented_species
377
+ return species_identifiers
352
378
 
353
379
  def get_cspecies_features(self) -> pd.DataFrame:
354
380
  """
@@ -445,113 +471,28 @@ class SBML_dfs:
445
471
 
446
472
  return named_identifiers
447
473
 
448
- def get_uri_urls(
449
- self,
450
- entity_type: str,
451
- entity_ids: Iterable[str] | None = None,
452
- required_ontology: str | None = None,
453
- ) -> pd.Series:
474
+ def get_network_summary(self) -> Mapping[str, Any]:
454
475
  """
455
- Get reference URLs for specified entities.
456
-
457
- Parameters
458
- ----------
459
- entity_type : str
460
- Type of entity to get URLs for (e.g., 'species', 'reactions')
461
- entity_ids : Optional[Iterable[str]], optional
462
- Specific entities to get URLs for, by default None (all entities)
463
- required_ontology : Optional[str], optional
464
- Specific ontology to get URLs from, by default None
476
+ Get diagnostic statistics about the network.
465
477
 
466
478
  Returns
467
479
  -------
468
- pd.Series
469
- Series mapping entity IDs to their reference URLs
470
-
471
- Raises
472
- ------
473
- ValueError
474
- If entity_type is invalid
475
- """
476
- schema = self.schema
477
-
478
- # valid entities and their identifier variables
479
- valid_entity_types = [
480
- SBML_DFS.COMPARTMENTS,
481
- SBML_DFS.SPECIES,
482
- SBML_DFS.REACTIONS,
483
- ]
484
-
485
- if entity_type not in valid_entity_types:
486
- raise ValueError(
487
- f"{entity_type} is an invalid entity_type; valid types "
488
- f"are {', '.join(valid_entity_types)}"
489
- )
490
-
491
- entity_table = getattr(self, entity_type)
492
-
493
- if entity_ids is not None:
494
- # ensure that entity_ids are unique and then convert back to list
495
- # to support pandas indexing
496
- entity_ids = list(set(entity_ids))
497
-
498
- # filter to a subset of identifiers if one is provided
499
- entity_table = entity_table.loc[entity_ids]
500
-
501
- # create a dataframe of all identifiers for the select entities
502
- all_ids = pd.concat(
503
- [
504
- sbml_dfs_utils._stub_ids(
505
- entity_table[schema[entity_type]["id"]].iloc[i].ids
506
- ).assign(id=entity_table.index[i])
507
- for i in range(0, entity_table.shape[0])
508
- ]
509
- ).rename(columns={"id": schema[entity_type]["pk"]})
510
-
511
- # set priorities for ontologies and bqb terms
512
-
513
- if required_ontology is None:
514
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
515
- ONTOLOGY_PRIORITIES, how="left"
516
- )
517
- else:
518
- ontology_priorities = pd.DataFrame(
519
- [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
520
- )
521
- # if only a single ontology is sought then just return matching entries
522
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
523
- ontology_priorities, how="inner"
524
- )
525
-
526
- uri_urls = (
527
- all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
528
- .groupby(schema[entity_type]["pk"])
529
- .first()[IDENTIFIERS.URL]
530
- )
531
- return uri_urls
532
-
533
- def get_network_summary(self) -> Mapping[str, Any]:
534
- """
535
- Get diagnostic statistics about the network.
536
-
537
- Returns
538
- -------
539
- Mapping[str, Any]
540
- Dictionary of diagnostic statistics including:
541
- - n_species_types: Number of species types
542
- - dict_n_species_per_type: Number of species per type
543
- - n_species: Number of species
544
- - n_cspecies: Number of compartmentalized species
545
- - n_reaction_species: Number of reaction species
546
- - n_reactions: Number of reactions
547
- - n_compartments: Number of compartments
548
- - dict_n_species_per_compartment: Number of species per compartment
549
- - stats_species_per_reaction: Statistics on reactands per reaction
550
- - top10_species_per_reaction: Top 10 reactions by number of reactands
551
- - stats_degree: Statistics on species connectivity
552
- - top10_degree: Top 10 species by connectivity
553
- - stats_identifiers_per_species: Statistics on identifiers per species
554
- - top10_identifiers_per_species: Top 10 species by number of identifiers
480
+ Mapping[str, Any]
481
+ Dictionary of diagnostic statistics including:
482
+ - n_species_types: Number of species types
483
+ - dict_n_species_per_type: Number of species per type
484
+ - n_species: Number of species
485
+ - n_cspecies: Number of compartmentalized species
486
+ - n_reaction_species: Number of reaction species
487
+ - n_reactions: Number of reactions
488
+ - n_compartments: Number of compartments
489
+ - dict_n_species_per_compartment: Number of species per compartment
490
+ - stats_species_per_reaction: Statistics on reactands per reaction
491
+ - top10_species_per_reaction: Top 10 reactions by number of reactands
492
+ - stats_degree: Statistics on species connectivity
493
+ - top10_degree: Top 10 species by connectivity
494
+ - stats_identifiers_per_species: Statistics on identifiers per species
495
+ - top10_identifiers_per_species: Top 10 species by number of identifiers
555
496
  """
556
497
  stats: MutableMapping[str, Any] = {}
557
498
  species_features = self.get_species_features()
@@ -616,1986 +557,1359 @@ class SBML_dfs:
616
557
 
617
558
  return stats
618
559
 
619
- def add_species_data(self, label: str, data: pd.DataFrame):
560
+ def get_species_features(self) -> pd.DataFrame:
620
561
  """
621
- Add additional species data with validation.
622
-
623
- Parameters
624
- ----------
625
- label : str
626
- Label for the new data
627
- data : pd.DataFrame
628
- Data to add, must be indexed by species_id
562
+ Get additional attributes of species.
629
563
 
630
- Raises
631
- ------
632
- ValueError
633
- If the data is invalid or label already exists
564
+ Returns
565
+ -------
566
+ pd.DataFrame
567
+ Species with additional features including:
568
+ - species_type: Classification of the species (e.g., metabolite, protein)
634
569
  """
635
- self._validate_species_data(data)
636
- if label in self.species_data:
637
- raise ValueError(
638
- f"{label} already exists in species_data. " "Drop it first."
639
- )
640
- self.species_data[label] = data
570
+ species = self.species
571
+ augmented_species = species.assign(
572
+ **{
573
+ "species_type": lambda d: d["s_Identifiers"].apply(
574
+ sbml_dfs_utils.species_type_types
575
+ )
576
+ }
577
+ )
641
578
 
642
- def remove_species_data(self, label: str):
643
- """
644
- Remove species data by label.
645
- """
646
- self._remove_entity_data(SBML_DFS.SPECIES, label)
579
+ return augmented_species
647
580
 
648
- def add_reactions_data(self, label: str, data: pd.DataFrame):
581
+ def get_table(
582
+ self, entity_type: str, required_attributes: None | set[str] = None
583
+ ) -> pd.DataFrame:
649
584
  """
650
- Add additional reaction data with validation.
585
+ Get a table from the SBML_dfs object with optional attribute validation.
651
586
 
652
587
  Parameters
653
588
  ----------
654
- label : str
655
- Label for the new data
656
- data : pd.DataFrame
657
- Data to add, must be indexed by reaction_id
589
+ entity_type : str
590
+ The type of entity table to retrieve (e.g., 'species', 'reactions')
591
+ required_attributes : Optional[Set[str]], optional
592
+ Set of attributes that must be present in the table, by default None.
593
+ Must be passed as a set, e.g. {'id'}, not a string.
594
+
595
+ Returns
596
+ -------
597
+ pd.DataFrame
598
+ The requested table
658
599
 
659
600
  Raises
660
601
  ------
661
602
  ValueError
662
- If the data is invalid or label already exists
603
+ If entity_type is invalid or required attributes are missing
604
+ TypeError
605
+ If required_attributes is not a set
663
606
  """
664
- self._validate_reactions_data(data)
665
- if label in self.reactions_data:
666
- raise ValueError(
667
- f"{label} already exists in reactions_data. Drop it first."
668
- )
669
- self.reactions_data[label] = data
670
607
 
671
- def remove_reactions_data(self, label: str):
672
- """
673
- Remove reactions data by label.
674
- """
675
- self._remove_entity_data(SBML_DFS.REACTIONS, label)
608
+ schema = self.schema
676
609
 
677
- def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
678
- """
679
- Remove compartmentalized species and associated reactions.
610
+ if entity_type not in schema.keys():
611
+ raise ValueError(
612
+ f"{entity_type} does not match a table in the SBML_dfs object. The tables "
613
+ f"which are present are {', '.join(schema.keys())}"
614
+ )
680
615
 
681
- Starting with a set of compartmentalized species, determine which reactions
682
- should be removed based on their removal. Then remove these reactions,
683
- compartmentalized species, and species.
616
+ if required_attributes is not None:
617
+ if not isinstance(required_attributes, set):
618
+ raise TypeError(
619
+ f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
620
+ "Did you pass a string instead of a set?"
621
+ )
684
622
 
685
- Parameters
686
- ----------
687
- sc_ids : Iterable[str]
688
- IDs of compartmentalized species to remove
689
- """
623
+ # determine whether required_attributes are appropriate
624
+ VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
625
+ invalid_required_attributes = required_attributes.difference(
626
+ VALID_REQUIRED_ATTRIBUTES
627
+ )
690
628
 
691
- # find reactions which should be totally removed since they are losing critical species
692
- removed_reactions = _find_underspecified_reactions_by_scids(self, sc_ids)
693
- self.remove_reactions(removed_reactions)
629
+ if len(invalid_required_attributes) > 0:
630
+ raise ValueError(
631
+ f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
632
+ f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
633
+ )
694
634
 
695
- self._remove_compartmentalized_species(sc_ids)
635
+ # determine if required_attributes are satisified
636
+ invalid_attrs = [
637
+ s for s in required_attributes if s not in schema[entity_type].keys()
638
+ ]
639
+ if len(invalid_attrs) > 0:
640
+ raise ValueError(
641
+ f"The following required attributes are not present for the {entity_type} table: "
642
+ f"{', '.join(invalid_attrs)}."
643
+ )
696
644
 
697
- # remove species (and their associated species data if all their cspecies have been lost)
698
- self._remove_unused_species()
645
+ return getattr(self, entity_type)
699
646
 
700
- def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
647
+ def get_uri_urls(
648
+ self,
649
+ entity_type: str,
650
+ entity_ids: Iterable[str] | None = None,
651
+ required_ontology: str | None = None,
652
+ ) -> pd.Series:
701
653
  """
702
- Remove reactions from the model.
654
+ Get reference URLs for specified entities.
703
655
 
704
656
  Parameters
705
657
  ----------
706
- r_ids : Iterable[str]
707
- IDs of reactions to remove
708
- remove_species : bool, optional
709
- Whether to remove species that are no longer part of any reactions,
710
- by default False
711
- """
712
- # remove corresponding reactions_species
713
- self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
714
- # remove reactions
715
- self.reactions = self.reactions.drop(index=list(r_ids))
716
- # remove reactions_data
717
- if hasattr(self, "reactions_data"):
718
- for k, data in self.reactions_data.items():
719
- self.reactions_data[k] = data.drop(index=list(r_ids))
720
- # remove species if requested
721
- if remove_species:
722
- self._remove_unused_cspecies()
723
- self._remove_unused_species()
724
-
725
- def validate(self):
726
- """
727
- Validate the SBML_dfs structure and relationships.
658
+ entity_type : str
659
+ Type of entity to get URLs for (e.g., 'species', 'reactions')
660
+ entity_ids : Optional[Iterable[str]], optional
661
+ Specific entities to get URLs for, by default None (all entities)
662
+ required_ontology : Optional[str], optional
663
+ Specific ontology to get URLs from, by default None
728
664
 
729
- Checks:
730
- - Schema existence
731
- - Required tables presence
732
- - Individual table structure
733
- - Primary key uniqueness
734
- - Foreign key relationships
735
- - Optional data table validity
736
- - Reaction species validity
665
+ Returns
666
+ -------
667
+ pd.Series
668
+ Series mapping entity IDs to their reference URLs
737
669
 
738
670
  Raises
739
671
  ------
740
672
  ValueError
741
- If any validation check fails
673
+ If entity_type is invalid
742
674
  """
675
+ schema = self.schema
743
676
 
744
- if not hasattr(self, "schema"):
745
- raise ValueError("No schema found")
746
-
747
- required_tables = self._required_entities
748
- schema_tables = set(self.schema.keys())
677
+ # valid entities and their identifier variables
678
+ valid_entity_types = [
679
+ SBML_DFS.COMPARTMENTS,
680
+ SBML_DFS.SPECIES,
681
+ SBML_DFS.REACTIONS,
682
+ ]
749
683
 
750
- extra_tables = schema_tables.difference(required_tables)
751
- if len(extra_tables) != 0:
752
- logger.debug(
753
- f"{len(extra_tables)} unexpected tables found: "
754
- f"{', '.join(extra_tables)}"
684
+ if entity_type not in valid_entity_types:
685
+ raise ValueError(
686
+ f"{entity_type} is an invalid entity_type; valid types "
687
+ f"are {', '.join(valid_entity_types)}"
755
688
  )
756
689
 
757
- missing_tables = required_tables.difference(schema_tables)
758
- if len(missing_tables) != 0:
759
- raise ValueError(
760
- f"Missing {len(missing_tables)} required tables: "
761
- f"{', '.join(missing_tables)}"
762
- )
763
-
764
- # check individual tables
765
- for table in required_tables:
766
- self._validate_table(table)
690
+ entity_table = getattr(self, entity_type)
767
691
 
768
- # check whether pks and fks agree
769
- pk_df = pd.DataFrame(
770
- [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
771
- )
692
+ if entity_ids is not None:
693
+ # ensure that entity_ids are unique and then convert back to list
694
+ # to support pandas indexing
695
+ entity_ids = list(set(entity_ids))
772
696
 
773
- fk_df = (
774
- pd.DataFrame(
775
- [
776
- {"fk_table": k, "fk": v["fk"]}
777
- for k, v in self.schema.items()
778
- if "fk" in v.keys()
779
- ]
780
- )
781
- .set_index("fk_table")["fk"]
782
- .apply(pd.Series)
783
- .reset_index()
784
- .melt(id_vars="fk_table")
785
- .drop(["variable"], axis=1)
786
- .rename(columns={"value": "key"})
787
- )
697
+ # filter to a subset of identifiers if one is provided
698
+ entity_table = entity_table.loc[entity_ids]
788
699
 
789
- pk_fk_correspondences = pk_df.merge(fk_df)
700
+ # create a dataframe of all identifiers for the select entities
701
+ all_ids = pd.concat(
702
+ [
703
+ sbml_dfs_utils._id_dict_to_df(
704
+ entity_table[schema[entity_type]["id"]].iloc[i].ids
705
+ ).assign(id=entity_table.index[i])
706
+ for i in range(0, entity_table.shape[0])
707
+ ]
708
+ ).rename(columns={"id": schema[entity_type]["pk"]})
790
709
 
791
- for i in range(0, pk_fk_correspondences.shape[0]):
792
- pk_table_keys = set(
793
- getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
794
- )
795
- if None in pk_table_keys:
796
- raise ValueError(
797
- f"{pk_fk_correspondences['pk_table'][i]} had "
798
- "missing values in its index"
799
- )
710
+ # set priorities for ontologies and bqb terms
800
711
 
801
- fk_table_keys = set(
802
- getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
803
- :, pk_fk_correspondences["key"][i]
804
- ]
712
+ if required_ontology is None:
713
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
714
+ ONTOLOGY_PRIORITIES, how="left"
805
715
  )
806
- if None in fk_table_keys:
807
- raise ValueError(
808
- f"{pk_fk_correspondences['fk_table'][i]} included "
809
- f"missing {pk_fk_correspondences['key'][i]} values"
810
- )
811
-
812
- # all foreign keys need to match a primary key
813
- extra_fks = fk_table_keys.difference(pk_table_keys)
814
- if len(extra_fks) != 0:
815
- raise ValueError(
816
- f"{len(extra_fks)} distinct "
817
- f"{pk_fk_correspondences['key'][i]} values were"
818
- f" found in {pk_fk_correspondences['fk_table'][i]} "
819
- f"but missing from {pk_fk_correspondences['pk_table'][i]}."
820
- " All foreign keys must have a matching primary key.\n\n"
821
- f"Extra key are: {', '.join(extra_fks)}"
822
- )
823
-
824
- # check optional data tables:
825
- for k, v in self.species_data.items():
826
- try:
827
- self._validate_species_data(v)
828
- except ValueError as e:
829
- raise ValueError(f"species data {k} was invalid.") from e
830
-
831
- for k, v in self.reactions_data.items():
832
- try:
833
- self._validate_reactions_data(v)
834
- except ValueError as e:
835
- raise ValueError(f"reactions data {k} was invalid.") from e
836
-
837
- # validate reaction_species sbo_terms and stoi
838
- self._validate_reaction_species()
839
-
840
- def validate_and_resolve(self):
841
- """
842
- Validate and attempt to automatically fix common issues.
843
-
844
- This method iteratively:
845
- 1. Attempts validation
846
- 2. If validation fails, tries to resolve the issue
847
- 3. Repeats until validation passes or issue cannot be resolved
848
-
849
- Raises
850
- ------
851
- ValueError
852
- If validation fails and cannot be automatically resolved
853
- """
854
-
855
- current_exception = None
856
- validated = False
857
-
858
- while not validated:
859
- try:
860
- self.validate()
861
- validated = True
862
- except Exception as e:
863
- e_str = str(e)
864
- if e_str == current_exception:
865
- logger.warning(
866
- "Automated resolution of an Exception was attempted but failed"
867
- )
868
- raise e
869
-
870
- # try to resolve
871
- self._attempt_resolve(e)
872
-
873
- def select_species_data(self, species_data_table: str) -> pd.DataFrame:
874
- """
875
- Select a species data table from the SBML_dfs object.
876
-
877
- Parameters
878
- ----------
879
- species_data_table : str
880
- Name of the species data table to select
881
-
882
- Returns
883
- -------
884
- pd.DataFrame
885
- The selected species data table
886
-
887
- Raises
888
- ------
889
- ValueError
890
- If species_data_table is not found
891
- """
892
- # Check if species_data_table exists in sbml_dfs.species_data
893
- if species_data_table not in self.species_data:
894
- raise ValueError(
895
- f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
896
- f"Available tables: {self.species_data.keys()}"
716
+ else:
717
+ ontology_priorities = pd.DataFrame(
718
+ [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
897
719
  )
898
-
899
- # Get the species data
900
- return self.species_data[species_data_table]
901
-
902
- def _validate_table(self, table: str) -> None:
903
- """
904
- Validate a table in this SBML_dfs object against its schema.
905
-
906
- This is an internal method that validates a table that is part of this SBML_dfs
907
- object against the schema stored in self.schema.
908
-
909
- Parameters
910
- ----------
911
- table : str
912
- Name of the table to validate
913
-
914
- Raises
915
- ------
916
- ValueError
917
- If the table does not conform to its schema
918
- """
919
- table_schema = self.schema[table]
920
- table_data = getattr(self, table)
921
- _perform_sbml_dfs_table_validation(table_data, table_schema, table)
922
-
923
- def _remove_entity_data(self, entity_type: str, label: str) -> None:
924
- """
925
- Remove data from species_data or reactions_data by table name and label.
926
-
927
- Parameters
928
- ----------
929
- entity_type : str
930
- Name of the table to remove data from ('species' or 'reactions')
931
- label : str
932
- Label of the data to remove
933
-
934
- Notes
935
- -----
936
- If the label does not exist, a warning will be logged that includes the existing labels.
937
- """
938
- if entity_type not in ENTITIES_W_DATA:
939
- raise ValueError("table_name must be either 'species' or 'reactions'")
940
-
941
- data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
942
- if label not in data_dict:
943
- existing_labels = list(data_dict.keys())
944
- logger.warning(
945
- f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
946
- f"Existing labels: {existing_labels}"
720
+ # if only a single ontology is sought then just return matching entries
721
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
722
+ ontology_priorities, how="inner"
947
723
  )
948
- return
949
-
950
- del data_dict[label]
951
-
952
- def _remove_unused_cspecies(self):
953
- """Removes compartmentalized species that are no
954
- longer part of any reactions"""
955
- sc_ids = self._get_unused_cspecies()
956
- self._remove_compartmentalized_species(sc_ids)
957
-
958
- def _get_unused_cspecies(self) -> set[str]:
959
- """Returns a set of compartmentalized species
960
- that are not part of any reactions"""
961
- sc_ids = set(self.compartmentalized_species.index) - set(
962
- self.reaction_species[SBML_DFS.SC_ID]
963
- )
964
- return sc_ids # type: ignore
965
-
966
- def _remove_unused_species(self):
967
- """Removes species that are no longer part of any
968
- compartmentalized species"""
969
- s_ids = self._get_unused_species()
970
- self._remove_species(s_ids)
971
-
972
- def _get_unused_species(self) -> set[str]:
973
- """Returns a list of species that are not part of any reactions"""
974
- s_ids = set(self.species.index) - set(
975
- self.compartmentalized_species[SBML_DFS.S_ID]
976
- )
977
- return s_ids # type: ignore
978
-
979
- def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
980
- """Removes compartmentalized species from the model
981
-
982
- This should not be directly used by the user, as it can lead to
983
- invalid reactions when removing species without a logic to decide
984
- if the reaction needs to be removed as well.
985
724
 
986
- Args:
987
- sc_ids (Iterable[str]): the compartmentalized species to remove
988
- """
989
- # Remove compartmentalized species
990
- self.compartmentalized_species = self.compartmentalized_species.drop(
991
- index=list(sc_ids)
725
+ uri_urls = (
726
+ all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
727
+ .groupby(schema[entity_type]["pk"])
728
+ .first()[IDENTIFIERS.URL]
992
729
  )
993
- # remove corresponding reactions_species
994
- self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
995
-
996
- def _remove_species(self, s_ids: Iterable[str]):
997
- """Removes species from the model
998
-
999
- This should not be directly used by the user, as it can lead to
1000
- invalid reactions when removing species without a logic to decide
1001
- if the reaction needs to be removed as well.
1002
-
1003
- This removes the species and corresponding compartmentalized species and
1004
- reactions_species.
1005
-
1006
- Args:
1007
- s_ids (Iterable[str]): the species to remove
1008
- """
1009
- sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1010
- self._remove_compartmentalized_species(sc_ids)
1011
- # Remove species
1012
- self.species = self.species.drop(index=list(s_ids))
1013
- # remove data
1014
- for k, data in self.species_data.items():
1015
- self.species_data[k] = data.drop(index=list(s_ids))
1016
-
1017
- def _validate_species_data(self, species_data_table: pd.DataFrame):
1018
- """Validates species data attribute
1019
-
1020
- Args:
1021
- species_data_table (pd.DataFrame): a species data table
1022
-
1023
- Raises:
1024
- ValueError: s_id not index name
1025
- ValueError: s_id index contains duplicates
1026
- ValueError: s_id not in species table
1027
- """
1028
- _validate_matching_data(species_data_table, self.species)
1029
-
1030
- def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1031
- """Validates reactions data attribute
1032
-
1033
- Args:
1034
- reactions_data_table (pd.DataFrame): a reactions data table
1035
-
1036
- Raises:
1037
- ValueError: r_id not index name
1038
- ValueError: r_id index contains duplicates
1039
- ValueError: r_id not in reactions table
1040
- """
1041
- _validate_matching_data(reactions_data_table, self.reactions)
1042
-
1043
- def _validate_reaction_species(self):
1044
- if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1045
- raise ValueError(
1046
- "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1047
- )
1048
-
1049
- # test for null SBO terms
1050
- n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1051
- if n_null_sbo_terms != 0:
1052
- raise ValueError(
1053
- f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1054
- )
1055
-
1056
- # find invalid SBO terms
1057
- sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1058
- invalid_sbo_term_counts = sbo_counts[
1059
- ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
1060
- ]
1061
-
1062
- if invalid_sbo_term_counts.shape[0] != 0:
1063
- invalid_sbo_counts_str = ", ".join(
1064
- [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1065
- )
1066
- raise ValueError(
1067
- f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1068
- f"defined {invalid_sbo_counts_str}"
1069
- )
1070
-
1071
- def _attempt_resolve(self, e):
1072
- str_e = str(e)
1073
- if str_e == "compartmentalized_species included missing c_id values":
1074
- logger.warning(str_e)
1075
- logger.warning(
1076
- "Attempting to resolve with infer_uncompartmentalized_species_location()"
1077
- )
1078
- self = infer_uncompartmentalized_species_location(self)
1079
- elif re.search("sbo_terms were not defined", str_e):
1080
- logger.warning(str_e)
1081
- logger.warning("Attempting to resolve with infer_sbo_terms()")
1082
- self = infer_sbo_terms(self)
1083
- else:
1084
- logger.warning(
1085
- "An error occurred which could not be automatically resolved"
1086
- )
1087
- raise e
1088
-
1089
-
1090
- def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1091
- """
1092
- Species Status
1093
-
1094
- Return all of the reaction's a species particpates in.
1095
-
1096
- Parameters:
1097
- s_id: str
1098
- A species ID
1099
- sbml_dfs: SBML_dfs
1100
-
1101
- Returns:
1102
- pd.DataFrame, one row reaction
1103
- """
1104
-
1105
- matching_species = sbml_dfs.species.loc[s_id]
1106
-
1107
- if not isinstance(matching_species, pd.Series):
1108
- raise ValueError(f"{s_id} did not match a single species")
1109
-
1110
- # find all rxns species particpate in
1111
-
1112
- matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
1113
- sbml_dfs.compartmentalized_species.s_id.isin([s_id])
1114
- ]
1115
-
1116
- rxns_participating = sbml_dfs.reaction_species[
1117
- sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1118
- ]
1119
-
1120
- # find all participants in these rxns
1121
-
1122
- full_rxns_participating = sbml_dfs.reaction_species[
1123
- sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1124
- ].merge(
1125
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1126
- )
1127
-
1128
- reaction_descriptions = pd.concat(
1129
- [
1130
- reaction_summary(x, sbml_dfs)
1131
- for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
1132
- ]
1133
- )
1134
-
1135
- status = (
1136
- full_rxns_participating.loc[
1137
- full_rxns_participating[SBML_DFS.SC_ID].isin(
1138
- matching_compartmentalized_species.index.values.tolist()
1139
- ),
1140
- [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1141
- ]
1142
- .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1143
- .reset_index(drop=True)
1144
- .drop(SBML_DFS.R_ID, axis=1)
1145
- )
1146
-
1147
- return status
1148
-
1149
-
1150
- def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1151
- """
1152
- Reaction Summary
1153
-
1154
- Return a reaction's name and a human-readable formula.
1155
-
1156
- Parameters:
1157
- r_id: str
1158
- A reaction ID
1159
- sbml_dfs: SBML_dfs
1160
-
1161
- Returns:
1162
- one row pd.DataFrame
1163
- """
1164
-
1165
- logger.warning(
1166
- "reaction_summary is deprecated and will be removed in a future version of rcpr; "
1167
- "please use reaction_summaries() instead"
1168
- )
1169
-
1170
- matching_reaction = sbml_dfs.reactions.loc[r_id]
730
+ return uri_urls
1171
731
 
1172
- if not isinstance(matching_reaction, pd.Series):
1173
- raise ValueError(f"{r_id} did not match a single reaction")
732
+ def infer_sbo_terms(self):
733
+ """
734
+ Infer SBO Terms
1174
735
 
1175
- matching_reaction = sbml_dfs.reactions.loc[r_id]
736
+ Define SBO terms based on stoichiometry for reaction_species with missing terms.
737
+ Modifies the SBML_dfs object in-place.
1176
738
 
1177
- matching_reaction_species = sbml_dfs.reaction_species[
1178
- sbml_dfs.reaction_species.r_id.isin([r_id])
1179
- ].merge(
1180
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1181
- )
739
+ Returns
740
+ -------
741
+ None (modifies SBML_dfs object in-place)
742
+ """
743
+ valid_sbo_terms = self.reaction_species[
744
+ self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
745
+ ]
1182
746
 
1183
- # collapse all reaction species to a formula string
747
+ invalid_sbo_terms = self.reaction_species[
748
+ ~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
749
+ ]
1184
750
 
1185
- if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
1186
- augmented_matching_reaction_species = matching_reaction_species.merge(
1187
- sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
1188
- ).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1189
- str_formula = (
1190
- construct_formula_string(
1191
- augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
1192
- )
1193
- + " ["
1194
- + augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
1195
- + "]"
1196
- )
1197
- else:
1198
- str_formula = construct_formula_string(
1199
- matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
1200
- )
751
+ if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
752
+ raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
753
+ if invalid_sbo_terms.shape[0] == 0:
754
+ logger.info("All sbo_terms were valid; nothing to update.")
755
+ return
1201
756
 
1202
- output = pd.DataFrame(
1203
- {
1204
- SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
1205
- "r_formula_str": str_formula,
1206
- },
1207
- index=[r_id],
1208
- )
757
+ logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
1209
758
 
1210
- output.index.name = SBML_DFS.R_ID
759
+ # add missing/invalid terms based on stoichiometry
760
+ invalid_sbo_terms.loc[
761
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
762
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1211
763
 
1212
- return output
764
+ invalid_sbo_terms.loc[
765
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
766
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1213
767
 
768
+ invalid_sbo_terms.loc[
769
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
770
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
1214
771
 
1215
- def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
1216
- """
1217
- Reaction Summary
772
+ updated_reaction_species = pd.concat(
773
+ [valid_sbo_terms, invalid_sbo_terms]
774
+ ).sort_index()
1218
775
 
1219
- Return human-readable formulas for reactions.
776
+ if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
777
+ raise ValueError(
778
+ f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
779
+ )
780
+ self.reaction_species = updated_reaction_species
781
+ return
1220
782
 
1221
- Parameters:
1222
- ----------
1223
- sbml_dfs: sbml.SBML_dfs
1224
- A relational mechanistic model
1225
- r_ids: [str], str or None
1226
- Reaction IDs or None for all reactions
783
+ def infer_uncompartmentalized_species_location(self):
784
+ """
785
+ Infer Uncompartmentalized Species Location
1227
786
 
1228
- Returns:
1229
- ----------
1230
- formula_strs: pd.Series
1231
- """
787
+ If the compartment of a subset of compartmentalized species
788
+ was not specified, infer an appropriate compartment from
789
+ other members of reactions they participate in.
1232
790
 
1233
- if isinstance(r_ids, str):
1234
- r_ids = [r_ids]
791
+ This method modifies the SBML_dfs object in-place.
1235
792
 
1236
- if r_ids is None:
1237
- matching_reactions = sbml_dfs.reactions
1238
- else:
1239
- matching_reactions = sbml_dfs.reactions.loc[r_ids]
793
+ Returns
794
+ -------
795
+ None (modifies SBML_dfs object in-place)
796
+ """
797
+ default_compartment = (
798
+ self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
799
+ .rename("N")
800
+ .reset_index()
801
+ .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
802
+ )
803
+ if not isinstance(default_compartment, str):
804
+ raise ValueError(
805
+ "No default compartment could be found - compartment "
806
+ "information may not be present"
807
+ )
1240
808
 
1241
- matching_reaction_species = sbml_dfs.reaction_species[
1242
- sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
1243
- ].merge(
1244
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1245
- )
809
+ # infer the compartments of species missing compartments
810
+ missing_compartment_scids = self.compartmentalized_species[
811
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
812
+ ].index.tolist()
813
+ if len(missing_compartment_scids) == 0:
814
+ logger.info(
815
+ "All compartmentalized species have compartments, "
816
+ "returning input SBML_dfs"
817
+ )
818
+ return self
819
+
820
+ participating_reactions = (
821
+ self.reaction_species[
822
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
823
+ ][SBML_DFS.R_ID]
824
+ .unique()
825
+ .tolist()
826
+ )
827
+ reaction_participants = self.reaction_species[
828
+ self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
829
+ ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
830
+ reaction_participants = reaction_participants.merge(
831
+ self.compartmentalized_species[SBML_DFS.C_ID],
832
+ left_on=SBML_DFS.SC_ID,
833
+ right_index=True,
834
+ )
1246
835
 
1247
- # split into within compartment and cross-compartment reactions
1248
- r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
1249
- SBML_DFS.C_ID
1250
- ].nunique()
1251
-
1252
- # identify reactions which work across compartments
1253
- r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
1254
- # there species must be labelled with the sc_name to specify where a species exists
1255
- if r_id_cross_compartment.shape[0] > 0:
1256
- rxn_eqtn_cross_compartment = (
1257
- matching_reaction_species[
1258
- matching_reaction_species[SBML_DFS.R_ID].isin(
1259
- r_id_cross_compartment.index
1260
- )
1261
- ]
1262
- .sort_values([SBML_DFS.SC_NAME])
836
+ # find a default compartment to fall back on if all compartmental information is missing
837
+ primary_reaction_compartment = (
838
+ reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
839
+ .rename("N")
840
+ .reset_index()
841
+ .sort_values("N", ascending=False)
1263
842
  .groupby(SBML_DFS.R_ID)
1264
- .apply(
1265
- lambda x: construct_formula_string(
1266
- x, sbml_dfs.reactions, SBML_DFS.SC_NAME
1267
- )
1268
- )
1269
- .rename("r_formula_str")
843
+ .first()[SBML_DFS.C_ID]
844
+ .reset_index()
1270
845
  )
1271
- else:
1272
- rxn_eqtn_cross_compartment = None
1273
-
1274
- # identify reactions which occur within a single compartment; for these the reaction
1275
- # can be labelled with the compartment and individual species can receive a more readable s_name
1276
- r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1277
- if r_id_within_compartment.shape[0] > 0:
1278
- # add s_name
1279
- augmented_matching_reaction_species = (
1280
- matching_reaction_species[
1281
- matching_reaction_species[SBML_DFS.R_ID].isin(
1282
- r_id_within_compartment.index
1283
- )
846
+
847
+ inferred_compartmentalization = (
848
+ self.reaction_species[
849
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1284
850
  ]
1285
- .merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1286
- .merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1287
- .sort_values([SBML_DFS.S_NAME])
851
+ .merge(primary_reaction_compartment)
852
+ .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
853
+ .rename("N")
854
+ .reset_index()
855
+ .sort_values("N", ascending=False)
856
+ .groupby(SBML_DFS.SC_ID)
857
+ .first()
858
+ .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
1288
859
  )
1289
- # create formulas based on s_names of components
1290
- rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1291
- [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1292
- ).apply(
1293
- lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
860
+ logger.info(
861
+ f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
1294
862
  )
1295
- # add compartment for each reaction
1296
- rxn_eqtn_within_compartment = pd.Series(
1297
- [
1298
- y + ": " + x
1299
- for x, y in zip(
1300
- rxn_eqtn_within_compartment,
1301
- rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
1302
- )
1303
- ],
1304
- index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1305
- ).rename("r_formula_str")
1306
- else:
1307
- rxn_eqtn_within_compartment = None
1308
-
1309
- formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
1310
-
1311
- return formula_strs
1312
-
1313
-
1314
- def construct_formula_string(
1315
- reaction_species_df: pd.DataFrame,
1316
- reactions_df: pd.DataFrame,
1317
- name_var: str,
1318
- ) -> str:
1319
- """
1320
- Construct Formula String
1321
-
1322
- Convert a table of reaction species into a formula string
1323
863
 
1324
- Parameters:
1325
- ----------
1326
- reaction_species_df: pd.DataFrame
1327
- Table containing a reactions' species
1328
- reactions_df: pd.DataFrame
1329
- smbl.reactions
1330
- name_var: str
1331
- Name used to label species
1332
-
1333
- Returns:
1334
- ----------
1335
- formula_str: str
1336
- String representation of a reactions substrates, products and
1337
- modifiers
864
+ # define where a reaction is most likely to occur based on the compartmentalization of its participants
865
+ species_with_unknown_compartmentalization = set(
866
+ missing_compartment_scids
867
+ ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
868
+ if len(species_with_unknown_compartmentalization) != 0:
869
+ logger.warning(
870
+ f"{len(species_with_unknown_compartmentalization)} "
871
+ "species compartmentalization could not be inferred"
872
+ " from other reaction participants. Their compartmentalization "
873
+ f"will be set to the default of {default_compartment}"
874
+ )
1338
875
 
1339
- """
876
+ inferred_compartmentalization = pd.concat(
877
+ [
878
+ inferred_compartmentalization,
879
+ pd.DataFrame(
880
+ {
881
+ SBML_DFS.SC_ID: list(
882
+ species_with_unknown_compartmentalization
883
+ )
884
+ }
885
+ ).assign(c_id=default_compartment),
886
+ ]
887
+ )
1340
888
 
1341
- reaction_species_df["label"] = [
1342
- add_stoi_to_species_name(x, y)
1343
- for x, y in zip(
1344
- reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
1345
- )
1346
- ]
889
+ if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
890
+ raise ValueError(
891
+ f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
892
+ )
1347
893
 
1348
- rxn_reversible = bool(
1349
- reactions_df.loc[
1350
- reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
1351
- ]
1352
- ) # convert from a np.bool_ to bool if needed
1353
- if not isinstance(rxn_reversible, bool):
1354
- raise TypeError(
1355
- f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
894
+ updated_compartmentalized_species = pd.concat(
895
+ [
896
+ self.compartmentalized_species[
897
+ ~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
898
+ ],
899
+ self.compartmentalized_species[
900
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
901
+ ]
902
+ .drop(SBML_DFS.C_ID, axis=1)
903
+ .merge(
904
+ inferred_compartmentalization,
905
+ left_index=True,
906
+ right_on=SBML_DFS.SC_ID,
907
+ )
908
+ .set_index(SBML_DFS.SC_ID),
909
+ ]
1356
910
  )
1357
911
 
1358
- if rxn_reversible:
1359
- arrow_type = " <-> "
1360
- else:
1361
- arrow_type = " -> "
1362
-
1363
- substrates = " + ".join(
1364
- reaction_species_df["label"][
1365
- reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
1366
- ].tolist()
1367
- )
1368
- products = " + ".join(
1369
- reaction_species_df["label"][
1370
- reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
1371
- ].tolist()
1372
- )
1373
- modifiers = " + ".join(
1374
- reaction_species_df["label"][
1375
- reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
1376
- ].tolist()
1377
- )
1378
- if modifiers != "":
1379
- modifiers = f" ---- modifiers: {modifiers}]"
1380
-
1381
- return f"{substrates}{arrow_type}{products}{modifiers}"
1382
-
912
+ if (
913
+ updated_compartmentalized_species.shape[0]
914
+ != self.compartmentalized_species.shape[0]
915
+ ):
916
+ raise ValueError(
917
+ f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
918
+ " compartmentalized species with "
919
+ f"{updated_compartmentalized_species.shape[0]}"
920
+ )
1383
921
 
1384
- def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
1385
- """
1386
- Add Stoi To Species Name
922
+ if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
923
+ raise ValueError("Some species compartments are still missing")
1387
924
 
1388
- Add # of molecules to a species name
925
+ self.compartmentalized_species = updated_compartmentalized_species
926
+ return
1389
927
 
1390
- Parameters:
1391
- ----------
1392
- stoi: float or int
1393
- Number of molecules
1394
- name: str
1395
- Name of species
928
+ def name_compartmentalized_species(self):
929
+ """
930
+ Name Compartmentalized Species
1396
931
 
1397
- Returns:
1398
- ----------
1399
- name: str
1400
- Name containing number of species
932
+ Rename compartmentalized species if they have the same
933
+ name as their species. Modifies the SBML_dfs object in-place.
1401
934
 
1402
- """
935
+ Returns
936
+ -------
937
+ None (modifies SBML_dfs object in-place)
938
+ """
939
+ augmented_cspecies = self.compartmentalized_species.merge(
940
+ self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
941
+ ).merge(
942
+ self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
943
+ )
944
+ augmented_cspecies[SBML_DFS.SC_NAME] = [
945
+ f"{s} [{c}]" if sc == s else sc
946
+ for sc, c, s in zip(
947
+ augmented_cspecies[SBML_DFS.SC_NAME],
948
+ augmented_cspecies[SBML_DFS.C_NAME],
949
+ augmented_cspecies[SBML_DFS.S_NAME],
950
+ )
951
+ ]
1403
952
 
1404
- if stoi in [-1, 0, 1]:
1405
- return name
1406
- else:
1407
- return str(abs(stoi)) + " " + name
953
+ self.compartmentalized_species = augmented_cspecies.loc[
954
+ :, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
955
+ ]
956
+ return
1408
957
 
958
+ def reaction_formulas(
959
+ self, r_ids: Optional[Union[str, list[str]]] = None
960
+ ) -> pd.Series:
961
+ """
962
+ Reaction Summary
1409
963
 
1410
- def filter_to_characteristic_species_ids(
1411
- species_ids: pd.DataFrame,
1412
- max_complex_size: int = 4,
1413
- max_promiscuity: int = 20,
1414
- defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
1415
- ) -> pd.DataFrame:
1416
- """
1417
- Filter to Characteristic Species IDs
964
+ Return human-readable formulas for reactions.
1418
965
 
1419
- Remove identifiers corresponding to one component within a large protein
1420
- complexes and non-characteristic annotations such as pubmed references and
1421
- homologues.
966
+ Parameters:
967
+ ----------
968
+ r_ids: [str], str or None
969
+ Reaction IDs or None for all reactions
1422
970
 
1423
- Parameters
971
+ Returns
1424
972
  ----------
1425
- species_ids: pd.DataFrame
1426
- A table of identifiers produced by sdbml_dfs.get_identifiers("species")
1427
- max_complex_size: int
1428
- The largest size of a complex, where BQB_HAS_PART terms will be retained.
1429
- In most cases, complexes are handled with specific formation and
1430
- dissolutation reactions,but these identifiers will be pulled in when
1431
- searching by identifiers or searching the identifiers associated with a
1432
- species against an external resource such as Open Targets.
1433
- max_promiscuity: int
1434
- Maximum number of species where a single molecule can act as a
1435
- BQB_HAS_PART component associated with a single identifier (and common ontology).
1436
- defining_biological_qualifiers (list[str]):
1437
- BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
1438
- permissive settings would include homologs, different forms of the same gene.
1439
-
1440
- Returns:
1441
- --------
1442
- species_id: pd.DataFrame
1443
- Input species filtered to characteristic identifiers
973
+ formula_strs: pd.Series
974
+ """
1444
975
 
1445
- """
976
+ validated_rids = self._validate_r_ids(r_ids)
1446
977
 
1447
- if not isinstance(species_ids, pd.DataFrame):
1448
- raise TypeError(
1449
- f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
978
+ matching_reaction_species = self.reaction_species[
979
+ self.reaction_species.r_id.isin(validated_rids)
980
+ ].merge(
981
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1450
982
  )
1451
983
 
1452
- if not isinstance(max_complex_size, int):
1453
- raise TypeError(
1454
- f"max_complex_size was a {type(max_complex_size)} but must be an int"
1455
- )
984
+ # split into within compartment and cross-compartment reactions
985
+ r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
986
+ SBML_DFS.C_ID
987
+ ].nunique()
1456
988
 
1457
- if not isinstance(max_promiscuity, int):
1458
- raise TypeError(
1459
- f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
1460
- )
989
+ # identify reactions which work across compartments
990
+ r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
991
+ # there species must be labelled with the sc_name to specify where a species exists
992
+ if r_id_cross_compartment.shape[0] > 0:
993
+ rxn_eqtn_cross_compartment = (
994
+ matching_reaction_species[
995
+ matching_reaction_species[SBML_DFS.R_ID].isin(
996
+ r_id_cross_compartment.index
997
+ )
998
+ ]
999
+ .sort_values([SBML_DFS.SC_NAME])
1000
+ .groupby(SBML_DFS.R_ID)
1001
+ .apply(
1002
+ lambda x: sbml_dfs_utils.construct_formula_string(
1003
+ x, self.reactions, SBML_DFS.SC_NAME
1004
+ )
1005
+ )
1006
+ .rename("r_formula_str")
1007
+ )
1008
+ else:
1009
+ rxn_eqtn_cross_compartment = None
1010
+
1011
+ # identify reactions which occur within a single compartment; for these the reaction
1012
+ # can be labelled with the compartment and individual species can receive a more readable s_name
1013
+ r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1014
+ if r_id_within_compartment.shape[0] > 0:
1015
+ # add s_name
1016
+ augmented_matching_reaction_species = (
1017
+ matching_reaction_species[
1018
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1019
+ r_id_within_compartment.index
1020
+ )
1021
+ ]
1022
+ .merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1023
+ .merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
1024
+ .sort_values([SBML_DFS.S_NAME])
1025
+ )
1026
+ # create formulas based on s_names of components
1027
+ rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1028
+ [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1029
+ ).apply(
1030
+ lambda x: sbml_dfs_utils.construct_formula_string(
1031
+ x, self.reactions, SBML_DFS.S_NAME
1032
+ )
1033
+ )
1034
+ # add compartment for each reaction
1035
+ rxn_eqtn_within_compartment = pd.Series(
1036
+ [
1037
+ y + ": " + x
1038
+ for x, y in zip(
1039
+ rxn_eqtn_within_compartment,
1040
+ rxn_eqtn_within_compartment.index.get_level_values(
1041
+ SBML_DFS.C_NAME
1042
+ ),
1043
+ )
1044
+ ],
1045
+ index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1046
+ ).rename("r_formula_str")
1047
+ else:
1048
+ rxn_eqtn_within_compartment = None
1461
1049
 
1462
- if not isinstance(defining_biological_qualifiers, list):
1463
- raise TypeError(
1464
- f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
1050
+ formula_strs = pd.concat(
1051
+ [rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
1465
1052
  )
1466
1053
 
1467
- # primary annotations of a species
1468
- bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
1469
-
1470
- # add components within modestly sized protein complexes
1471
- # look at HAS_PART IDs
1472
- bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
1054
+ return formula_strs
1473
1055
 
1474
- # number of species in a complex
1475
- n_species_components = bqb_has_parts_species.value_counts(
1476
- [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
1477
- )
1478
- big_complex_sids = set(
1479
- n_species_components[
1480
- n_species_components > max_complex_size
1481
- ].index.get_level_values(SBML_DFS.S_ID)
1482
- )
1056
+ def reaction_summaries(
1057
+ self, r_ids: Optional[Union[str, list[str]]] = None
1058
+ ) -> pd.DataFrame:
1059
+ """
1060
+ Reaction Summary
1483
1061
 
1484
- filtered_bqb_has_parts = _filter_promiscuous_components(
1485
- bqb_has_parts_species, max_promiscuity
1486
- )
1062
+ Return a summary of reactions.
1487
1063
 
1488
- # drop species parts if there are many components
1489
- filtered_bqb_has_parts = filtered_bqb_has_parts[
1490
- ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
1491
- ]
1064
+ Parameters:
1065
+ ----------
1066
+ r_ids: [str], str or None
1067
+ Reaction IDs or None for all reactions
1492
1068
 
1493
- # combine primary identifiers and rare components
1494
- characteristic_species_ids = pd.concat(
1495
- [
1496
- bqb_is_species,
1497
- filtered_bqb_has_parts,
1498
- ]
1499
- )
1069
+ Returns
1070
+ ----------
1071
+ reaction_summaries_df: pd.DataFrame
1072
+ A table with r_id as an index and columns:
1073
+ - r_name: str, name of the reaction
1074
+ - r_formula_str: str, human-readable formula of the reaction
1075
+ """
1500
1076
 
1501
- return characteristic_species_ids
1077
+ validated_rids = self._validate_r_ids(r_ids)
1502
1078
 
1079
+ participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
1080
+ participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
1081
+ reaction_summareis_df = pd.concat(
1082
+ [participating_r_names, participating_r_formulas], axis=1
1083
+ )
1503
1084
 
1504
- def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
1505
- """
1506
- Infer Uncompartmentalized Species Location
1085
+ return reaction_summareis_df
1507
1086
 
1508
- If the compartment of a subset of compartmentalized species
1509
- was not specified, infer an appropriate compartment from
1510
- other members of reactions they particpate in
1087
+ def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
1088
+ """
1089
+ Remove compartmentalized species and associated reactions.
1511
1090
 
1512
- Parameters:
1513
- ----------
1514
- sbml_dfs: sbml.SBML_dfs
1515
- A relational pathway model
1091
+ Starting with a set of compartmentalized species, determine which reactions
1092
+ should be removed based on their removal. Then remove these reactions,
1093
+ compartmentalized species, and species.
1516
1094
 
1517
- Returns:
1518
- ----------
1519
- sbml_dfs: sbml.SBML_dfs
1520
- A relational pathway model (with filled in species compartments)
1095
+ Parameters
1096
+ ----------
1097
+ sc_ids : Iterable[str]
1098
+ IDs of compartmentalized species to remove
1099
+ """
1521
1100
 
1522
- """
1101
+ # find reactions which should be totally removed since they are losing critical species
1102
+ removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
1103
+ self.remove_reactions(removed_reactions)
1523
1104
 
1524
- default_compartment = (
1525
- sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
1526
- .rename("N")
1527
- .reset_index()
1528
- .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
1529
- )
1530
- if not isinstance(default_compartment, str):
1531
- raise ValueError(
1532
- "No default compartment could be found - compartment "
1533
- "information may not be present"
1534
- )
1105
+ self._remove_compartmentalized_species(sc_ids)
1535
1106
 
1536
- # infer the compartments of species missing compartments
1107
+ # remove species (and their associated species data if all their cspecies have been lost)
1108
+ self._remove_unused_species()
1537
1109
 
1538
- missing_compartment_scids = sbml_dfs.compartmentalized_species[
1539
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1540
- ].index.tolist()
1541
- if len(missing_compartment_scids) == 0:
1542
- logger.info(
1543
- "All compartmentalized species have compartments, "
1544
- "returning input sbml_dfs"
1545
- )
1546
- return sbml_dfs
1547
-
1548
- participating_reactions = (
1549
- sbml_dfs.reaction_species[
1550
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1551
- ][SBML_DFS.R_ID]
1552
- .unique()
1553
- .tolist()
1554
- )
1555
- reaction_participants = sbml_dfs.reaction_species[
1556
- sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
1557
- ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
1558
- reaction_participants = reaction_participants.merge(
1559
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
1560
- left_on=SBML_DFS.SC_ID,
1561
- right_index=True,
1562
- )
1110
+ def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
1111
+ """
1112
+ Remove reactions from the model.
1563
1113
 
1564
- # find a default compartment to fall back on if all compartmental information is missing
1114
+ Parameters
1115
+ ----------
1116
+ r_ids : Iterable[str]
1117
+ IDs of reactions to remove
1118
+ remove_species : bool, optional
1119
+ Whether to remove species that are no longer part of any reactions,
1120
+ by default False
1121
+ """
1122
+ # remove corresponding reactions_species
1123
+ self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
1124
+ # remove reactions
1125
+ self.reactions = self.reactions.drop(index=list(r_ids))
1126
+ # remove reactions_data
1127
+ if hasattr(self, "reactions_data"):
1128
+ for k, data in self.reactions_data.items():
1129
+ self.reactions_data[k] = data.drop(index=list(r_ids))
1130
+ # remove species if requested
1131
+ if remove_species:
1132
+ self._remove_unused_cspecies()
1133
+ self._remove_unused_species()
1565
1134
 
1566
- primary_reaction_compartment = (
1567
- reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
1568
- .rename("N")
1569
- .reset_index()
1570
- .sort_values("N", ascending=False)
1571
- .groupby(SBML_DFS.R_ID)
1572
- .first()[SBML_DFS.C_ID]
1573
- .reset_index()
1574
- )
1135
+ def remove_reactions_data(self, label: str):
1136
+ """
1137
+ Remove reactions data by label.
1138
+ """
1139
+ self._remove_entity_data(SBML_DFS.REACTIONS, label)
1575
1140
 
1576
- inferred_compartmentalization = (
1577
- sbml_dfs.reaction_species[
1578
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1579
- ]
1580
- .merge(primary_reaction_compartment)
1581
- .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
1582
- .rename("N")
1583
- .reset_index()
1584
- .sort_values("N", ascending=False)
1585
- .groupby(SBML_DFS.SC_ID)
1586
- .first()
1587
- .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
1588
- )
1589
- logger.info(
1590
- f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
1591
- )
1141
+ def remove_species_data(self, label: str):
1142
+ """
1143
+ Remove species data by label.
1144
+ """
1145
+ self._remove_entity_data(SBML_DFS.SPECIES, label)
1592
1146
 
1593
- # define where a reaction is most likely to occur based on the compartmentalization of its particpants
1594
- species_with_unknown_compartmentalization = set(
1595
- missing_compartment_scids
1596
- ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
1597
- if len(species_with_unknown_compartmentalization) != 0:
1598
- logger.warning(
1599
- f"{len(species_with_unknown_compartmentalization)} "
1600
- "species compartmentalization could not be inferred"
1601
- " from other reaction particpants. Their compartmentalization "
1602
- f"will be set to the default of {default_compartment}"
1603
- )
1147
+ def search_by_ids(
1148
+ self,
1149
+ ids: list[str],
1150
+ entity_type: str,
1151
+ identifiers_df: pd.DataFrame,
1152
+ ontologies: None | set[str] = None,
1153
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
1154
+ """
1155
+ Find entities and identifiers matching a set of query IDs.
1604
1156
 
1605
- inferred_compartmentalization = pd.concat(
1606
- [
1607
- inferred_compartmentalization,
1608
- pd.DataFrame(
1609
- {SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
1610
- ).assign(c_id=default_compartment),
1611
- ]
1612
- )
1157
+ Parameters
1158
+ ----------
1159
+ ids : List[str]
1160
+ List of identifiers to search for
1161
+ entity_type : str
1162
+ Type of entity to search (e.g., 'species', 'reactions')
1163
+ identifiers_df : pd.DataFrame
1164
+ DataFrame containing identifier mappings
1165
+ ontologies : Optional[Set[str]], optional
1166
+ Set of ontologies to filter by, by default None
1613
1167
 
1614
- if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
1615
- raise ValueError(
1616
- f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
1617
- )
1168
+ Returns
1169
+ -------
1170
+ Tuple[pd.DataFrame, pd.DataFrame]
1171
+ - Matching entities
1172
+ - Matching identifiers
1618
1173
 
1619
- updated_compartmentalized_species = pd.concat(
1620
- [
1621
- sbml_dfs.compartmentalized_species[
1622
- ~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1623
- ],
1624
- sbml_dfs.compartmentalized_species[
1625
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1626
- ]
1627
- .drop(SBML_DFS.C_ID, axis=1)
1628
- .merge(
1629
- inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
1630
- )
1631
- .set_index(SBML_DFS.SC_ID),
1632
- ]
1633
- )
1174
+ Raises
1175
+ ------
1176
+ ValueError
1177
+ If entity_type is invalid or ontologies are invalid
1178
+ TypeError
1179
+ If ontologies is not a set
1180
+ """
1181
+ # validate inputs
1182
+ entity_table = self.get_table(entity_type, required_attributes={"id"})
1183
+ entity_pk = self.schema[entity_type]["pk"]
1634
1184
 
1635
- if (
1636
- updated_compartmentalized_species.shape[0]
1637
- != sbml_dfs.compartmentalized_species.shape[0]
1638
- ):
1639
- raise ValueError(
1640
- f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
1641
- " compartmentalized species with "
1642
- f"{updated_compartmentalized_species.shape[0]}"
1643
- )
1185
+ utils.match_pd_vars(
1186
+ identifiers_df,
1187
+ req_vars={
1188
+ entity_pk,
1189
+ IDENTIFIERS.ONTOLOGY,
1190
+ IDENTIFIERS.IDENTIFIER,
1191
+ IDENTIFIERS.URL,
1192
+ IDENTIFIERS.BQB,
1193
+ },
1194
+ allow_series=False,
1195
+ ).assert_present()
1644
1196
 
1645
- if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
1646
- raise ValueError("Some species compartments are still missing")
1197
+ if ontologies is not None:
1198
+ if not isinstance(ontologies, set):
1199
+ # for clarity this should not be reachable based on type hints
1200
+ raise TypeError(
1201
+ f"ontologies must be a set, but got {type(ontologies).__name__}"
1202
+ )
1203
+ ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
1204
+ invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
1205
+ if len(invalid_ontologies) > 0:
1206
+ raise ValueError(
1207
+ f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
1208
+ f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
1209
+ )
1647
1210
 
1648
- sbml_dfs.compartmentalized_species = updated_compartmentalized_species
1211
+ # fitler to just to identifiers matchign the ontologies of interest
1212
+ identifiers_df = identifiers_df.query("ontology in @ontologies")
1649
1213
 
1650
- return sbml_dfs
1214
+ matching_identifiers = identifiers_df.loc[
1215
+ identifiers_df["identifier"].isin(ids)
1216
+ ]
1217
+ entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
1651
1218
 
1219
+ return entity_subset, matching_identifiers
1652
1220
 
1653
- def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
1654
- """
1655
- Infer SBO Terms
1221
+ def search_by_name(
1222
+ self, name: str, entity_type: str, partial_match: bool = True
1223
+ ) -> pd.DataFrame:
1224
+ """
1225
+ Find entities by exact or partial name match.
1656
1226
 
1657
- Define SBO terms based on stoichiometry for reaction_species with missing terms
1227
+ Parameters
1228
+ ----------
1229
+ name : str
1230
+ Name to search for
1231
+ entity_type : str
1232
+ Type of entity to search (e.g., 'species', 'reactions')
1233
+ partial_match : bool, optional
1234
+ Whether to allow partial string matches, by default True
1658
1235
 
1659
- Parameters:
1660
- ----------
1661
- sbml_dfs: sbml.SBML_dfs
1662
- A relational pathway model
1236
+ Returns
1237
+ -------
1238
+ pd.DataFrame
1239
+ Matching entities
1240
+ """
1241
+ entity_table = self.get_table(entity_type, required_attributes={"label"})
1242
+ label_attr = self.schema[entity_type]["label"]
1663
1243
 
1664
- Returns:
1665
- ----------
1666
- sbml_dfs: sbml.SBML_dfs
1667
- A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
1244
+ if partial_match:
1245
+ matches = entity_table.loc[
1246
+ entity_table[label_attr].str.contains(name, case=False)
1247
+ ]
1248
+ else:
1249
+ matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
1250
+ return matches
1668
1251
 
1669
- """
1252
+ def select_species_data(self, species_data_table: str) -> pd.DataFrame:
1253
+ """
1254
+ Select a species data table from the SBML_dfs object.
1670
1255
 
1671
- valid_sbo_terms = sbml_dfs.reaction_species[
1672
- sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1673
- ]
1256
+ Parameters
1257
+ ----------
1258
+ species_data_table : str
1259
+ Name of the species data table to select
1674
1260
 
1675
- invalid_sbo_terms = sbml_dfs.reaction_species[
1676
- ~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1677
- ]
1261
+ Returns
1262
+ -------
1263
+ pd.DataFrame
1264
+ The selected species data table
1678
1265
 
1679
- if not all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull()):
1680
- raise ValueError(
1681
- "All sbml_dfs.reaction_species[SBML_DFS.SBO_TERM] must be not null"
1682
- )
1683
- if invalid_sbo_terms.shape[0] == 0:
1684
- logger.info("All sbo_terms were valid; returning input sbml_dfs")
1685
- return sbml_dfs
1266
+ Raises
1267
+ ------
1268
+ ValueError
1269
+ If species_data_table is not found
1270
+ """
1271
+ # Check if species_data_table exists in sbml_dfs.species_data
1272
+ if species_data_table not in self.species_data:
1273
+ raise ValueError(
1274
+ f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
1275
+ f"Available tables: {self.species_data.keys()}"
1276
+ )
1686
1277
 
1687
- logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
1278
+ # Get the species data
1279
+ return self.species_data[species_data_table]
1688
1280
 
1689
- # add missing/invalid terms based on stoichiometry
1690
- invalid_sbo_terms.loc[
1691
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
1692
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1281
+ def species_status(self, s_id: str) -> pd.DataFrame:
1282
+ """
1283
+ Species Status
1693
1284
 
1694
- invalid_sbo_terms.loc[
1695
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
1696
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1285
+ Return all of the reactions a species participates in.
1697
1286
 
1698
- invalid_sbo_terms.loc[
1699
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
1700
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
1287
+ Parameters:
1288
+ s_id: str
1289
+ A species ID
1701
1290
 
1702
- updated_reaction_species = pd.concat(
1703
- [valid_sbo_terms, invalid_sbo_terms]
1704
- ).sort_index()
1291
+ Returns:
1292
+ pd.DataFrame, one row per reaction the species participates in
1293
+ with columns:
1294
+ - sc_name: str, name of the compartment the species participates in
1295
+ - stoichiometry: float, stoichiometry of the species in the reaction
1296
+ - r_name: str, name of the reaction
1297
+ - r_formula_str: str, human-readable formula of the reaction
1298
+ """
1705
1299
 
1706
- if sbml_dfs.reaction_species.shape[0] != updated_reaction_species.shape[0]:
1707
- raise ValueError(
1708
- f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
1709
- )
1710
- sbml_dfs.reaction_species = updated_reaction_species
1300
+ if s_id not in self.species.index:
1301
+ raise ValueError(f"{s_id} not found in species table")
1711
1302
 
1712
- return sbml_dfs
1303
+ matching_species = self.species.loc[s_id]
1713
1304
 
1305
+ if not isinstance(matching_species, pd.Series):
1306
+ raise ValueError(f"{s_id} did not match a single species")
1714
1307
 
1715
- def name_compartmentalized_species(sbml_dfs):
1716
- """
1717
- Name Compartmentalized Species
1308
+ # find all rxns species participate in
1309
+ matching_compartmentalized_species = self.compartmentalized_species[
1310
+ self.compartmentalized_species.s_id.isin([s_id])
1311
+ ]
1718
1312
 
1719
- Rename compartmentalized species if they have the same
1720
- name as their species
1313
+ rxns_participating = self.reaction_species[
1314
+ self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1315
+ ]
1721
1316
 
1722
- Parameters
1723
- ----------
1724
- sbml_dfs : SBML_dfs
1725
- A model formed by aggregating pathways
1317
+ # find all participants in these rxns
1318
+ full_rxns_participating = self.reaction_species[
1319
+ self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1320
+ ].merge(
1321
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1322
+ )
1726
1323
 
1727
- Returns:
1728
- ----------
1729
- sbml_dfs
1730
- """
1324
+ participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
1325
+ reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
1731
1326
 
1732
- augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
1733
- sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
1734
- ).merge(
1735
- sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
1736
- )
1737
- augmented_cspecies[SBML_DFS.SC_NAME] = [
1738
- f"{s} [{c}]" if sc == s else sc
1739
- for sc, c, s in zip(
1740
- augmented_cspecies[SBML_DFS.SC_NAME],
1741
- augmented_cspecies[SBML_DFS.C_NAME],
1742
- augmented_cspecies[SBML_DFS.S_NAME],
1327
+ status = (
1328
+ full_rxns_participating.loc[
1329
+ full_rxns_participating[SBML_DFS.SC_ID].isin(
1330
+ matching_compartmentalized_species.index.values.tolist()
1331
+ ),
1332
+ [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1333
+ ]
1334
+ .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1335
+ .reset_index(drop=True)
1336
+ .drop(SBML_DFS.R_ID, axis=1)
1743
1337
  )
1744
- ]
1745
1338
 
1746
- sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
1747
- :, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
1748
- ]
1749
-
1750
- return sbml_dfs
1339
+ return status
1751
1340
 
1341
+ def validate(self):
1342
+ """
1343
+ Validate the SBML_dfs structure and relationships.
1752
1344
 
1753
- def export_sbml_dfs(
1754
- model_prefix: str,
1755
- sbml_dfs: SBML_dfs,
1756
- outdir: str,
1757
- overwrite: bool = False,
1758
- dogmatic: bool = True,
1759
- ) -> None:
1760
- """
1761
- Export SBML_dfs
1762
-
1763
- Export summaries of species identifiers and each table underlying
1764
- an SBML_dfs pathway model
1765
-
1766
- Params
1767
- ------
1768
- model_prefix: str
1769
- Label to prepend to all exported files
1770
- sbml_dfs: sbml.SBML_dfs
1771
- A pathway model
1772
- outdir: str
1773
- Path to an existing directory where results should be saved
1774
- overwrite: bool
1775
- Should the directory be overwritten if it already exists?
1776
- dogmatic: bool
1777
- If True then treat genes, transcript, and proteins as separate species. If False
1778
- then treat them interchangeably.
1345
+ Checks:
1346
+ - Schema existence
1347
+ - Required tables presence
1348
+ - Individual table structure
1349
+ - Primary key uniqueness
1350
+ - Foreign key relationships
1351
+ - Optional data table validity
1352
+ - Reaction species validity
1779
1353
 
1780
- Returns
1781
- -------
1782
- None
1354
+ Raises
1355
+ ------
1356
+ ValueError
1357
+ If any validation check fails
1358
+ """
1783
1359
 
1784
- """
1360
+ if not hasattr(self, "schema"):
1361
+ raise ValueError("No schema found")
1785
1362
 
1786
- if not isinstance(model_prefix, str):
1787
- raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
1788
- if not isinstance(sbml_dfs, SBML_dfs):
1789
- raise TypeError(
1790
- f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
1791
- )
1363
+ required_tables = self._required_entities
1364
+ schema_tables = set(self.schema.keys())
1792
1365
 
1793
- # filter to identifiers which make sense when mapping from ids -> species
1794
- species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
1795
- sbml_dfs,
1796
- dogmatic=dogmatic,
1797
- )
1366
+ extra_tables = schema_tables.difference(required_tables)
1367
+ if len(extra_tables) != 0:
1368
+ logger.debug(
1369
+ f"{len(extra_tables)} unexpected tables found: "
1370
+ f"{', '.join(extra_tables)}"
1371
+ )
1798
1372
 
1799
- try:
1800
- utils.initialize_dir(outdir, overwrite=overwrite)
1801
- except FileExistsError:
1802
- logger.warning(
1803
- f"Directory {outdir} already exists and overwrite is False. "
1804
- "Files will be added to the existing directory."
1805
- )
1806
- with open_fs(outdir, writeable=True) as fs:
1807
- species_identifiers_path = (
1808
- model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
1809
- )
1810
- with fs.openbin(species_identifiers_path, "w") as f:
1811
- species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
1812
- f, sep="\t", index=False
1373
+ missing_tables = required_tables.difference(schema_tables)
1374
+ if len(missing_tables) != 0:
1375
+ raise ValueError(
1376
+ f"Missing {len(missing_tables)} required tables: "
1377
+ f"{', '.join(missing_tables)}"
1813
1378
  )
1814
1379
 
1815
- # export jsons
1816
- species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
1817
- reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
1818
- reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
1819
- compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
1820
- compartmentalized_species_path = (
1821
- model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
1822
- )
1823
- with fs.openbin(species_path, "w") as f:
1824
- sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
1380
+ # check individual tables
1381
+ for table in required_tables:
1382
+ self._validate_table(table)
1825
1383
 
1826
- with fs.openbin(reactions_path, "w") as f:
1827
- sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
1384
+ # check whether pks and fks agree
1385
+ self._check_pk_fk_correspondence()
1828
1386
 
1829
- with fs.openbin(reation_species_path, "w") as f:
1830
- sbml_dfs.reaction_species.to_json(f)
1387
+ # check optional data tables:
1388
+ for k, v in self.species_data.items():
1389
+ try:
1390
+ self._validate_species_data(v)
1391
+ except ValueError as e:
1392
+ raise ValueError(f"species data {k} was invalid.") from e
1831
1393
 
1832
- with fs.openbin(compartments_path, "w") as f:
1833
- sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
1394
+ for k, v in self.reactions_data.items():
1395
+ try:
1396
+ self._validate_reactions_data(v)
1397
+ except ValueError as e:
1398
+ raise ValueError(f"reactions data {k} was invalid.") from e
1834
1399
 
1835
- with fs.openbin(compartmentalized_species_path, "w") as f:
1836
- sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
1837
- f
1838
- )
1400
+ # validate reaction_species sbo_terms and stoi
1401
+ self._validate_reaction_species()
1839
1402
 
1840
- return None
1403
+ def validate_and_resolve(self):
1404
+ """
1405
+ Validate and attempt to automatically fix common issues.
1841
1406
 
1407
+ This method iteratively:
1408
+ 1. Attempts validation
1409
+ 2. If validation fails, tries to resolve the issue
1410
+ 3. Repeats until validation passes or issue cannot be resolved
1842
1411
 
1843
- def sbml_dfs_from_edgelist(
1844
- interaction_edgelist: pd.DataFrame,
1845
- species_df: pd.DataFrame,
1846
- compartments_df: pd.DataFrame,
1847
- interaction_source: source.Source,
1848
- upstream_stoichiometry: int = 0,
1849
- downstream_stoichiometry: int = 1,
1850
- downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1851
- keep_species_data: bool | str = False,
1852
- keep_reactions_data: bool | str = False,
1853
- ) -> SBML_dfs:
1854
- """
1855
- Create SBML_dfs from Edgelist
1856
-
1857
- Combine a set of interactions into an sbml.SBML_dfs mechanistic model
1858
-
1859
- Parameters:
1860
- interaction_edgelist (pd.DataFrame): A table containing interactions:
1861
- - upstream_name (str): matching "s_name" from "species_df"
1862
- - downstream_name (str): matching "s_name" from "species_df"
1863
- - upstream_compartment (str): compartment of "upstream_name"
1864
- with names matching "c_name" from "compartments_df"
1865
- - downstream_compartment (str): compartment of "downstream_name"
1866
- with names matching "c_name" from "compartments_df"
1867
- - r_name (str): a name for the interaction
1868
- - sbo_term (str): sbo term defining the type of
1869
- molecular interaction (see MINI_SBO_FROM_NAME)
1870
- - r_Identifiers (identifiers.Identifiers): identifiers
1871
- supporting the interaction (e.g., pubmed ids)
1872
- - r_isreversible (bool): Is this reaction reversible?
1873
- If True, the reaction is reversible
1874
- By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
1875
- species_df (pd.DataFrame): A table defining unique molecular
1876
- species participating in "interaction_edgelist":
1877
- - s_name (str): name of molecular species
1878
- - s_Identifiers (identifiers.Identifiers): identifiers
1879
- defining the species
1880
- compartments_df (pd.DataFrame): A table defining compartments
1881
- where interactions are occurring "interaction_edgelist":
1882
- - c_name (str): name of compartment
1883
- - c_Identifiers (identifiers.Identifiers):
1884
- identifiers defining the compartment (see
1885
- bigg.annotate_recon() for a set of names > go categories)
1886
- interaction_source (source.Source): A source object
1887
- which will tie model entities to the interaction source
1888
- upstream_stoichiometry (int): stoichiometry of
1889
- upstream species in reaction
1890
- downstream_stoichiometry (int): stoichiometry of
1891
- downstream species in reaction
1892
- downstream_sbo_name (str): sbo term defining the
1893
- type of molecular interaction for the downstream reactand
1894
- (see MINI_SBO_FROM_NAME)
1895
- keep_species_data (bool | str): Should species data
1896
- be kept in the model? If True, all species data will be kept
1897
- and saved as "species_data" in the SBML_dfs. The label will be 'source'
1898
- If False, no species data will be kept.
1899
- If a string: label for the species data to be kept.
1900
- keep_reactions_data (bool | str): Should reaction data be kept in the model?
1901
- If True, all reaction data will be kept and saved
1902
- as "reactions_data" in the SBML_dfs. The label will be 'source'.
1903
- If False, no reaction data will be kept.
1904
- If a string: label for the reaction data to be kept.
1905
-
1906
- Returns:
1907
- sbml.SBML_dfs
1412
+ Raises
1413
+ ------
1414
+ ValueError
1415
+ If validation fails and cannot be automatically resolved
1416
+ """
1908
1417
 
1909
- """
1418
+ current_exception = None
1419
+ validated = False
1910
1420
 
1911
- # check input dfs for required variables
1912
- _sbml_dfs_from_edgelist_validate_inputs(
1913
- interaction_edgelist, species_df, compartments_df
1914
- )
1421
+ while not validated:
1422
+ try:
1423
+ self.validate()
1424
+ validated = True
1425
+ except Exception as e:
1426
+ e_str = str(e)
1427
+ if e_str == current_exception:
1428
+ logger.warning(
1429
+ "Automated resolution of an Exception was attempted but failed"
1430
+ )
1431
+ raise e
1915
1432
 
1916
- # Identify extra columns in the input data.
1917
- # if keep_reactions_data is True, this will be added
1918
- # as `reaction_data`
1919
- interaction_edgelist_required_vars = {
1920
- "upstream_name",
1921
- "downstream_name",
1922
- "upstream_compartment",
1923
- "downstream_compartment",
1924
- SBML_DFS.R_NAME,
1925
- SBML_DFS.SBO_TERM,
1926
- SBML_DFS.R_IDENTIFIERS,
1927
- SBML_DFS.R_ISREVERSIBLE,
1928
- }
1929
- if keep_reactions_data is not False:
1930
- extra_reactions_columns = [
1931
- c
1932
- for c in interaction_edgelist.columns
1933
- if c not in interaction_edgelist_required_vars
1934
- ]
1935
- else:
1936
- extra_reactions_columns = []
1937
- # Extra species columns
1938
- if keep_species_data is not False:
1939
- extra_species_columns = [
1940
- c
1941
- for c in species_df.columns
1942
- if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
1943
- ]
1944
- else:
1945
- extra_species_columns = []
1433
+ # try to resolve
1434
+ self._attempt_resolve(e)
1946
1435
 
1947
- # format compartments
1948
- compartments_df[SBML_DFS.C_SOURCE] = interaction_source
1949
- compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
1950
- range(compartments_df.shape[0]), SBML_DFS.C_ID
1951
- )
1952
- compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
1953
- [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
1954
- ]
1955
-
1956
- # format species
1957
- species_df[SBML_DFS.S_SOURCE] = interaction_source
1958
- species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
1959
- range(species_df.shape[0]), SBML_DFS.S_ID
1960
- )
1436
+ # =============================================================================
1437
+ # PRIVATE METHODS (ALPHABETICAL ORDER)
1438
+ # =============================================================================
1961
1439
 
1962
- required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
1963
- species_df = species_df.set_index(SBML_DFS.S_ID)[
1964
- required_cols + extra_species_columns
1965
- ]
1966
- # Keep extra columns to save them as extra data
1967
- species_data = species_df[extra_species_columns]
1968
- # Remove extra columns
1969
- species_df = species_df[required_cols]
1970
-
1971
- # create compartmentalized species
1972
-
1973
- # define all distinct upstream and downstream compartmentalized species
1974
- comp_species = pd.concat(
1975
- [
1976
- interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
1977
- {
1978
- "upstream_name": SBML_DFS.S_NAME,
1979
- "upstream_compartment": SBML_DFS.C_NAME,
1980
- },
1981
- axis=1,
1982
- ),
1983
- interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
1984
- {
1985
- "downstream_name": SBML_DFS.S_NAME,
1986
- "downstream_compartment": SBML_DFS.C_NAME,
1987
- },
1988
- axis=1,
1989
- ),
1990
- ]
1991
- ).drop_duplicates()
1992
-
1993
- # merge to add species and compartments primary keys
1994
- comp_species_w_ids = comp_species.merge(
1995
- species_df[SBML_DFS.S_NAME].reset_index(),
1996
- how="left",
1997
- left_on=SBML_DFS.S_NAME,
1998
- right_on=SBML_DFS.S_NAME,
1999
- ).merge(
2000
- compartments_df[SBML_DFS.C_NAME].reset_index(),
2001
- how="left",
2002
- left_on=SBML_DFS.C_NAME,
2003
- right_on=SBML_DFS.C_NAME,
2004
- )
1440
+ def _attempt_resolve(self, e):
1441
+ str_e = str(e)
1442
+ if str_e == "compartmentalized_species included missing c_id values":
1443
+ logger.warning(str_e)
1444
+ logger.warning(
1445
+ "Attempting to resolve with infer_uncompartmentalized_species_location()"
1446
+ )
1447
+ self.infer_uncompartmentalized_species_location()
1448
+ elif re.search("sbo_terms were not defined", str_e):
1449
+ logger.warning(str_e)
1450
+ logger.warning("Attempting to resolve with infer_sbo_terms()")
1451
+ self.infer_sbo_terms()
1452
+ else:
1453
+ logger.warning(
1454
+ "An error occurred which could not be automatically resolved"
1455
+ )
1456
+ raise e
2005
1457
 
2006
- # check whether all species and compartments exist
2007
- _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
1458
+ def _check_pk_fk_correspondence(self):
1459
+ """
1460
+ Check whether primary keys and foreign keys agree for all tables in the schema.
1461
+ Raises ValueError if any correspondence fails.
1462
+ """
2008
1463
 
2009
- # name compounds
2010
- comp_species_w_ids[SBML_DFS.SC_NAME] = [
2011
- f"{s} [{c}]"
2012
- for s, c in zip(
2013
- comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
1464
+ pk_df = pd.DataFrame(
1465
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
2014
1466
  )
2015
- ]
2016
- # add source object
2017
- comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2018
- # name index
2019
- comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2020
- range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2021
- )
2022
- comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2023
- [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2024
- ]
2025
-
2026
- # create reactions
2027
-
2028
- # create a from cs_species -> to cs_species edgelist
2029
- # interaction_edgelist
2030
- comp_species_w_names = (
2031
- comp_species_w_ids.reset_index()
2032
- .merge(species_df[SBML_DFS.S_NAME].reset_index())
2033
- .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
2034
- )
2035
1467
 
2036
- interaction_edgelist_w_cspecies = interaction_edgelist.merge(
2037
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2038
- {
2039
- SBML_DFS.SC_ID: "sc_id_up",
2040
- SBML_DFS.S_NAME: "upstream_name",
2041
- SBML_DFS.C_NAME: "upstream_compartment",
2042
- },
2043
- axis=1,
2044
- ),
2045
- how="left",
2046
- ).merge(
2047
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2048
- {
2049
- SBML_DFS.SC_ID: "sc_id_down",
2050
- SBML_DFS.S_NAME: "downstream_name",
2051
- SBML_DFS.C_NAME: "downstream_compartment",
2052
- },
2053
- axis=1,
2054
- ),
2055
- how="left",
2056
- )[
2057
- REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2058
- ]
2059
-
2060
- # some extra checks
2061
- if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
2062
- raise ValueError(
2063
- "Merging compartmentalized species to interaction_edgelist"
2064
- " resulted in an increase in the tables from "
2065
- f"{interaction_edgelist.shape[0]} to "
2066
- f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
2067
- " a 1-many join which should have been 1-1"
1468
+ fk_df = (
1469
+ pd.DataFrame(
1470
+ [
1471
+ {"fk_table": k, "fk": v["fk"]}
1472
+ for k, v in self.schema.items()
1473
+ if "fk" in v.keys()
1474
+ ]
1475
+ )
1476
+ .set_index("fk_table")["fk"]
1477
+ .apply(pd.Series)
1478
+ .reset_index()
1479
+ .melt(id_vars="fk_table")
1480
+ .drop(["variable"], axis=1)
1481
+ .rename(columns={"value": "key"})
2068
1482
  )
2069
1483
 
2070
- # create one reaction per interaction
2071
- interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
2072
- interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2073
- range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
2074
- )
1484
+ pk_fk_correspondences = pk_df.merge(fk_df)
2075
1485
 
2076
- reactions_df_columns = [
2077
- SBML_DFS.R_NAME,
2078
- SBML_DFS.R_IDENTIFIERS,
2079
- SBML_DFS.R_SOURCE,
2080
- SBML_DFS.R_ISREVERSIBLE,
2081
- ]
2082
- reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
2083
- reactions_df_columns + extra_reactions_columns
2084
- ]
2085
- # Keep extra columns to save them as extra data
2086
- reactions_data = reactions_df[extra_reactions_columns]
2087
- reactions_df = reactions_df[reactions_df_columns]
2088
-
2089
- # define upstream and downstream comp species as reaction species
2090
- reaction_species_df = pd.concat(
2091
- [
2092
- # upstream interactions are defined by sbo_term and should generally
2093
- # be modifiers/stimulator/inhibitor/interactor
2094
- interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
2095
- .assign(stoichiometry=upstream_stoichiometry)
2096
- .rename({"sc_id_up": "sc_id"}, axis=1),
2097
- # downstream interactions indicate some modification of the state
2098
- # of the species and hence are defined as product
2099
- interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
2100
- .assign(
2101
- stoichiometry=downstream_stoichiometry,
2102
- sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
1486
+ for i in range(0, pk_fk_correspondences.shape[0]):
1487
+ pk_table_keys = set(
1488
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
2103
1489
  )
2104
- .rename({"sc_id_down": "sc_id"}, axis=1),
2105
- ]
2106
- )
2107
- reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2108
- range(reaction_species_df.shape[0]), "rsc_id"
2109
- )
2110
- reaction_species_df = reaction_species_df.set_index("rsc_id")
2111
-
2112
- # form sbml_dfs object
2113
- sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
2114
- "compartments": compartments_df,
2115
- "species": species_df,
2116
- "compartmentalized_species": comp_species_w_ids,
2117
- "reactions": reactions_df,
2118
- "reaction_species": reaction_species_df,
2119
- }
2120
- if len(extra_reactions_columns) > 0:
2121
- if isinstance(keep_reactions_data, str):
2122
- reactions_data_label = keep_reactions_data
2123
- else:
2124
- reactions_data_label = "source"
2125
- sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
1490
+ if None in pk_table_keys:
1491
+ raise ValueError(
1492
+ f"{pk_fk_correspondences['pk_table'][i]} had "
1493
+ "missing values in its index"
1494
+ )
2126
1495
 
2127
- if len(extra_species_columns) > 0:
2128
- if isinstance(keep_species_data, str):
2129
- species_data_label = keep_species_data
2130
- else:
2131
- species_data_label = "source"
2132
- sbml_tbl_dict["species_data"] = {species_data_label: species_data}
1496
+ fk_table_keys = set(
1497
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1498
+ :, pk_fk_correspondences["key"][i]
1499
+ ]
1500
+ )
1501
+ if None in fk_table_keys:
1502
+ raise ValueError(
1503
+ f"{pk_fk_correspondences['fk_table'][i]} included "
1504
+ f"missing {pk_fk_correspondences['key'][i]} values"
1505
+ )
1506
+
1507
+ # all foreign keys need to match a primary key
1508
+ extra_fks = fk_table_keys.difference(pk_table_keys)
1509
+ if len(extra_fks) != 0:
1510
+ raise ValueError(
1511
+ f"{len(extra_fks)} distinct "
1512
+ f"{pk_fk_correspondences['key'][i]} values were"
1513
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
1514
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1515
+ " All foreign keys must have a matching primary key.\n\n"
1516
+ f"Extra key are: {', '.join(extra_fks)}"
1517
+ )
2133
1518
 
2134
- sbml_model = SBML_dfs(sbml_tbl_dict)
2135
- sbml_model.validate()
1519
+ def _find_underspecified_reactions_by_scids(
1520
+ self, sc_ids: Iterable[str]
1521
+ ) -> set[str]:
1522
+ """
1523
+ Find Underspecified reactions
2136
1524
 
2137
- return sbml_model
1525
+ Identify reactions which should be removed if a set of molecular species are removed
1526
+ from the system.
2138
1527
 
1528
+ Parameters
1529
+ ----------
1530
+ sc_ids : list[str]
1531
+ A list of compartmentalized species ids (sc_ids) which will be removed.
2139
1532
 
2140
- def _sbml_dfs_from_edgelist_validate_inputs(
2141
- interaction_edgelist: pd.DataFrame,
2142
- species_df: pd.DataFrame,
2143
- compartments_df: pd.DataFrame,
2144
- ) -> None:
2145
- """Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
2146
-
2147
- # check compartments
2148
- compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2149
- compartments_df_columns = set(compartments_df.columns.tolist())
2150
- missing_required_fields = compartments_df_expected_vars.difference(
2151
- compartments_df_columns
2152
- )
2153
- if len(missing_required_fields) > 0:
2154
- raise ValueError(
2155
- f"{', '.join(missing_required_fields)} are required variables"
2156
- ' in "compartments_df" but were not present in the input file.'
1533
+ Returns
1534
+ -------
1535
+ underspecified_reactions : set[str]
1536
+ A set of reactions which should be removed because they will not occur once
1537
+ "sc_ids" are removed.
1538
+ """
1539
+ updated_reaction_species = self.reaction_species.copy()
1540
+ updated_reaction_species["new"] = ~updated_reaction_species[
1541
+ SBML_DFS.SC_ID
1542
+ ].isin(sc_ids)
1543
+ updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
1544
+ underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
1545
+ updated_reaction_species
2157
1546
  )
1547
+ return underspecified_reactions
2158
1548
 
2159
- # check species
2160
- species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2161
- species_df_columns = set(species_df.columns.tolist())
2162
- missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2163
- if len(missing_required_fields) > 0:
2164
- raise ValueError(
2165
- f"{', '.join(missing_required_fields)} are required"
2166
- ' variables in "species_df" but were not present '
2167
- "in the input file."
1549
+ def _get_unused_cspecies(self) -> set[str]:
1550
+ """Returns a set of compartmentalized species
1551
+ that are not part of any reactions"""
1552
+ sc_ids = set(self.compartmentalized_species.index) - set(
1553
+ self.reaction_species[SBML_DFS.SC_ID]
2168
1554
  )
1555
+ return sc_ids # type: ignore
2169
1556
 
2170
- # check interactions
2171
- interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2172
- missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2173
- interaction_edgelist_columns
2174
- )
2175
- if len(missing_required_fields) > 0:
2176
- raise ValueError(
2177
- f"{', '.join(missing_required_fields)} are required "
2178
- 'variables in "interaction_edgelist" but were not '
2179
- "present in the input file."
1557
+ def _get_unused_species(self) -> set[str]:
1558
+ """Returns a list of species that are not part of any reactions"""
1559
+ s_ids = set(self.species.index) - set(
1560
+ self.compartmentalized_species[SBML_DFS.S_ID]
2180
1561
  )
1562
+ return s_ids # type: ignore
2181
1563
 
2182
- return None
2183
-
1564
+ def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
1565
+ """Removes compartmentalized species from the model
2184
1566
 
2185
- def _sbml_dfs_from_edgelist_check_cspecies_merge(
2186
- merged_species: pd.DataFrame, original_species: pd.DataFrame
2187
- ) -> None:
2188
- """Check for a mismatch between the provided species data and species implied by the edgelist."""
1567
+ This should not be directly used by the user, as it can lead to
1568
+ invalid reactions when removing species without a logic to decide
1569
+ if the reaction needs to be removed as well.
2189
1570
 
2190
- # check for 1-many merge
2191
- if merged_species.shape[0] != original_species.shape[0]:
2192
- raise ValueError(
2193
- "Merging compartmentalized species to species_df"
2194
- " and compartments_df by names resulted in an "
2195
- f"increase in the tables from {original_species.shape[0]}"
2196
- f" to {merged_species.shape[0]} indicating that names were"
2197
- " not unique"
1571
+ Args:
1572
+ sc_ids (Iterable[str]): the compartmentalized species to remove
1573
+ """
1574
+ # Remove compartmentalized species
1575
+ self.compartmentalized_species = self.compartmentalized_species.drop(
1576
+ index=list(sc_ids)
2198
1577
  )
1578
+ # remove corresponding reactions_species
1579
+ self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
2199
1580
 
2200
- # check for missing species and compartments
2201
- missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2202
- SBML_DFS.C_NAME
2203
- ].unique()
2204
- if len(missing_compartments) >= 1:
2205
- raise ValueError(
2206
- f"{len(missing_compartments)} compartments were present in"
2207
- ' "interaction_edgelist" but not "compartments_df":'
2208
- f" {', '.join(missing_compartments)}"
2209
- )
1581
+ def _remove_entity_data(self, entity_type: str, label: str) -> None:
1582
+ """
1583
+ Remove data from species_data or reactions_data by table name and label.
2210
1584
 
2211
- missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2212
- SBML_DFS.S_NAME
2213
- ].unique()
2214
- if len(missing_species) >= 1:
2215
- raise ValueError(
2216
- f"{len(missing_species)} species were present in "
2217
- '"interaction_edgelist" but not "species_df":'
2218
- f" {', '.join(missing_species)}"
2219
- )
1585
+ Parameters
1586
+ ----------
1587
+ entity_type : str
1588
+ Name of the table to remove data from ('species' or 'reactions')
1589
+ label : str
1590
+ Label of the data to remove
2220
1591
 
2221
- return None
1592
+ Notes
1593
+ -----
1594
+ If the label does not exist, a warning will be logged that includes the existing labels.
1595
+ """
1596
+ if entity_type not in ENTITIES_W_DATA:
1597
+ raise ValueError("table_name must be either 'species' or 'reactions'")
2222
1598
 
1599
+ data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
1600
+ if label not in data_dict:
1601
+ existing_labels = list(data_dict.keys())
1602
+ logger.warning(
1603
+ f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
1604
+ f"Existing labels: {existing_labels}"
1605
+ )
1606
+ return
2223
1607
 
2224
- def _stub_compartments(
2225
- stubbed_compartment: str = GENERIC_COMPARTMENT,
2226
- ) -> pd.DataFrame:
2227
- """Stub Compartments
1608
+ del data_dict[label]
2228
1609
 
2229
- Create a compartments table with only a single compartment
1610
+ def _remove_species(self, s_ids: Iterable[str]):
1611
+ """Removes species from the model
2230
1612
 
2231
- Args:
2232
- stubbed_compartment (str): the name of a compartment which should match the
2233
- keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
1613
+ This should not be directly used by the user, as it can lead to
1614
+ invalid reactions when removing species without a logic to decide
1615
+ if the reaction needs to be removed as well.
2234
1616
 
2235
- Returns:
2236
- compartments_df (pd.DataFrame): compartments dataframe
2237
- """
1617
+ This removes the species and corresponding compartmentalized species and
1618
+ reactions_species.
2238
1619
 
2239
- if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2240
- raise ValueError(
2241
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2242
- )
1620
+ Args:
1621
+ s_ids (Iterable[str]): the species to remove
1622
+ """
1623
+ sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1624
+ self._remove_compartmentalized_species(sc_ids)
1625
+ # Remove species
1626
+ self.species = self.species.drop(index=list(s_ids))
1627
+ # remove data
1628
+ for k, data in self.species_data.items():
1629
+ self.species_data[k] = data.drop(index=list(s_ids))
2243
1630
 
2244
- if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2245
- raise ValueError(
2246
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2247
- )
1631
+ def _remove_unused_cspecies(self):
1632
+ """Removes compartmentalized species that are no
1633
+ longer part of any reactions"""
1634
+ sc_ids = self._get_unused_cspecies()
1635
+ self._remove_compartmentalized_species(sc_ids)
2248
1636
 
2249
- stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
1637
+ def _remove_unused_species(self):
1638
+ """Removes species that are no longer part of any
1639
+ compartmentalized species"""
1640
+ s_ids = self._get_unused_species()
1641
+ self._remove_species(s_ids)
2250
1642
 
2251
- formatted_uri = identifiers.format_uri(
2252
- uri=identifiers.create_uri_url(
2253
- ontology=ONTOLOGIES.GO,
2254
- identifier=stubbed_compartment_id,
2255
- ),
2256
- biological_qualifier_type=BQB.IS,
2257
- )
1643
+ def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
2258
1644
 
2259
- compartments_df = pd.DataFrame(
2260
- {
2261
- SBML_DFS.C_NAME: [stubbed_compartment],
2262
- SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2263
- }
2264
- )
2265
- compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2266
- compartments_df.index.name = SBML_DFS.C_ID
1645
+ if isinstance(r_ids, str):
1646
+ r_ids = [r_ids]
2267
1647
 
2268
- return compartments_df
1648
+ if r_ids is None:
1649
+ return self.reactions.index.tolist()
1650
+ else:
1651
+ if not all(r_id in self.reactions.index for r_id in r_ids):
1652
+ raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
2269
1653
 
1654
+ return r_ids
2270
1655
 
2271
- def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2272
- """Validates a table against a reference
1656
+ def _validate_reaction_species(self):
1657
+ if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1658
+ raise ValueError(
1659
+ "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1660
+ )
2273
1661
 
2274
- This check if the table has the same index, no duplicates in the index
2275
- and that all values in the index are in the reference table.
1662
+ # test for null SBO terms
1663
+ n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1664
+ if n_null_sbo_terms != 0:
1665
+ raise ValueError(
1666
+ f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1667
+ )
2276
1668
 
2277
- Args:
2278
- data_table (pd.DataFrame): a table with data that should
2279
- match the reference
2280
- ref_table (pd.DataFrame): a reference table
1669
+ # find invalid SBO terms
1670
+ sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1671
+ invalid_sbo_term_counts = sbo_counts[
1672
+ ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
1673
+ ]
2281
1674
 
2282
- Raises:
2283
- ValueError: not same index name
2284
- ValueError: index contains duplicates
2285
- ValueError: index not subset of index of reactions table
2286
- """
2287
- ref_index_name = ref_table.index.name
2288
- if data_table.index.name != ref_index_name:
2289
- raise ValueError(
2290
- "the index name for reaction data table was not"
2291
- f" {ref_index_name}: {data_table.index.name}"
2292
- )
2293
- ids = data_table.index
2294
- if any(ids.duplicated()):
2295
- raise ValueError(
2296
- "the index for reaction data table " "contained duplicate values"
2297
- )
2298
- if not all(ids.isin(ref_table.index)):
2299
- raise ValueError(
2300
- "the index for reaction data table contained values"
2301
- " not found in the reactions table"
2302
- )
2303
- if not isinstance(data_table, pd.DataFrame):
2304
- raise TypeError(
2305
- f"The data table was type {type(data_table).__name__}"
2306
- " but must be a pd.DataFrame"
2307
- )
1675
+ if invalid_sbo_term_counts.shape[0] != 0:
1676
+ invalid_sbo_counts_str = ", ".join(
1677
+ [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1678
+ )
1679
+ raise ValueError(
1680
+ f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1681
+ f"defined {invalid_sbo_counts_str}"
1682
+ )
2308
1683
 
1684
+ def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1685
+ """Validates reactions data attribute
2309
1686
 
2310
- def species_type_types(x):
2311
- """Assign a high-level molecule type to a molecular species"""
1687
+ Args:
1688
+ reactions_data_table (pd.DataFrame): a reactions data table
2312
1689
 
2313
- if isinstance(x, identifiers.Identifiers):
2314
- if x.filter(["chebi"]):
2315
- return "metabolite"
2316
- elif x.filter(["molodex"]):
2317
- return "drug"
2318
- else:
2319
- return "protein"
2320
- else:
2321
- return "unknown"
2322
-
2323
-
2324
- def stub_ids(ids):
2325
- if len(ids) == 0:
2326
- return pd.DataFrame(
2327
- {
2328
- IDENTIFIERS.ONTOLOGY: [None],
2329
- IDENTIFIERS.IDENTIFIER: [None],
2330
- IDENTIFIERS.URL: [None],
2331
- IDENTIFIERS.BQB: [None],
2332
- }
2333
- )
2334
- else:
2335
- return pd.DataFrame(ids)
1690
+ Raises:
1691
+ ValueError: r_id not index name
1692
+ ValueError: r_id index contains duplicates
1693
+ ValueError: r_id not in reactions table
1694
+ """
1695
+ sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
2336
1696
 
1697
+ def _validate_species_data(self, species_data_table: pd.DataFrame):
1698
+ """Validates species data attribute
2337
1699
 
2338
- def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
2339
- """
2340
- Add an sbo_role column to the reaction_species table.
1700
+ Args:
1701
+ species_data_table (pd.DataFrame): a species data table
2341
1702
 
2342
- The sbo_role column is a string column that contains the SBO role of the reaction species.
2343
- The values in the sbo_role column are taken from the sbo_term column.
1703
+ Raises:
1704
+ ValueError: s_id not index name
1705
+ ValueError: s_id index contains duplicates
1706
+ ValueError: s_id not in species table
1707
+ """
1708
+ sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
2344
1709
 
2345
- The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
2346
- """
1710
+ def _validate_table(self, table_name: str) -> None:
1711
+ """
1712
+ Validate a table in this SBML_dfs object against its schema.
2347
1713
 
2348
- validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
1714
+ This is an internal method that validates a table that is part of this SBML_dfs
1715
+ object against the schema stored in self.schema.
2349
1716
 
2350
- reaction_species = (
2351
- reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2352
- .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2353
- .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2354
- )
1717
+ Parameters
1718
+ ----------
1719
+ table : str
1720
+ Name of the table to validate
2355
1721
 
2356
- undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2357
- SBO_NAME_TO_ROLE.values()
2358
- )
2359
- if len(undefined_roles) > 0:
2360
- logger.warning(
2361
- f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2362
- )
2363
- mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2364
- reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
1722
+ Raises
1723
+ ------
1724
+ ValueError
1725
+ If the table does not conform to its schema
1726
+ """
1727
+ table_data = getattr(self, table_name)
2365
1728
 
2366
- return reaction_species
1729
+ sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
2367
1730
 
2368
1731
 
2369
- def find_underspecified_reactions(
2370
- reaction_species_w_roles: pd.DataFrame,
2371
- ) -> pd.DataFrame:
1732
+ def sbml_dfs_from_edgelist(
1733
+ interaction_edgelist: pd.DataFrame,
1734
+ species_df: pd.DataFrame,
1735
+ compartments_df: pd.DataFrame,
1736
+ interaction_source: source.Source,
1737
+ upstream_stoichiometry: int = 0,
1738
+ downstream_stoichiometry: int = 1,
1739
+ downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1740
+ keep_species_data: bool | str = False,
1741
+ keep_reactions_data: bool | str = False,
1742
+ ) -> SBML_dfs:
1743
+ """
1744
+ Create SBML_dfs from interaction edgelist.
2372
1745
 
2373
- # check that both sbo_role and "new" are present
2374
- if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2375
- raise ValueError(
2376
- "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2377
- )
2378
- if "new" not in reaction_species_w_roles.columns:
2379
- raise ValueError(
2380
- "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2381
- )
2382
- # check that new is a boolean column
2383
- if reaction_species_w_roles["new"].dtype != bool:
2384
- raise ValueError(
2385
- "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2386
- )
1746
+ Combines a set of molecular interactions into a mechanistic SBML_dfs model
1747
+ by processing interaction data, species information, and compartment definitions.
2387
1748
 
2388
- reactions_with_lost_defining_members = set(
2389
- reaction_species_w_roles.query("~new")
2390
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2391
- .tolist()
1749
+ Parameters
1750
+ ----------
1751
+ interaction_edgelist : pd.DataFrame
1752
+ Table containing molecular interactions with columns:
1753
+ - upstream_name : str, matches "s_name" from species_df
1754
+ - downstream_name : str, matches "s_name" from species_df
1755
+ - upstream_compartment : str, matches "c_name" from compartments_df
1756
+ - downstream_compartment : str, matches "c_name" from compartments_df
1757
+ - r_name : str, name for the interaction
1758
+ - sbo_term : str, SBO term defining interaction type
1759
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
1760
+ - r_isreversible : bool, whether reaction is reversible
1761
+ species_df : pd.DataFrame
1762
+ Table defining molecular species with columns:
1763
+ - s_name : str, name of molecular species
1764
+ - s_Identifiers : identifiers.Identifiers, species identifiers
1765
+ compartments_df : pd.DataFrame
1766
+ Table defining compartments with columns:
1767
+ - c_name : str, name of compartment
1768
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
1769
+ interaction_source : source.Source
1770
+ Source object linking model entities to interaction source
1771
+ upstream_stoichiometry : int, default 0
1772
+ Stoichiometry of upstream species in reactions
1773
+ downstream_stoichiometry : int, default 1
1774
+ Stoichiometry of downstream species in reactions
1775
+ downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1776
+ SBO term for downstream reactant type
1777
+ keep_species_data : bool or str, default False
1778
+ Whether to preserve extra species columns. If True, saves as 'source' label.
1779
+ If string, uses as custom label. If False, discards extra data.
1780
+ keep_reactions_data : bool or str, default False
1781
+ Whether to preserve extra reaction columns. If True, saves as 'source' label.
1782
+ If string, uses as custom label. If False, discards extra data.
1783
+
1784
+ Returns
1785
+ -------
1786
+ SBML_dfs
1787
+ Validated SBML data structure containing compartments, species,
1788
+ compartmentalized species, reactions, and reaction species tables.
1789
+ """
1790
+ # 1. Validate inputs
1791
+ sbml_dfs_utils._edgelist_validate_inputs(
1792
+ interaction_edgelist, species_df, compartments_df
2392
1793
  )
2393
1794
 
2394
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2395
- if N_reactions_with_lost_defining_members > 0:
2396
- logger.info(
2397
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2398
- )
2399
-
2400
- # find the cases where all "new" values for a given (r_id, sbo_term) are False
2401
- reactions_with_lost_requirements = set(
2402
- reaction_species_w_roles
2403
- # drop already filtered reactions
2404
- .query("r_id not in @reactions_with_lost_defining_members")
2405
- .query("sbo_role == 'REQUIRED'")
2406
- # which entries which have some required attribute have all False values for that attribute
2407
- .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
2408
- .agg({"new": "any"})
2409
- .query("new == False")
2410
- .index.get_level_values(SBML_DFS.R_ID)
1795
+ # 2. Identify which extra columns to preserve
1796
+ extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
1797
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
2411
1798
  )
2412
1799
 
2413
- N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2414
- if N_reactions_with_lost_requirements > 0:
2415
- logger.info(
2416
- f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
2417
- )
2418
-
2419
- underspecified_reactions = reactions_with_lost_defining_members.union(
2420
- reactions_with_lost_requirements
1800
+ # 3. Process compartments and species tables
1801
+ processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
1802
+ compartments_df, interaction_source
1803
+ )
1804
+ processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
1805
+ species_df, interaction_source, extra_columns["species"]
2421
1806
  )
2422
1807
 
2423
- return underspecified_reactions
2424
-
2425
-
2426
- def _find_underspecified_reactions_by_scids(
2427
- sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
2428
- ) -> set[str]:
2429
- """
2430
- Find Underspecified reactions
2431
-
2432
- Identity reactions which should be removed if a set of molecular species are removed
2433
- from the system.
2434
-
2435
- Params:
2436
- sbml_dfs (SBML_dfs):
2437
- A pathway representation
2438
- sc_ids (list[str])
2439
- A list of compartmentalized species ids (sc_ids) which will be removed.
2440
-
2441
- Returns:
2442
- underspecified_reactions (set[str]):
2443
- A list of reactions which should be removed because they will not occur once
2444
- \"sc_ids\" are removed.
2445
-
2446
- """
1808
+ # 4. Create compartmentalized species
1809
+ comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
1810
+ interaction_edgelist,
1811
+ processed_species,
1812
+ processed_compartments,
1813
+ interaction_source,
1814
+ )
2447
1815
 
2448
- updated_reaction_species = sbml_dfs.reaction_species.copy()
2449
- updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
2450
- sc_ids
1816
+ # 5. Create reactions and reaction species
1817
+ reactions, reaction_species, reactions_data = (
1818
+ sbml_dfs_utils._edgelist_create_reactions_and_species(
1819
+ interaction_edgelist,
1820
+ comp_species,
1821
+ processed_species,
1822
+ processed_compartments,
1823
+ interaction_source,
1824
+ upstream_stoichiometry,
1825
+ downstream_stoichiometry,
1826
+ downstream_sbo_name,
1827
+ extra_columns["reactions"],
1828
+ )
2451
1829
  )
2452
1830
 
2453
- updated_reaction_species = add_sbo_role(updated_reaction_species)
2454
- underspecified_reactions = find_underspecified_reactions(updated_reaction_species)
1831
+ # 6. Assemble final SBML_dfs object
1832
+ sbml_dfs = _edgelist_assemble_sbml_model(
1833
+ processed_compartments,
1834
+ processed_species,
1835
+ comp_species,
1836
+ reactions,
1837
+ reaction_species,
1838
+ species_data,
1839
+ reactions_data,
1840
+ keep_species_data,
1841
+ keep_reactions_data,
1842
+ extra_columns,
1843
+ )
2455
1844
 
2456
- return underspecified_reactions
1845
+ return sbml_dfs
2457
1846
 
2458
1847
 
2459
- def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
1848
+ def _edgelist_assemble_sbml_model(
1849
+ compartments: pd.DataFrame,
1850
+ species: pd.DataFrame,
1851
+ comp_species: pd.DataFrame,
1852
+ reactions: pd.DataFrame,
1853
+ reaction_species: pd.DataFrame,
1854
+ species_data,
1855
+ reactions_data,
1856
+ keep_species_data,
1857
+ keep_reactions_data,
1858
+ extra_columns: dict[str, list[str]],
1859
+ ) -> SBML_dfs:
2460
1860
  """
2461
- Validate a standalone table against the SBML_dfs schema.
2462
-
2463
- This function validates a table against the schema defined in SBML_DFS_SCHEMA,
2464
- without requiring an SBML_dfs object. Useful for validating tables before
2465
- creating an SBML_dfs object.
1861
+ Assemble the final SBML_dfs object.
2466
1862
 
2467
1863
  Parameters
2468
1864
  ----------
2469
- table_data : pd.DataFrame
2470
- The table to validate
2471
- table_name : str
2472
- Name of the table in the SBML_dfs schema
2473
-
2474
- Raises
2475
- ------
2476
- ValueError
2477
- If table_name is not in schema or validation fails
2478
- """
2479
- if table_name not in SBML_DFS_SCHEMA.SCHEMA:
2480
- raise ValueError(
2481
- f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
2482
- f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
2483
- )
2484
-
2485
- table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
2486
- _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
2487
-
2488
-
2489
- def _perform_sbml_dfs_table_validation(
2490
- table_data: pd.DataFrame,
2491
- table_schema: dict,
2492
- table_name: str,
2493
- ) -> None:
2494
- """
2495
- Core validation logic for SBML_dfs tables.
2496
-
2497
- This function performs the actual validation checks for any table against its schema,
2498
- regardless of whether it's part of an SBML_dfs object or standalone.
2499
-
2500
- Parameters
2501
- ----------
2502
- table_data : pd.DataFrame
2503
- The table data to validate
2504
- table_schema : dict
2505
- Schema definition for the table
2506
- table_name : str
2507
- Name of the table (for error messages)
2508
-
2509
- Raises
2510
- ------
2511
- ValueError
2512
- If the table does not conform to its schema:
2513
- - Not a DataFrame
2514
- - Wrong index name
2515
- - Duplicate primary keys
2516
- - Missing required variables
2517
- - Empty table
1865
+ compartments : pd.DataFrame
1866
+ Processed compartments data
1867
+ species : pd.DataFrame
1868
+ Processed species data
1869
+ comp_species : pd.DataFrame
1870
+ Compartmentalized species data
1871
+ reactions : pd.DataFrame
1872
+ Reactions data
1873
+ reaction_species : pd.DataFrame
1874
+ Reaction species relationships
1875
+ species_data : pd.DataFrame
1876
+ Extra species data to include
1877
+ reactions_data : pd.DataFrame
1878
+ Extra reactions data to include
1879
+ keep_species_data : bool or str
1880
+ Label for species extra data
1881
+ keep_reactions_data : bool or str
1882
+ Label for reactions extra data
1883
+ extra_columns : dict
1884
+ Dictionary containing lists of extra column names
1885
+
1886
+ Returns
1887
+ -------
1888
+ SBML_dfs
1889
+ Validated SBML data structure
2518
1890
  """
2519
- if not isinstance(table_data, pd.DataFrame):
2520
- raise ValueError(
2521
- f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
2522
- )
2523
-
2524
- # check index
2525
- expected_index_name = table_schema["pk"]
2526
- if table_data.index.name != expected_index_name:
2527
- raise ValueError(
2528
- f"the index name for {table_name} was not the pk: {expected_index_name}"
2529
- )
2530
-
2531
- # check that all entries in the index are unique
2532
- if len(set(table_data.index.tolist())) != table_data.shape[0]:
2533
- duplicated_pks = table_data.index.value_counts()
2534
- duplicated_pks = duplicated_pks[duplicated_pks > 1]
2535
-
2536
- example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
2537
- raise ValueError(
2538
- f"{duplicated_pks.shape[0]} primary keys were duplicated "
2539
- f"including {', '.join(example_duplicates)}"
2540
- )
2541
-
2542
- # check variables
2543
- expected_vars = set(table_schema["vars"])
2544
- table_vars = set(list(table_data.columns))
1891
+ sbml_tbl_dict = {
1892
+ "compartments": compartments,
1893
+ "species": species,
1894
+ "compartmentalized_species": comp_species,
1895
+ "reactions": reactions,
1896
+ "reaction_species": reaction_species,
1897
+ }
2545
1898
 
2546
- extra_vars = table_vars.difference(expected_vars)
2547
- if len(extra_vars) != 0:
2548
- logger.debug(
2549
- f"{len(extra_vars)} extra variables were found for {table_name}: "
2550
- f"{', '.join(extra_vars)}"
1899
+ # Add extra data if requested
1900
+ if len(extra_columns["reactions"]) > 0:
1901
+ data_label = (
1902
+ keep_reactions_data if isinstance(keep_reactions_data, str) else "source"
2551
1903
  )
1904
+ sbml_tbl_dict["reactions_data"] = {data_label: reactions_data}
2552
1905
 
2553
- missing_vars = expected_vars.difference(table_vars)
2554
- if len(missing_vars) != 0:
2555
- raise ValueError(
2556
- f"Missing {len(missing_vars)} required variables for {table_name}: "
2557
- f"{', '.join(missing_vars)}"
1906
+ if len(extra_columns["species"]) > 0:
1907
+ data_label = (
1908
+ keep_species_data if isinstance(keep_species_data, str) else "source"
2558
1909
  )
1910
+ sbml_tbl_dict["species_data"] = {data_label: species_data}
2559
1911
 
2560
- # check for empty table
2561
- if table_data.shape[0] == 0:
2562
- raise ValueError(f"{table_name} contained no entries")
2563
-
2564
-
2565
- def _filter_promiscuous_components(
2566
- bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
2567
- ) -> pd.DataFrame:
2568
-
2569
- # number of complexes a species is part of
2570
- n_complexes_involvedin = bqb_has_parts_species.value_counts(
2571
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
2572
- )
2573
- promiscuous_component_identifiers_index = n_complexes_involvedin[
2574
- n_complexes_involvedin > max_promiscuity
2575
- ].index
2576
- promiscuous_component_identifiers = pd.Series(
2577
- data=[True] * len(promiscuous_component_identifiers_index),
2578
- index=promiscuous_component_identifiers_index,
2579
- name="is_shared_component",
2580
- dtype=bool,
2581
- )
2582
-
2583
- if len(promiscuous_component_identifiers) == 0:
2584
- return bqb_has_parts_species
2585
-
2586
- filtered_bqb_has_parts = bqb_has_parts_species.merge(
2587
- promiscuous_component_identifiers,
2588
- left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
2589
- right_index=True,
2590
- how="left",
2591
- )
2592
-
2593
- filtered_bqb_has_parts["is_shared_component"] = (
2594
- filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
2595
- )
2596
- # drop identifiers shared as components across many species
2597
- filtered_bqb_has_parts = filtered_bqb_has_parts[
2598
- ~filtered_bqb_has_parts["is_shared_component"]
2599
- ].drop(["is_shared_component"], axis=1)
1912
+ sbml_model = SBML_dfs(sbml_tbl_dict)
1913
+ sbml_model.validate()
2600
1914
 
2601
- return filtered_bqb_has_parts
1915
+ return sbml_model