napistu 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/sbml_dfs_core.py CHANGED
@@ -7,8 +7,12 @@ from typing import Iterable
7
7
  from typing import Mapping
8
8
  from typing import MutableMapping
9
9
  from typing import TYPE_CHECKING
10
+ from typing import Optional
11
+ from typing import Union
10
12
 
13
+ from fs import open_fs
11
14
  import pandas as pd
15
+
12
16
  from napistu import identifiers
13
17
  from napistu import sbml_dfs_utils
14
18
  from napistu import source
@@ -17,25 +21,14 @@ from napistu.ingestion import sbml
17
21
  from napistu.constants import SBML_DFS
18
22
  from napistu.constants import SBML_DFS_SCHEMA
19
23
  from napistu.constants import IDENTIFIERS
20
- from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
21
- from napistu.constants import CPR_STANDARD_OUTPUTS
22
- from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
24
+ from napistu.constants import NAPISTU_STANDARD_OUTPUTS
23
25
  from napistu.constants import BQB_PRIORITIES
24
26
  from napistu.constants import ONTOLOGY_PRIORITIES
25
- from napistu.constants import BQB
26
- from napistu.constants import BQB_DEFINING_ATTRS
27
27
  from napistu.constants import MINI_SBO_FROM_NAME
28
28
  from napistu.constants import MINI_SBO_TO_NAME
29
- from napistu.constants import ONTOLOGIES
30
- from napistu.constants import SBO_NAME_TO_ROLE
31
29
  from napistu.constants import SBOTERM_NAMES
32
- from napistu.constants import SBO_ROLES_DEFS
33
30
  from napistu.constants import ENTITIES_W_DATA
34
31
  from napistu.constants import ENTITIES_TO_ENTITY_DATA
35
- from napistu.ingestion.constants import GENERIC_COMPARTMENT
36
- from napistu.ingestion.constants import COMPARTMENT_ALIASES
37
- from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
38
- from fs import open_fs
39
32
 
40
33
  logger = logging.getLogger(__name__)
41
34
 
@@ -65,26 +58,76 @@ class SBML_dfs:
65
58
  schema : dict
66
59
  Dictionary representing the structure of the other attributes and meaning of their variables
67
60
 
68
- Methods
69
- -------
70
- get_table(entity_type, required_attributes)
71
- Get a table from the SBML_dfs object with optional attribute validation
72
- search_by_ids(ids, entity_type, identifiers_df, ontologies)
73
- Find entities and identifiers matching a set of query IDs
74
- search_by_name(name, entity_type, partial_match)
75
- Find entities by exact or partial name match
61
+ Public Methods (alphabetical)
62
+ ----------------------------
63
+ add_reactions_data(label, data)
64
+ Add a new reactions data table to the model with validation.
65
+ add_species_data(label, data)
66
+ Add a new species data table to the model with validation.
67
+ export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
68
+ Export the SBML_dfs model and its tables to files in a specified directory.
69
+ get_characteristic_species_ids(dogmatic=True)
70
+ Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
76
71
  get_cspecies_features()
77
- Get additional attributes of compartmentalized species
78
- get_species_features()
79
- Get additional attributes of species
72
+ Compute and return additional features for compartmentalized species, such as degree and type.
80
73
  get_identifiers(id_type)
81
- Get identifiers from a specified entity type
82
- get_uri_urls(entity_type, entity_ids)
83
- Get reference URLs for specified entities
74
+ Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
75
+ get_network_summary()
76
+ Return a dictionary of diagnostic statistics summarizing the network structure.
77
+ get_species_features()
78
+ Compute and return additional features for species, such as species type.
79
+ get_table(entity_type, required_attributes=None)
80
+ Retrieve a table for a given entity type, optionally validating required attributes.
81
+ get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
82
+ Return reference URLs for specified entities, optionally filtered by ontology.
83
+ infer_sbo_terms()
84
+ Infer and fill in missing SBO terms for reaction species based on stoichiometry.
85
+ infer_uncompartmentalized_species_location()
86
+ Infer and assign compartments for compartmentalized species with missing compartment information.
87
+ name_compartmentalized_species()
88
+ Rename compartmentalized species to include compartment information if needed.
89
+ reaction_formulas(r_ids=None)
90
+ Generate human-readable reaction formulas for specified reactions.
91
+ reaction_summaries(r_ids=None)
92
+ Return a summary DataFrame for specified reactions, including names and formulas.
93
+ remove_compartmentalized_species(sc_ids)
94
+ Remove specified compartmentalized species and associated reactions from the model.
95
+ remove_reactions(r_ids, remove_species=False)
96
+ Remove specified reactions and optionally remove unused species.
97
+ remove_reactions_data(label)
98
+ Remove a reactions data table by label.
99
+ remove_species_data(label)
100
+ Remove a species data table by label.
101
+ search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
102
+ Find entities and identifiers matching a set of query IDs.
103
+ search_by_name(name, entity_type, partial_match=True)
104
+ Find entities by exact or partial name match.
105
+ select_species_data(species_data_table)
106
+ Select a species data table from the SBML_dfs object by name.
107
+ species_status(s_id)
108
+ Return all reactions a species participates in, with stoichiometry and formula information.
84
109
  validate()
85
- Validate the SBML_dfs structure and relationships
110
+ Validate the SBML_dfs structure and relationships.
86
111
  validate_and_resolve()
87
- Validate and attempt to automatically fix common issues
112
+ Validate and attempt to automatically fix common issues.
113
+
114
+ Private/Hidden Methods (alphabetical, appear after public methods)
115
+ -----------------------------------------------------------------
116
+ _attempt_resolve(e)
117
+ _check_pk_fk_correspondence()
118
+ _find_underspecified_reactions_by_scids(sc_ids)
119
+ _get_unused_cspecies()
120
+ _get_unused_species()
121
+ _remove_compartmentalized_species(sc_ids)
122
+ _remove_entity_data(entity_type, label)
123
+ _remove_species(s_ids)
124
+ _remove_unused_cspecies()
125
+ _remove_unused_species()
126
+ _validate_r_ids(r_ids)
127
+ _validate_reaction_species()
128
+ _validate_reactions_data(reactions_data_table)
129
+ _validate_species_data(species_data_table)
130
+ _validate_table(table_name)
88
131
  """
89
132
 
90
133
  compartments: pd.DataFrame
@@ -162,193 +205,176 @@ class SBML_dfs:
162
205
  '"validate" = False so "resolve" will be ignored (eventhough it was True)'
163
206
  )
164
207
 
165
- def get_table(
166
- self, entity_type: str, required_attributes: None | set[str] = None
167
- ) -> pd.DataFrame:
208
+ # =============================================================================
209
+ # PUBLIC METHODS (ALPHABETICAL ORDER)
210
+ # =============================================================================
211
+
212
+ def add_reactions_data(self, label: str, data: pd.DataFrame):
168
213
  """
169
- Get a table from the SBML_dfs object with optional attribute validation.
214
+ Add additional reaction data with validation.
170
215
 
171
216
  Parameters
172
217
  ----------
173
- entity_type : str
174
- The type of entity table to retrieve (e.g., 'species', 'reactions')
175
- required_attributes : Optional[Set[str]], optional
176
- Set of attributes that must be present in the table, by default None.
177
- Must be passed as a set, e.g. {'id'}, not a string.
178
-
179
- Returns
180
- -------
181
- pd.DataFrame
182
- The requested table
218
+ label : str
219
+ Label for the new data
220
+ data : pd.DataFrame
221
+ Data to add, must be indexed by reaction_id
183
222
 
184
223
  Raises
185
224
  ------
186
225
  ValueError
187
- If entity_type is invalid or required attributes are missing
188
- TypeError
189
- If required_attributes is not a set
226
+ If the data is invalid or label already exists
190
227
  """
191
-
192
- schema = self.schema
193
-
194
- if entity_type not in schema.keys():
228
+ self._validate_reactions_data(data)
229
+ if label in self.reactions_data:
195
230
  raise ValueError(
196
- f"{entity_type} does not match a table in the SBML_dfs object. The tables "
197
- f"which are present are {', '.join(schema.keys())}"
198
- )
199
-
200
- if required_attributes is not None:
201
- if not isinstance(required_attributes, set):
202
- raise TypeError(
203
- f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
204
- "Did you pass a string instead of a set?"
205
- )
206
-
207
- # determine whether required_attributes are appropriate
208
- VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
209
- invalid_required_attributes = required_attributes.difference(
210
- VALID_REQUIRED_ATTRIBUTES
231
+ f"{label} already exists in reactions_data. " "Drop it first."
211
232
  )
233
+ self.reactions_data[label] = data
212
234
 
213
- if len(invalid_required_attributes) > 0:
214
- raise ValueError(
215
- f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
216
- f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
217
- )
235
+ def add_species_data(self, label: str, data: pd.DataFrame):
236
+ """
237
+ Add additional species data with validation.
218
238
 
219
- # determine if required_attributes are satisified
220
- invalid_attrs = [
221
- s for s in required_attributes if s not in schema[entity_type].keys()
222
- ]
223
- if len(invalid_attrs) > 0:
224
- raise ValueError(
225
- f"The following required attributes are not present for the {entity_type} table: "
226
- f"{', '.join(invalid_attrs)}."
227
- )
239
+ Parameters
240
+ ----------
241
+ label : str
242
+ Label for the new data
243
+ data : pd.DataFrame
244
+ Data to add, must be indexed by species_id
228
245
 
229
- return getattr(self, entity_type)
246
+ Raises
247
+ ------
248
+ ValueError
249
+ If the data is invalid or label already exists
250
+ """
251
+ self._validate_species_data(data)
252
+ if label in self.species_data:
253
+ raise ValueError(
254
+ f"{label} already exists in species_data. " "Drop it first."
255
+ )
256
+ self.species_data[label] = data
230
257
 
231
- def search_by_ids(
258
+ def export_sbml_dfs(
232
259
  self,
233
- ids: list[str],
234
- entity_type: str,
235
- identifiers_df: pd.DataFrame,
236
- ontologies: None | set[str] = None,
237
- ) -> tuple[pd.DataFrame, pd.DataFrame]:
260
+ model_prefix: str,
261
+ outdir: str,
262
+ overwrite: bool = False,
263
+ dogmatic: bool = True,
264
+ ) -> None:
238
265
  """
239
- Find entities and identifiers matching a set of query IDs.
266
+ Export SBML_dfs
240
267
 
241
- Parameters
242
- ----------
243
- ids : List[str]
244
- List of identifiers to search for
245
- entity_type : str
246
- Type of entity to search (e.g., 'species', 'reactions')
247
- identifiers_df : pd.DataFrame
248
- DataFrame containing identifier mappings
249
- ontologies : Optional[Set[str]], optional
250
- Set of ontologies to filter by, by default None
268
+ Export summaries of species identifiers and each table underlying
269
+ an SBML_dfs pathway model
270
+
271
+ Params
272
+ ------
273
+ model_prefix: str
274
+ Label to prepend to all exported files
275
+ outdir: str
276
+ Path to an existing directory where results should be saved
277
+ overwrite: bool
278
+ Should the directory be overwritten if it already exists?
279
+ dogmatic: bool
280
+ If True then treat genes, transcript, and proteins as separate species. If False
281
+ then treat them interchangeably.
251
282
 
252
283
  Returns
253
284
  -------
254
- Tuple[pd.DataFrame, pd.DataFrame]
255
- - Matching entities
256
- - Matching identifiers
257
-
258
- Raises
259
- ------
260
- ValueError
261
- If entity_type is invalid or ontologies are invalid
262
- TypeError
263
- If ontologies is not a set
285
+ None
264
286
  """
265
- # validate inputs
266
- entity_table = self.get_table(entity_type, required_attributes={"id"})
267
- entity_pk = self.schema[entity_type]["pk"]
287
+ if not isinstance(model_prefix, str):
288
+ raise TypeError(
289
+ f"model_prefix was a {type(model_prefix)} " "and must be a str"
290
+ )
291
+ if not isinstance(self, SBML_dfs):
292
+ raise TypeError(
293
+ f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
294
+ )
268
295
 
269
- utils.match_pd_vars(
270
- identifiers_df,
271
- req_vars={
272
- entity_pk,
273
- IDENTIFIERS.ONTOLOGY,
274
- IDENTIFIERS.IDENTIFIER,
275
- IDENTIFIERS.URL,
276
- IDENTIFIERS.BQB,
277
- },
278
- allow_series=False,
279
- ).assert_present()
296
+ # filter to identifiers which make sense when mapping from ids -> species
297
+ species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
280
298
 
281
- if ontologies is not None:
282
- if not isinstance(ontologies, set):
283
- # for clarity this should not be reachable based on type hints
284
- raise TypeError(
285
- f"ontologies must be a set, but got {type(ontologies).__name__}"
286
- )
287
- ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
288
- invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
289
- if len(invalid_ontologies) > 0:
290
- raise ValueError(
291
- f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
292
- f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
299
+ try:
300
+ utils.initialize_dir(outdir, overwrite=overwrite)
301
+ except FileExistsError:
302
+ logger.warning(
303
+ f"Directory {outdir} already exists and overwrite is False. "
304
+ "Files will be added to the existing directory."
305
+ )
306
+ with open_fs(outdir, writeable=True) as fs:
307
+ species_identifiers_path = (
308
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
309
+ )
310
+ with fs.openbin(species_identifiers_path, "w") as f:
311
+ species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
312
+ f, sep="\t", index=False
293
313
  )
294
314
 
295
- # fitler to just to identifiers matchign the ontologies of interest
296
- identifiers_df = identifiers_df.query("ontology in @ontologies")
315
+ # export jsons
316
+ species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
317
+ reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
318
+ reation_species_path = (
319
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
320
+ )
321
+ compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
322
+ compartmentalized_species_path = (
323
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
324
+ )
325
+ with fs.openbin(species_path, "w") as f:
326
+ self.species[[SBML_DFS.S_NAME]].to_json(f)
297
327
 
298
- matching_identifiers = identifiers_df.loc[
299
- identifiers_df["identifier"].isin(ids)
300
- ]
301
- entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
328
+ with fs.openbin(reactions_path, "w") as f:
329
+ self.reactions[[SBML_DFS.R_NAME]].to_json(f)
302
330
 
303
- return entity_subset, matching_identifiers
331
+ with fs.openbin(reation_species_path, "w") as f:
332
+ self.reaction_species.to_json(f)
304
333
 
305
- def search_by_name(
306
- self, name: str, entity_type: str, partial_match: bool = True
307
- ) -> pd.DataFrame:
334
+ with fs.openbin(compartments_path, "w") as f:
335
+ self.compartments[[SBML_DFS.C_NAME]].to_json(f)
336
+
337
+ with fs.openbin(compartmentalized_species_path, "w") as f:
338
+ self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
339
+ f
340
+ )
341
+
342
+ return None
343
+
344
+ def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
308
345
  """
309
- Find entities by exact or partial name match.
346
+ Get Characteristic Species IDs
347
+
348
+ List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
310
349
 
311
350
  Parameters
312
351
  ----------
313
- name : str
314
- Name to search for
315
- entity_type : str
316
- Type of entity to search (e.g., 'species', 'reactions')
317
- partial_match : bool, optional
318
- Whether to allow partial string matches, by default True
352
+ sbml_dfs : sbml_dfs_core.SBML_dfs
353
+ The SBML_dfs object.
354
+ dogmatic : bool, default=True
355
+ Whether to use the dogmatic flag to determine which BQB attributes are valid.
319
356
 
320
357
  Returns
321
358
  -------
322
359
  pd.DataFrame
323
- Matching entities
360
+ A DataFrame containing the systematic identifiers which are characteristic of molecular species.
324
361
  """
325
- entity_table = self.get_table(entity_type, required_attributes={"label"})
326
- label_attr = self.schema[entity_type]["label"]
327
362
 
328
- if partial_match:
329
- matches = entity_table.loc[
330
- entity_table[label_attr].str.contains(name, case=False)
331
- ]
332
- else:
333
- matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
334
- return matches
363
+ # select valid BQB attributes based on dogmatic flag
364
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
365
+ dogmatic
366
+ )
335
367
 
336
- def get_species_features(self) -> pd.DataFrame:
337
- """
338
- Get additional attributes of species.
368
+ # pre-summarize ontologies
369
+ species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
339
370
 
340
- Returns
341
- -------
342
- pd.DataFrame
343
- Species with additional features including:
344
- - species_type: Classification of the species (e.g., metabolite, protein)
345
- """
346
- species = self.species
347
- augmented_species = species.assign(
348
- **{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
371
+ # drop some BQB_HAS_PART annotations
372
+ species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
373
+ species_identifiers,
374
+ defining_biological_qualifiers=defining_biological_qualifiers,
349
375
  )
350
376
 
351
- return augmented_species
377
+ return species_identifiers
352
378
 
353
379
  def get_cspecies_features(self) -> pd.DataFrame:
354
380
  """
@@ -445,113 +471,28 @@ class SBML_dfs:
445
471
 
446
472
  return named_identifiers
447
473
 
448
- def get_uri_urls(
449
- self,
450
- entity_type: str,
451
- entity_ids: Iterable[str] | None = None,
452
- required_ontology: str | None = None,
453
- ) -> pd.Series:
474
+ def get_network_summary(self) -> Mapping[str, Any]:
454
475
  """
455
- Get reference URLs for specified entities.
456
-
457
- Parameters
458
- ----------
459
- entity_type : str
460
- Type of entity to get URLs for (e.g., 'species', 'reactions')
461
- entity_ids : Optional[Iterable[str]], optional
462
- Specific entities to get URLs for, by default None (all entities)
463
- required_ontology : Optional[str], optional
464
- Specific ontology to get URLs from, by default None
476
+ Get diagnostic statistics about the network.
465
477
 
466
478
  Returns
467
479
  -------
468
- pd.Series
469
- Series mapping entity IDs to their reference URLs
470
-
471
- Raises
472
- ------
473
- ValueError
474
- If entity_type is invalid
475
- """
476
- schema = self.schema
477
-
478
- # valid entities and their identifier variables
479
- valid_entity_types = [
480
- SBML_DFS.COMPARTMENTS,
481
- SBML_DFS.SPECIES,
482
- SBML_DFS.REACTIONS,
483
- ]
484
-
485
- if entity_type not in valid_entity_types:
486
- raise ValueError(
487
- f"{entity_type} is an invalid entity_type; valid types "
488
- f"are {', '.join(valid_entity_types)}"
489
- )
490
-
491
- entity_table = getattr(self, entity_type)
492
-
493
- if entity_ids is not None:
494
- # ensure that entity_ids are unique and then convert back to list
495
- # to support pandas indexing
496
- entity_ids = list(set(entity_ids))
497
-
498
- # filter to a subset of identifiers if one is provided
499
- entity_table = entity_table.loc[entity_ids]
500
-
501
- # create a dataframe of all identifiers for the select entities
502
- all_ids = pd.concat(
503
- [
504
- sbml_dfs_utils._stub_ids(
505
- entity_table[schema[entity_type]["id"]].iloc[i].ids
506
- ).assign(id=entity_table.index[i])
507
- for i in range(0, entity_table.shape[0])
508
- ]
509
- ).rename(columns={"id": schema[entity_type]["pk"]})
510
-
511
- # set priorities for ontologies and bqb terms
512
-
513
- if required_ontology is None:
514
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
515
- ONTOLOGY_PRIORITIES, how="left"
516
- )
517
- else:
518
- ontology_priorities = pd.DataFrame(
519
- [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
520
- )
521
- # if only a single ontology is sought then just return matching entries
522
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
523
- ontology_priorities, how="inner"
524
- )
525
-
526
- uri_urls = (
527
- all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
528
- .groupby(schema[entity_type]["pk"])
529
- .first()[IDENTIFIERS.URL]
530
- )
531
- return uri_urls
532
-
533
- def get_network_summary(self) -> Mapping[str, Any]:
534
- """
535
- Get diagnostic statistics about the network.
536
-
537
- Returns
538
- -------
539
- Mapping[str, Any]
540
- Dictionary of diagnostic statistics including:
541
- - n_species_types: Number of species types
542
- - dict_n_species_per_type: Number of species per type
543
- - n_species: Number of species
544
- - n_cspecies: Number of compartmentalized species
545
- - n_reaction_species: Number of reaction species
546
- - n_reactions: Number of reactions
547
- - n_compartments: Number of compartments
548
- - dict_n_species_per_compartment: Number of species per compartment
549
- - stats_species_per_reaction: Statistics on reactands per reaction
550
- - top10_species_per_reaction: Top 10 reactions by number of reactands
551
- - stats_degree: Statistics on species connectivity
552
- - top10_degree: Top 10 species by connectivity
553
- - stats_identifiers_per_species: Statistics on identifiers per species
554
- - top10_identifiers_per_species: Top 10 species by number of identifiers
480
+ Mapping[str, Any]
481
+ Dictionary of diagnostic statistics including:
482
+ - n_species_types: Number of species types
483
+ - dict_n_species_per_type: Number of species per type
484
+ - n_species: Number of species
485
+ - n_cspecies: Number of compartmentalized species
486
+ - n_reaction_species: Number of reaction species
487
+ - n_reactions: Number of reactions
488
+ - n_compartments: Number of compartments
489
+ - dict_n_species_per_compartment: Number of species per compartment
490
+ - stats_species_per_reaction: Statistics on reactands per reaction
491
+ - top10_species_per_reaction: Top 10 reactions by number of reactands
492
+ - stats_degree: Statistics on species connectivity
493
+ - top10_degree: Top 10 species by connectivity
494
+ - stats_identifiers_per_species: Statistics on identifiers per species
495
+ - top10_identifiers_per_species: Top 10 species by number of identifiers
555
496
  """
556
497
  stats: MutableMapping[str, Any] = {}
557
498
  species_features = self.get_species_features()
@@ -616,2009 +557,1306 @@ class SBML_dfs:
616
557
 
617
558
  return stats
618
559
 
619
- def add_species_data(self, label: str, data: pd.DataFrame):
560
+ def get_species_features(self) -> pd.DataFrame:
620
561
  """
621
- Add additional species data with validation.
622
-
623
- Parameters
624
- ----------
625
- label : str
626
- Label for the new data
627
- data : pd.DataFrame
628
- Data to add, must be indexed by species_id
562
+ Get additional attributes of species.
629
563
 
630
- Raises
631
- ------
632
- ValueError
633
- If the data is invalid or label already exists
564
+ Returns
565
+ -------
566
+ pd.DataFrame
567
+ Species with additional features including:
568
+ - species_type: Classification of the species (e.g., metabolite, protein)
634
569
  """
635
- self._validate_species_data(data)
636
- if label in self.species_data:
637
- raise ValueError(
638
- f"{label} already exists in species_data. " "Drop it first."
639
- )
640
- self.species_data[label] = data
570
+ species = self.species
571
+ augmented_species = species.assign(
572
+ **{
573
+ "species_type": lambda d: d["s_Identifiers"].apply(
574
+ sbml_dfs_utils.species_type_types
575
+ )
576
+ }
577
+ )
641
578
 
642
- def remove_species_data(self, label: str):
643
- """
644
- Remove species data by label.
645
- """
646
- self._remove_entity_data(SBML_DFS.SPECIES, label)
579
+ return augmented_species
647
580
 
648
- def add_reactions_data(self, label: str, data: pd.DataFrame):
581
+ def get_table(
582
+ self, entity_type: str, required_attributes: None | set[str] = None
583
+ ) -> pd.DataFrame:
649
584
  """
650
- Add additional reaction data with validation.
585
+ Get a table from the SBML_dfs object with optional attribute validation.
651
586
 
652
587
  Parameters
653
588
  ----------
654
- label : str
655
- Label for the new data
656
- data : pd.DataFrame
657
- Data to add, must be indexed by reaction_id
589
+ entity_type : str
590
+ The type of entity table to retrieve (e.g., 'species', 'reactions')
591
+ required_attributes : Optional[Set[str]], optional
592
+ Set of attributes that must be present in the table, by default None.
593
+ Must be passed as a set, e.g. {'id'}, not a string.
594
+
595
+ Returns
596
+ -------
597
+ pd.DataFrame
598
+ The requested table
658
599
 
659
600
  Raises
660
601
  ------
661
602
  ValueError
662
- If the data is invalid or label already exists
603
+ If entity_type is invalid or required attributes are missing
604
+ TypeError
605
+ If required_attributes is not a set
663
606
  """
664
- self._validate_reactions_data(data)
665
- if label in self.reactions_data:
666
- raise ValueError(
667
- f"{label} already exists in reactions_data. Drop it first."
668
- )
669
- self.reactions_data[label] = data
670
607
 
671
- def remove_reactions_data(self, label: str):
672
- """
673
- Remove reactions data by label.
674
- """
675
- self._remove_entity_data(SBML_DFS.REACTIONS, label)
608
+ schema = self.schema
676
609
 
677
- def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
678
- """
679
- Remove compartmentalized species and associated reactions.
610
+ if entity_type not in schema.keys():
611
+ raise ValueError(
612
+ f"{entity_type} does not match a table in the SBML_dfs object. The tables "
613
+ f"which are present are {', '.join(schema.keys())}"
614
+ )
680
615
 
681
- Starting with a set of compartmentalized species, determine which reactions
682
- should be removed based on their removal. Then remove these reactions,
683
- compartmentalized species, and species.
616
+ if required_attributes is not None:
617
+ if not isinstance(required_attributes, set):
618
+ raise TypeError(
619
+ f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
620
+ "Did you pass a string instead of a set?"
621
+ )
684
622
 
685
- Parameters
686
- ----------
687
- sc_ids : Iterable[str]
688
- IDs of compartmentalized species to remove
689
- """
623
+ # determine whether required_attributes are appropriate
624
+ VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
625
+ invalid_required_attributes = required_attributes.difference(
626
+ VALID_REQUIRED_ATTRIBUTES
627
+ )
690
628
 
691
- # find reactions which should be totally removed since they are losing critical species
692
- removed_reactions = _find_underspecified_reactions_by_scids(self, sc_ids)
693
- self.remove_reactions(removed_reactions)
629
+ if len(invalid_required_attributes) > 0:
630
+ raise ValueError(
631
+ f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
632
+ f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
633
+ )
694
634
 
695
- self._remove_compartmentalized_species(sc_ids)
635
+ # determine if required_attributes are satisified
636
+ invalid_attrs = [
637
+ s for s in required_attributes if s not in schema[entity_type].keys()
638
+ ]
639
+ if len(invalid_attrs) > 0:
640
+ raise ValueError(
641
+ f"The following required attributes are not present for the {entity_type} table: "
642
+ f"{', '.join(invalid_attrs)}."
643
+ )
696
644
 
697
- # remove species (and their associated species data if all their cspecies have been lost)
698
- self._remove_unused_species()
645
+ return getattr(self, entity_type)
699
646
 
700
- def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
647
+ def get_uri_urls(
648
+ self,
649
+ entity_type: str,
650
+ entity_ids: Iterable[str] | None = None,
651
+ required_ontology: str | None = None,
652
+ ) -> pd.Series:
701
653
  """
702
- Remove reactions from the model.
654
+ Get reference URLs for specified entities.
703
655
 
704
656
  Parameters
705
657
  ----------
706
- r_ids : Iterable[str]
707
- IDs of reactions to remove
708
- remove_species : bool, optional
709
- Whether to remove species that are no longer part of any reactions,
710
- by default False
711
- """
712
- # remove corresponding reactions_species
713
- self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
714
- # remove reactions
715
- self.reactions = self.reactions.drop(index=list(r_ids))
716
- # remove reactions_data
717
- if hasattr(self, "reactions_data"):
718
- for k, data in self.reactions_data.items():
719
- self.reactions_data[k] = data.drop(index=list(r_ids))
720
- # remove species if requested
721
- if remove_species:
722
- self._remove_unused_cspecies()
723
- self._remove_unused_species()
724
-
725
- def validate(self):
726
- """
727
- Validate the SBML_dfs structure and relationships.
658
+ entity_type : str
659
+ Type of entity to get URLs for (e.g., 'species', 'reactions')
660
+ entity_ids : Optional[Iterable[str]], optional
661
+ Specific entities to get URLs for, by default None (all entities)
662
+ required_ontology : Optional[str], optional
663
+ Specific ontology to get URLs from, by default None
728
664
 
729
- Checks:
730
- - Schema existence
731
- - Required tables presence
732
- - Individual table structure
733
- - Primary key uniqueness
734
- - Foreign key relationships
735
- - Optional data table validity
736
- - Reaction species validity
665
+ Returns
666
+ -------
667
+ pd.Series
668
+ Series mapping entity IDs to their reference URLs
737
669
 
738
670
  Raises
739
671
  ------
740
672
  ValueError
741
- If any validation check fails
673
+ If entity_type is invalid
742
674
  """
675
+ schema = self.schema
743
676
 
744
- if not hasattr(self, "schema"):
745
- raise ValueError("No schema found")
746
-
747
- required_tables = self._required_entities
748
- schema_tables = set(self.schema.keys())
677
+ # valid entities and their identifier variables
678
+ valid_entity_types = [
679
+ SBML_DFS.COMPARTMENTS,
680
+ SBML_DFS.SPECIES,
681
+ SBML_DFS.REACTIONS,
682
+ ]
749
683
 
750
- extra_tables = schema_tables.difference(required_tables)
751
- if len(extra_tables) != 0:
752
- logger.debug(
753
- f"{len(extra_tables)} unexpected tables found: "
754
- f"{', '.join(extra_tables)}"
684
+ if entity_type not in valid_entity_types:
685
+ raise ValueError(
686
+ f"{entity_type} is an invalid entity_type; valid types "
687
+ f"are {', '.join(valid_entity_types)}"
755
688
  )
756
689
 
757
- missing_tables = required_tables.difference(schema_tables)
758
- if len(missing_tables) != 0:
690
+ entity_table = getattr(self, entity_type)
691
+
692
+ if entity_ids is not None:
693
+ # ensure that entity_ids are unique and then convert back to list
694
+ # to support pandas indexing
695
+ entity_ids = list(set(entity_ids))
696
+
697
+ # filter to a subset of identifiers if one is provided
698
+ entity_table = entity_table.loc[entity_ids]
699
+
700
+ # create a dataframe of all identifiers for the select entities
701
+ all_ids = pd.concat(
702
+ [
703
+ sbml_dfs_utils._id_dict_to_df(
704
+ entity_table[schema[entity_type]["id"]].iloc[i].ids
705
+ ).assign(id=entity_table.index[i])
706
+ for i in range(0, entity_table.shape[0])
707
+ ]
708
+ ).rename(columns={"id": schema[entity_type]["pk"]})
709
+
710
+ # set priorities for ontologies and bqb terms
711
+
712
+ if required_ontology is None:
713
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
714
+ ONTOLOGY_PRIORITIES, how="left"
715
+ )
716
+ else:
717
+ ontology_priorities = pd.DataFrame(
718
+ [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
719
+ )
720
+ # if only a single ontology is sought then just return matching entries
721
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
722
+ ontology_priorities, how="inner"
723
+ )
724
+
725
+ uri_urls = (
726
+ all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
727
+ .groupby(schema[entity_type]["pk"])
728
+ .first()[IDENTIFIERS.URL]
729
+ )
730
+ return uri_urls
731
+
732
+ def infer_sbo_terms(self):
733
+ """
734
+ Infer SBO Terms
735
+
736
+ Define SBO terms based on stoichiometry for reaction_species with missing terms.
737
+ Modifies the SBML_dfs object in-place.
738
+
739
+ Returns
740
+ -------
741
+ None (modifies SBML_dfs object in-place)
742
+ """
743
+ valid_sbo_terms = self.reaction_species[
744
+ self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
745
+ ]
746
+
747
+ invalid_sbo_terms = self.reaction_species[
748
+ ~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
749
+ ]
750
+
751
+ if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
752
+ raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
753
+ if invalid_sbo_terms.shape[0] == 0:
754
+ logger.info("All sbo_terms were valid; nothing to update.")
755
+ return
756
+
757
+ logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
758
+
759
+ # add missing/invalid terms based on stoichiometry
760
+ invalid_sbo_terms.loc[
761
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
762
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
763
+
764
+ invalid_sbo_terms.loc[
765
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
766
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
767
+
768
+ invalid_sbo_terms.loc[
769
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
770
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
771
+
772
+ updated_reaction_species = pd.concat(
773
+ [valid_sbo_terms, invalid_sbo_terms]
774
+ ).sort_index()
775
+
776
+ if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
759
777
  raise ValueError(
760
- f"Missing {len(missing_tables)} required tables: "
761
- f"{', '.join(missing_tables)}"
778
+ f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
762
779
  )
780
+ self.reaction_species = updated_reaction_species
781
+ return
763
782
 
764
- # check individual tables
765
- for table in required_tables:
766
- self._validate_table(table)
783
+ def infer_uncompartmentalized_species_location(self):
784
+ """
785
+ Infer Uncompartmentalized Species Location
767
786
 
768
- # check whether pks and fks agree
769
- pk_df = pd.DataFrame(
770
- [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
787
+ If the compartment of a subset of compartmentalized species
788
+ was not specified, infer an appropriate compartment from
789
+ other members of reactions they participate in.
790
+
791
+ This method modifies the SBML_dfs object in-place.
792
+
793
+ Returns
794
+ -------
795
+ None (modifies SBML_dfs object in-place)
796
+ """
797
+ default_compartment = (
798
+ self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
799
+ .rename("N")
800
+ .reset_index()
801
+ .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
771
802
  )
803
+ if not isinstance(default_compartment, str):
804
+ raise ValueError(
805
+ "No default compartment could be found - compartment "
806
+ "information may not be present"
807
+ )
772
808
 
773
- fk_df = (
774
- pd.DataFrame(
775
- [
776
- {"fk_table": k, "fk": v["fk"]}
777
- for k, v in self.schema.items()
778
- if "fk" in v.keys()
779
- ]
809
+ # infer the compartments of species missing compartments
810
+ missing_compartment_scids = self.compartmentalized_species[
811
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
812
+ ].index.tolist()
813
+ if len(missing_compartment_scids) == 0:
814
+ logger.info(
815
+ "All compartmentalized species have compartments, "
816
+ "returning input SBML_dfs"
780
817
  )
781
- .set_index("fk_table")["fk"]
782
- .apply(pd.Series)
818
+ return self
819
+
820
+ participating_reactions = (
821
+ self.reaction_species[
822
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
823
+ ][SBML_DFS.R_ID]
824
+ .unique()
825
+ .tolist()
826
+ )
827
+ reaction_participants = self.reaction_species[
828
+ self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
829
+ ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
830
+ reaction_participants = reaction_participants.merge(
831
+ self.compartmentalized_species[SBML_DFS.C_ID],
832
+ left_on=SBML_DFS.SC_ID,
833
+ right_index=True,
834
+ )
835
+
836
+ # find a default compartment to fall back on if all compartmental information is missing
837
+ primary_reaction_compartment = (
838
+ reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
839
+ .rename("N")
840
+ .reset_index()
841
+ .sort_values("N", ascending=False)
842
+ .groupby(SBML_DFS.R_ID)
843
+ .first()[SBML_DFS.C_ID]
783
844
  .reset_index()
784
- .melt(id_vars="fk_table")
785
- .drop(["variable"], axis=1)
786
- .rename(columns={"value": "key"})
787
845
  )
788
846
 
789
- pk_fk_correspondences = pk_df.merge(fk_df)
847
+ inferred_compartmentalization = (
848
+ self.reaction_species[
849
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
850
+ ]
851
+ .merge(primary_reaction_compartment)
852
+ .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
853
+ .rename("N")
854
+ .reset_index()
855
+ .sort_values("N", ascending=False)
856
+ .groupby(SBML_DFS.SC_ID)
857
+ .first()
858
+ .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
859
+ )
860
+ logger.info(
861
+ f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
862
+ )
790
863
 
791
- for i in range(0, pk_fk_correspondences.shape[0]):
792
- pk_table_keys = set(
793
- getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
864
+ # define where a reaction is most likely to occur based on the compartmentalization of its participants
865
+ species_with_unknown_compartmentalization = set(
866
+ missing_compartment_scids
867
+ ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
868
+ if len(species_with_unknown_compartmentalization) != 0:
869
+ logger.warning(
870
+ f"{len(species_with_unknown_compartmentalization)} "
871
+ "species compartmentalization could not be inferred"
872
+ " from other reaction participants. Their compartmentalization "
873
+ f"will be set to the default of {default_compartment}"
794
874
  )
795
- if None in pk_table_keys:
796
- raise ValueError(
797
- f"{pk_fk_correspondences['pk_table'][i]} had "
798
- "missing values in its index"
799
- )
800
875
 
801
- fk_table_keys = set(
802
- getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
803
- :, pk_fk_correspondences["key"][i]
876
+ inferred_compartmentalization = pd.concat(
877
+ [
878
+ inferred_compartmentalization,
879
+ pd.DataFrame(
880
+ {
881
+ SBML_DFS.SC_ID: list(
882
+ species_with_unknown_compartmentalization
883
+ )
884
+ }
885
+ ).assign(c_id=default_compartment),
804
886
  ]
805
887
  )
806
- if None in fk_table_keys:
807
- raise ValueError(
808
- f"{pk_fk_correspondences['fk_table'][i]} included "
809
- f"missing {pk_fk_correspondences['key'][i]} values"
810
- )
811
888
 
812
- # all foreign keys need to match a primary key
813
- extra_fks = fk_table_keys.difference(pk_table_keys)
814
- if len(extra_fks) != 0:
815
- raise ValueError(
816
- f"{len(extra_fks)} distinct "
817
- f"{pk_fk_correspondences['key'][i]} values were"
818
- f" found in {pk_fk_correspondences['fk_table'][i]} "
819
- f"but missing from {pk_fk_correspondences['pk_table'][i]}."
820
- " All foreign keys must have a matching primary key.\n\n"
821
- f"Extra key are: {', '.join(extra_fks)}"
889
+ if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
890
+ raise ValueError(
891
+ f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
892
+ )
893
+
894
+ updated_compartmentalized_species = pd.concat(
895
+ [
896
+ self.compartmentalized_species[
897
+ ~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
898
+ ],
899
+ self.compartmentalized_species[
900
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
901
+ ]
902
+ .drop(SBML_DFS.C_ID, axis=1)
903
+ .merge(
904
+ inferred_compartmentalization,
905
+ left_index=True,
906
+ right_on=SBML_DFS.SC_ID,
822
907
  )
908
+ .set_index(SBML_DFS.SC_ID),
909
+ ]
910
+ )
823
911
 
824
- # check optional data tables:
825
- for k, v in self.species_data.items():
826
- try:
827
- self._validate_species_data(v)
828
- except ValueError as e:
829
- raise ValueError(f"species data {k} was invalid.") from e
912
+ if (
913
+ updated_compartmentalized_species.shape[0]
914
+ != self.compartmentalized_species.shape[0]
915
+ ):
916
+ raise ValueError(
917
+ f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
918
+ " compartmentalized species with "
919
+ f"{updated_compartmentalized_species.shape[0]}"
920
+ )
830
921
 
831
- for k, v in self.reactions_data.items():
832
- try:
833
- self._validate_reactions_data(v)
834
- except ValueError as e:
835
- raise ValueError(f"reactions data {k} was invalid.") from e
922
+ if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
923
+ raise ValueError("Some species compartments are still missing")
836
924
 
837
- # validate reaction_species sbo_terms and stoi
838
- self._validate_reaction_species()
925
+ self.compartmentalized_species = updated_compartmentalized_species
926
+ return
839
927
 
840
- def validate_and_resolve(self):
928
+ def name_compartmentalized_species(self):
841
929
  """
842
- Validate and attempt to automatically fix common issues.
930
+ Name Compartmentalized Species
843
931
 
844
- This method iteratively:
845
- 1. Attempts validation
846
- 2. If validation fails, tries to resolve the issue
847
- 3. Repeats until validation passes or issue cannot be resolved
932
+ Rename compartmentalized species if they have the same
933
+ name as their species. Modifies the SBML_dfs object in-place.
848
934
 
849
- Raises
850
- ------
851
- ValueError
852
- If validation fails and cannot be automatically resolved
935
+ Returns
936
+ -------
937
+ None (modifies SBML_dfs object in-place)
853
938
  """
939
+ augmented_cspecies = self.compartmentalized_species.merge(
940
+ self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
941
+ ).merge(
942
+ self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
943
+ )
944
+ augmented_cspecies[SBML_DFS.SC_NAME] = [
945
+ f"{s} [{c}]" if sc == s else sc
946
+ for sc, c, s in zip(
947
+ augmented_cspecies[SBML_DFS.SC_NAME],
948
+ augmented_cspecies[SBML_DFS.C_NAME],
949
+ augmented_cspecies[SBML_DFS.S_NAME],
950
+ )
951
+ ]
854
952
 
855
- current_exception = None
856
- validated = False
857
-
858
- while not validated:
859
- try:
860
- self.validate()
861
- validated = True
862
- except Exception as e:
863
- e_str = str(e)
864
- if e_str == current_exception:
865
- logger.warning(
866
- "Automated resolution of an Exception was attempted but failed"
867
- )
868
- raise e
869
-
870
- # try to resolve
871
- self._attempt_resolve(e)
953
+ self.compartmentalized_species = augmented_cspecies.loc[
954
+ :, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
955
+ ]
956
+ return
872
957
 
873
- def select_species_data(self, species_data_table: str) -> pd.DataFrame:
958
+ def reaction_formulas(
959
+ self, r_ids: Optional[Union[str, list[str]]] = None
960
+ ) -> pd.Series:
874
961
  """
875
- Select a species data table from the SBML_dfs object.
962
+ Reaction Summary
876
963
 
877
- Parameters
964
+ Return human-readable formulas for reactions.
965
+
966
+ Parameters:
878
967
  ----------
879
- species_data_table : str
880
- Name of the species data table to select
968
+ r_ids: [str], str or None
969
+ Reaction IDs or None for all reactions
881
970
 
882
971
  Returns
883
- -------
884
- pd.DataFrame
885
- The selected species data table
886
-
887
- Raises
888
- ------
889
- ValueError
890
- If species_data_table is not found
972
+ ----------
973
+ formula_strs: pd.Series
891
974
  """
892
- # Check if species_data_table exists in sbml_dfs.species_data
893
- if species_data_table not in self.species_data:
894
- raise ValueError(
895
- f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
896
- f"Available tables: {self.species_data.keys()}"
975
+
976
+ validated_rids = self._validate_r_ids(r_ids)
977
+
978
+ matching_reaction_species = self.reaction_species[
979
+ self.reaction_species.r_id.isin(validated_rids)
980
+ ].merge(
981
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
982
+ )
983
+
984
+ # split into within compartment and cross-compartment reactions
985
+ r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
986
+ SBML_DFS.C_ID
987
+ ].nunique()
988
+
989
+ # identify reactions which work across compartments
990
+ r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
991
+ # there species must be labelled with the sc_name to specify where a species exists
992
+ if r_id_cross_compartment.shape[0] > 0:
993
+ rxn_eqtn_cross_compartment = (
994
+ matching_reaction_species[
995
+ matching_reaction_species[SBML_DFS.R_ID].isin(
996
+ r_id_cross_compartment.index
997
+ )
998
+ ]
999
+ .sort_values([SBML_DFS.SC_NAME])
1000
+ .groupby(SBML_DFS.R_ID)
1001
+ .apply(
1002
+ lambda x: sbml_dfs_utils.construct_formula_string(
1003
+ x, self.reactions, SBML_DFS.SC_NAME
1004
+ )
1005
+ )
1006
+ .rename("r_formula_str")
1007
+ )
1008
+ else:
1009
+ rxn_eqtn_cross_compartment = None
1010
+
1011
+ # identify reactions which occur within a single compartment; for these the reaction
1012
+ # can be labelled with the compartment and individual species can receive a more readable s_name
1013
+ r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1014
+ if r_id_within_compartment.shape[0] > 0:
1015
+ # add s_name
1016
+ augmented_matching_reaction_species = (
1017
+ matching_reaction_species[
1018
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1019
+ r_id_within_compartment.index
1020
+ )
1021
+ ]
1022
+ .merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1023
+ .merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
1024
+ .sort_values([SBML_DFS.S_NAME])
897
1025
  )
1026
+ # create formulas based on s_names of components
1027
+ rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1028
+ [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1029
+ ).apply(
1030
+ lambda x: sbml_dfs_utils.construct_formula_string(
1031
+ x, self.reactions, SBML_DFS.S_NAME
1032
+ )
1033
+ )
1034
+ # add compartment for each reaction
1035
+ rxn_eqtn_within_compartment = pd.Series(
1036
+ [
1037
+ y + ": " + x
1038
+ for x, y in zip(
1039
+ rxn_eqtn_within_compartment,
1040
+ rxn_eqtn_within_compartment.index.get_level_values(
1041
+ SBML_DFS.C_NAME
1042
+ ),
1043
+ )
1044
+ ],
1045
+ index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1046
+ ).rename("r_formula_str")
1047
+ else:
1048
+ rxn_eqtn_within_compartment = None
898
1049
 
899
- # Get the species data
900
- return self.species_data[species_data_table]
1050
+ formula_strs = pd.concat(
1051
+ [rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
1052
+ )
901
1053
 
902
- def _validate_table(self, table: str) -> None:
1054
+ return formula_strs
1055
+
1056
+ def reaction_summaries(
1057
+ self, r_ids: Optional[Union[str, list[str]]] = None
1058
+ ) -> pd.DataFrame:
903
1059
  """
904
- Validate a table in this SBML_dfs object against its schema.
1060
+ Reaction Summary
905
1061
 
906
- This is an internal method that validates a table that is part of this SBML_dfs
907
- object against the schema stored in self.schema.
1062
+ Return a summary of reactions.
908
1063
 
909
- Parameters
1064
+ Parameters:
910
1065
  ----------
911
- table : str
912
- Name of the table to validate
1066
+ r_ids: [str], str or None
1067
+ Reaction IDs or None for all reactions
913
1068
 
914
- Raises
915
- ------
916
- ValueError
917
- If the table does not conform to its schema
1069
+ Returns
1070
+ ----------
1071
+ reaction_summaries_df: pd.DataFrame
1072
+ A table with r_id as an index and columns:
1073
+ - r_name: str, name of the reaction
1074
+ - r_formula_str: str, human-readable formula of the reaction
918
1075
  """
919
- table_schema = self.schema[table]
920
- table_data = getattr(self, table)
921
- _perform_sbml_dfs_table_validation(table_data, table_schema, table)
922
1076
 
923
- def _remove_entity_data(self, entity_type: str, label: str) -> None:
924
- """
925
- Remove data from species_data or reactions_data by table name and label.
1077
+ validated_rids = self._validate_r_ids(r_ids)
926
1078
 
927
- Parameters
928
- ----------
929
- entity_type : str
930
- Name of the table to remove data from ('species' or 'reactions')
931
- label : str
932
- Label of the data to remove
1079
+ participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
1080
+ participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
1081
+ reaction_summareis_df = pd.concat(
1082
+ [participating_r_names, participating_r_formulas], axis=1
1083
+ )
933
1084
 
934
- Notes
935
- -----
936
- If the label does not exist, a warning will be logged that includes the existing labels.
1085
+ return reaction_summareis_df
1086
+
1087
+ def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
937
1088
  """
938
- if entity_type not in ENTITIES_W_DATA:
939
- raise ValueError("table_name must be either 'species' or 'reactions'")
1089
+ Remove compartmentalized species and associated reactions.
940
1090
 
941
- data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
942
- if label not in data_dict:
943
- existing_labels = list(data_dict.keys())
944
- logger.warning(
945
- f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
946
- f"Existing labels: {existing_labels}"
947
- )
948
- return
949
-
950
- del data_dict[label]
951
-
952
- def _remove_unused_cspecies(self):
953
- """Removes compartmentalized species that are no
954
- longer part of any reactions"""
955
- sc_ids = self._get_unused_cspecies()
956
- self._remove_compartmentalized_species(sc_ids)
1091
+ Starting with a set of compartmentalized species, determine which reactions
1092
+ should be removed based on their removal. Then remove these reactions,
1093
+ compartmentalized species, and species.
957
1094
 
958
- def _get_unused_cspecies(self) -> set[str]:
959
- """Returns a set of compartmentalized species
960
- that are not part of any reactions"""
961
- sc_ids = set(self.compartmentalized_species.index) - set(
962
- self.reaction_species[SBML_DFS.SC_ID]
963
- )
964
- return sc_ids # type: ignore
1095
+ Parameters
1096
+ ----------
1097
+ sc_ids : Iterable[str]
1098
+ IDs of compartmentalized species to remove
1099
+ """
965
1100
 
966
- def _remove_unused_species(self):
967
- """Removes species that are no longer part of any
968
- compartmentalized species"""
969
- s_ids = self._get_unused_species()
970
- self._remove_species(s_ids)
1101
+ # find reactions which should be totally removed since they are losing critical species
1102
+ removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
1103
+ self.remove_reactions(removed_reactions)
971
1104
 
972
- def _get_unused_species(self) -> set[str]:
973
- """Returns a list of species that are not part of any reactions"""
974
- s_ids = set(self.species.index) - set(
975
- self.compartmentalized_species[SBML_DFS.S_ID]
976
- )
977
- return s_ids # type: ignore
1105
+ self._remove_compartmentalized_species(sc_ids)
978
1106
 
979
- def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
980
- """Removes compartmentalized species from the model
1107
+ # remove species (and their associated species data if all their cspecies have been lost)
1108
+ self._remove_unused_species()
981
1109
 
982
- This should not be directly used by the user, as it can lead to
983
- invalid reactions when removing species without a logic to decide
984
- if the reaction needs to be removed as well.
1110
+ def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
1111
+ """
1112
+ Remove reactions from the model.
985
1113
 
986
- Args:
987
- sc_ids (Iterable[str]): the compartmentalized species to remove
1114
+ Parameters
1115
+ ----------
1116
+ r_ids : Iterable[str]
1117
+ IDs of reactions to remove
1118
+ remove_species : bool, optional
1119
+ Whether to remove species that are no longer part of any reactions,
1120
+ by default False
988
1121
  """
989
- # Remove compartmentalized species
990
- self.compartmentalized_species = self.compartmentalized_species.drop(
991
- index=list(sc_ids)
992
- )
993
1122
  # remove corresponding reactions_species
994
- self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
995
-
996
- def _remove_species(self, s_ids: Iterable[str]):
997
- """Removes species from the model
998
-
999
- This should not be directly used by the user, as it can lead to
1000
- invalid reactions when removing species without a logic to decide
1001
- if the reaction needs to be removed as well.
1002
-
1003
- This removes the species and corresponding compartmentalized species and
1004
- reactions_species.
1123
+ self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
1124
+ # remove reactions
1125
+ self.reactions = self.reactions.drop(index=list(r_ids))
1126
+ # remove reactions_data
1127
+ if hasattr(self, "reactions_data"):
1128
+ for k, data in self.reactions_data.items():
1129
+ self.reactions_data[k] = data.drop(index=list(r_ids))
1130
+ # remove species if requested
1131
+ if remove_species:
1132
+ self._remove_unused_cspecies()
1133
+ self._remove_unused_species()
1005
1134
 
1006
- Args:
1007
- s_ids (Iterable[str]): the species to remove
1135
+ def remove_reactions_data(self, label: str):
1008
1136
  """
1009
- sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1010
- self._remove_compartmentalized_species(sc_ids)
1011
- # Remove species
1012
- self.species = self.species.drop(index=list(s_ids))
1013
- # remove data
1014
- for k, data in self.species_data.items():
1015
- self.species_data[k] = data.drop(index=list(s_ids))
1016
-
1017
- def _validate_species_data(self, species_data_table: pd.DataFrame):
1018
- """Validates species data attribute
1137
+ Remove reactions data by label.
1138
+ """
1139
+ self._remove_entity_data(SBML_DFS.REACTIONS, label)
1019
1140
 
1020
- Args:
1021
- species_data_table (pd.DataFrame): a species data table
1141
+ def remove_species_data(self, label: str):
1142
+ """
1143
+ Remove species data by label.
1144
+ """
1145
+ self._remove_entity_data(SBML_DFS.SPECIES, label)
1022
1146
 
1023
- Raises:
1024
- ValueError: s_id not index name
1025
- ValueError: s_id index contains duplicates
1026
- ValueError: s_id not in species table
1147
+ def search_by_ids(
1148
+ self,
1149
+ ids: list[str],
1150
+ entity_type: str,
1151
+ identifiers_df: pd.DataFrame,
1152
+ ontologies: None | set[str] = None,
1153
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
1027
1154
  """
1028
- _validate_matching_data(species_data_table, self.species)
1155
+ Find entities and identifiers matching a set of query IDs.
1029
1156
 
1030
- def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1031
- """Validates reactions data attribute
1157
+ Parameters
1158
+ ----------
1159
+ ids : List[str]
1160
+ List of identifiers to search for
1161
+ entity_type : str
1162
+ Type of entity to search (e.g., 'species', 'reactions')
1163
+ identifiers_df : pd.DataFrame
1164
+ DataFrame containing identifier mappings
1165
+ ontologies : Optional[Set[str]], optional
1166
+ Set of ontologies to filter by, by default None
1032
1167
 
1033
- Args:
1034
- reactions_data_table (pd.DataFrame): a reactions data table
1168
+ Returns
1169
+ -------
1170
+ Tuple[pd.DataFrame, pd.DataFrame]
1171
+ - Matching entities
1172
+ - Matching identifiers
1035
1173
 
1036
- Raises:
1037
- ValueError: r_id not index name
1038
- ValueError: r_id index contains duplicates
1039
- ValueError: r_id not in reactions table
1174
+ Raises
1175
+ ------
1176
+ ValueError
1177
+ If entity_type is invalid or ontologies are invalid
1178
+ TypeError
1179
+ If ontologies is not a set
1040
1180
  """
1041
- _validate_matching_data(reactions_data_table, self.reactions)
1181
+ # validate inputs
1182
+ entity_table = self.get_table(entity_type, required_attributes={"id"})
1183
+ entity_pk = self.schema[entity_type]["pk"]
1042
1184
 
1043
- def _validate_reaction_species(self):
1044
- if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1045
- raise ValueError(
1046
- "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1047
- )
1185
+ utils.match_pd_vars(
1186
+ identifiers_df,
1187
+ req_vars={
1188
+ entity_pk,
1189
+ IDENTIFIERS.ONTOLOGY,
1190
+ IDENTIFIERS.IDENTIFIER,
1191
+ IDENTIFIERS.URL,
1192
+ IDENTIFIERS.BQB,
1193
+ },
1194
+ allow_series=False,
1195
+ ).assert_present()
1048
1196
 
1049
- # test for null SBO terms
1050
- n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1051
- if n_null_sbo_terms != 0:
1052
- raise ValueError(
1053
- f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1054
- )
1197
+ if ontologies is not None:
1198
+ if not isinstance(ontologies, set):
1199
+ # for clarity this should not be reachable based on type hints
1200
+ raise TypeError(
1201
+ f"ontologies must be a set, but got {type(ontologies).__name__}"
1202
+ )
1203
+ ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
1204
+ invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
1205
+ if len(invalid_ontologies) > 0:
1206
+ raise ValueError(
1207
+ f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
1208
+ f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
1209
+ )
1055
1210
 
1056
- # find invalid SBO terms
1057
- sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1058
- invalid_sbo_term_counts = sbo_counts[
1059
- ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
1060
- ]
1211
+ # fitler to just to identifiers matchign the ontologies of interest
1212
+ identifiers_df = identifiers_df.query("ontology in @ontologies")
1061
1213
 
1062
- if invalid_sbo_term_counts.shape[0] != 0:
1063
- invalid_sbo_counts_str = ", ".join(
1064
- [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1065
- )
1066
- raise ValueError(
1067
- f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1068
- f"defined {invalid_sbo_counts_str}"
1069
- )
1214
+ matching_identifiers = identifiers_df.loc[
1215
+ identifiers_df["identifier"].isin(ids)
1216
+ ]
1217
+ entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
1070
1218
 
1071
- def _attempt_resolve(self, e):
1072
- str_e = str(e)
1073
- if str_e == "compartmentalized_species included missing c_id values":
1074
- logger.warning(str_e)
1075
- logger.warning(
1076
- "Attempting to resolve with infer_uncompartmentalized_species_location()"
1077
- )
1078
- self = infer_uncompartmentalized_species_location(self)
1079
- elif re.search("sbo_terms were not defined", str_e):
1080
- logger.warning(str_e)
1081
- logger.warning("Attempting to resolve with infer_sbo_terms()")
1082
- self = infer_sbo_terms(self)
1083
- else:
1084
- logger.warning(
1085
- "An error occurred which could not be automatically resolved"
1086
- )
1087
- raise e
1219
+ return entity_subset, matching_identifiers
1088
1220
 
1221
+ def search_by_name(
1222
+ self, name: str, entity_type: str, partial_match: bool = True
1223
+ ) -> pd.DataFrame:
1224
+ """
1225
+ Find entities by exact or partial name match.
1089
1226
 
1090
- def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1091
- """
1092
- Species Status
1227
+ Parameters
1228
+ ----------
1229
+ name : str
1230
+ Name to search for
1231
+ entity_type : str
1232
+ Type of entity to search (e.g., 'species', 'reactions')
1233
+ partial_match : bool, optional
1234
+ Whether to allow partial string matches, by default True
1093
1235
 
1094
- Return all of the reaction's a species particpates in.
1236
+ Returns
1237
+ -------
1238
+ pd.DataFrame
1239
+ Matching entities
1240
+ """
1241
+ entity_table = self.get_table(entity_type, required_attributes={"label"})
1242
+ label_attr = self.schema[entity_type]["label"]
1095
1243
 
1096
- Parameters:
1097
- s_id: str
1098
- A species ID
1099
- sbml_dfs: SBML_dfs
1244
+ if partial_match:
1245
+ matches = entity_table.loc[
1246
+ entity_table[label_attr].str.contains(name, case=False)
1247
+ ]
1248
+ else:
1249
+ matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
1250
+ return matches
1100
1251
 
1101
- Returns:
1102
- pd.DataFrame, one row reaction
1103
- """
1104
-
1105
- matching_species = sbml_dfs.species.loc[s_id]
1106
-
1107
- if not isinstance(matching_species, pd.Series):
1108
- raise ValueError(f"{s_id} did not match a single species")
1109
-
1110
- # find all rxns species particpate in
1111
-
1112
- matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
1113
- sbml_dfs.compartmentalized_species.s_id.isin([s_id])
1114
- ]
1115
-
1116
- rxns_participating = sbml_dfs.reaction_species[
1117
- sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1118
- ]
1119
-
1120
- # find all participants in these rxns
1121
-
1122
- full_rxns_participating = sbml_dfs.reaction_species[
1123
- sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1124
- ].merge(
1125
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1126
- )
1127
-
1128
- reaction_descriptions = pd.concat(
1129
- [
1130
- reaction_summary(x, sbml_dfs)
1131
- for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
1132
- ]
1133
- )
1134
-
1135
- status = (
1136
- full_rxns_participating.loc[
1137
- full_rxns_participating[SBML_DFS.SC_ID].isin(
1138
- matching_compartmentalized_species.index.values.tolist()
1139
- ),
1140
- [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1141
- ]
1142
- .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1143
- .reset_index(drop=True)
1144
- .drop(SBML_DFS.R_ID, axis=1)
1145
- )
1146
-
1147
- return status
1148
-
1149
-
1150
- def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1151
- """
1152
- Reaction Summary
1153
-
1154
- Return a reaction's name and a human-readable formula.
1155
-
1156
- Parameters:
1157
- r_id: str
1158
- A reaction ID
1159
- sbml_dfs: SBML_dfs
1160
-
1161
- Returns:
1162
- one row pd.DataFrame
1163
- """
1164
-
1165
- logger.warning(
1166
- "reaction_summary is deprecated and will be removed in a future version of rcpr; "
1167
- "please use reaction_summaries() instead"
1168
- )
1169
-
1170
- matching_reaction = sbml_dfs.reactions.loc[r_id]
1171
-
1172
- if not isinstance(matching_reaction, pd.Series):
1173
- raise ValueError(f"{r_id} did not match a single reaction")
1174
-
1175
- matching_reaction = sbml_dfs.reactions.loc[r_id]
1176
-
1177
- matching_reaction_species = sbml_dfs.reaction_species[
1178
- sbml_dfs.reaction_species.r_id.isin([r_id])
1179
- ].merge(
1180
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1181
- )
1182
-
1183
- # collapse all reaction species to a formula string
1184
-
1185
- if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
1186
- augmented_matching_reaction_species = matching_reaction_species.merge(
1187
- sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
1188
- ).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1189
- str_formula = (
1190
- construct_formula_string(
1191
- augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
1192
- )
1193
- + " ["
1194
- + augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
1195
- + "]"
1196
- )
1197
- else:
1198
- str_formula = construct_formula_string(
1199
- matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
1200
- )
1201
-
1202
- output = pd.DataFrame(
1203
- {
1204
- SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
1205
- "r_formula_str": str_formula,
1206
- },
1207
- index=[r_id],
1208
- )
1209
-
1210
- output.index.name = SBML_DFS.R_ID
1211
-
1212
- return output
1213
-
1214
-
1215
- def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
1216
- """
1217
- Reaction Summary
1218
-
1219
- Return human-readable formulas for reactions.
1220
-
1221
- Parameters:
1222
- ----------
1223
- sbml_dfs: sbml.SBML_dfs
1224
- A relational mechanistic model
1225
- r_ids: [str], str or None
1226
- Reaction IDs or None for all reactions
1227
-
1228
- Returns:
1229
- ----------
1230
- formula_strs: pd.Series
1231
- """
1232
-
1233
- if isinstance(r_ids, str):
1234
- r_ids = [r_ids]
1235
-
1236
- if r_ids is None:
1237
- matching_reactions = sbml_dfs.reactions
1238
- else:
1239
- matching_reactions = sbml_dfs.reactions.loc[r_ids]
1240
-
1241
- matching_reaction_species = sbml_dfs.reaction_species[
1242
- sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
1243
- ].merge(
1244
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1245
- )
1246
-
1247
- # split into within compartment and cross-compartment reactions
1248
- r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
1249
- SBML_DFS.C_ID
1250
- ].nunique()
1251
-
1252
- # identify reactions which work across compartments
1253
- r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
1254
- # there species must be labelled with the sc_name to specify where a species exists
1255
- if r_id_cross_compartment.shape[0] > 0:
1256
- rxn_eqtn_cross_compartment = (
1257
- matching_reaction_species[
1258
- matching_reaction_species[SBML_DFS.R_ID].isin(
1259
- r_id_cross_compartment.index
1260
- )
1261
- ]
1262
- .sort_values([SBML_DFS.SC_NAME])
1263
- .groupby(SBML_DFS.R_ID)
1264
- .apply(
1265
- lambda x: construct_formula_string(
1266
- x, sbml_dfs.reactions, SBML_DFS.SC_NAME
1267
- )
1268
- )
1269
- .rename("r_formula_str")
1270
- )
1271
- else:
1272
- rxn_eqtn_cross_compartment = None
1273
-
1274
- # identify reactions which occur within a single compartment; for these the reaction
1275
- # can be labelled with the compartment and individual species can receive a more readable s_name
1276
- r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1277
- if r_id_within_compartment.shape[0] > 0:
1278
- # add s_name
1279
- augmented_matching_reaction_species = (
1280
- matching_reaction_species[
1281
- matching_reaction_species[SBML_DFS.R_ID].isin(
1282
- r_id_within_compartment.index
1283
- )
1284
- ]
1285
- .merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1286
- .merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1287
- .sort_values([SBML_DFS.S_NAME])
1288
- )
1289
- # create formulas based on s_names of components
1290
- rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1291
- [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1292
- ).apply(
1293
- lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
1294
- )
1295
- # add compartment for each reaction
1296
- rxn_eqtn_within_compartment = pd.Series(
1297
- [
1298
- y + ": " + x
1299
- for x, y in zip(
1300
- rxn_eqtn_within_compartment,
1301
- rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
1302
- )
1303
- ],
1304
- index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1305
- ).rename("r_formula_str")
1306
- else:
1307
- rxn_eqtn_within_compartment = None
1308
-
1309
- formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
1310
-
1311
- return formula_strs
1312
-
1313
-
1314
- def construct_formula_string(
1315
- reaction_species_df: pd.DataFrame,
1316
- reactions_df: pd.DataFrame,
1317
- name_var: str,
1318
- ) -> str:
1319
- """
1320
- Construct Formula String
1321
-
1322
- Convert a table of reaction species into a formula string
1323
-
1324
- Parameters:
1325
- ----------
1326
- reaction_species_df: pd.DataFrame
1327
- Table containing a reactions' species
1328
- reactions_df: pd.DataFrame
1329
- smbl.reactions
1330
- name_var: str
1331
- Name used to label species
1332
-
1333
- Returns:
1334
- ----------
1335
- formula_str: str
1336
- String representation of a reactions substrates, products and
1337
- modifiers
1338
-
1339
- """
1340
-
1341
- reaction_species_df["label"] = [
1342
- add_stoi_to_species_name(x, y)
1343
- for x, y in zip(
1344
- reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
1345
- )
1346
- ]
1347
-
1348
- rxn_reversible = bool(
1349
- reactions_df.loc[
1350
- reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
1351
- ]
1352
- ) # convert from a np.bool_ to bool if needed
1353
- if not isinstance(rxn_reversible, bool):
1354
- raise TypeError(
1355
- f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
1356
- )
1357
-
1358
- if rxn_reversible:
1359
- arrow_type = " <-> "
1360
- else:
1361
- arrow_type = " -> "
1362
-
1363
- substrates = " + ".join(
1364
- reaction_species_df["label"][
1365
- reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
1366
- ].tolist()
1367
- )
1368
- products = " + ".join(
1369
- reaction_species_df["label"][
1370
- reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
1371
- ].tolist()
1372
- )
1373
- modifiers = " + ".join(
1374
- reaction_species_df["label"][
1375
- reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
1376
- ].tolist()
1377
- )
1378
- if modifiers != "":
1379
- modifiers = f" ---- modifiers: {modifiers}]"
1380
-
1381
- return f"{substrates}{arrow_type}{products}{modifiers}"
1382
-
1383
-
1384
- def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
1385
- """
1386
- Add Stoi To Species Name
1387
-
1388
- Add # of molecules to a species name
1389
-
1390
- Parameters:
1391
- ----------
1392
- stoi: float or int
1393
- Number of molecules
1394
- name: str
1395
- Name of species
1396
-
1397
- Returns:
1398
- ----------
1399
- name: str
1400
- Name containing number of species
1401
-
1402
- """
1403
-
1404
- if stoi in [-1, 0, 1]:
1405
- return name
1406
- else:
1407
- return str(abs(stoi)) + " " + name
1408
-
1409
-
1410
- def filter_to_characteristic_species_ids(
1411
- species_ids: pd.DataFrame,
1412
- max_complex_size: int = 4,
1413
- max_promiscuity: int = 20,
1414
- defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
1415
- ) -> pd.DataFrame:
1416
- """
1417
- Filter to Characteristic Species IDs
1418
-
1419
- Remove identifiers corresponding to one component within a large protein
1420
- complexes and non-characteristic annotations such as pubmed references and
1421
- homologues.
1252
+ def select_species_data(self, species_data_table: str) -> pd.DataFrame:
1253
+ """
1254
+ Select a species data table from the SBML_dfs object.
1422
1255
 
1423
1256
  Parameters
1424
1257
  ----------
1425
- species_ids: pd.DataFrame
1426
- A table of identifiers produced by sdbml_dfs.get_identifiers("species")
1427
- max_complex_size: int
1428
- The largest size of a complex, where BQB_HAS_PART terms will be retained.
1429
- In most cases, complexes are handled with specific formation and
1430
- dissolutation reactions,but these identifiers will be pulled in when
1431
- searching by identifiers or searching the identifiers associated with a
1432
- species against an external resource such as Open Targets.
1433
- max_promiscuity: int
1434
- Maximum number of species where a single molecule can act as a
1435
- BQB_HAS_PART component associated with a single identifier (and common ontology).
1436
- defining_biological_qualifiers (list[str]):
1437
- BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
1438
- permissive settings would include homologs, different forms of the same gene.
1439
-
1440
- Returns:
1441
- --------
1442
- species_id: pd.DataFrame
1443
- Input species filtered to characteristic identifiers
1444
-
1445
- """
1446
-
1447
- if not isinstance(species_ids, pd.DataFrame):
1448
- raise TypeError(
1449
- f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
1450
- )
1451
-
1452
- if not isinstance(max_complex_size, int):
1453
- raise TypeError(
1454
- f"max_complex_size was a {type(max_complex_size)} but must be an int"
1455
- )
1456
-
1457
- if not isinstance(max_promiscuity, int):
1458
- raise TypeError(
1459
- f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
1460
- )
1461
-
1462
- if not isinstance(defining_biological_qualifiers, list):
1463
- raise TypeError(
1464
- f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
1465
- )
1466
-
1467
- # primary annotations of a species
1468
- bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
1469
-
1470
- # add components within modestly sized protein complexes
1471
- # look at HAS_PART IDs
1472
- bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
1473
-
1474
- # number of species in a complex
1475
- n_species_components = bqb_has_parts_species.value_counts(
1476
- [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
1477
- )
1478
- big_complex_sids = set(
1479
- n_species_components[
1480
- n_species_components > max_complex_size
1481
- ].index.get_level_values(SBML_DFS.S_ID)
1482
- )
1483
-
1484
- filtered_bqb_has_parts = _filter_promiscuous_components(
1485
- bqb_has_parts_species, max_promiscuity
1486
- )
1487
-
1488
- # drop species parts if there are many components
1489
- filtered_bqb_has_parts = filtered_bqb_has_parts[
1490
- ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
1491
- ]
1492
-
1493
- # combine primary identifiers and rare components
1494
- characteristic_species_ids = pd.concat(
1495
- [
1496
- bqb_is_species,
1497
- filtered_bqb_has_parts,
1498
- ]
1499
- )
1500
-
1501
- return characteristic_species_ids
1502
-
1503
-
1504
- def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
1505
- """
1506
- Infer Uncompartmentalized Species Location
1507
-
1508
- If the compartment of a subset of compartmentalized species
1509
- was not specified, infer an appropriate compartment from
1510
- other members of reactions they particpate in
1511
-
1512
- Parameters:
1513
- ----------
1514
- sbml_dfs: sbml.SBML_dfs
1515
- A relational pathway model
1516
-
1517
- Returns:
1518
- ----------
1519
- sbml_dfs: sbml.SBML_dfs
1520
- A relational pathway model (with filled in species compartments)
1521
-
1522
- """
1523
-
1524
- default_compartment = (
1525
- sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
1526
- .rename("N")
1527
- .reset_index()
1528
- .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
1529
- )
1530
- if not isinstance(default_compartment, str):
1531
- raise ValueError(
1532
- "No default compartment could be found - compartment "
1533
- "information may not be present"
1534
- )
1535
-
1536
- # infer the compartments of species missing compartments
1537
-
1538
- missing_compartment_scids = sbml_dfs.compartmentalized_species[
1539
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1540
- ].index.tolist()
1541
- if len(missing_compartment_scids) == 0:
1542
- logger.info(
1543
- "All compartmentalized species have compartments, "
1544
- "returning input sbml_dfs"
1545
- )
1546
- return sbml_dfs
1547
-
1548
- participating_reactions = (
1549
- sbml_dfs.reaction_species[
1550
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1551
- ][SBML_DFS.R_ID]
1552
- .unique()
1553
- .tolist()
1554
- )
1555
- reaction_participants = sbml_dfs.reaction_species[
1556
- sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
1557
- ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
1558
- reaction_participants = reaction_participants.merge(
1559
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
1560
- left_on=SBML_DFS.SC_ID,
1561
- right_index=True,
1562
- )
1563
-
1564
- # find a default compartment to fall back on if all compartmental information is missing
1565
-
1566
- primary_reaction_compartment = (
1567
- reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
1568
- .rename("N")
1569
- .reset_index()
1570
- .sort_values("N", ascending=False)
1571
- .groupby(SBML_DFS.R_ID)
1572
- .first()[SBML_DFS.C_ID]
1573
- .reset_index()
1574
- )
1575
-
1576
- inferred_compartmentalization = (
1577
- sbml_dfs.reaction_species[
1578
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1579
- ]
1580
- .merge(primary_reaction_compartment)
1581
- .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
1582
- .rename("N")
1583
- .reset_index()
1584
- .sort_values("N", ascending=False)
1585
- .groupby(SBML_DFS.SC_ID)
1586
- .first()
1587
- .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
1588
- )
1589
- logger.info(
1590
- f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
1591
- )
1592
-
1593
- # define where a reaction is most likely to occur based on the compartmentalization of its particpants
1594
- species_with_unknown_compartmentalization = set(
1595
- missing_compartment_scids
1596
- ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
1597
- if len(species_with_unknown_compartmentalization) != 0:
1598
- logger.warning(
1599
- f"{len(species_with_unknown_compartmentalization)} "
1600
- "species compartmentalization could not be inferred"
1601
- " from other reaction particpants. Their compartmentalization "
1602
- f"will be set to the default of {default_compartment}"
1603
- )
1604
-
1605
- inferred_compartmentalization = pd.concat(
1606
- [
1607
- inferred_compartmentalization,
1608
- pd.DataFrame(
1609
- {SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
1610
- ).assign(c_id=default_compartment),
1611
- ]
1612
- )
1613
-
1614
- if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
1615
- raise ValueError(
1616
- f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
1617
- )
1618
-
1619
- updated_compartmentalized_species = pd.concat(
1620
- [
1621
- sbml_dfs.compartmentalized_species[
1622
- ~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1623
- ],
1624
- sbml_dfs.compartmentalized_species[
1625
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1626
- ]
1627
- .drop(SBML_DFS.C_ID, axis=1)
1628
- .merge(
1629
- inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
1630
- )
1631
- .set_index(SBML_DFS.SC_ID),
1632
- ]
1633
- )
1634
-
1635
- if (
1636
- updated_compartmentalized_species.shape[0]
1637
- != sbml_dfs.compartmentalized_species.shape[0]
1638
- ):
1639
- raise ValueError(
1640
- f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
1641
- " compartmentalized species with "
1642
- f"{updated_compartmentalized_species.shape[0]}"
1643
- )
1644
-
1645
- if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
1646
- raise ValueError("Some species compartments are still missing")
1647
-
1648
- sbml_dfs.compartmentalized_species = updated_compartmentalized_species
1649
-
1650
- return sbml_dfs
1651
-
1652
-
1653
- def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
1654
- """
1655
- Infer SBO Terms
1656
-
1657
- Define SBO terms based on stoichiometry for reaction_species with missing terms
1658
-
1659
- Parameters:
1660
- ----------
1661
- sbml_dfs: sbml.SBML_dfs
1662
- A relational pathway model
1663
-
1664
- Returns:
1665
- ----------
1666
- sbml_dfs: sbml.SBML_dfs
1667
- A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
1668
-
1669
- """
1670
-
1671
- valid_sbo_terms = sbml_dfs.reaction_species[
1672
- sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1673
- ]
1674
-
1675
- invalid_sbo_terms = sbml_dfs.reaction_species[
1676
- ~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1677
- ]
1678
-
1679
- if not all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull()):
1680
- raise ValueError(
1681
- "All sbml_dfs.reaction_species[SBML_DFS.SBO_TERM] must be not null"
1682
- )
1683
- if invalid_sbo_terms.shape[0] == 0:
1684
- logger.info("All sbo_terms were valid; returning input sbml_dfs")
1685
- return sbml_dfs
1686
-
1687
- logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
1688
-
1689
- # add missing/invalid terms based on stoichiometry
1690
- invalid_sbo_terms.loc[
1691
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
1692
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1693
-
1694
- invalid_sbo_terms.loc[
1695
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
1696
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1697
-
1698
- invalid_sbo_terms.loc[
1699
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
1700
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
1701
-
1702
- updated_reaction_species = pd.concat(
1703
- [valid_sbo_terms, invalid_sbo_terms]
1704
- ).sort_index()
1705
-
1706
- if sbml_dfs.reaction_species.shape[0] != updated_reaction_species.shape[0]:
1707
- raise ValueError(
1708
- f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
1709
- )
1710
- sbml_dfs.reaction_species = updated_reaction_species
1711
-
1712
- return sbml_dfs
1713
-
1714
-
1715
- def name_compartmentalized_species(sbml_dfs):
1716
- """
1717
- Name Compartmentalized Species
1718
-
1719
- Rename compartmentalized species if they have the same
1720
- name as their species
1721
-
1722
- Parameters
1723
- ----------
1724
- sbml_dfs : SBML_dfs
1725
- A model formed by aggregating pathways
1726
-
1727
- Returns:
1728
- ----------
1729
- sbml_dfs
1730
- """
1731
-
1732
- augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
1733
- sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
1734
- ).merge(
1735
- sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
1736
- )
1737
- augmented_cspecies[SBML_DFS.SC_NAME] = [
1738
- f"{s} [{c}]" if sc == s else sc
1739
- for sc, c, s in zip(
1740
- augmented_cspecies[SBML_DFS.SC_NAME],
1741
- augmented_cspecies[SBML_DFS.C_NAME],
1742
- augmented_cspecies[SBML_DFS.S_NAME],
1743
- )
1744
- ]
1745
-
1746
- sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
1747
- :, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
1748
- ]
1749
-
1750
- return sbml_dfs
1751
-
1752
-
1753
- def export_sbml_dfs(
1754
- model_prefix: str,
1755
- sbml_dfs: SBML_dfs,
1756
- outdir: str,
1757
- overwrite: bool = False,
1758
- dogmatic: bool = True,
1759
- ) -> None:
1760
- """
1761
- Export SBML_dfs
1762
-
1763
- Export summaries of species identifiers and each table underlying
1764
- an SBML_dfs pathway model
1765
-
1766
- Params
1767
- ------
1768
- model_prefix: str
1769
- Label to prepend to all exported files
1770
- sbml_dfs: sbml.SBML_dfs
1771
- A pathway model
1772
- outdir: str
1773
- Path to an existing directory where results should be saved
1774
- overwrite: bool
1775
- Should the directory be overwritten if it already exists?
1776
- dogmatic: bool
1777
- If True then treat genes, transcript, and proteins as separate species. If False
1778
- then treat them interchangeably.
1258
+ species_data_table : str
1259
+ Name of the species data table to select
1779
1260
 
1780
1261
  Returns
1781
1262
  -------
1782
- None
1783
-
1784
- """
1785
-
1786
- if not isinstance(model_prefix, str):
1787
- raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
1788
- if not isinstance(sbml_dfs, SBML_dfs):
1789
- raise TypeError(
1790
- f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
1791
- )
1792
-
1793
- # filter to identifiers which make sense when mapping from ids -> species
1794
- species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
1795
- sbml_dfs,
1796
- dogmatic=dogmatic,
1797
- )
1798
-
1799
- try:
1800
- utils.initialize_dir(outdir, overwrite=overwrite)
1801
- except FileExistsError:
1802
- logger.warning(
1803
- f"Directory {outdir} already exists and overwrite is False. "
1804
- "Files will be added to the existing directory."
1805
- )
1806
- with open_fs(outdir, writeable=True) as fs:
1807
- species_identifiers_path = (
1808
- model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
1809
- )
1810
- with fs.openbin(species_identifiers_path, "w") as f:
1811
- species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
1812
- f, sep="\t", index=False
1813
- )
1814
-
1815
- # export jsons
1816
- species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
1817
- reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
1818
- reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
1819
- compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
1820
- compartmentalized_species_path = (
1821
- model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
1822
- )
1823
- with fs.openbin(species_path, "w") as f:
1824
- sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
1825
-
1826
- with fs.openbin(reactions_path, "w") as f:
1827
- sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
1828
-
1829
- with fs.openbin(reation_species_path, "w") as f:
1830
- sbml_dfs.reaction_species.to_json(f)
1831
-
1832
- with fs.openbin(compartments_path, "w") as f:
1833
- sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
1834
-
1835
- with fs.openbin(compartmentalized_species_path, "w") as f:
1836
- sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
1837
- f
1838
- )
1839
-
1840
- return None
1841
-
1842
-
1843
- def sbml_dfs_from_edgelist(
1844
- interaction_edgelist: pd.DataFrame,
1845
- species_df: pd.DataFrame,
1846
- compartments_df: pd.DataFrame,
1847
- interaction_source: source.Source,
1848
- upstream_stoichiometry: int = 0,
1849
- downstream_stoichiometry: int = 1,
1850
- downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1851
- keep_species_data: bool | str = False,
1852
- keep_reactions_data: bool | str = False,
1853
- ) -> SBML_dfs:
1854
- """
1855
- Create SBML_dfs from interaction edgelist.
1856
-
1857
- Combines a set of molecular interactions into a mechanistic SBML_dfs model
1858
- by processing interaction data, species information, and compartment definitions.
1859
-
1860
- Parameters
1861
- ----------
1862
- interaction_edgelist : pd.DataFrame
1863
- Table containing molecular interactions with columns:
1864
- - upstream_name : str, matches "s_name" from species_df
1865
- - downstream_name : str, matches "s_name" from species_df
1866
- - upstream_compartment : str, matches "c_name" from compartments_df
1867
- - downstream_compartment : str, matches "c_name" from compartments_df
1868
- - r_name : str, name for the interaction
1869
- - sbo_term : str, SBO term defining interaction type
1870
- - r_Identifiers : identifiers.Identifiers, supporting identifiers
1871
- - r_isreversible : bool, whether reaction is reversible
1872
- species_df : pd.DataFrame
1873
- Table defining molecular species with columns:
1874
- - s_name : str, name of molecular species
1875
- - s_Identifiers : identifiers.Identifiers, species identifiers
1876
- compartments_df : pd.DataFrame
1877
- Table defining compartments with columns:
1878
- - c_name : str, name of compartment
1879
- - c_Identifiers : identifiers.Identifiers, compartment identifiers
1880
- interaction_source : source.Source
1881
- Source object linking model entities to interaction source
1882
- upstream_stoichiometry : int, default 0
1883
- Stoichiometry of upstream species in reactions
1884
- downstream_stoichiometry : int, default 1
1885
- Stoichiometry of downstream species in reactions
1886
- downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1887
- SBO term for downstream reactant type
1888
- keep_species_data : bool or str, default False
1889
- Whether to preserve extra species columns. If True, saves as 'source' label.
1890
- If string, uses as custom label. If False, discards extra data.
1891
- keep_reactions_data : bool or str, default False
1892
- Whether to preserve extra reaction columns. If True, saves as 'source' label.
1893
- If string, uses as custom label. If False, discards extra data.
1894
-
1895
- Returns
1896
- -------
1897
- SBML_dfs
1898
- Validated SBML data structure containing compartments, species,
1899
- compartmentalized species, reactions, and reaction species tables.
1900
- """
1901
- # 1. Validate inputs
1902
- _edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
1903
-
1904
- # 2. Identify which extra columns to preserve
1905
- extra_columns = _edgelist_identify_extra_columns(
1906
- interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1907
- )
1908
-
1909
- # 3. Process compartments and species tables
1910
- processed_compartments = _edgelist_process_compartments(
1911
- compartments_df, interaction_source
1912
- )
1913
- processed_species, species_data = _edgelist_process_species(
1914
- species_df, interaction_source, extra_columns["species"]
1915
- )
1916
-
1917
- # 4. Create compartmentalized species
1918
- comp_species = _edgelist_create_compartmentalized_species(
1919
- interaction_edgelist,
1920
- processed_species,
1921
- processed_compartments,
1922
- interaction_source,
1923
- )
1924
-
1925
- # 5. Create reactions and reaction species
1926
- reactions, reaction_species, reactions_data = (
1927
- _edgelist_create_reactions_and_species(
1928
- interaction_edgelist,
1929
- comp_species,
1930
- processed_species,
1931
- processed_compartments,
1932
- interaction_source,
1933
- upstream_stoichiometry,
1934
- downstream_stoichiometry,
1935
- downstream_sbo_name,
1936
- extra_columns["reactions"],
1937
- )
1938
- )
1939
-
1940
- # 6. Assemble final SBML_dfs object
1941
- sbml_model = _edgelist_assemble_sbml_model(
1942
- processed_compartments,
1943
- processed_species,
1944
- comp_species,
1945
- reactions,
1946
- reaction_species,
1947
- species_data,
1948
- reactions_data,
1949
- keep_species_data,
1950
- keep_reactions_data,
1951
- extra_columns,
1952
- )
1953
-
1954
- return sbml_model
1955
-
1956
- return sbml_model
1957
-
1958
-
1959
- def species_type_types(x):
1960
- """Assign a high-level molecule type to a molecular species"""
1961
-
1962
- if isinstance(x, identifiers.Identifiers):
1963
- if x.filter(["chebi"]):
1964
- return "metabolite"
1965
- elif x.filter(["molodex"]):
1966
- return "drug"
1967
- else:
1968
- return "protein"
1969
- else:
1970
- return "unknown"
1971
-
1972
-
1973
- def stub_ids(ids):
1974
- if len(ids) == 0:
1975
- return pd.DataFrame(
1976
- {
1977
- IDENTIFIERS.ONTOLOGY: [None],
1978
- IDENTIFIERS.IDENTIFIER: [None],
1979
- IDENTIFIERS.URL: [None],
1980
- IDENTIFIERS.BQB: [None],
1981
- }
1982
- )
1983
- else:
1984
- return pd.DataFrame(ids)
1985
-
1263
+ pd.DataFrame
1264
+ The selected species data table
1986
1265
 
1987
- def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
1988
- """
1989
- Add an sbo_role column to the reaction_species table.
1266
+ Raises
1267
+ ------
1268
+ ValueError
1269
+ If species_data_table is not found
1270
+ """
1271
+ # Check if species_data_table exists in sbml_dfs.species_data
1272
+ if species_data_table not in self.species_data:
1273
+ raise ValueError(
1274
+ f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
1275
+ f"Available tables: {self.species_data.keys()}"
1276
+ )
1990
1277
 
1991
- The sbo_role column is a string column that contains the SBO role of the reaction species.
1992
- The values in the sbo_role column are taken from the sbo_term column.
1278
+ # Get the species data
1279
+ return self.species_data[species_data_table]
1993
1280
 
1994
- The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
1995
- """
1281
+ def species_status(self, s_id: str) -> pd.DataFrame:
1282
+ """
1283
+ Species Status
1996
1284
 
1997
- validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
1285
+ Return all of the reactions a species participates in.
1998
1286
 
1999
- reaction_species = (
2000
- reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2001
- .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2002
- .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2003
- )
1287
+ Parameters:
1288
+ s_id: str
1289
+ A species ID
2004
1290
 
2005
- undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2006
- SBO_NAME_TO_ROLE.values()
2007
- )
2008
- if len(undefined_roles) > 0:
2009
- logger.warning(
2010
- f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2011
- )
2012
- mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2013
- reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
1291
+ Returns:
1292
+ pd.DataFrame, one row per reaction the species participates in
1293
+ with columns:
1294
+ - sc_name: str, name of the compartment the species participates in
1295
+ - stoichiometry: float, stoichiometry of the species in the reaction
1296
+ - r_name: str, name of the reaction
1297
+ - r_formula_str: str, human-readable formula of the reaction
1298
+ """
2014
1299
 
2015
- return reaction_species
1300
+ if s_id not in self.species.index:
1301
+ raise ValueError(f"{s_id} not found in species table")
2016
1302
 
1303
+ matching_species = self.species.loc[s_id]
2017
1304
 
2018
- def find_underspecified_reactions(
2019
- reaction_species_w_roles: pd.DataFrame,
2020
- ) -> pd.DataFrame:
1305
+ if not isinstance(matching_species, pd.Series):
1306
+ raise ValueError(f"{s_id} did not match a single species")
2021
1307
 
2022
- # check that both sbo_role and "new" are present
2023
- if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2024
- raise ValueError(
2025
- "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2026
- )
2027
- if "new" not in reaction_species_w_roles.columns:
2028
- raise ValueError(
2029
- "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2030
- )
2031
- # check that new is a boolean column
2032
- if reaction_species_w_roles["new"].dtype != bool:
2033
- raise ValueError(
2034
- "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2035
- )
1308
+ # find all rxns species participate in
1309
+ matching_compartmentalized_species = self.compartmentalized_species[
1310
+ self.compartmentalized_species.s_id.isin([s_id])
1311
+ ]
2036
1312
 
2037
- reactions_with_lost_defining_members = set(
2038
- reaction_species_w_roles.query("~new")
2039
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2040
- .tolist()
2041
- )
1313
+ rxns_participating = self.reaction_species[
1314
+ self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1315
+ ]
2042
1316
 
2043
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2044
- if N_reactions_with_lost_defining_members > 0:
2045
- logger.info(
2046
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
1317
+ # find all participants in these rxns
1318
+ full_rxns_participating = self.reaction_species[
1319
+ self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1320
+ ].merge(
1321
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
2047
1322
  )
2048
1323
 
2049
- # find the cases where all "new" values for a given (r_id, sbo_term) are False
2050
- reactions_with_lost_requirements = set(
2051
- reaction_species_w_roles
2052
- # drop already filtered reactions
2053
- .query("r_id not in @reactions_with_lost_defining_members")
2054
- .query("sbo_role == 'REQUIRED'")
2055
- # which entries which have some required attribute have all False values for that attribute
2056
- .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
2057
- .agg({"new": "any"})
2058
- .query("new == False")
2059
- .index.get_level_values(SBML_DFS.R_ID)
2060
- )
1324
+ participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
1325
+ reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
2061
1326
 
2062
- N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2063
- if N_reactions_with_lost_requirements > 0:
2064
- logger.info(
2065
- f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
1327
+ status = (
1328
+ full_rxns_participating.loc[
1329
+ full_rxns_participating[SBML_DFS.SC_ID].isin(
1330
+ matching_compartmentalized_species.index.values.tolist()
1331
+ ),
1332
+ [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1333
+ ]
1334
+ .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1335
+ .reset_index(drop=True)
1336
+ .drop(SBML_DFS.R_ID, axis=1)
2066
1337
  )
2067
1338
 
2068
- underspecified_reactions = reactions_with_lost_defining_members.union(
2069
- reactions_with_lost_requirements
2070
- )
1339
+ return status
2071
1340
 
2072
- return underspecified_reactions
1341
+ def validate(self):
1342
+ """
1343
+ Validate the SBML_dfs structure and relationships.
2073
1344
 
1345
+ Checks:
1346
+ - Schema existence
1347
+ - Required tables presence
1348
+ - Individual table structure
1349
+ - Primary key uniqueness
1350
+ - Foreign key relationships
1351
+ - Optional data table validity
1352
+ - Reaction species validity
2074
1353
 
2075
- def _find_underspecified_reactions_by_scids(
2076
- sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
2077
- ) -> set[str]:
2078
- """
2079
- Find Underspecified reactions
1354
+ Raises
1355
+ ------
1356
+ ValueError
1357
+ If any validation check fails
1358
+ """
2080
1359
 
2081
- Identity reactions which should be removed if a set of molecular species are removed
2082
- from the system.
1360
+ if not hasattr(self, "schema"):
1361
+ raise ValueError("No schema found")
2083
1362
 
2084
- Params:
2085
- sbml_dfs (SBML_dfs):
2086
- A pathway representation
2087
- sc_ids (list[str])
2088
- A list of compartmentalized species ids (sc_ids) which will be removed.
1363
+ required_tables = self._required_entities
1364
+ schema_tables = set(self.schema.keys())
2089
1365
 
2090
- Returns:
2091
- underspecified_reactions (set[str]):
2092
- A list of reactions which should be removed because they will not occur once
2093
- \"sc_ids\" are removed.
1366
+ extra_tables = schema_tables.difference(required_tables)
1367
+ if len(extra_tables) != 0:
1368
+ logger.debug(
1369
+ f"{len(extra_tables)} unexpected tables found: "
1370
+ f"{', '.join(extra_tables)}"
1371
+ )
2094
1372
 
2095
- """
1373
+ missing_tables = required_tables.difference(schema_tables)
1374
+ if len(missing_tables) != 0:
1375
+ raise ValueError(
1376
+ f"Missing {len(missing_tables)} required tables: "
1377
+ f"{', '.join(missing_tables)}"
1378
+ )
2096
1379
 
2097
- updated_reaction_species = sbml_dfs.reaction_species.copy()
2098
- updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
2099
- sc_ids
2100
- )
1380
+ # check individual tables
1381
+ for table in required_tables:
1382
+ self._validate_table(table)
2101
1383
 
2102
- updated_reaction_species = add_sbo_role(updated_reaction_species)
2103
- underspecified_reactions = find_underspecified_reactions(updated_reaction_species)
1384
+ # check whether pks and fks agree
1385
+ self._check_pk_fk_correspondence()
2104
1386
 
2105
- return underspecified_reactions
1387
+ # check optional data tables:
1388
+ for k, v in self.species_data.items():
1389
+ try:
1390
+ self._validate_species_data(v)
1391
+ except ValueError as e:
1392
+ raise ValueError(f"species data {k} was invalid.") from e
2106
1393
 
1394
+ for k, v in self.reactions_data.items():
1395
+ try:
1396
+ self._validate_reactions_data(v)
1397
+ except ValueError as e:
1398
+ raise ValueError(f"reactions data {k} was invalid.") from e
2107
1399
 
2108
- def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
2109
- """
2110
- Validate a standalone table against the SBML_dfs schema.
1400
+ # validate reaction_species sbo_terms and stoi
1401
+ self._validate_reaction_species()
2111
1402
 
2112
- This function validates a table against the schema defined in SBML_DFS_SCHEMA,
2113
- without requiring an SBML_dfs object. Useful for validating tables before
2114
- creating an SBML_dfs object.
1403
+ def validate_and_resolve(self):
1404
+ """
1405
+ Validate and attempt to automatically fix common issues.
2115
1406
 
2116
- Parameters
2117
- ----------
2118
- table_data : pd.DataFrame
2119
- The table to validate
2120
- table_name : str
2121
- Name of the table in the SBML_dfs schema
1407
+ This method iteratively:
1408
+ 1. Attempts validation
1409
+ 2. If validation fails, tries to resolve the issue
1410
+ 3. Repeats until validation passes or issue cannot be resolved
2122
1411
 
2123
1412
  Raises
2124
1413
  ------
2125
1414
  ValueError
2126
- If table_name is not in schema or validation fails
2127
- """
2128
- if table_name not in SBML_DFS_SCHEMA.SCHEMA:
2129
- raise ValueError(
2130
- f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
2131
- f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
2132
- )
1415
+ If validation fails and cannot be automatically resolved
1416
+ """
2133
1417
 
2134
- table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
2135
- _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
1418
+ current_exception = None
1419
+ validated = False
2136
1420
 
1421
+ while not validated:
1422
+ try:
1423
+ self.validate()
1424
+ validated = True
1425
+ except Exception as e:
1426
+ e_str = str(e)
1427
+ if e_str == current_exception:
1428
+ logger.warning(
1429
+ "Automated resolution of an Exception was attempted but failed"
1430
+ )
1431
+ raise e
2137
1432
 
2138
- def _perform_sbml_dfs_table_validation(
2139
- table_data: pd.DataFrame,
2140
- table_schema: dict,
2141
- table_name: str,
2142
- ) -> None:
2143
- """
2144
- Core validation logic for SBML_dfs tables.
1433
+ # try to resolve
1434
+ self._attempt_resolve(e)
2145
1435
 
2146
- This function performs the actual validation checks for any table against its schema,
2147
- regardless of whether it's part of an SBML_dfs object or standalone.
1436
+ # =============================================================================
1437
+ # PRIVATE METHODS (ALPHABETICAL ORDER)
1438
+ # =============================================================================
2148
1439
 
2149
- Parameters
2150
- ----------
2151
- table_data : pd.DataFrame
2152
- The table data to validate
2153
- table_schema : dict
2154
- Schema definition for the table
2155
- table_name : str
2156
- Name of the table (for error messages)
1440
+ def _attempt_resolve(self, e):
1441
+ str_e = str(e)
1442
+ if str_e == "compartmentalized_species included missing c_id values":
1443
+ logger.warning(str_e)
1444
+ logger.warning(
1445
+ "Attempting to resolve with infer_uncompartmentalized_species_location()"
1446
+ )
1447
+ self.infer_uncompartmentalized_species_location()
1448
+ elif re.search("sbo_terms were not defined", str_e):
1449
+ logger.warning(str_e)
1450
+ logger.warning("Attempting to resolve with infer_sbo_terms()")
1451
+ self.infer_sbo_terms()
1452
+ else:
1453
+ logger.warning(
1454
+ "An error occurred which could not be automatically resolved"
1455
+ )
1456
+ raise e
2157
1457
 
2158
- Raises
2159
- ------
2160
- ValueError
2161
- If the table does not conform to its schema:
2162
- - Not a DataFrame
2163
- - Wrong index name
2164
- - Duplicate primary keys
2165
- - Missing required variables
2166
- - Empty table
2167
- """
2168
- if not isinstance(table_data, pd.DataFrame):
2169
- raise ValueError(
2170
- f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
1458
+ def _check_pk_fk_correspondence(self):
1459
+ """
1460
+ Check whether primary keys and foreign keys agree for all tables in the schema.
1461
+ Raises ValueError if any correspondence fails.
1462
+ """
1463
+
1464
+ pk_df = pd.DataFrame(
1465
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
2171
1466
  )
2172
1467
 
2173
- # check index
2174
- expected_index_name = table_schema["pk"]
2175
- if table_data.index.name != expected_index_name:
2176
- raise ValueError(
2177
- f"the index name for {table_name} was not the pk: {expected_index_name}"
1468
+ fk_df = (
1469
+ pd.DataFrame(
1470
+ [
1471
+ {"fk_table": k, "fk": v["fk"]}
1472
+ for k, v in self.schema.items()
1473
+ if "fk" in v.keys()
1474
+ ]
1475
+ )
1476
+ .set_index("fk_table")["fk"]
1477
+ .apply(pd.Series)
1478
+ .reset_index()
1479
+ .melt(id_vars="fk_table")
1480
+ .drop(["variable"], axis=1)
1481
+ .rename(columns={"value": "key"})
2178
1482
  )
2179
1483
 
2180
- # check that all entries in the index are unique
2181
- if len(set(table_data.index.tolist())) != table_data.shape[0]:
2182
- duplicated_pks = table_data.index.value_counts()
2183
- duplicated_pks = duplicated_pks[duplicated_pks > 1]
1484
+ pk_fk_correspondences = pk_df.merge(fk_df)
2184
1485
 
2185
- example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
2186
- raise ValueError(
2187
- f"{duplicated_pks.shape[0]} primary keys were duplicated "
2188
- f"including {', '.join(example_duplicates)}"
2189
- )
1486
+ for i in range(0, pk_fk_correspondences.shape[0]):
1487
+ pk_table_keys = set(
1488
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1489
+ )
1490
+ if None in pk_table_keys:
1491
+ raise ValueError(
1492
+ f"{pk_fk_correspondences['pk_table'][i]} had "
1493
+ "missing values in its index"
1494
+ )
2190
1495
 
2191
- # check variables
2192
- expected_vars = set(table_schema["vars"])
2193
- table_vars = set(list(table_data.columns))
1496
+ fk_table_keys = set(
1497
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1498
+ :, pk_fk_correspondences["key"][i]
1499
+ ]
1500
+ )
1501
+ if None in fk_table_keys:
1502
+ raise ValueError(
1503
+ f"{pk_fk_correspondences['fk_table'][i]} included "
1504
+ f"missing {pk_fk_correspondences['key'][i]} values"
1505
+ )
2194
1506
 
2195
- extra_vars = table_vars.difference(expected_vars)
2196
- if len(extra_vars) != 0:
2197
- logger.debug(
2198
- f"{len(extra_vars)} extra variables were found for {table_name}: "
2199
- f"{', '.join(extra_vars)}"
2200
- )
1507
+ # all foreign keys need to match a primary key
1508
+ extra_fks = fk_table_keys.difference(pk_table_keys)
1509
+ if len(extra_fks) != 0:
1510
+ raise ValueError(
1511
+ f"{len(extra_fks)} distinct "
1512
+ f"{pk_fk_correspondences['key'][i]} values were"
1513
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
1514
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1515
+ " All foreign keys must have a matching primary key.\n\n"
1516
+ f"Extra key are: {', '.join(extra_fks)}"
1517
+ )
2201
1518
 
2202
- missing_vars = expected_vars.difference(table_vars)
2203
- if len(missing_vars) != 0:
2204
- raise ValueError(
2205
- f"Missing {len(missing_vars)} required variables for {table_name}: "
2206
- f"{', '.join(missing_vars)}"
2207
- )
1519
+ def _find_underspecified_reactions_by_scids(
1520
+ self, sc_ids: Iterable[str]
1521
+ ) -> set[str]:
1522
+ """
1523
+ Find Underspecified reactions
2208
1524
 
2209
- # check for empty table
2210
- if table_data.shape[0] == 0:
2211
- raise ValueError(f"{table_name} contained no entries")
1525
+ Identify reactions which should be removed if a set of molecular species are removed
1526
+ from the system.
2212
1527
 
1528
+ Parameters
1529
+ ----------
1530
+ sc_ids : list[str]
1531
+ A list of compartmentalized species ids (sc_ids) which will be removed.
2213
1532
 
2214
- def _filter_promiscuous_components(
2215
- bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
2216
- ) -> pd.DataFrame:
1533
+ Returns
1534
+ -------
1535
+ underspecified_reactions : set[str]
1536
+ A set of reactions which should be removed because they will not occur once
1537
+ "sc_ids" are removed.
1538
+ """
1539
+ updated_reaction_species = self.reaction_species.copy()
1540
+ updated_reaction_species["new"] = ~updated_reaction_species[
1541
+ SBML_DFS.SC_ID
1542
+ ].isin(sc_ids)
1543
+ updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
1544
+ underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
1545
+ updated_reaction_species
1546
+ )
1547
+ return underspecified_reactions
2217
1548
 
2218
- # number of complexes a species is part of
2219
- n_complexes_involvedin = bqb_has_parts_species.value_counts(
2220
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
2221
- )
2222
- promiscuous_component_identifiers_index = n_complexes_involvedin[
2223
- n_complexes_involvedin > max_promiscuity
2224
- ].index
2225
- promiscuous_component_identifiers = pd.Series(
2226
- data=[True] * len(promiscuous_component_identifiers_index),
2227
- index=promiscuous_component_identifiers_index,
2228
- name="is_shared_component",
2229
- dtype=bool,
2230
- )
1549
+ def _get_unused_cspecies(self) -> set[str]:
1550
+ """Returns a set of compartmentalized species
1551
+ that are not part of any reactions"""
1552
+ sc_ids = set(self.compartmentalized_species.index) - set(
1553
+ self.reaction_species[SBML_DFS.SC_ID]
1554
+ )
1555
+ return sc_ids # type: ignore
2231
1556
 
2232
- if len(promiscuous_component_identifiers) == 0:
2233
- return bqb_has_parts_species
1557
+ def _get_unused_species(self) -> set[str]:
1558
+ """Returns a list of species that are not part of any reactions"""
1559
+ s_ids = set(self.species.index) - set(
1560
+ self.compartmentalized_species[SBML_DFS.S_ID]
1561
+ )
1562
+ return s_ids # type: ignore
2234
1563
 
2235
- filtered_bqb_has_parts = bqb_has_parts_species.merge(
2236
- promiscuous_component_identifiers,
2237
- left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
2238
- right_index=True,
2239
- how="left",
2240
- )
1564
+ def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
1565
+ """Removes compartmentalized species from the model
2241
1566
 
2242
- filtered_bqb_has_parts["is_shared_component"] = (
2243
- filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
2244
- )
2245
- # drop identifiers shared as components across many species
2246
- filtered_bqb_has_parts = filtered_bqb_has_parts[
2247
- ~filtered_bqb_has_parts["is_shared_component"]
2248
- ].drop(["is_shared_component"], axis=1)
1567
+ This should not be directly used by the user, as it can lead to
1568
+ invalid reactions when removing species without a logic to decide
1569
+ if the reaction needs to be removed as well.
2249
1570
 
2250
- return filtered_bqb_has_parts
1571
+ Args:
1572
+ sc_ids (Iterable[str]): the compartmentalized species to remove
1573
+ """
1574
+ # Remove compartmentalized species
1575
+ self.compartmentalized_species = self.compartmentalized_species.drop(
1576
+ index=list(sc_ids)
1577
+ )
1578
+ # remove corresponding reactions_species
1579
+ self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
2251
1580
 
1581
+ def _remove_entity_data(self, entity_type: str, label: str) -> None:
1582
+ """
1583
+ Remove data from species_data or reactions_data by table name and label.
2252
1584
 
2253
- def _edgelist_validate_inputs(
2254
- interaction_edgelist: pd.DataFrame,
2255
- species_df: pd.DataFrame,
2256
- compartments_df: pd.DataFrame,
2257
- ) -> None:
2258
- """
2259
- Validate input DataFrames have required columns.
1585
+ Parameters
1586
+ ----------
1587
+ entity_type : str
1588
+ Name of the table to remove data from ('species' or 'reactions')
1589
+ label : str
1590
+ Label of the data to remove
2260
1591
 
2261
- Parameters
2262
- ----------
2263
- interaction_edgelist : pd.DataFrame
2264
- Interaction data to validate
2265
- species_df : pd.DataFrame
2266
- Species data to validate
2267
- compartments_df : pd.DataFrame
2268
- Compartments data to validate
2269
- """
1592
+ Notes
1593
+ -----
1594
+ If the label does not exist, a warning will be logged that includes the existing labels.
1595
+ """
1596
+ if entity_type not in ENTITIES_W_DATA:
1597
+ raise ValueError("table_name must be either 'species' or 'reactions'")
2270
1598
 
2271
- # check compartments
2272
- compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2273
- compartments_df_columns = set(compartments_df.columns.tolist())
2274
- missing_required_fields = compartments_df_expected_vars.difference(
2275
- compartments_df_columns
2276
- )
2277
- if len(missing_required_fields) > 0:
2278
- raise ValueError(
2279
- f"{', '.join(missing_required_fields)} are required variables"
2280
- ' in "compartments_df" but were not present in the input file.'
2281
- )
1599
+ data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
1600
+ if label not in data_dict:
1601
+ existing_labels = list(data_dict.keys())
1602
+ logger.warning(
1603
+ f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
1604
+ f"Existing labels: {existing_labels}"
1605
+ )
1606
+ return
2282
1607
 
2283
- # check species
2284
- species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2285
- species_df_columns = set(species_df.columns.tolist())
2286
- missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2287
- if len(missing_required_fields) > 0:
2288
- raise ValueError(
2289
- f"{', '.join(missing_required_fields)} are required"
2290
- ' variables in "species_df" but were not present '
2291
- "in the input file."
2292
- )
1608
+ del data_dict[label]
2293
1609
 
2294
- # check interactions
2295
- interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2296
- missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2297
- interaction_edgelist_columns
2298
- )
2299
- if len(missing_required_fields) > 0:
2300
- raise ValueError(
2301
- f"{', '.join(missing_required_fields)} are required "
2302
- 'variables in "interaction_edgelist" but were not '
2303
- "present in the input file."
2304
- )
1610
+ def _remove_species(self, s_ids: Iterable[str]):
1611
+ """Removes species from the model
2305
1612
 
2306
- return None
1613
+ This should not be directly used by the user, as it can lead to
1614
+ invalid reactions when removing species without a logic to decide
1615
+ if the reaction needs to be removed as well.
2307
1616
 
1617
+ This removes the species and corresponding compartmentalized species and
1618
+ reactions_species.
2308
1619
 
2309
- def _edgelist_identify_extra_columns(
2310
- interaction_edgelist, species_df, keep_reactions_data, keep_species_data
2311
- ):
2312
- """
2313
- Identify extra columns in input data that should be preserved.
1620
+ Args:
1621
+ s_ids (Iterable[str]): the species to remove
1622
+ """
1623
+ sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1624
+ self._remove_compartmentalized_species(sc_ids)
1625
+ # Remove species
1626
+ self.species = self.species.drop(index=list(s_ids))
1627
+ # remove data
1628
+ for k, data in self.species_data.items():
1629
+ self.species_data[k] = data.drop(index=list(s_ids))
2314
1630
 
2315
- Parameters
2316
- ----------
2317
- interaction_edgelist : pd.DataFrame
2318
- Interaction data containing potential extra columns
2319
- species_df : pd.DataFrame
2320
- Species data containing potential extra columns
2321
- keep_reactions_data : bool or str
2322
- Whether to keep extra reaction columns
2323
- keep_species_data : bool or str
2324
- Whether to keep extra species columns
1631
+ def _remove_unused_cspecies(self):
1632
+ """Removes compartmentalized species that are no
1633
+ longer part of any reactions"""
1634
+ sc_ids = self._get_unused_cspecies()
1635
+ self._remove_compartmentalized_species(sc_ids)
2325
1636
 
2326
- Returns
2327
- -------
2328
- dict
2329
- Dictionary with 'reactions' and 'species' keys containing lists of extra column names
2330
- """
2331
- extra_reactions_columns = []
2332
- extra_species_columns = []
2333
-
2334
- if keep_reactions_data is not False:
2335
- extra_reactions_columns = [
2336
- c
2337
- for c in interaction_edgelist.columns
2338
- if c not in INTERACTION_EDGELIST_EXPECTED_VARS
2339
- ]
1637
+ def _remove_unused_species(self):
1638
+ """Removes species that are no longer part of any
1639
+ compartmentalized species"""
1640
+ s_ids = self._get_unused_species()
1641
+ self._remove_species(s_ids)
2340
1642
 
2341
- if keep_species_data is not False:
2342
- extra_species_columns = [
2343
- c
2344
- for c in species_df.columns
2345
- if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2346
- ]
1643
+ def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
2347
1644
 
2348
- return {"reactions": extra_reactions_columns, "species": extra_species_columns}
1645
+ if isinstance(r_ids, str):
1646
+ r_ids = [r_ids]
2349
1647
 
1648
+ if r_ids is None:
1649
+ return self.reactions.index.tolist()
1650
+ else:
1651
+ if not all(r_id in self.reactions.index for r_id in r_ids):
1652
+ raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
2350
1653
 
2351
- def _edgelist_process_compartments(compartments_df, interaction_source):
2352
- """
2353
- Format compartments DataFrame with source and ID columns.
1654
+ return r_ids
2354
1655
 
2355
- Parameters
2356
- ----------
2357
- compartments_df : pd.DataFrame
2358
- Raw compartments data
2359
- interaction_source : source.Source
2360
- Source object to assign to compartments
1656
+ def _validate_reaction_species(self):
1657
+ if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1658
+ raise ValueError(
1659
+ "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1660
+ )
2361
1661
 
2362
- Returns
2363
- -------
2364
- pd.DataFrame
2365
- Processed compartments with IDs, indexed by compartment ID
2366
- """
2367
- compartments = compartments_df.copy()
2368
- compartments[SBML_DFS.C_SOURCE] = interaction_source
2369
- compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
2370
- range(compartments.shape[0]), SBML_DFS.C_ID
2371
- )
2372
- return compartments.set_index(SBML_DFS.C_ID)[
2373
- [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
2374
- ]
1662
+ # test for null SBO terms
1663
+ n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1664
+ if n_null_sbo_terms != 0:
1665
+ raise ValueError(
1666
+ f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1667
+ )
2375
1668
 
1669
+ # find invalid SBO terms
1670
+ sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1671
+ invalid_sbo_term_counts = sbo_counts[
1672
+ ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
1673
+ ]
2376
1674
 
2377
- def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
2378
- """
2379
- Format species DataFrame and extract extra data.
1675
+ if invalid_sbo_term_counts.shape[0] != 0:
1676
+ invalid_sbo_counts_str = ", ".join(
1677
+ [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1678
+ )
1679
+ raise ValueError(
1680
+ f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1681
+ f"defined {invalid_sbo_counts_str}"
1682
+ )
2380
1683
 
2381
- Parameters
2382
- ----------
2383
- species_df : pd.DataFrame
2384
- Raw species data
2385
- interaction_source : source.Source
2386
- Source object to assign to species
2387
- extra_species_columns : list
2388
- Names of extra columns to preserve separately
1684
+ def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1685
+ """Validates reactions data attribute
2389
1686
 
2390
- Returns
2391
- -------
2392
- tuple of pd.DataFrame
2393
- Processed species DataFrame and species extra data DataFrame
2394
- """
2395
- species = species_df.copy()
2396
- species[SBML_DFS.S_SOURCE] = interaction_source
2397
- species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
2398
- range(species.shape[0]), SBML_DFS.S_ID
2399
- )
1687
+ Args:
1688
+ reactions_data_table (pd.DataFrame): a reactions data table
2400
1689
 
2401
- required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
2402
- species_indexed = species.set_index(SBML_DFS.S_ID)[
2403
- required_cols + extra_species_columns
2404
- ]
1690
+ Raises:
1691
+ ValueError: r_id not index name
1692
+ ValueError: r_id index contains duplicates
1693
+ ValueError: r_id not in reactions table
1694
+ """
1695
+ sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
2405
1696
 
2406
- # Separate extra data from main species table
2407
- species_data = species_indexed[extra_species_columns]
2408
- processed_species = species_indexed[required_cols]
1697
+ def _validate_species_data(self, species_data_table: pd.DataFrame):
1698
+ """Validates species data attribute
2409
1699
 
2410
- return processed_species, species_data
1700
+ Args:
1701
+ species_data_table (pd.DataFrame): a species data table
2411
1702
 
1703
+ Raises:
1704
+ ValueError: s_id not index name
1705
+ ValueError: s_id index contains duplicates
1706
+ ValueError: s_id not in species table
1707
+ """
1708
+ sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
2412
1709
 
2413
- def _edgelist_create_compartmentalized_species(
2414
- interaction_edgelist, species_df, compartments_df, interaction_source
2415
- ):
2416
- """
2417
- Create compartmentalized species from interactions.
1710
+ def _validate_table(self, table_name: str) -> None:
1711
+ """
1712
+ Validate a table in this SBML_dfs object against its schema.
2418
1713
 
2419
- Parameters
2420
- ----------
2421
- interaction_edgelist : pd.DataFrame
2422
- Interaction data containing species-compartment combinations
2423
- species_df : pd.DataFrame
2424
- Processed species data with IDs
2425
- compartments_df : pd.DataFrame
2426
- Processed compartments data with IDs
2427
- interaction_source : source.Source
2428
- Source object to assign to compartmentalized species
1714
+ This is an internal method that validates a table that is part of this SBML_dfs
1715
+ object against the schema stored in self.schema.
2429
1716
 
2430
- Returns
2431
- -------
2432
- pd.DataFrame
2433
- Compartmentalized species with formatted names and IDs
2434
- """
2435
- # Get all distinct upstream and downstream compartmentalized species
2436
- comp_species = pd.concat(
2437
- [
2438
- interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
2439
- {
2440
- "upstream_name": SBML_DFS.S_NAME,
2441
- "upstream_compartment": SBML_DFS.C_NAME,
2442
- },
2443
- axis=1,
2444
- ),
2445
- interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
2446
- {
2447
- "downstream_name": SBML_DFS.S_NAME,
2448
- "downstream_compartment": SBML_DFS.C_NAME,
2449
- },
2450
- axis=1,
2451
- ),
2452
- ]
2453
- ).drop_duplicates()
1717
+ Parameters
1718
+ ----------
1719
+ table : str
1720
+ Name of the table to validate
2454
1721
 
2455
- # Add species and compartment IDs
2456
- comp_species_w_ids = comp_species.merge(
2457
- species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
2458
- ).merge(
2459
- compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
2460
- )
1722
+ Raises
1723
+ ------
1724
+ ValueError
1725
+ If the table does not conform to its schema
1726
+ """
1727
+ table_data = getattr(self, table_name)
2461
1728
 
2462
- # Validate merge was successful
2463
- _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
1729
+ sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
2464
1730
 
2465
- # Format compartmentalized species with names, source, and IDs
2466
- comp_species_w_ids[SBML_DFS.SC_NAME] = [
2467
- f"{s} [{c}]"
2468
- for s, c in zip(
2469
- comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
2470
- )
2471
- ]
2472
- comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2473
- comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2474
- range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2475
- )
2476
1731
 
2477
- return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2478
- [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2479
- ]
2480
-
2481
-
2482
- def _edgelist_create_reactions_and_species(
2483
- interaction_edgelist,
2484
- comp_species,
2485
- species_df,
2486
- compartments_df,
2487
- interaction_source,
2488
- upstream_stoichiometry,
2489
- downstream_stoichiometry,
2490
- downstream_sbo_name,
2491
- extra_reactions_columns,
2492
- ):
1732
+ def sbml_dfs_from_edgelist(
1733
+ interaction_edgelist: pd.DataFrame,
1734
+ species_df: pd.DataFrame,
1735
+ compartments_df: pd.DataFrame,
1736
+ interaction_source: source.Source,
1737
+ upstream_stoichiometry: int = 0,
1738
+ downstream_stoichiometry: int = 1,
1739
+ downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1740
+ keep_species_data: bool | str = False,
1741
+ keep_reactions_data: bool | str = False,
1742
+ ) -> SBML_dfs:
2493
1743
  """
2494
- Create reactions and reaction species from interactions.
1744
+ Create SBML_dfs from interaction edgelist.
1745
+
1746
+ Combines a set of molecular interactions into a mechanistic SBML_dfs model
1747
+ by processing interaction data, species information, and compartment definitions.
2495
1748
 
2496
1749
  Parameters
2497
1750
  ----------
2498
1751
  interaction_edgelist : pd.DataFrame
2499
- Original interaction data
2500
- comp_species : pd.DataFrame
2501
- Compartmentalized species with IDs
1752
+ Table containing molecular interactions with columns:
1753
+ - upstream_name : str, matches "s_name" from species_df
1754
+ - downstream_name : str, matches "s_name" from species_df
1755
+ - upstream_compartment : str, matches "c_name" from compartments_df
1756
+ - downstream_compartment : str, matches "c_name" from compartments_df
1757
+ - r_name : str, name for the interaction
1758
+ - sbo_term : str, SBO term defining interaction type
1759
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
1760
+ - r_isreversible : bool, whether reaction is reversible
2502
1761
  species_df : pd.DataFrame
2503
- Processed species data with IDs
1762
+ Table defining molecular species with columns:
1763
+ - s_name : str, name of molecular species
1764
+ - s_Identifiers : identifiers.Identifiers, species identifiers
2504
1765
  compartments_df : pd.DataFrame
2505
- Processed compartments data with IDs
1766
+ Table defining compartments with columns:
1767
+ - c_name : str, name of compartment
1768
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
2506
1769
  interaction_source : source.Source
2507
- Source object for reactions
2508
- upstream_stoichiometry : int
2509
- Stoichiometry for upstream species
2510
- downstream_stoichiometry : int
2511
- Stoichiometry for downstream species
2512
- downstream_sbo_name : str
2513
- SBO term name for downstream species
2514
- extra_reactions_columns : list
2515
- Names of extra columns to preserve
1770
+ Source object linking model entities to interaction source
1771
+ upstream_stoichiometry : int, default 0
1772
+ Stoichiometry of upstream species in reactions
1773
+ downstream_stoichiometry : int, default 1
1774
+ Stoichiometry of downstream species in reactions
1775
+ downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1776
+ SBO term for downstream reactant type
1777
+ keep_species_data : bool or str, default False
1778
+ Whether to preserve extra species columns. If True, saves as 'source' label.
1779
+ If string, uses as custom label. If False, discards extra data.
1780
+ keep_reactions_data : bool or str, default False
1781
+ Whether to preserve extra reaction columns. If True, saves as 'source' label.
1782
+ If string, uses as custom label. If False, discards extra data.
2516
1783
 
2517
1784
  Returns
2518
1785
  -------
2519
- tuple
2520
- (reactions_df, reaction_species_df, reactions_data)
1786
+ SBML_dfs
1787
+ Validated SBML data structure containing compartments, species,
1788
+ compartmentalized species, reactions, and reaction species tables.
2521
1789
  """
2522
- # Add compartmentalized species IDs to interactions
2523
- comp_species_w_names = (
2524
- comp_species.reset_index()
2525
- .merge(species_df[SBML_DFS.S_NAME].reset_index())
2526
- .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
1790
+ # 1. Validate inputs
1791
+ sbml_dfs_utils._edgelist_validate_inputs(
1792
+ interaction_edgelist, species_df, compartments_df
2527
1793
  )
2528
1794
 
2529
- interaction_w_cspecies = interaction_edgelist.merge(
2530
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2531
- {
2532
- SBML_DFS.SC_ID: "sc_id_up",
2533
- SBML_DFS.S_NAME: "upstream_name",
2534
- SBML_DFS.C_NAME: "upstream_compartment",
2535
- },
2536
- axis=1,
2537
- ),
2538
- how="left",
2539
- ).merge(
2540
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2541
- {
2542
- SBML_DFS.SC_ID: "sc_id_down",
2543
- SBML_DFS.S_NAME: "downstream_name",
2544
- SBML_DFS.C_NAME: "downstream_compartment",
2545
- },
2546
- axis=1,
2547
- ),
2548
- how="left",
2549
- )[
2550
- REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2551
- ]
2552
-
2553
- # Validate merge didn't create duplicates
2554
- if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
2555
- raise ValueError(
2556
- f"Merging compartmentalized species resulted in row count change "
2557
- f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
2558
- )
1795
+ # 2. Identify which extra columns to preserve
1796
+ extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
1797
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1798
+ )
2559
1799
 
2560
- # Create reaction IDs FIRST - before using them
2561
- interaction_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2562
- range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
1800
+ # 3. Process compartments and species tables
1801
+ processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
1802
+ compartments_df, interaction_source
1803
+ )
1804
+ processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
1805
+ species_df, interaction_source, extra_columns["species"]
2563
1806
  )
2564
1807
 
2565
- # Create reactions DataFrame
2566
- interactions_copy = interaction_w_cspecies.copy()
2567
- interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
2568
-
2569
- reactions_columns = [
2570
- SBML_DFS.R_NAME,
2571
- SBML_DFS.R_IDENTIFIERS,
2572
- SBML_DFS.R_SOURCE,
2573
- SBML_DFS.R_ISREVERSIBLE,
2574
- ]
2575
-
2576
- reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
2577
- reactions_columns + extra_reactions_columns
2578
- ]
2579
-
2580
- # Separate extra data
2581
- reactions_data = reactions_df[extra_reactions_columns]
2582
- reactions_df = reactions_df[reactions_columns]
2583
-
2584
- # Create reaction species relationships - NOW r_id exists
2585
- reaction_species_df = pd.concat(
2586
- [
2587
- # Upstream species (modifiers/stimulators/inhibitors)
2588
- interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
2589
- .assign(stoichiometry=upstream_stoichiometry)
2590
- .rename({"sc_id_up": "sc_id"}, axis=1),
2591
- # Downstream species (products)
2592
- interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
2593
- .assign(
2594
- stoichiometry=downstream_stoichiometry,
2595
- sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2596
- )
2597
- .rename({"sc_id_down": "sc_id"}, axis=1),
2598
- ]
1808
+ # 4. Create compartmentalized species
1809
+ comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
1810
+ interaction_edgelist,
1811
+ processed_species,
1812
+ processed_compartments,
1813
+ interaction_source,
2599
1814
  )
2600
1815
 
2601
- reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2602
- range(reaction_species_df.shape[0]), "rsc_id"
1816
+ # 5. Create reactions and reaction species
1817
+ reactions, reaction_species, reactions_data = (
1818
+ sbml_dfs_utils._edgelist_create_reactions_and_species(
1819
+ interaction_edgelist,
1820
+ comp_species,
1821
+ processed_species,
1822
+ processed_compartments,
1823
+ interaction_source,
1824
+ upstream_stoichiometry,
1825
+ downstream_stoichiometry,
1826
+ downstream_sbo_name,
1827
+ extra_columns["reactions"],
1828
+ )
2603
1829
  )
2604
1830
 
2605
- reaction_species_df = reaction_species_df.set_index("rsc_id")
1831
+ # 6. Assemble final SBML_dfs object
1832
+ sbml_dfs = _edgelist_assemble_sbml_model(
1833
+ processed_compartments,
1834
+ processed_species,
1835
+ comp_species,
1836
+ reactions,
1837
+ reaction_species,
1838
+ species_data,
1839
+ reactions_data,
1840
+ keep_species_data,
1841
+ keep_reactions_data,
1842
+ extra_columns,
1843
+ )
2606
1844
 
2607
- return reactions_df, reaction_species_df, reactions_data
1845
+ return sbml_dfs
2608
1846
 
2609
1847
 
2610
1848
  def _edgelist_assemble_sbml_model(
2611
- compartments,
2612
- species,
2613
- comp_species,
2614
- reactions,
2615
- reaction_species,
1849
+ compartments: pd.DataFrame,
1850
+ species: pd.DataFrame,
1851
+ comp_species: pd.DataFrame,
1852
+ reactions: pd.DataFrame,
1853
+ reaction_species: pd.DataFrame,
2616
1854
  species_data,
2617
1855
  reactions_data,
2618
1856
  keep_species_data,
2619
1857
  keep_reactions_data,
2620
- extra_columns,
2621
- ):
1858
+ extra_columns: dict[str, list[str]],
1859
+ ) -> SBML_dfs:
2622
1860
  """
2623
1861
  Assemble the final SBML_dfs object.
2624
1862
 
@@ -2675,128 +1913,3 @@ def _edgelist_assemble_sbml_model(
2675
1913
  sbml_model.validate()
2676
1914
 
2677
1915
  return sbml_model
2678
-
2679
-
2680
- def _sbml_dfs_from_edgelist_check_cspecies_merge(
2681
- merged_species: pd.DataFrame, original_species: pd.DataFrame
2682
- ) -> None:
2683
- """Check for a mismatch between the provided species data and species implied by the edgelist."""
2684
-
2685
- # check for 1-many merge
2686
- if merged_species.shape[0] != original_species.shape[0]:
2687
- raise ValueError(
2688
- "Merging compartmentalized species to species_df"
2689
- " and compartments_df by names resulted in an "
2690
- f"increase in the tables from {original_species.shape[0]}"
2691
- f" to {merged_species.shape[0]} indicating that names were"
2692
- " not unique"
2693
- )
2694
-
2695
- # check for missing species and compartments
2696
- missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2697
- SBML_DFS.C_NAME
2698
- ].unique()
2699
- if len(missing_compartments) >= 1:
2700
- raise ValueError(
2701
- f"{len(missing_compartments)} compartments were present in"
2702
- ' "interaction_edgelist" but not "compartments_df":'
2703
- f" {', '.join(missing_compartments)}"
2704
- )
2705
-
2706
- missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2707
- SBML_DFS.S_NAME
2708
- ].unique()
2709
- if len(missing_species) >= 1:
2710
- raise ValueError(
2711
- f"{len(missing_species)} species were present in "
2712
- '"interaction_edgelist" but not "species_df":'
2713
- f" {', '.join(missing_species)}"
2714
- )
2715
-
2716
- return None
2717
-
2718
-
2719
- def _stub_compartments(
2720
- stubbed_compartment: str = GENERIC_COMPARTMENT,
2721
- ) -> pd.DataFrame:
2722
- """Stub Compartments
2723
-
2724
- Create a compartments table with only a single compartment
2725
-
2726
- Args:
2727
- stubbed_compartment (str): the name of a compartment which should match the
2728
- keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2729
-
2730
- Returns:
2731
- compartments_df (pd.DataFrame): compartments dataframe
2732
- """
2733
-
2734
- if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2735
- raise ValueError(
2736
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2737
- )
2738
-
2739
- if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2740
- raise ValueError(
2741
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2742
- )
2743
-
2744
- stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2745
-
2746
- formatted_uri = identifiers.format_uri(
2747
- uri=identifiers.create_uri_url(
2748
- ontology=ONTOLOGIES.GO,
2749
- identifier=stubbed_compartment_id,
2750
- ),
2751
- biological_qualifier_type=BQB.IS,
2752
- )
2753
-
2754
- compartments_df = pd.DataFrame(
2755
- {
2756
- SBML_DFS.C_NAME: [stubbed_compartment],
2757
- SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2758
- }
2759
- )
2760
- compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2761
- compartments_df.index.name = SBML_DFS.C_ID
2762
-
2763
- return compartments_df
2764
-
2765
-
2766
- def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2767
- """Validates a table against a reference
2768
-
2769
- This check if the table has the same index, no duplicates in the index
2770
- and that all values in the index are in the reference table.
2771
-
2772
- Args:
2773
- data_table (pd.DataFrame): a table with data that should
2774
- match the reference
2775
- ref_table (pd.DataFrame): a reference table
2776
-
2777
- Raises:
2778
- ValueError: not same index name
2779
- ValueError: index contains duplicates
2780
- ValueError: index not subset of index of reactions table
2781
- """
2782
- ref_index_name = ref_table.index.name
2783
- if data_table.index.name != ref_index_name:
2784
- raise ValueError(
2785
- "the index name for reaction data table was not"
2786
- f" {ref_index_name}: {data_table.index.name}"
2787
- )
2788
- ids = data_table.index
2789
- if any(ids.duplicated()):
2790
- raise ValueError(
2791
- "the index for reaction data table " "contained duplicate values"
2792
- )
2793
- if not all(ids.isin(ref_table.index)):
2794
- raise ValueError(
2795
- "the index for reaction data table contained values"
2796
- " not found in the reactions table"
2797
- )
2798
- if not isinstance(data_table, pd.DataFrame):
2799
- raise TypeError(
2800
- f"The data table was type {type(data_table).__name__}"
2801
- " but must be a pd.DataFrame"
2802
- )