napistu 0.3.2.dev1__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/ingestion/sbml.py CHANGED
@@ -6,60 +6,46 @@ import re
6
6
 
7
7
  import libsbml
8
8
  import pandas as pd
9
+ from fs import open_fs
10
+ from pydantic import conlist, field_validator, RootModel
11
+
9
12
  from napistu import consensus
10
- from napistu import constants
11
13
  from napistu import identifiers
12
14
  from napistu import sbml_dfs_utils
13
15
  from napistu import source
14
16
  from napistu import utils
15
-
16
17
  from napistu.constants import BQB
17
-
18
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_ID
19
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_IDENTIFIERS
20
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
21
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_SOURCE
22
- from napistu.ingestion.constants import SBML_COMPARTMENTALIZED_SPECIES_DICT_NAME
23
- from napistu.ingestion.constants import SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE
24
- from napistu.ingestion.constants import SBML_REACTION_ATTR_GET_GENE_PRODUCT
25
- from napistu.ingestion.constants import SBML_SPECIES_DICT_ID
26
- from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
27
- from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
28
- from napistu.ingestion.constants import SMBL_ERROR_CATEGORY
29
- from napistu.ingestion.constants import SMBL_ERROR_DESCRIPTION
30
- from napistu.ingestion.constants import SMBL_ERROR_MESSAGE
31
- from napistu.ingestion.constants import SMBL_ERROR_NUMBER
32
- from napistu.ingestion.constants import SMBL_ERROR_SEVERITY
33
- from napistu.ingestion.constants import SMBL_REACTION_DICT_ID
34
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
35
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
36
- from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
37
- from napistu.ingestion.constants import SMBL_REACTION_DICT_SOURCE
38
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_RSC_ID
39
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
40
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_SC_ID
41
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_STOICHIOMETRY
42
- from napistu.ingestion.constants import SMBL_SUMMARY_COMPARTMENTS
43
- from napistu.ingestion.constants import SMBL_SUMMARY_N_REACTIONS
44
- from napistu.ingestion.constants import SMBL_SUMMARY_N_SPECIES
45
- from napistu.ingestion.constants import SMBL_SUMMARY_PATHWAY_ID
46
- from napistu.ingestion.constants import SMBL_SUMMARY_PATHWAY_NAME
47
-
48
- from fs import open_fs
18
+ from napistu.constants import ONTOLOGIES
19
+ from napistu.constants import SBML_DFS
20
+ from napistu.ingestion.constants import SBML_DEFS
21
+ from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
22
+ from napistu.ingestion.constants import COMPARTMENT_ALIASES
23
+ from napistu.ingestion.constants import VALID_COMPARTMENTS
24
+ from napistu.ingestion.constants import GENERIC_COMPARTMENT
49
25
 
50
26
  logger = logging.getLogger(__name__)
51
27
 
28
+ NonEmptyStringList = conlist(str, min_length=1)
29
+
52
30
 
53
31
  class SBML:
54
- """
55
- System Biology Markup Language Connections.
32
+ """A class for handling Systems Biology Markup Language (SBML) files.
33
+
34
+ This class provides an interface to read and parse SBML files, offering
35
+ methods to access the model, summarize its contents, and report any errors
36
+ encountered during parsing.
37
+
38
+ Parameters
39
+ ----------
40
+ sbml_path : str
41
+ The file path to an SBML model. Supports local paths and GCS URIs.
56
42
 
57
43
  Attributes
58
44
  ----------
59
- document
60
- Connection to the SBML document
61
- model
62
- Connection to the SBML model
45
+ document : libsbml.SBMLDocument
46
+ The raw SBML document object from libsbml.
47
+ model : libsbml.Model
48
+ The parsed SBML model object from libsbml.
63
49
 
64
50
  Methods
65
51
  -------
@@ -68,25 +54,18 @@ class SBML:
68
54
  sbml_errors(reduced_log, return_df)
69
55
  Print a summary of all errors in the SBML file
70
56
 
57
+ Raises
58
+ ------
59
+ ValueError
60
+ If the SBML model is not Level 3, or if critical, unknown errors are
61
+ found during parsing.
71
62
  """
72
63
 
73
64
  def __init__(
74
65
  self,
75
66
  sbml_path: str,
76
67
  ) -> None:
77
- """
78
- Connects to an SBML file
79
-
80
- Parameters
81
- ----------
82
- sbml_path : str
83
- path to a .sbml file.
84
-
85
- Returns
86
- -------
87
- None.
88
- """
89
-
68
+ """Initializes the SBML object by reading and validating an SBML file."""
90
69
  reader = libsbml.SBMLReader()
91
70
  if os.path.exists(sbml_path):
92
71
  self.document = reader.readSBML(sbml_path)
@@ -105,8 +84,8 @@ class SBML:
105
84
  # check for critical sbml errors
106
85
  errors = self.sbml_errors(reduced_log=False, return_df=True)
107
86
  if errors is not None:
108
- critical_errors = errors[errors[SMBL_ERROR_SEVERITY] >= 2]
109
- critical_errors = set(critical_errors[SMBL_ERROR_DESCRIPTION].unique())
87
+ critical_errors = errors[errors[SBML_DEFS.ERROR_SEVERITY] >= 2]
88
+ critical_errors = set(critical_errors[SBML_DEFS.ERROR_DESCRIPTION].unique())
110
89
  known_errors = {"<layout> must have 'id' and may have 'name'"}
111
90
 
112
91
  found_known_errors = known_errors.intersection(critical_errors)
@@ -123,41 +102,50 @@ class SBML:
123
102
  )
124
103
 
125
104
  def summary(self) -> pd.DataFrame:
126
- """Returns a pd.DataFrame summary of an SBML model."""
105
+ """Generates a styled summary of the SBML model.
106
+
107
+ Returns
108
+ -------
109
+ pd.io.formats.style.Styler
110
+ A styled pandas DataFrame containing a summary of the model,
111
+ including pathway name, ID, and counts of species and reactions.
112
+ """
127
113
  model = self.model
128
114
 
129
115
  model_summaries = dict()
130
116
 
131
- model_summaries[SMBL_SUMMARY_PATHWAY_NAME] = model.getName()
132
- model_summaries[SMBL_SUMMARY_PATHWAY_ID] = model.getId()
117
+ model_summaries[SBML_DEFS.SUMMARY_PATHWAY_NAME] = model.getName()
118
+ model_summaries[SBML_DEFS.SUMMARY_PATHWAY_ID] = model.getId()
133
119
 
134
- model_summaries[SMBL_SUMMARY_N_SPECIES] = model.getNumSpecies()
135
- model_summaries[SMBL_SUMMARY_N_REACTIONS] = model.getNumReactions()
120
+ model_summaries[SBML_DEFS.SUMMARY_N_SPECIES] = model.getNumSpecies()
121
+ model_summaries[SBML_DEFS.SUMMARY_N_REACTIONS] = model.getNumReactions()
136
122
 
137
123
  compartments = [
138
124
  model.getCompartment(i).getName() for i in range(model.getNumCompartments())
139
125
  ]
140
126
  compartments.sort()
141
- model_summaries[SMBL_SUMMARY_COMPARTMENTS] = ",\n".join(compartments)
127
+ model_summaries[SBML_DEFS.SUMMARY_COMPARTMENTS] = ",\n".join(compartments)
142
128
 
143
129
  model_summaries_dat = pd.DataFrame(model_summaries, index=[0]).T
144
130
 
145
131
  return utils.style_df(model_summaries_dat) # type: ignore
146
132
 
147
133
  def sbml_errors(self, reduced_log: bool = True, return_df: bool = False):
148
- """
149
- Format and print all SBML errors
134
+ """Formats and reports all errors found in the SBML file.
150
135
 
151
136
  Parameters
152
137
  ----------
153
- reduced_log : bool
154
- Reduced log aggregates errors across categories an severity levels
155
- return_df: bool
156
- If False then print a log, if True then return a pd.DataFrame
138
+ reduced_log : bool, optional
139
+ If True, aggregates errors by category and severity. Defaults to True.
140
+ return_df : bool, optional
141
+ If True, returns a DataFrame of the errors. Otherwise, prints a
142
+ styled summary. Defaults to False.
157
143
 
158
144
  Returns
159
145
  -------
160
- None or pd.DataFrame.
146
+ pd.DataFrame or None
147
+ A DataFrame containing the error log if `return_df` is True and
148
+ errors are present, otherwise None.
161
149
  """
162
150
  n_errors = self.document.getNumErrors()
163
151
  if n_errors == 0:
@@ -168,11 +156,11 @@ class SBML:
168
156
  e = self.document.getError(i)
169
157
 
170
158
  error_entry = {
171
- SMBL_ERROR_NUMBER: i,
172
- SMBL_ERROR_CATEGORY: e.getCategoryAsString(),
173
- SMBL_ERROR_SEVERITY: e.getSeverity(),
174
- SMBL_ERROR_DESCRIPTION: e.getShortMessage(),
175
- SMBL_ERROR_MESSAGE: e.getMessage(),
159
+ SBML_DEFS.ERROR_NUMBER: i,
160
+ SBML_DEFS.ERROR_CATEGORY: e.getCategoryAsString(),
161
+ SBML_DEFS.ERROR_SEVERITY: e.getSeverity(),
162
+ SBML_DEFS.ERROR_DESCRIPTION: e.getShortMessage(),
163
+ SBML_DEFS.ERROR_MESSAGE: e.getMessage(),
176
164
  }
177
165
 
178
166
  error_log.append(error_entry)
@@ -181,9 +169,13 @@ class SBML:
181
169
  if reduced_log:
182
170
  error_log = (
183
171
  error_log[
184
- [SMBL_ERROR_CATEGORY, SMBL_ERROR_SEVERITY, SMBL_ERROR_MESSAGE]
172
+ [
173
+ SBML_DEFS.ERROR_CATEGORY,
174
+ SBML_DEFS.ERROR_SEVERITY,
175
+ SBML_DEFS.ERROR_MESSAGE,
176
+ ]
185
177
  ]
186
- .groupby([SMBL_ERROR_CATEGORY, SMBL_ERROR_SEVERITY])
178
+ .groupby([SBML_DEFS.ERROR_CATEGORY, SBML_DEFS.ERROR_SEVERITY])
187
179
  .count()
188
180
  )
189
181
 
@@ -191,12 +183,15 @@ class SBML:
191
183
  return error_log
192
184
  else:
193
185
  if reduced_log:
194
- headers = [f"{SMBL_ERROR_CATEGORY}, {SMBL_ERROR_SEVERITY}", "count"]
186
+ headers = [
187
+ f"{SBML_DEFS.ERROR_CATEGORY}, {SBML_DEFS.ERROR_SEVERITY}",
188
+ "count",
189
+ ]
195
190
  else:
196
191
  headers = [
197
- SMBL_ERROR_CATEGORY,
198
- SMBL_ERROR_SEVERITY,
199
- SMBL_ERROR_DESCRIPTION,
192
+ SBML_DEFS.ERROR_CATEGORY,
193
+ SBML_DEFS.ERROR_SEVERITY,
194
+ SBML_DEFS.ERROR_DESCRIPTION,
200
195
  ]
201
196
  error_log = error_log[headers]
202
197
 
@@ -205,34 +200,103 @@ class SBML:
205
200
  return None
206
201
 
207
202
 
208
- class SBML_reaction:
203
+ class CompartmentAliasesValidator(RootModel):
209
204
  """
210
- System Biology Markup Language Model Reactions.
205
+ A Pydantic model for validating compartment alias dictionaries.
206
+
207
+ This model ensures that the compartment alias dictionary is a mapping
208
+ from a string (the canonical compartment name) to a list of strings
209
+ (the aliases for that compartment). It also validates that the keys
210
+ of the dictionary are valid compartment names.
211
211
 
212
212
  Attributes
213
213
  ----------
214
- reaction_dict: dict
215
- dictionary of reaction-level attributes, id, name, identifiers
216
- species: pd.DataFrame
217
- table of substrates, products, and modifiers
214
+ root : dict[str, list[str]]
215
+ The root of the model is a dictionary where keys are strings and
216
+ values are lists of strings.
217
+ """
218
+
219
+ root: dict[str, list[str]]
220
+
221
+ @field_validator("root")
222
+ def validate_aliases(cls, values: dict[str, list[str]]):
223
+ """Validate the compartment alias dictionary."""
224
+ for key, alias_list in values.items():
225
+ if not key:
226
+ raise ValueError("Compartment keys must be non-empty.")
227
+ if key not in VALID_COMPARTMENTS:
228
+ raise ValueError(
229
+ f"Invalid compartment key: {key}. "
230
+ f"Must be one of {VALID_COMPARTMENTS}"
231
+ )
232
+ if not alias_list:
233
+ raise ValueError(f"Alias list for '{key}' cannot be empty.")
234
+ return values
235
+
236
+ @classmethod
237
+ def from_dict(cls, data: dict[str, list[str]]) -> "CompartmentAliasesValidator":
238
+ """
239
+ Create a CompartmentAliasesValidator from a dictionary.
240
+
241
+ Parameters
242
+ ----------
243
+ data : dict[str, list[str]]
244
+ A dictionary mapping canonical compartment names to their aliases.
245
+
246
+ Returns
247
+ -------
248
+ CompartmentAliasesValidator
249
+ A validated instance of the model.
250
+ """
251
+ return cls.model_validate(data)
252
+
253
+ def __getitem__(self, key: str) -> list[str]:
254
+ return self.root[key]
255
+
256
+ def items(self):
257
+ return self.root.items()
258
+
259
+ def __iter__(self):
260
+ return iter(self.root)
261
+
262
+ def __len__(self):
263
+ return len(self.root)
264
+
265
+
266
+ class SBML_reaction:
267
+ """A convenience class for processing individual SBML reactions.
268
+
269
+ This class extracts and organizes key information about an SBML reaction,
270
+ including its attributes and participating species (substrates, products,
271
+ and modifiers).
272
+
273
+ Parameters
274
+ ----------
275
+ sbml_reaction : libsbml.Reaction
276
+ A libsbml Reaction object to be processed.
218
277
 
278
+ Attributes
279
+ ----------
280
+ reaction_dict : dict
281
+ A dictionary of reaction-level attributes, including its ID, name,
282
+ reversibility, identifiers, and source information.
283
+ species : pd.DataFrame
284
+ A DataFrame listing all species participating in the reaction,
285
+ including their roles (substrate, product, modifier), stoichiometry,
286
+ and SBO terms.
219
287
  """
220
288
 
221
289
  def __init__(
222
290
  self,
223
291
  sbml_reaction: libsbml.Reaction,
224
292
  ) -> None:
225
- """
226
- Convenience class for working with sbml reactions
227
- """
293
+ """Initializes the SBML_reaction object by parsing a libsbml Reaction."""
228
294
  reaction_dict = {
229
- SMBL_REACTION_DICT_ID: sbml_reaction.getId(),
230
- SMBL_REACTION_DICT_NAME: sbml_reaction.getName(),
231
- SMBL_REACTION_DICT_IDENTIFIERS: identifiers.cv_to_Identifiers(
232
- sbml_reaction
233
- ),
234
- SMBL_REACTION_DICT_SOURCE: source.Source(init=True),
235
- SMBL_REACTION_DICT_IS_REVERSIBLE: sbml_reaction.getReversible(),
295
+ SBML_DFS.R_ID: sbml_reaction.getId(),
296
+ SBML_DFS.R_NAME: sbml_reaction.getName(),
297
+ SBML_DFS.R_IDENTIFIERS: identifiers.cv_to_Identifiers(sbml_reaction),
298
+ SBML_DFS.R_SOURCE: source.Source(init=True),
299
+ SBML_DFS.R_ISREVERSIBLE: sbml_reaction.getReversible(),
236
300
  }
237
301
 
238
302
  self.reaction_dict = reaction_dict
@@ -243,80 +307,114 @@ class SBML_reaction:
243
307
  for i in range(sbml_reaction.getNumModifiers()):
244
308
  spec = sbml_reaction.getModifier(i)
245
309
  spec_dict = {
246
- SMBL_REACTION_SPEC_RSC_ID: spec.getId(),
247
- SMBL_REACTION_SPEC_SC_ID: spec.getSpecies(),
248
- SMBL_REACTION_SPEC_STOICHIOMETRY: 0,
249
- SMBL_REACTION_SPEC_SBO_TERM: spec.getSBOTermID(),
310
+ SBML_DFS.RSC_ID: spec.getId(),
311
+ SBML_DFS.SC_ID: spec.getSpecies(),
312
+ SBML_DFS.STOICHIOMETRY: 0,
313
+ SBML_DFS.SBO_TERM: spec.getSBOTermID(),
250
314
  }
251
315
  reaction_species.append(spec_dict)
252
316
 
317
+ # find gene products defined using the fbc plugin
253
318
  rxn_fbc = sbml_reaction.getPlugin("fbc")
254
- # check for gene products associated with the FBC L3 extension
255
- if rxn_fbc is not None:
256
- gene_products = list()
319
+ if rxn_fbc:
257
320
  gpa = rxn_fbc.getGeneProductAssociation()
258
- if gpa is not None:
259
- gpaa = gpa.getAssociation()
260
- if hasattr(gpaa, SBML_REACTION_ATTR_GET_GENE_PRODUCT):
261
- gene_products.append(_get_gene_product_dict(gpaa))
262
- else:
263
- for i in range(gpaa.getNumAssociations()):
264
- gpaaa = gpaa.getAssociation(i)
265
- if hasattr(gpaaa, SBML_REACTION_ATTR_GET_GENE_PRODUCT):
266
- gene_products.append(_get_gene_product_dict(gpaaa))
267
- else:
268
- for i in range(gpaaa.getNumAssociations()):
269
- gpaaaa = gpaaa.getAssociation(i)
270
- if hasattr(gpaaaa, SBML_REACTION_ATTR_GET_GENE_PRODUCT):
271
- gene_products.append(_get_gene_product_dict(gpaaaa))
272
- else:
273
- for i in range(gpaa.getNumAssociations()):
274
- gpaaaaa = gpaaaa.getAssociation(i)
275
- if hasattr(
276
- gpaaaaa, SBML_REACTION_ATTR_GET_GENE_PRODUCT
277
- ):
278
- gene_products.append(
279
- _get_gene_product_dict(gpaaaaa)
280
- )
281
- else:
282
- logger.warning(
283
- "gene annotations nested deeper than 4 levels, ignoring"
284
- )
285
- continue
286
- # de-duplicate
287
- gene_products = list(
288
- {d[SMBL_REACTION_SPEC_SC_ID]: d for d in gene_products}.values()
289
- )
290
- reaction_species = reaction_species + gene_products
321
+ if gpa:
322
+ gene_products = _extract_gene_products(gpa.getAssociation())
323
+ # de-duplicate
324
+ gene_products = list(
325
+ {d[SBML_DFS.SC_ID]: d for d in gene_products}.values()
326
+ )
327
+ reaction_species.extend(gene_products)
291
328
 
292
329
  # save reactants
293
330
  for i in range(sbml_reaction.getNumReactants()):
294
331
  spec = sbml_reaction.getReactant(i)
295
332
  spec_dict = {
296
- SMBL_REACTION_SPEC_RSC_ID: spec.getId(),
297
- SMBL_REACTION_SPEC_SC_ID: spec.getSpecies(),
298
- SMBL_REACTION_SPEC_STOICHIOMETRY: -1 * spec.getStoichiometry(),
299
- SMBL_REACTION_SPEC_SBO_TERM: spec.getSBOTermID(),
333
+ SBML_DFS.RSC_ID: spec.getId(),
334
+ SBML_DFS.SC_ID: spec.getSpecies(),
335
+ SBML_DFS.STOICHIOMETRY: -1 * spec.getStoichiometry(),
336
+ SBML_DFS.SBO_TERM: spec.getSBOTermID(),
300
337
  }
301
338
  reaction_species.append(spec_dict)
302
339
  # save products
303
340
  for i in range(sbml_reaction.getNumProducts()):
304
341
  spec = sbml_reaction.getProduct(i)
305
342
  spec_dict = {
306
- SMBL_REACTION_SPEC_RSC_ID: spec.getId(),
307
- SMBL_REACTION_SPEC_SC_ID: spec.getSpecies(),
308
- SMBL_REACTION_SPEC_STOICHIOMETRY: spec.getStoichiometry(),
309
- SMBL_REACTION_SPEC_SBO_TERM: spec.getSBOTermID(),
343
+ SBML_DFS.RSC_ID: spec.getId(),
344
+ SBML_DFS.SC_ID: spec.getSpecies(),
345
+ SBML_DFS.STOICHIOMETRY: spec.getStoichiometry(),
346
+ SBML_DFS.SBO_TERM: spec.getSBOTermID(),
310
347
  }
311
348
  reaction_species.append(spec_dict)
312
349
 
313
- self.species = pd.DataFrame(reaction_species).set_index(
314
- SMBL_REACTION_SPEC_RSC_ID
315
- )
350
+ self.species = pd.DataFrame(reaction_species).set_index(SBML_DFS.RSC_ID)
351
+
352
+
353
+ def sbml_dfs_from_sbml(self, sbml_model: SBML, compartment_aliases: dict | None = None):
354
+ """Parses an SBML model into a set of standardized DataFrames.
355
+
356
+ This function serves as the main entry point for converting an SBML model
357
+ into the internal DataFrame-based representation used by napistu. It
358
+ orchestrates the processing of compartments, species, and reactions.
359
+
360
+ Parameters
361
+ ----------
362
+ self : object
363
+ The instance of the calling class, expected to have a `schema` attribute.
364
+ sbml_model : SBML
365
+ The SBML model to be parsed.
366
+ compartment_aliases : dict, optional
367
+ A dictionary to map custom compartment names to the napistu controlled
368
+ vocabulary. If None, the default mapping (COMPARTMENT_ALIASES) is used.
369
+ Defaults to None.
370
+
371
+ Returns
372
+ -------
373
+ object
374
+ The calling class instance, now populated with DataFrames for
375
+ compartments, species, compartmentalized_species, reactions, and reaction_species
376
+ """
377
+ # 1. Process compartments from the SBML model
378
+ self.compartments = _define_compartments(sbml_model, compartment_aliases)
379
+
380
+ # 2. Process species and compartmentalized species
381
+ self.species, self.compartmentalized_species = _define_species(
382
+ sbml_model, self.schema
383
+ )
384
+
385
+ # 3. Process reactions and their participating species
386
+ self.reactions, self.reaction_species = _define_reactions(sbml_model)
316
387
 
388
+ return self
389
+
390
+
391
+ def _define_compartments(
392
+ sbml_model: SBML, compartment_aliases_dict: dict | None = None
393
+ ) -> pd.DataFrame:
394
+ """Extracts and defines compartments from the SBML model.
395
+
396
+ This function iterates through the compartments in the SBML model,
397
+ extracting their IDs, names, and identifiers. It also handles cases where
398
+ CVTerms are missing by mapping compartment names to known GO terms.
317
399
 
318
- def sbml_df_from_sbml(self, sbml_model: SBML):
319
- # specify compartments
400
+ Parameters
401
+ ----------
402
+ sbml_model : SBML
403
+ The SBML model to process.
404
+ compartment_aliases_dict : dict, optional
405
+ A dictionary to map custom compartment names. If None, the default
406
+ mapping from `COMPARTMENT_ALIASES` is used.
407
+
408
+ Returns
409
+ -------
410
+ pd.DataFrame
411
+ A DataFrame containing information about each compartment, indexed by
412
+ compartment ID.
413
+ """
414
+ if compartment_aliases_dict is None:
415
+ aliases = COMPARTMENT_ALIASES
416
+ else:
417
+ aliases = CompartmentAliasesValidator.from_dict(compartment_aliases_dict)
320
418
 
321
419
  compartments = list()
322
420
  for i in range(sbml_model.model.getNumCompartments()):
@@ -330,7 +428,7 @@ def sbml_df_from_sbml(self, sbml_model: SBML):
330
428
  comp_name = comp.getName()
331
429
  mapped_compartment_key = [
332
430
  compkey
333
- for compkey, mappednames in constants.COMPARTMENT_ALIASES.items()
431
+ for compkey, mappednames in aliases.items()
334
432
  if comp_name in mappednames
335
433
  ]
336
434
 
@@ -340,22 +438,22 @@ def sbml_df_from_sbml(self, sbml_model: SBML):
340
438
  )
341
439
  compartments.append(
342
440
  {
343
- SBML_COMPARTMENT_DICT_ID: comp.getId(),
344
- SBML_COMPARTMENT_DICT_NAME: comp.getName(),
345
- SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
441
+ SBML_DFS.C_ID: comp.getId(),
442
+ SBML_DFS.C_NAME: comp.getName(),
443
+ SBML_DFS.C_IDENTIFIERS: identifiers.Identifiers(
346
444
  [
347
445
  identifiers.format_uri(
348
446
  uri=identifiers.create_uri_url(
349
- ontology=constants.ONTOLOGIES.GO,
350
- identifier=constants.COMPARTMENTS_GO_TERMS[
351
- "CELLULAR_COMPONENT"
447
+ ontology=ONTOLOGIES.GO,
448
+ identifier=COMPARTMENTS_GO_TERMS[
449
+ GENERIC_COMPARTMENT
352
450
  ],
353
451
  ),
354
452
  biological_qualifier_type=BQB.BQB_IS,
355
453
  )
356
454
  ]
357
455
  ),
358
- SBML_COMPARTMENT_DICT_SOURCE: source.Source(init=True),
456
+ SBML_DFS.C_SOURCE: source.Source(init=True),
359
457
  }
360
458
  )
361
459
 
@@ -366,14 +464,14 @@ def sbml_df_from_sbml(self, sbml_model: SBML):
366
464
  )
367
465
  compartments.append(
368
466
  {
369
- SBML_COMPARTMENT_DICT_ID: comp.getId(),
370
- SBML_COMPARTMENT_DICT_NAME: comp.getName(),
371
- SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
467
+ SBML_DFS.C_ID: comp.getId(),
468
+ SBML_DFS.C_NAME: comp.getName(),
469
+ SBML_DFS.C_IDENTIFIERS: identifiers.Identifiers(
372
470
  [
373
471
  identifiers.format_uri(
374
472
  uri=identifiers.create_uri_url(
375
- ontology=constants.ONTOLOGIES.GO,
376
- identifier=constants.COMPARTMENTS_GO_TERMS[
473
+ ontology=ONTOLOGIES.GO,
474
+ identifier=COMPARTMENTS_GO_TERMS[
377
475
  mapped_compartment_key[0]
378
476
  ],
379
477
  ),
@@ -381,107 +479,156 @@ def sbml_df_from_sbml(self, sbml_model: SBML):
381
479
  )
382
480
  ]
383
481
  ),
384
- SBML_COMPARTMENT_DICT_SOURCE: source.Source(init=True),
482
+ SBML_DFS.C_SOURCE: source.Source(init=True),
385
483
  }
386
484
  )
387
485
 
388
486
  else:
389
487
  compartments.append(
390
488
  {
391
- SBML_COMPARTMENT_DICT_ID: comp.getId(),
392
- SBML_COMPARTMENT_DICT_NAME: comp.getName(),
393
- SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.cv_to_Identifiers(
394
- comp
395
- ),
396
- SBML_COMPARTMENT_DICT_SOURCE: source.Source(init=True),
489
+ SBML_DFS.C_ID: comp.getId(),
490
+ SBML_DFS.C_NAME: comp.getName(),
491
+ SBML_DFS.C_IDENTIFIERS: identifiers.cv_to_Identifiers(comp),
492
+ SBML_DFS.C_SOURCE: source.Source(init=True),
397
493
  }
398
494
  )
399
495
 
400
- self.compartments = pd.DataFrame(compartments).set_index(SBML_COMPARTMENT_DICT_ID)
496
+ return pd.DataFrame(compartments).set_index(SBML_DFS.C_ID)
497
+
498
+
499
+ def _define_species(
500
+ sbml_model: SBML, schema: dict
501
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
502
+ """Extracts and defines species and compartmentalized species.
503
+
504
+ This function creates two DataFrames: one for unique molecular species
505
+ (un-compartmentalized) and another for compartmentalized species, which
506
+ represent a species within a specific compartment.
507
+
508
+ Parameters
509
+ ----------
510
+ sbml_model : SBML
511
+ The SBML model to process.
512
+ schema : dict
513
+ A dictionary defining the data schema for species and compartmentalized
514
+ species tables.
515
+
516
+ Returns
517
+ -------
518
+ tuple[pd.DataFrame, pd.DataFrame]
519
+ A tuple containing two DataFrames:
520
+ - The first DataFrame represents unique molecular species.
521
+ - The second DataFrame represents compartmentalized species.
522
+ """
523
+
524
+ SPECIES_VARS = schema["species"]["vars"]
525
+ CSPECIES_VARS = schema["compartmentalized_species"]["vars"]
401
526
 
402
- # create a species df
403
527
  comp_species_df = setup_cspecies(sbml_model)
404
528
 
405
529
  # find unique species and create a table
406
530
  consensus_species_df = comp_species_df.copy()
407
- consensus_species_df.index.names = [SBML_SPECIES_DICT_ID]
531
+ consensus_species_df.index.names = [SBML_DFS.S_ID]
408
532
  consensus_species, species_lookup = consensus.reduce_to_consensus_ids(
409
533
  consensus_species_df,
410
- {"pk": SBML_SPECIES_DICT_ID, "id": SBML_SPECIES_DICT_IDENTIFIERS},
534
+ {"pk": SBML_DFS.S_ID, "id": SBML_DFS.S_IDENTIFIERS},
411
535
  )
412
536
 
413
537
  # create a table of unique molecular species
414
- consensus_species.index.name = SBML_SPECIES_DICT_ID
415
- consensus_species[SBML_SPECIES_DICT_NAME] = [
416
- re.sub("\\[.+\\]", "", x).strip()
417
- for x in consensus_species[SBML_COMPARTMENTALIZED_SPECIES_DICT_NAME]
538
+ consensus_species.index.name = SBML_DFS.S_ID
539
+ consensus_species[SBML_DFS.S_NAME] = [
540
+ re.sub("\\[.+\\]", "", x).strip() for x in consensus_species[SBML_DFS.SC_NAME]
418
541
  ]
419
542
  consensus_species = consensus_species.drop(
420
- [SBML_COMPARTMENTALIZED_SPECIES_DICT_NAME, SBML_COMPARTMENT_DICT_ID], axis=1
543
+ [SBML_DFS.SC_NAME, SBML_DFS.C_ID], axis=1
421
544
  )
422
545
  consensus_species["s_Source"] = [
423
546
  source.Source(init=True) for x in range(0, consensus_species.shape[0])
424
547
  ]
425
548
 
426
- self.species = consensus_species[self.schema["species"]["vars"]]
549
+ species = consensus_species[SPECIES_VARS]
550
+ compartmentalized_species = comp_species_df.join(species_lookup).rename(
551
+ columns={"new_id": SBML_DFS.S_ID}
552
+ )[CSPECIES_VARS]
553
+
554
+ return species, compartmentalized_species
427
555
 
428
- self.compartmentalized_species = comp_species_df.join(species_lookup).rename(
429
- columns={"new_id": SBML_SPECIES_DICT_ID}
430
- )[self.schema["compartmentalized_species"]["vars"]]
431
556
 
432
- # specify reactions
557
+ def _define_reactions(sbml_model: SBML) -> tuple[pd.DataFrame, pd.DataFrame]:
558
+ """Extracts and defines reactions and their participating species.
559
+
560
+ This function iterates through all reactions in the SBML model, creating
561
+ a DataFrame for reaction attributes and another for all participating
562
+ species (reactants, products, and modifiers).
563
+
564
+ Parameters
565
+ ----------
566
+ sbml_model : SBML
567
+ The SBML model to process.
433
568
 
434
- reactions = list()
435
- reaction_species = list()
569
+ Returns
570
+ -------
571
+ tuple[pd.DataFrame, pd.DataFrame]
572
+ A tuple containing two DataFrames:
573
+ - The first DataFrame contains reaction attributes, indexed by reaction ID.
574
+ - The second DataFrame lists all species participating in reactions.
575
+ """
576
+ reactions_list = []
577
+ reaction_species_list = []
436
578
  for i in range(sbml_model.model.getNumReactions()):
437
579
  rxn = SBML_reaction(sbml_model.model.getReaction(i))
438
- reactions.append(rxn.reaction_dict)
580
+ reactions_list.append(rxn.reaction_dict)
439
581
 
440
582
  rxn_specs = rxn.species
441
- rxn_specs[SMBL_REACTION_DICT_ID] = rxn.reaction_dict[SMBL_REACTION_DICT_ID]
442
- reaction_species.append(rxn_specs)
583
+ rxn_specs[SBML_DFS.R_ID] = rxn.reaction_dict[SBML_DFS.R_ID]
584
+ reaction_species_list.append(rxn_specs)
443
585
 
444
- self.reactions = pd.DataFrame(reactions).set_index(SMBL_REACTION_DICT_ID)
586
+ reactions = pd.DataFrame(reactions_list).set_index(SBML_DFS.R_ID)
445
587
 
446
- reaction_species_df = pd.concat(reaction_species)
588
+ reaction_species_df = pd.concat(reaction_species_list)
447
589
  # add an index if reaction species didn't have IDs in the .sbml
448
590
  if all([v == "" for v in reaction_species_df.index.tolist()]):
449
591
  reaction_species_df = (
450
592
  reaction_species_df.reset_index(drop=True)
451
593
  .assign(
452
594
  rsc_id=sbml_dfs_utils.id_formatter(
453
- range(reaction_species_df.shape[0]), SMBL_REACTION_SPEC_RSC_ID
595
+ range(reaction_species_df.shape[0]), SBML_DFS.RSC_ID
454
596
  )
455
597
  )
456
- .set_index(SMBL_REACTION_SPEC_RSC_ID)
598
+ .set_index(SBML_DFS.RSC_ID)
457
599
  )
458
600
 
459
- self.reaction_species = reaction_species_df
460
-
461
- return self
601
+ return reactions, reaction_species_df
462
602
 
463
603
 
464
604
  def setup_cspecies(sbml_model: SBML) -> pd.DataFrame:
465
- """
466
- Setup Compartmentalized Species
467
-
468
- Read all compartmentalized species from a model
469
- and setup as a pd.DataFrame.
470
- This operation is functionalized to test the subsequent call of
471
- consensus.reduce_to_consensus_ids()
472
- which collapses compartmentalized_species -> species
473
- based on shared identifiers.
605
+ """Creates a DataFrame of compartmentalized species from an SBML model.
606
+
607
+ This function extracts all species from the model and creates a
608
+ standardized DataFrame that includes unique IDs for each compartmentalized
609
+ species (`sc_id`), along with species and compartment IDs, and their
610
+ corresponding identifiers.
611
+
612
+ Parameters
613
+ ----------
614
+ sbml_model : SBML
615
+ The SBML model to process.
616
+
617
+ Returns
618
+ -------
619
+ pd.DataFrame
620
+ A DataFrame containing information about each compartmentalized species.
474
621
  """
475
622
  comp_species = list()
476
623
  for i in range(sbml_model.model.getNumSpecies()):
477
624
  spec = sbml_model.model.getSpecies(i)
478
625
 
479
626
  spec_dict = {
480
- SMBL_REACTION_SPEC_SC_ID: spec.getId(),
481
- SBML_COMPARTMENTALIZED_SPECIES_DICT_NAME: spec.getName(),
482
- SBML_COMPARTMENT_DICT_ID: spec.getCompartment(),
483
- SBML_SPECIES_DICT_IDENTIFIERS: identifiers.cv_to_Identifiers(spec),
484
- SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE: source.Source(init=True),
627
+ SBML_DFS.SC_ID: spec.getId(),
628
+ SBML_DFS.SC_NAME: spec.getName(),
629
+ SBML_DFS.C_ID: spec.getCompartment(),
630
+ SBML_DFS.S_IDENTIFIERS: identifiers.cv_to_Identifiers(spec),
631
+ SBML_DFS.SC_SOURCE: source.Source(init=True),
485
632
  }
486
633
 
487
634
  comp_species.append(spec_dict)
@@ -494,31 +641,55 @@ def setup_cspecies(sbml_model: SBML) -> pd.DataFrame:
494
641
  gene_product = mplugin.getGeneProduct(i)
495
642
 
496
643
  gene_dict = {
497
- SMBL_REACTION_SPEC_SC_ID: gene_product.getId(),
498
- SBML_COMPARTMENTALIZED_SPECIES_DICT_NAME: (
644
+ SBML_DFS.SC_ID: gene_product.getId(),
645
+ SBML_DFS.SC_NAME: (
499
646
  gene_product.getName()
500
647
  if gene_product.isSetName()
501
648
  else gene_product.getLabel()
502
649
  ),
503
650
  # use getLabel() to accomendate sbml model (e.g. HumanGEM.xml) with no fbc:name attribute
504
651
  # Recon3D.xml has both fbc:label and fbc:name attributes, with gene name in fbc:nam
505
- SBML_COMPARTMENT_DICT_ID: None,
506
- SBML_SPECIES_DICT_IDENTIFIERS: identifiers.cv_to_Identifiers(
507
- gene_product
508
- ),
509
- SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE: source.Source(init=True),
652
+ SBML_DFS.C_ID: None,
653
+ SBML_DFS.S_IDENTIFIERS: identifiers.cv_to_Identifiers(gene_product),
654
+ SBML_DFS.SC_SOURCE: source.Source(init=True),
510
655
  }
511
656
 
512
657
  comp_species.append(gene_dict)
513
658
 
514
- return pd.DataFrame(comp_species).set_index(SMBL_REACTION_SPEC_SC_ID)
659
+ return pd.DataFrame(comp_species).set_index(SBML_DFS.SC_ID)
515
660
 
516
661
 
517
662
  def _get_gene_product_dict(gp):
518
- """Read a gene product node from an sbml file."""
663
+ """Extracts attributes of a gene product from an SBML reaction object.
664
+
665
+ Parameters
666
+ ----------
667
+ gp : libsbml.GeneProduct
668
+ A libsbml GeneProduct object.
669
+
670
+ Returns
671
+ -------
672
+ dict
673
+ A dictionary containing the gene product's ID, name, and identifiers.
674
+ """
519
675
  return {
520
- SMBL_REACTION_SPEC_RSC_ID: gp.getId(),
521
- SMBL_REACTION_SPEC_SC_ID: gp.getGeneProduct(),
522
- SMBL_REACTION_SPEC_STOICHIOMETRY: 0,
523
- SMBL_REACTION_SPEC_SBO_TERM: gp.getSBOTermID(),
676
+ SBML_DFS.RSC_ID: gp.getId(),
677
+ SBML_DFS.SC_ID: gp.getGeneProduct(),
678
+ SBML_DFS.STOICHIOMETRY: 0,
679
+ SBML_DFS.SBO_TERM: gp.getSBOTermID(),
524
680
  }
681
+
682
+
683
+ def _extract_gene_products(association: libsbml.Association) -> list[dict]:
684
+ """Recursively extracts gene products from an association tree."""
685
+ gene_products = []
686
+
687
+ def _recursive_helper(assoc: libsbml.Association):
688
+ if hasattr(assoc, SBML_DEFS.REACTION_ATTR_GET_GENE_PRODUCT):
689
+ gene_products.append(_get_gene_product_dict(assoc))
690
+ elif hasattr(assoc, "getNumAssociations"):
691
+ for i in range(assoc.getNumAssociations()):
692
+ _recursive_helper(assoc.getAssociation(i))
693
+
694
+ _recursive_helper(association)
695
+ return gene_products