napistu 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/sbml_dfs_utils.py CHANGED
@@ -11,80 +11,356 @@ from fs import open_fs
11
11
  import numpy as np
12
12
  import pandas as pd
13
13
  from napistu import utils
14
+ from napistu import identifiers
14
15
  from napistu import indices
15
16
 
16
- from napistu import sbml_dfs_core
17
+ from napistu.constants import BQB
17
18
  from napistu.constants import SBML_DFS
19
+ from napistu.constants import SBML_DFS_SCHEMA
18
20
  from napistu.constants import IDENTIFIERS
19
21
  from napistu.constants import BQB_DEFINING_ATTRS
20
22
  from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
23
+ from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
24
+ from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
25
+ from napistu.constants import SBO_ROLES_DEFS
26
+ from napistu.constants import MINI_SBO_FROM_NAME
27
+ from napistu.constants import MINI_SBO_TO_NAME
28
+ from napistu.constants import SBO_NAME_TO_ROLE
29
+ from napistu.constants import ONTOLOGIES
30
+ from napistu.ingestion.constants import VALID_COMPARTMENTS
31
+ from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
32
+ from napistu.ingestion.constants import GENERIC_COMPARTMENT
21
33
 
22
34
  logger = logging.getLogger(__name__)
23
35
 
24
36
 
25
- def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
37
+ # =============================================================================
38
+ # PUBLIC FUNCTIONS (ALPHABETICAL ORDER)
39
+ # =============================================================================
40
+
41
+
42
+ def adapt_pw_index(
43
+ source: str | indices.PWIndex,
44
+ species: str | Iterable[str] | None,
45
+ outdir: str | None = None,
46
+ ) -> indices.PWIndex:
47
+ """Adapts a pw_index
48
+
49
+ Helpful to filter for species before reconstructing.
50
+
51
+ Args:
52
+ source (str | PWIndex): uri for pw_index.csv file or PWIndex object
53
+ species (str):
54
+ outdir (str | None, optional): Optional directory to write pw_index to.
55
+ Defaults to None.
56
+
57
+ Returns:
58
+ indices.PWIndex: Filtered pw index
26
59
  """
27
- Unnest Identifiers
60
+ if isinstance(source, str):
61
+ pw_index = indices.PWIndex(source)
62
+ elif isinstance(source, indices.PWIndex):
63
+ pw_index = copy.deepcopy(source)
64
+ else:
65
+ raise ValueError("'source' needs to be str or PWIndex.")
66
+ pw_index.filter(species=species)
28
67
 
29
- Take a pd.DataFrame containing an array of Identifiers and
30
- return one-row per identifier.
68
+ if outdir is not None:
69
+ with open_fs(outdir, create=True) as fs:
70
+ with fs.open("pw_index.tsv", "w") as f:
71
+ pw_index.index.to_csv(f, sep="\t")
72
+ return pw_index
73
+
74
+
75
+ def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
76
+ """
77
+ Add an sbo_role column to the reaction_species table.
78
+
79
+ The sbo_role column is a string column that contains the SBO role of the reaction species.
80
+ The values in the sbo_role column are taken from the sbo_term column.
81
+
82
+ The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
83
+ """
84
+
85
+ validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
86
+
87
+ reaction_species = (
88
+ reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
89
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
90
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
91
+ )
92
+
93
+ undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
94
+ SBO_NAME_TO_ROLE.values()
95
+ )
96
+ if len(undefined_roles) > 0:
97
+ logger.warning(
98
+ f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
99
+ )
100
+ mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
101
+ reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
102
+
103
+ return reaction_species
104
+
105
+
106
+ def check_entity_data_index_matching(sbml_dfs, table):
107
+ """
108
+ Update the input smbl_dfs's entity_data (dict) index
109
+ with match_entitydata_index_to_entity,
110
+ so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
111
+ corresponding entity, and then passes sbml_dfs.validate()
112
+ Args
113
+ sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
114
+ table (str): table whose data is being consolidates (currently species or reactions)
115
+ Returns
116
+ sbml_dfs (cpr.SBML_dfs):
117
+ sbml_dfs whose entity_data is checked to have the same index
118
+ as the corresponding entity.
119
+ """
120
+
121
+ table_data = table + "_data"
122
+
123
+ entity_data_dict = getattr(sbml_dfs, table_data)
124
+ entity_schema = sbml_dfs.schema[table]
125
+ sbml_dfs_entity = getattr(sbml_dfs, table)
126
+
127
+ if entity_data_dict != {}:
128
+ entity_data_types = set.union(set(entity_data_dict.keys()))
129
+
130
+ entity_data_dict_checked = {
131
+ x: match_entitydata_index_to_entity(
132
+ entity_data_dict, x, sbml_dfs_entity, entity_schema, table
133
+ )
134
+ for x in entity_data_types
135
+ }
136
+
137
+ if table == SBML_DFS.REACTIONS:
138
+ sbml_dfs.reactions_data = entity_data_dict_checked
139
+ elif table == SBML_DFS.SPECIES:
140
+ sbml_dfs.species_data = entity_data_dict_checked
141
+
142
+ return sbml_dfs
143
+
144
+
145
+ def construct_formula_string(
146
+ reaction_species_df: pd.DataFrame,
147
+ reactions_df: pd.DataFrame,
148
+ name_var: str,
149
+ ) -> str:
150
+ """
151
+ Construct Formula String
152
+
153
+ Convert a table of reaction species into a formula string
31
154
 
32
155
  Parameters:
33
- id_table: pd.DataFrame
34
- a table containing an array of Identifiers
35
- id_var: str
36
- variable containing Identifiers
156
+ ----------
157
+ reaction_species_df: pd.DataFrame
158
+ Table containing a reactions' species
159
+ reactions_df: pd.DataFrame
160
+ smbl.reactions
161
+ name_var: str
162
+ Name used to label species
37
163
 
38
164
  Returns:
39
- pd.Dataframe containing the index of id_table but expanded
40
- to include one row per identifier
165
+ ----------
166
+ formula_str: str
167
+ String representation of a reactions substrates, products and
168
+ modifiers
41
169
 
42
170
  """
43
171
 
44
- # validate inputs
45
- utils.match_pd_vars(id_table, {id_var}).assert_present()
172
+ reaction_species_df["label"] = [
173
+ _add_stoi_to_species_name(x, y)
174
+ for x, y in zip(
175
+ reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
176
+ )
177
+ ]
178
+
179
+ rxn_reversible = bool(
180
+ reactions_df.loc[
181
+ reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
182
+ ]
183
+ ) # convert from a np.bool_ to bool if needed
184
+ if not isinstance(rxn_reversible, bool):
185
+ raise TypeError(
186
+ f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
187
+ )
46
188
 
47
- N_invalid_ids = sum(id_table[id_var].isna())
48
- if N_invalid_ids != 0:
189
+ if rxn_reversible:
190
+ arrow_type = " <-> "
191
+ else:
192
+ arrow_type = " -> "
193
+
194
+ substrates = " + ".join(
195
+ reaction_species_df["label"][
196
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
197
+ ].tolist()
198
+ )
199
+ products = " + ".join(
200
+ reaction_species_df["label"][
201
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
202
+ ].tolist()
203
+ )
204
+ modifiers = " + ".join(
205
+ reaction_species_df["label"][
206
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
207
+ ].tolist()
208
+ )
209
+ if modifiers != "":
210
+ modifiers = f" ---- modifiers: {modifiers}]"
211
+
212
+ return f"{substrates}{arrow_type}{products}{modifiers}"
213
+
214
+
215
+ def find_underspecified_reactions(
216
+ reaction_species_w_roles: pd.DataFrame,
217
+ ) -> pd.DataFrame:
218
+
219
+ # check that both sbo_role and "new" are present
220
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
49
221
  raise ValueError(
50
- f'{N_invalid_ids} entries in "id_table" were missing',
51
- "entries with no identifiers should still include an Identifiers object",
222
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call sbml_dfs_utils.add_sbo_role() first."
223
+ )
224
+ if "new" not in reaction_species_w_roles.columns:
225
+ raise ValueError(
226
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
227
+ )
228
+ # check that new is a boolean column
229
+ if reaction_species_w_roles["new"].dtype != bool:
230
+ raise ValueError(
231
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
52
232
  )
53
233
 
54
- # Get the identifier as a list of dicts
55
- df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
56
- # Filter out zero length lists
57
- df = df.query(f"{id_var} != 0")
58
- # Unnest the list of dicts into one dict per row
59
- df = df.explode(id_var)
60
- # Unnest the dict into a dataframe
61
- df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
62
- # Add the entry number as an index
63
- df["entry"] = df.groupby(df.index).cumcount()
64
- df.set_index("entry", append=True, inplace=True)
65
- return df
234
+ reactions_with_lost_defining_members = set(
235
+ reaction_species_w_roles.query("~new")
236
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
237
+ .tolist()
238
+ )
66
239
 
240
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
241
+ if N_reactions_with_lost_defining_members > 0:
242
+ logger.info(
243
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
244
+ )
67
245
 
68
- def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
69
- id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
70
- return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
246
+ # find the cases where all "new" values for a given (r_id, sbo_term) are False
247
+ reactions_with_lost_requirements = set(
248
+ reaction_species_w_roles
249
+ # drop already filtered reactions
250
+ .query("r_id not in @reactions_with_lost_defining_members")
251
+ .query("sbo_role == 'REQUIRED'")
252
+ # which entries which have some required attribute have all False values for that attribute
253
+ .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
254
+ .agg({"new": "any"})
255
+ .query("new == False")
256
+ .index.get_level_values(SBML_DFS.R_ID)
257
+ )
71
258
 
259
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
260
+ if N_reactions_with_lost_requirements > 0:
261
+ logger.info(
262
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
263
+ )
72
264
 
73
- def id_formatter_inv(ids: list[str]) -> list[int]:
265
+ underspecified_reactions = reactions_with_lost_defining_members.union(
266
+ reactions_with_lost_requirements
267
+ )
268
+
269
+ return underspecified_reactions
270
+
271
+
272
+ def filter_to_characteristic_species_ids(
273
+ species_ids: pd.DataFrame,
274
+ max_complex_size: int = 4,
275
+ max_promiscuity: int = 20,
276
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
277
+ ) -> pd.DataFrame:
74
278
  """
75
- ID Formatter Inverter
279
+ Filter to Characteristic Species IDs
280
+
281
+ Remove identifiers corresponding to one component within a large protein
282
+ complexes and non-characteristic annotations such as pubmed references and
283
+ homologues.
284
+
285
+ Parameters
286
+ ----------
287
+ species_ids: pd.DataFrame
288
+ A table of identifiers produced by sdbml_dfs.get_identifiers("species")
289
+ max_complex_size: int
290
+ The largest size of a complex, where BQB_HAS_PART terms will be retained.
291
+ In most cases, complexes are handled with specific formation and
292
+ dissolutation reactions,but these identifiers will be pulled in when
293
+ searching by identifiers or searching the identifiers associated with a
294
+ species against an external resource such as Open Targets.
295
+ max_promiscuity: int
296
+ Maximum number of species where a single molecule can act as a
297
+ BQB_HAS_PART component associated with a single identifier (and common ontology).
298
+ defining_biological_qualifiers (list[str]):
299
+ BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
300
+ permissive settings would include homologs, different forms of the same gene.
301
+
302
+ Returns:
303
+ --------
304
+ species_id: pd.DataFrame
305
+ Input species filtered to characteristic identifiers
76
306
 
77
- Convert from internal IDs back to integer IDs
78
307
  """
79
308
 
80
- id_val = list()
81
- for an_id in ids:
82
- if re.match("^[A-Z]+[0-9]+$", an_id):
83
- id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
84
- else:
85
- id_val.append(np.nan) # type: ignore
309
+ if not isinstance(species_ids, pd.DataFrame):
310
+ raise TypeError(
311
+ f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
312
+ )
86
313
 
87
- return id_val
314
+ if not isinstance(max_complex_size, int):
315
+ raise TypeError(
316
+ f"max_complex_size was a {type(max_complex_size)} but must be an int"
317
+ )
318
+
319
+ if not isinstance(max_promiscuity, int):
320
+ raise TypeError(
321
+ f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
322
+ )
323
+
324
+ if not isinstance(defining_biological_qualifiers, list):
325
+ raise TypeError(
326
+ f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
327
+ )
328
+
329
+ # primary annotations of a species
330
+ bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
331
+
332
+ # add components within modestly sized protein complexes
333
+ # look at HAS_PART IDs
334
+ bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
335
+
336
+ # number of species in a complex
337
+ n_species_components = bqb_has_parts_species.value_counts(
338
+ [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
339
+ )
340
+ big_complex_sids = set(
341
+ n_species_components[
342
+ n_species_components > max_complex_size
343
+ ].index.get_level_values(SBML_DFS.S_ID)
344
+ )
345
+
346
+ filtered_bqb_has_parts = _filter_promiscuous_components(
347
+ bqb_has_parts_species, max_promiscuity
348
+ )
349
+
350
+ # drop species parts if there are many components
351
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
352
+ ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
353
+ ]
354
+
355
+ # combine primary identifiers and rare components
356
+ characteristic_species_ids = pd.concat(
357
+ [
358
+ bqb_is_species,
359
+ filtered_bqb_has_parts,
360
+ ]
361
+ )
362
+
363
+ return characteristic_species_ids
88
364
 
89
365
 
90
366
  def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
@@ -118,57 +394,26 @@ def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
118
394
  return current_max_id
119
395
 
120
396
 
121
- def adapt_pw_index(
122
- source: str | indices.PWIndex,
123
- species: str | Iterable[str] | None,
124
- outdir: str | None = None,
125
- ) -> indices.PWIndex:
126
- """Adapts a pw_index
127
-
128
- Helpful to filter for species before reconstructing.
397
+ def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
398
+ id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
399
+ return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
129
400
 
130
- Args:
131
- source (str | PWIndex): uri for pw_index.csv file or PWIndex object
132
- species (str):
133
- outdir (str | None, optional): Optional directory to write pw_index to.
134
- Defaults to None.
135
401
 
136
- Returns:
137
- indices.PWIndex: Filtered pw index
402
+ def id_formatter_inv(ids: list[str]) -> list[int]:
138
403
  """
139
- if isinstance(source, str):
140
- pw_index = indices.PWIndex(source)
141
- elif isinstance(source, indices.PWIndex):
142
- pw_index = copy.deepcopy(source)
143
- else:
144
- raise ValueError("'source' needs to be str or PWIndex.")
145
- pw_index.filter(species=species)
146
-
147
- if outdir is not None:
148
- with open_fs(outdir, create=True) as fs:
149
- with fs.open("pw_index.tsv", "w") as f:
150
- pw_index.index.to_csv(f, sep="\t")
151
- return pw_index
404
+ ID Formatter Inverter
152
405
 
406
+ Convert from internal IDs back to integer IDs
407
+ """
153
408
 
154
- def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
155
- if dogmatic:
156
- logger.info(
157
- "Running in dogmatic mode - differences genes, transcripts, and proteins will "
158
- "try to be maintained as separate species."
159
- )
160
- # preserve differences between genes, transcripts, and proteins
161
- defining_biological_qualifiers = BQB_DEFINING_ATTRS
162
- else:
163
- logger.info(
164
- "Running in non-dogmatic mode - genes, transcripts, and proteins will "
165
- "be merged if possible."
166
- )
167
- # merge genes, transcripts, and proteins (if they are defined with
168
- # bqb terms which specify their relationships).
169
- defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
409
+ id_val = list()
410
+ for an_id in ids:
411
+ if re.match("^[A-Z]+[0-9]+$", an_id):
412
+ id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
413
+ else:
414
+ id_val.append(np.nan) # type: ignore
170
415
 
171
- return defining_biological_qualifiers
416
+ return id_val
172
417
 
173
418
 
174
419
  def match_entitydata_index_to_entity(
@@ -200,7 +445,7 @@ def match_entitydata_index_to_entity(
200
445
  if len(entity_data_df.index.difference(consensus_entity_df.index)) == 0:
201
446
  logger.info(f"{data_table} ids are included in {table} ids")
202
447
  else:
203
- logger.warnning(
448
+ logger.warning(
204
449
  f"{data_table} have ids are not matched to {table} ids,"
205
450
  f"please check mismatched ids first"
206
451
  )
@@ -229,79 +474,169 @@ def match_entitydata_index_to_entity(
229
474
  return entity_data_df
230
475
 
231
476
 
232
- def check_entity_data_index_matching(sbml_dfs, table):
233
- """
234
- Update the input smbl_dfs's entity_data (dict) index
235
- with match_entitydata_index_to_entity,
236
- so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
237
- corresponding entity, and then passes sbml_dfs.validate()
238
- Args
239
- sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
240
- table (str): table whose data is being consolidates (currently species or reactions)
241
- Returns
242
- sbml_dfs (cpr.SBML_dfs):
243
- sbml_dfs whose entity_data is checked to have the same index
244
- as the corresponding entity.
245
- """
477
+ def species_type_types(x):
478
+ """Assign a high-level molecule type to a molecular species"""
246
479
 
247
- table_data = table + "_data"
480
+ if isinstance(x, identifiers.Identifiers):
481
+ if x.filter(["chebi"]):
482
+ return "metabolite"
483
+ elif x.filter(["molodex"]):
484
+ return "drug"
485
+ else:
486
+ return "protein"
487
+ else:
488
+ return "unknown"
248
489
 
249
- entity_data_dict = getattr(sbml_dfs, table_data)
250
- entity_schema = sbml_dfs.schema[table]
251
- sbml_dfs_entity = getattr(sbml_dfs, table)
252
490
 
253
- if entity_data_dict != {}:
254
- entity_data_types = set.union(set(entity_data_dict.keys()))
491
+ def stub_compartments(
492
+ stubbed_compartment: str = GENERIC_COMPARTMENT,
493
+ ) -> pd.DataFrame:
494
+ """Stub Compartments
255
495
 
256
- entity_data_dict_checked = {
257
- x: match_entitydata_index_to_entity(
258
- entity_data_dict, x, sbml_dfs_entity, entity_schema, table
259
- )
260
- for x in entity_data_types
496
+ Create a compartments table with only a single compartment
497
+
498
+ Args:
499
+ stubbed_compartment (str): the name of a compartment which should match the
500
+ keys in ingestion.constants.VALID_COMPARTMENTS and ingestion.constants.COMPARTMENTS_GO_TERMS
501
+
502
+ Returns:
503
+ compartments_df (pd.DataFrame): compartments dataframe
504
+ """
505
+
506
+ if stubbed_compartment not in VALID_COMPARTMENTS:
507
+ raise ValueError(
508
+ f"{stubbed_compartment} is not defined in ingestion.constants.VALID_COMPARTMENTS"
509
+ )
510
+
511
+ if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
512
+ raise ValueError(
513
+ f"{stubbed_compartment} is not defined in ingestion.constants.COMPARTMENTS_GO_TERMS"
514
+ )
515
+
516
+ stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
517
+
518
+ formatted_uri = identifiers.format_uri(
519
+ uri=identifiers.create_uri_url(
520
+ ontology=ONTOLOGIES.GO,
521
+ identifier=stubbed_compartment_id,
522
+ ),
523
+ biological_qualifier_type=BQB.IS,
524
+ )
525
+
526
+ compartments_df = pd.DataFrame(
527
+ {
528
+ SBML_DFS.C_NAME: [stubbed_compartment],
529
+ SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
261
530
  }
531
+ )
532
+ compartments_df.index = id_formatter([0], SBML_DFS.C_ID) # type: ignore
533
+ compartments_df.index.name = SBML_DFS.C_ID
262
534
 
263
- if table == SBML_DFS.REACTIONS:
264
- sbml_dfs.reactions_data = entity_data_dict_checked
265
- elif table == SBML_DFS.SPECIES:
266
- sbml_dfs.species_data = entity_data_dict_checked
535
+ return compartments_df
267
536
 
268
- return sbml_dfs
269
537
 
538
+ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
539
+ """
540
+ Unnest Identifiers
541
+
542
+ Take a pd.DataFrame containing an array of Identifiers and
543
+ return one-row per identifier.
544
+
545
+ Parameters:
546
+ id_table: pd.DataFrame
547
+ a table containing an array of Identifiers
548
+ id_var: str
549
+ variable containing Identifiers
550
+
551
+ Returns:
552
+ pd.Dataframe containing the index of id_table but expanded
553
+ to include one row per identifier
270
554
 
271
- def get_characteristic_species_ids(
272
- sbml_dfs: sbml_dfs_core.SBML_dfs, dogmatic: bool = True
273
- ) -> pd.DataFrame:
274
555
  """
275
- Get Characteristic Species IDs
276
556
 
277
- List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
557
+ # validate inputs
558
+ utils.match_pd_vars(id_table, {id_var}).assert_present()
559
+
560
+ N_invalid_ids = sum(id_table[id_var].isna())
561
+ if N_invalid_ids != 0:
562
+ raise ValueError(
563
+ f'{N_invalid_ids} entries in "id_table" were missing',
564
+ "entries with no identifiers should still include an Identifiers object",
565
+ )
566
+
567
+ # Get the identifier as a list of dicts
568
+ df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
569
+ # Filter out zero length lists
570
+ df = df.query(f"{id_var} != 0")
571
+ # Unnest the list of dicts into one dict per row
572
+ df = df.explode(id_var)
573
+ # Unnest the dict into a dataframe
574
+ df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
575
+ # Add the entry number as an index
576
+ df["entry"] = df.groupby(df.index).cumcount()
577
+ df.set_index("entry", append=True, inplace=True)
578
+ return df
579
+
580
+
581
+ def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
582
+ """
583
+ Validate a standalone table against the SBML_dfs schema.
584
+
585
+ This function validates a table against the schema defined in SBML_DFS_SCHEMA,
586
+ without requiring an SBML_dfs object. Useful for validating tables before
587
+ creating an SBML_dfs object.
278
588
 
279
589
  Parameters
280
590
  ----------
281
- sbml_dfs : sbml_dfs_core.SBML_dfs
282
- The SBML_dfs object.
283
- dogmatic : bool, default=True
284
- Whether to use the dogmatic flag to determine which BQB attributes are valid.
591
+ table_data : pd.DataFrame
592
+ The table to validate
593
+ table_name : str
594
+ Name of the table in the SBML_dfs schema
595
+
596
+ Raises
597
+ ------
598
+ ValueError
599
+ If table_name is not in schema or validation fails
600
+ """
601
+ if table_name not in SBML_DFS_SCHEMA.SCHEMA:
602
+ raise ValueError(
603
+ f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
604
+ f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
605
+ )
285
606
 
286
- Returns
287
- -------
288
- pd.DataFrame
289
- A DataFrame containing the systematic identifiers which are characteristic of molecular species.
607
+ table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
608
+ _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
609
+
610
+
611
+ # =============================================================================
612
+ # PRIVATE FUNCTIONS (ALPHABETICAL ORDER)
613
+ # =============================================================================
614
+
615
+
616
+ def _add_stoi_to_species_name(stoi: float | int, name: str) -> str:
290
617
  """
618
+ Add Stoi To Species Name
291
619
 
292
- # select valid BQB attributes based on dogmatic flag
293
- defining_biological_qualifiers = _dogmatic_to_defining_bqbs(dogmatic)
620
+ Add # of molecules to a species name
294
621
 
295
- # pre-summarize ontologies
296
- species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
622
+ Parameters:
623
+ ----------
624
+ stoi: float or int
625
+ Number of molecules
626
+ name: str
627
+ Name of species
297
628
 
298
- # drop some BQB_HAS_PART annotations
299
- species_identifiers = sbml_dfs_core.filter_to_characteristic_species_ids(
300
- species_identifiers,
301
- defining_biological_qualifiers=defining_biological_qualifiers,
302
- )
629
+ Returns:
630
+ ----------
631
+ name: str
632
+ Name containing number of species
633
+
634
+ """
303
635
 
304
- return species_identifiers
636
+ if stoi in [-1, 0, 1]:
637
+ return name
638
+ else:
639
+ return str(abs(stoi)) + " " + name
305
640
 
306
641
 
307
642
  def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
@@ -325,8 +660,458 @@ def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
325
660
  return defining_biological_qualifiers
326
661
 
327
662
 
328
- def _stub_ids(ids):
329
- """Stub with a blank ID if an ids list is blank; otherwise create an Identifiers object from the provided ids"""
663
+ def _edgelist_create_compartmentalized_species(
664
+ interaction_edgelist, species_df, compartments_df, interaction_source
665
+ ):
666
+ """
667
+ Create compartmentalized species from interactions.
668
+
669
+ Parameters
670
+ ----------
671
+ interaction_edgelist : pd.DataFrame
672
+ Interaction data containing species-compartment combinations
673
+ species_df : pd.DataFrame
674
+ Processed species data with IDs
675
+ compartments_df : pd.DataFrame
676
+ Processed compartments data with IDs
677
+ interaction_source : source.Source
678
+ Source object to assign to compartmentalized species
679
+
680
+ Returns
681
+ -------
682
+ pd.DataFrame
683
+ Compartmentalized species with formatted names and IDs
684
+ """
685
+ # Get all distinct upstream and downstream compartmentalized species
686
+ comp_species = pd.concat(
687
+ [
688
+ interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
689
+ {
690
+ "upstream_name": SBML_DFS.S_NAME,
691
+ "upstream_compartment": SBML_DFS.C_NAME,
692
+ },
693
+ axis=1,
694
+ ),
695
+ interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
696
+ {
697
+ "downstream_name": SBML_DFS.S_NAME,
698
+ "downstream_compartment": SBML_DFS.C_NAME,
699
+ },
700
+ axis=1,
701
+ ),
702
+ ]
703
+ ).drop_duplicates()
704
+
705
+ # Add species and compartment IDs
706
+ comp_species_w_ids = comp_species.merge(
707
+ species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
708
+ ).merge(
709
+ compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
710
+ )
711
+
712
+ # Validate merge was successful
713
+ _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
714
+
715
+ # Format compartmentalized species with names, source, and IDs
716
+ comp_species_w_ids[SBML_DFS.SC_NAME] = [
717
+ f"{s} [{c}]"
718
+ for s, c in zip(
719
+ comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
720
+ )
721
+ ]
722
+ comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
723
+ comp_species_w_ids[SBML_DFS.SC_ID] = id_formatter(
724
+ range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
725
+ )
726
+
727
+ return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
728
+ [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
729
+ ]
730
+
731
+
732
+ def _edgelist_create_reactions_and_species(
733
+ interaction_edgelist,
734
+ comp_species,
735
+ species_df,
736
+ compartments_df,
737
+ interaction_source,
738
+ upstream_stoichiometry,
739
+ downstream_stoichiometry,
740
+ downstream_sbo_name,
741
+ extra_reactions_columns,
742
+ ):
743
+ """
744
+ Create reactions and reaction species from interactions.
745
+
746
+ Parameters
747
+ ----------
748
+ interaction_edgelist : pd.DataFrame
749
+ Original interaction data
750
+ comp_species : pd.DataFrame
751
+ Compartmentalized species with IDs
752
+ species_df : pd.DataFrame
753
+ Processed species data with IDs
754
+ compartments_df : pd.DataFrame
755
+ Processed compartments data with IDs
756
+ interaction_source : source.Source
757
+ Source object for reactions
758
+ upstream_stoichiometry : int
759
+ Stoichiometry for upstream species
760
+ downstream_stoichiometry : int
761
+ Stoichiometry for downstream species
762
+ downstream_sbo_name : str
763
+ SBO term name for downstream species
764
+ extra_reactions_columns : list
765
+ Names of extra columns to preserve
766
+
767
+ Returns
768
+ -------
769
+ tuple
770
+ (reactions_df, reaction_species_df, reactions_data)
771
+ """
772
+ # Add compartmentalized species IDs to interactions
773
+ comp_species_w_names = (
774
+ comp_species.reset_index()
775
+ .merge(species_df[SBML_DFS.S_NAME].reset_index())
776
+ .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
777
+ )
778
+
779
+ interaction_w_cspecies = interaction_edgelist.merge(
780
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
781
+ {
782
+ SBML_DFS.SC_ID: "sc_id_up",
783
+ SBML_DFS.S_NAME: "upstream_name",
784
+ SBML_DFS.C_NAME: "upstream_compartment",
785
+ },
786
+ axis=1,
787
+ ),
788
+ how="left",
789
+ ).merge(
790
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
791
+ {
792
+ SBML_DFS.SC_ID: "sc_id_down",
793
+ SBML_DFS.S_NAME: "downstream_name",
794
+ SBML_DFS.C_NAME: "downstream_compartment",
795
+ },
796
+ axis=1,
797
+ ),
798
+ how="left",
799
+ )[
800
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
801
+ ]
802
+
803
+ # Validate merge didn't create duplicates
804
+ if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
805
+ raise ValueError(
806
+ f"Merging compartmentalized species resulted in row count change "
807
+ f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
808
+ )
809
+
810
+ # Create reaction IDs FIRST - before using them
811
+ interaction_w_cspecies[SBML_DFS.R_ID] = id_formatter(
812
+ range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
813
+ )
814
+
815
+ # Create reactions DataFrame
816
+ interactions_copy = interaction_w_cspecies.copy()
817
+ interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
818
+
819
+ reactions_columns = [
820
+ SBML_DFS.R_NAME,
821
+ SBML_DFS.R_IDENTIFIERS,
822
+ SBML_DFS.R_SOURCE,
823
+ SBML_DFS.R_ISREVERSIBLE,
824
+ ]
825
+
826
+ reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
827
+ reactions_columns + extra_reactions_columns
828
+ ]
829
+
830
+ # Separate extra data
831
+ reactions_data = reactions_df[extra_reactions_columns]
832
+ reactions_df = reactions_df[reactions_columns]
833
+
834
+ # Create reaction species relationships - NOW r_id exists
835
+ reaction_species_df = pd.concat(
836
+ [
837
+ # Upstream species (modifiers/stimulators/inhibitors)
838
+ interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
839
+ .assign(stoichiometry=upstream_stoichiometry)
840
+ .rename({"sc_id_up": "sc_id"}, axis=1),
841
+ # Downstream species (products)
842
+ interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
843
+ .assign(
844
+ stoichiometry=downstream_stoichiometry,
845
+ sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
846
+ )
847
+ .rename({"sc_id_down": "sc_id"}, axis=1),
848
+ ]
849
+ )
850
+
851
+ reaction_species_df["rsc_id"] = id_formatter(
852
+ range(reaction_species_df.shape[0]), "rsc_id"
853
+ )
854
+
855
+ reaction_species_df = reaction_species_df.set_index("rsc_id")
856
+
857
+ return reactions_df, reaction_species_df, reactions_data
858
+
859
+
860
+ def _edgelist_identify_extra_columns(
861
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
862
+ ):
863
+ """
864
+ Identify extra columns in input data that should be preserved.
865
+
866
+ Parameters
867
+ ----------
868
+ interaction_edgelist : pd.DataFrame
869
+ Interaction data containing potential extra columns
870
+ species_df : pd.DataFrame
871
+ Species data containing potential extra columns
872
+ keep_reactions_data : bool or str
873
+ Whether to keep extra reaction columns
874
+ keep_species_data : bool or str
875
+ Whether to keep extra species columns
876
+
877
+ Returns
878
+ -------
879
+ dict
880
+ Dictionary with 'reactions' and 'species' keys containing lists of extra column names
881
+ """
882
+ extra_reactions_columns = []
883
+ extra_species_columns = []
884
+
885
+ if keep_reactions_data is not False:
886
+ extra_reactions_columns = [
887
+ c
888
+ for c in interaction_edgelist.columns
889
+ if c not in INTERACTION_EDGELIST_EXPECTED_VARS
890
+ ]
891
+
892
+ if keep_species_data is not False:
893
+ extra_species_columns = [
894
+ c
895
+ for c in species_df.columns
896
+ if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
897
+ ]
898
+
899
+ return {"reactions": extra_reactions_columns, "species": extra_species_columns}
900
+
901
+
902
+ def _edgelist_process_compartments(compartments_df, interaction_source):
903
+ """
904
+ Format compartments DataFrame with source and ID columns.
905
+
906
+ Parameters
907
+ ----------
908
+ compartments_df : pd.DataFrame
909
+ Raw compartments data
910
+ interaction_source : source.Source
911
+ Source object to assign to compartments
912
+
913
+ Returns
914
+ -------
915
+ pd.DataFrame
916
+ Processed compartments with IDs, indexed by compartment ID
917
+ """
918
+ compartments = compartments_df.copy()
919
+ compartments[SBML_DFS.C_SOURCE] = interaction_source
920
+ compartments[SBML_DFS.C_ID] = id_formatter(
921
+ range(compartments.shape[0]), SBML_DFS.C_ID
922
+ )
923
+ return compartments.set_index(SBML_DFS.C_ID)[
924
+ [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
925
+ ]
926
+
927
+
928
+ def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
929
+ """
930
+ Format species DataFrame and extract extra data.
931
+
932
+ Parameters
933
+ ----------
934
+ species_df : pd.DataFrame
935
+ Raw species data
936
+ interaction_source : source.Source
937
+ Source object to assign to species
938
+ extra_species_columns : list
939
+ Names of extra columns to preserve separately
940
+
941
+ Returns
942
+ -------
943
+ tuple of pd.DataFrame
944
+ Processed species DataFrame and species extra data DataFrame
945
+ """
946
+ species = species_df.copy()
947
+ species[SBML_DFS.S_SOURCE] = interaction_source
948
+ species[SBML_DFS.S_ID] = id_formatter(range(species.shape[0]), SBML_DFS.S_ID)
949
+
950
+ required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
951
+ species_indexed = species.set_index(SBML_DFS.S_ID)[
952
+ required_cols + extra_species_columns
953
+ ]
954
+
955
+ # Separate extra data from main species table
956
+ species_data = species_indexed[extra_species_columns]
957
+ processed_species = species_indexed[required_cols]
958
+
959
+ return processed_species, species_data
960
+
961
+
962
+ def _edgelist_validate_inputs(
963
+ interaction_edgelist: pd.DataFrame,
964
+ species_df: pd.DataFrame,
965
+ compartments_df: pd.DataFrame,
966
+ ) -> None:
967
+ """
968
+ Validate input DataFrames have required columns.
969
+
970
+ Parameters
971
+ ----------
972
+ interaction_edgelist : pd.DataFrame
973
+ Interaction data to validate
974
+ species_df : pd.DataFrame
975
+ Species data to validate
976
+ compartments_df : pd.DataFrame
977
+ Compartments data to validate
978
+ """
979
+
980
+ # check compartments
981
+ compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
982
+ compartments_df_columns = set(compartments_df.columns.tolist())
983
+ missing_required_fields = compartments_df_expected_vars.difference(
984
+ compartments_df_columns
985
+ )
986
+ if len(missing_required_fields) > 0:
987
+ raise ValueError(
988
+ f"{', '.join(missing_required_fields)} are required variables"
989
+ ' in "compartments_df" but were not present in the input file.'
990
+ )
991
+
992
+ # check species
993
+ species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
994
+ species_df_columns = set(species_df.columns.tolist())
995
+ missing_required_fields = species_df_expected_vars.difference(species_df_columns)
996
+ if len(missing_required_fields) > 0:
997
+ raise ValueError(
998
+ f"{', '.join(missing_required_fields)} are required"
999
+ ' variables in "species_df" but were not present '
1000
+ "in the input file."
1001
+ )
1002
+
1003
+ # check interactions
1004
+ interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
1005
+ missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
1006
+ interaction_edgelist_columns
1007
+ )
1008
+ if len(missing_required_fields) > 0:
1009
+ raise ValueError(
1010
+ f"{', '.join(missing_required_fields)} are required "
1011
+ 'variables in "interaction_edgelist" but were not '
1012
+ "present in the input file."
1013
+ )
1014
+
1015
+ return None
1016
+
1017
+
1018
+ def _filter_promiscuous_components(
1019
+ bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
1020
+ ) -> pd.DataFrame:
1021
+
1022
+ # number of complexes a species is part of
1023
+ n_complexes_involvedin = bqb_has_parts_species.value_counts(
1024
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1025
+ )
1026
+ promiscuous_component_identifiers_index = n_complexes_involvedin[
1027
+ n_complexes_involvedin > max_promiscuity
1028
+ ].index
1029
+ promiscuous_component_identifiers = pd.Series(
1030
+ data=[True] * len(promiscuous_component_identifiers_index),
1031
+ index=promiscuous_component_identifiers_index,
1032
+ name="is_shared_component",
1033
+ dtype=bool,
1034
+ )
1035
+
1036
+ if len(promiscuous_component_identifiers) == 0:
1037
+ return bqb_has_parts_species
1038
+
1039
+ filtered_bqb_has_parts = bqb_has_parts_species.merge(
1040
+ promiscuous_component_identifiers,
1041
+ left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
1042
+ right_index=True,
1043
+ how="left",
1044
+ )
1045
+
1046
+ filtered_bqb_has_parts["is_shared_component"] = (
1047
+ filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
1048
+ )
1049
+ # drop identifiers shared as components across many species
1050
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
1051
+ ~filtered_bqb_has_parts["is_shared_component"]
1052
+ ].drop(["is_shared_component"], axis=1)
1053
+
1054
+ return filtered_bqb_has_parts
1055
+
1056
+
1057
+ def _find_underspecified_reactions(
1058
+ reaction_species_w_roles: pd.DataFrame,
1059
+ ) -> pd.DataFrame:
1060
+
1061
+ # check that both sbo_role and "new" are present
1062
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
1063
+ raise ValueError(
1064
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
1065
+ )
1066
+ if "new" not in reaction_species_w_roles.columns:
1067
+ raise ValueError(
1068
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
1069
+ )
1070
+ # check that new is a boolean column
1071
+ if reaction_species_w_roles["new"].dtype != bool:
1072
+ raise ValueError(
1073
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
1074
+ )
1075
+
1076
+ reactions_with_lost_defining_members = set(
1077
+ reaction_species_w_roles.query("~new")
1078
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
1079
+ .tolist()
1080
+ )
1081
+
1082
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
1083
+ if N_reactions_with_lost_defining_members > 0:
1084
+ logger.info(
1085
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
1086
+ )
1087
+
1088
+ # find the cases where all "new" values for a given (r_id, sbo_term) are False
1089
+ reactions_with_lost_requirements = set(
1090
+ reaction_species_w_roles
1091
+ # drop already filtered reactions
1092
+ .query("r_id not in @reactions_with_lost_defining_members")
1093
+ .query("sbo_role == 'REQUIRED'")
1094
+ # which entries which have some required attribute have all False values for that attribute
1095
+ .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
1096
+ .agg({"new": "any"})
1097
+ .query("new == False")
1098
+ .index.get_level_values(SBML_DFS.R_ID)
1099
+ )
1100
+
1101
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
1102
+ if N_reactions_with_lost_requirements > 0:
1103
+ logger.info(
1104
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
1105
+ )
1106
+
1107
+ underspecified_reactions = reactions_with_lost_defining_members.union(
1108
+ reactions_with_lost_requirements
1109
+ )
1110
+
1111
+ return underspecified_reactions
1112
+
1113
+
1114
+ def _id_dict_to_df(ids):
330
1115
  if len(ids) == 0:
331
1116
  return pd.DataFrame(
332
1117
  {
@@ -338,3 +1123,157 @@ def _stub_ids(ids):
338
1123
  )
339
1124
  else:
340
1125
  return pd.DataFrame(ids)
1126
+
1127
+
1128
+ def _perform_sbml_dfs_table_validation(
1129
+ table_data: pd.DataFrame,
1130
+ table_schema: dict,
1131
+ table_name: str,
1132
+ ) -> None:
1133
+ """
1134
+ Core validation logic for SBML_dfs tables.
1135
+
1136
+ This function performs the actual validation checks for any table against its schema,
1137
+ regardless of whether it's part of an SBML_dfs object or standalone.
1138
+
1139
+ Parameters
1140
+ ----------
1141
+ table_data : pd.DataFrame
1142
+ The table data to validate
1143
+ table_schema : dict
1144
+ Schema definition for the table
1145
+ table_name : str
1146
+ Name of the table (for error messages)
1147
+
1148
+ Raises
1149
+ ------
1150
+ ValueError
1151
+ If the table does not conform to its schema:
1152
+ - Not a DataFrame
1153
+ - Wrong index name
1154
+ - Duplicate primary keys
1155
+ - Missing required variables
1156
+ - Empty table
1157
+ """
1158
+ if not isinstance(table_data, pd.DataFrame):
1159
+ raise ValueError(
1160
+ f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
1161
+ )
1162
+
1163
+ # check index
1164
+ expected_index_name = table_schema["pk"]
1165
+ if table_data.index.name != expected_index_name:
1166
+ raise ValueError(
1167
+ f"the index name for {table_name} was not the pk: {expected_index_name}"
1168
+ )
1169
+
1170
+ # check that all entries in the index are unique
1171
+ if len(set(table_data.index.tolist())) != table_data.shape[0]:
1172
+ duplicated_pks = table_data.index.value_counts()
1173
+ duplicated_pks = duplicated_pks[duplicated_pks > 1]
1174
+
1175
+ example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
1176
+ raise ValueError(
1177
+ f"{duplicated_pks.shape[0]} primary keys were duplicated "
1178
+ f"including {', '.join(example_duplicates)}"
1179
+ )
1180
+
1181
+ # check variables
1182
+ expected_vars = set(table_schema["vars"])
1183
+ table_vars = set(list(table_data.columns))
1184
+
1185
+ extra_vars = table_vars.difference(expected_vars)
1186
+ if len(extra_vars) != 0:
1187
+ logger.debug(
1188
+ f"{len(extra_vars)} extra variables were found for {table_name}: "
1189
+ f"{', '.join(extra_vars)}"
1190
+ )
1191
+
1192
+ missing_vars = expected_vars.difference(table_vars)
1193
+ if len(missing_vars) != 0:
1194
+ raise ValueError(
1195
+ f"Missing {len(missing_vars)} required variables for {table_name}: "
1196
+ f"{', '.join(missing_vars)}"
1197
+ )
1198
+
1199
+ # check for empty table
1200
+ if table_data.shape[0] == 0:
1201
+ raise ValueError(f"{table_name} contained no entries")
1202
+
1203
+
1204
+ def _sbml_dfs_from_edgelist_check_cspecies_merge(
1205
+ merged_species: pd.DataFrame, original_species: pd.DataFrame
1206
+ ) -> None:
1207
+ """Check for a mismatch between the provided species data and species implied by the edgelist."""
1208
+
1209
+ # check for 1-many merge
1210
+ if merged_species.shape[0] != original_species.shape[0]:
1211
+ raise ValueError(
1212
+ "Merging compartmentalized species to species_df"
1213
+ " and compartments_df by names resulted in an "
1214
+ f"increase in the tables from {original_species.shape[0]}"
1215
+ f" to {merged_species.shape[0]} indicating that names were"
1216
+ " not unique"
1217
+ )
1218
+
1219
+ # check for missing species and compartments
1220
+ missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
1221
+ SBML_DFS.C_NAME
1222
+ ].unique()
1223
+ if len(missing_compartments) >= 1:
1224
+ raise ValueError(
1225
+ f"{len(missing_compartments)} compartments were present in"
1226
+ ' "interaction_edgelist" but not "compartments_df":'
1227
+ f" {', '.join(missing_compartments)}"
1228
+ )
1229
+
1230
+ missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
1231
+ SBML_DFS.S_NAME
1232
+ ].unique()
1233
+ if len(missing_species) >= 1:
1234
+ raise ValueError(
1235
+ f"{len(missing_species)} species were present in "
1236
+ '"interaction_edgelist" but not "species_df":'
1237
+ f" {', '.join(missing_species)}"
1238
+ )
1239
+
1240
+ return None
1241
+
1242
+
1243
+ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
1244
+ """Validates a table against a reference
1245
+
1246
+ This check if the table has the same index, no duplicates in the index
1247
+ and that all values in the index are in the reference table.
1248
+
1249
+ Args:
1250
+ data_table (pd.DataFrame): a table with data that should
1251
+ match the reference
1252
+ ref_table (pd.DataFrame): a reference table
1253
+
1254
+ Raises:
1255
+ ValueError: not same index name
1256
+ ValueError: index contains duplicates
1257
+ ValueError: index not subset of index of reactions table
1258
+ """
1259
+ ref_index_name = ref_table.index.name
1260
+ if data_table.index.name != ref_index_name:
1261
+ raise ValueError(
1262
+ "the index name for reaction data table was not"
1263
+ f" {ref_index_name}: {data_table.index.name}"
1264
+ )
1265
+ ids = data_table.index
1266
+ if any(ids.duplicated()):
1267
+ raise ValueError(
1268
+ "the index for reaction data table " "contained duplicate values"
1269
+ )
1270
+ if not all(ids.isin(ref_table.index)):
1271
+ raise ValueError(
1272
+ "the index for reaction data table contained values"
1273
+ " not found in the reactions table"
1274
+ )
1275
+ if not isinstance(data_table, pd.DataFrame):
1276
+ raise TypeError(
1277
+ f"The data table was type {type(data_table).__name__}"
1278
+ " but must be a pd.DataFrame"
1279
+ )