napistu 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. napistu/__main__.py +38 -27
  2. napistu/consensus.py +22 -27
  3. napistu/constants.py +91 -65
  4. napistu/context/filtering.py +2 -1
  5. napistu/identifiers.py +3 -6
  6. napistu/indices.py +3 -1
  7. napistu/ingestion/bigg.py +6 -6
  8. napistu/ingestion/sbml.py +298 -295
  9. napistu/ingestion/string.py +16 -19
  10. napistu/ingestion/trrust.py +22 -27
  11. napistu/ingestion/yeast.py +2 -1
  12. napistu/matching/interactions.py +4 -4
  13. napistu/matching/species.py +1 -1
  14. napistu/modify/uncompartmentalize.py +1 -1
  15. napistu/network/net_create.py +1 -1
  16. napistu/network/paths.py +1 -1
  17. napistu/ontologies/dogma.py +2 -1
  18. napistu/ontologies/genodexito.py +5 -1
  19. napistu/ontologies/renaming.py +4 -0
  20. napistu/sbml_dfs_core.py +1343 -2167
  21. napistu/sbml_dfs_utils.py +1086 -143
  22. napistu/utils.py +52 -41
  23. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/METADATA +2 -2
  24. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/RECORD +40 -40
  25. tests/conftest.py +113 -13
  26. tests/test_consensus.py +161 -4
  27. tests/test_context_filtering.py +2 -2
  28. tests/test_gaps.py +26 -15
  29. tests/test_network_net_create.py +1 -1
  30. tests/test_network_precompute.py +1 -1
  31. tests/test_ontologies_genodexito.py +3 -0
  32. tests/test_ontologies_mygene.py +3 -0
  33. tests/test_ontologies_renaming.py +28 -24
  34. tests/test_sbml_dfs_core.py +260 -211
  35. tests/test_sbml_dfs_utils.py +194 -36
  36. tests/test_utils.py +19 -0
  37. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/WHEEL +0 -0
  38. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/entry_points.txt +0 -0
  39. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/licenses/LICENSE +0 -0
  40. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_utils.py CHANGED
@@ -11,80 +11,356 @@ from fs import open_fs
11
11
  import numpy as np
12
12
  import pandas as pd
13
13
  from napistu import utils
14
+ from napistu import identifiers
14
15
  from napistu import indices
15
16
 
16
- from napistu import sbml_dfs_core
17
+ from napistu.constants import BQB
17
18
  from napistu.constants import SBML_DFS
19
+ from napistu.constants import SBML_DFS_SCHEMA
18
20
  from napistu.constants import IDENTIFIERS
19
21
  from napistu.constants import BQB_DEFINING_ATTRS
20
22
  from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
23
+ from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
24
+ from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
25
+ from napistu.constants import SBO_ROLES_DEFS
26
+ from napistu.constants import MINI_SBO_FROM_NAME
27
+ from napistu.constants import MINI_SBO_TO_NAME
28
+ from napistu.constants import SBO_NAME_TO_ROLE
29
+ from napistu.constants import ONTOLOGIES
30
+ from napistu.ingestion.constants import VALID_COMPARTMENTS
31
+ from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
32
+ from napistu.ingestion.constants import GENERIC_COMPARTMENT
21
33
 
22
34
  logger = logging.getLogger(__name__)
23
35
 
24
36
 
25
- def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
37
+ # =============================================================================
38
+ # PUBLIC FUNCTIONS (ALPHABETICAL ORDER)
39
+ # =============================================================================
40
+
41
+
42
+ def adapt_pw_index(
43
+ source: str | indices.PWIndex,
44
+ species: str | Iterable[str] | None,
45
+ outdir: str | None = None,
46
+ ) -> indices.PWIndex:
47
+ """Adapts a pw_index
48
+
49
+ Helpful to filter for species before reconstructing.
50
+
51
+ Args:
52
+ source (str | PWIndex): uri for pw_index.csv file or PWIndex object
53
+ species (str):
54
+ outdir (str | None, optional): Optional directory to write pw_index to.
55
+ Defaults to None.
56
+
57
+ Returns:
58
+ indices.PWIndex: Filtered pw index
26
59
  """
27
- Unnest Identifiers
60
+ if isinstance(source, str):
61
+ pw_index = indices.PWIndex(source)
62
+ elif isinstance(source, indices.PWIndex):
63
+ pw_index = copy.deepcopy(source)
64
+ else:
65
+ raise ValueError("'source' needs to be str or PWIndex.")
66
+ pw_index.filter(species=species)
28
67
 
29
- Take a pd.DataFrame containing an array of Identifiers and
30
- return one-row per identifier.
68
+ if outdir is not None:
69
+ with open_fs(outdir, create=True) as fs:
70
+ with fs.open("pw_index.tsv", "w") as f:
71
+ pw_index.index.to_csv(f, sep="\t")
72
+ return pw_index
73
+
74
+
75
+ def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
76
+ """
77
+ Add an sbo_role column to the reaction_species table.
78
+
79
+ The sbo_role column is a string column that contains the SBO role of the reaction species.
80
+ The values in the sbo_role column are taken from the sbo_term column.
81
+
82
+ The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
83
+ """
84
+
85
+ validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
86
+
87
+ reaction_species = (
88
+ reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
89
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
90
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
91
+ )
92
+
93
+ undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
94
+ SBO_NAME_TO_ROLE.values()
95
+ )
96
+ if len(undefined_roles) > 0:
97
+ logger.warning(
98
+ f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
99
+ )
100
+ mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
101
+ reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
102
+
103
+ return reaction_species
104
+
105
+
106
+ def check_entity_data_index_matching(sbml_dfs, table):
107
+ """
108
+ Update the input smbl_dfs's entity_data (dict) index
109
+ with match_entitydata_index_to_entity,
110
+ so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
111
+ corresponding entity, and then passes sbml_dfs.validate()
112
+ Args
113
+ sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
114
+ table (str): table whose data is being consolidates (currently species or reactions)
115
+ Returns
116
+ sbml_dfs (cpr.SBML_dfs):
117
+ sbml_dfs whose entity_data is checked to have the same index
118
+ as the corresponding entity.
119
+ """
120
+
121
+ table_data = table + "_data"
122
+
123
+ entity_data_dict = getattr(sbml_dfs, table_data)
124
+ entity_schema = sbml_dfs.schema[table]
125
+ sbml_dfs_entity = getattr(sbml_dfs, table)
126
+
127
+ if entity_data_dict != {}:
128
+ entity_data_types = set.union(set(entity_data_dict.keys()))
129
+
130
+ entity_data_dict_checked = {
131
+ x: match_entitydata_index_to_entity(
132
+ entity_data_dict, x, sbml_dfs_entity, entity_schema, table
133
+ )
134
+ for x in entity_data_types
135
+ }
136
+
137
+ if table == SBML_DFS.REACTIONS:
138
+ sbml_dfs.reactions_data = entity_data_dict_checked
139
+ elif table == SBML_DFS.SPECIES:
140
+ sbml_dfs.species_data = entity_data_dict_checked
141
+
142
+ return sbml_dfs
143
+
144
+
145
+ def construct_formula_string(
146
+ reaction_species_df: pd.DataFrame,
147
+ reactions_df: pd.DataFrame,
148
+ name_var: str,
149
+ ) -> str:
150
+ """
151
+ Construct Formula String
152
+
153
+ Convert a table of reaction species into a formula string
31
154
 
32
155
  Parameters:
33
- id_table: pd.DataFrame
34
- a table containing an array of Identifiers
35
- id_var: str
36
- variable containing Identifiers
156
+ ----------
157
+ reaction_species_df: pd.DataFrame
158
+ Table containing a reactions' species
159
+ reactions_df: pd.DataFrame
160
+ smbl.reactions
161
+ name_var: str
162
+ Name used to label species
37
163
 
38
164
  Returns:
39
- pd.Dataframe containing the index of id_table but expanded
40
- to include one row per identifier
165
+ ----------
166
+ formula_str: str
167
+ String representation of a reactions substrates, products and
168
+ modifiers
41
169
 
42
170
  """
43
171
 
44
- # validate inputs
45
- utils.match_pd_vars(id_table, {id_var}).assert_present()
172
+ reaction_species_df["label"] = [
173
+ _add_stoi_to_species_name(x, y)
174
+ for x, y in zip(
175
+ reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
176
+ )
177
+ ]
178
+
179
+ rxn_reversible = bool(
180
+ reactions_df.loc[
181
+ reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
182
+ ]
183
+ ) # convert from a np.bool_ to bool if needed
184
+ if not isinstance(rxn_reversible, bool):
185
+ raise TypeError(
186
+ f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
187
+ )
46
188
 
47
- N_invalid_ids = sum(id_table[id_var].isna())
48
- if N_invalid_ids != 0:
189
+ if rxn_reversible:
190
+ arrow_type = " <-> "
191
+ else:
192
+ arrow_type = " -> "
193
+
194
+ substrates = " + ".join(
195
+ reaction_species_df["label"][
196
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
197
+ ].tolist()
198
+ )
199
+ products = " + ".join(
200
+ reaction_species_df["label"][
201
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
202
+ ].tolist()
203
+ )
204
+ modifiers = " + ".join(
205
+ reaction_species_df["label"][
206
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
207
+ ].tolist()
208
+ )
209
+ if modifiers != "":
210
+ modifiers = f" ---- modifiers: {modifiers}]"
211
+
212
+ return f"{substrates}{arrow_type}{products}{modifiers}"
213
+
214
+
215
+ def find_underspecified_reactions(
216
+ reaction_species_w_roles: pd.DataFrame,
217
+ ) -> pd.DataFrame:
218
+
219
+ # check that both sbo_role and "new" are present
220
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
49
221
  raise ValueError(
50
- f'{N_invalid_ids} entries in "id_table" were missing',
51
- "entries with no identifiers should still include an Identifiers object",
222
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call sbml_dfs_utils.add_sbo_role() first."
223
+ )
224
+ if "new" not in reaction_species_w_roles.columns:
225
+ raise ValueError(
226
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
227
+ )
228
+ # check that new is a boolean column
229
+ if reaction_species_w_roles["new"].dtype != bool:
230
+ raise ValueError(
231
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
52
232
  )
53
233
 
54
- # Get the identifier as a list of dicts
55
- df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
56
- # Filter out zero length lists
57
- df = df.query(f"{id_var} != 0")
58
- # Unnest the list of dicts into one dict per row
59
- df = df.explode(id_var)
60
- # Unnest the dict into a dataframe
61
- df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
62
- # Add the entry number as an index
63
- df["entry"] = df.groupby(df.index).cumcount()
64
- df.set_index("entry", append=True, inplace=True)
65
- return df
234
+ reactions_with_lost_defining_members = set(
235
+ reaction_species_w_roles.query("~new")
236
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
237
+ .tolist()
238
+ )
66
239
 
240
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
241
+ if N_reactions_with_lost_defining_members > 0:
242
+ logger.info(
243
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
244
+ )
67
245
 
68
- def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
69
- id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
70
- return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
246
+ # find the cases where all "new" values for a given (r_id, sbo_term) are False
247
+ reactions_with_lost_requirements = set(
248
+ reaction_species_w_roles
249
+ # drop already filtered reactions
250
+ .query("r_id not in @reactions_with_lost_defining_members")
251
+ .query("sbo_role == 'REQUIRED'")
252
+ # which entries which have some required attribute have all False values for that attribute
253
+ .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
254
+ .agg({"new": "any"})
255
+ .query("new == False")
256
+ .index.get_level_values(SBML_DFS.R_ID)
257
+ )
71
258
 
259
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
260
+ if N_reactions_with_lost_requirements > 0:
261
+ logger.info(
262
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
263
+ )
72
264
 
73
- def id_formatter_inv(ids: list[str]) -> list[int]:
265
+ underspecified_reactions = reactions_with_lost_defining_members.union(
266
+ reactions_with_lost_requirements
267
+ )
268
+
269
+ return underspecified_reactions
270
+
271
+
272
+ def filter_to_characteristic_species_ids(
273
+ species_ids: pd.DataFrame,
274
+ max_complex_size: int = 4,
275
+ max_promiscuity: int = 20,
276
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
277
+ ) -> pd.DataFrame:
74
278
  """
75
- ID Formatter Inverter
279
+ Filter to Characteristic Species IDs
280
+
281
+ Remove identifiers corresponding to one component within a large protein
282
+ complexes and non-characteristic annotations such as pubmed references and
283
+ homologues.
284
+
285
+ Parameters
286
+ ----------
287
+ species_ids: pd.DataFrame
288
+ A table of identifiers produced by sdbml_dfs.get_identifiers("species")
289
+ max_complex_size: int
290
+ The largest size of a complex, where BQB_HAS_PART terms will be retained.
291
+ In most cases, complexes are handled with specific formation and
292
+ dissolutation reactions,but these identifiers will be pulled in when
293
+ searching by identifiers or searching the identifiers associated with a
294
+ species against an external resource such as Open Targets.
295
+ max_promiscuity: int
296
+ Maximum number of species where a single molecule can act as a
297
+ BQB_HAS_PART component associated with a single identifier (and common ontology).
298
+ defining_biological_qualifiers (list[str]):
299
+ BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
300
+ permissive settings would include homologs, different forms of the same gene.
301
+
302
+ Returns:
303
+ --------
304
+ species_id: pd.DataFrame
305
+ Input species filtered to characteristic identifiers
76
306
 
77
- Convert from internal IDs back to integer IDs
78
307
  """
79
308
 
80
- id_val = list()
81
- for an_id in ids:
82
- if re.match("^[A-Z]+[0-9]+$", an_id):
83
- id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
84
- else:
85
- id_val.append(np.nan) # type: ignore
309
+ if not isinstance(species_ids, pd.DataFrame):
310
+ raise TypeError(
311
+ f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
312
+ )
86
313
 
87
- return id_val
314
+ if not isinstance(max_complex_size, int):
315
+ raise TypeError(
316
+ f"max_complex_size was a {type(max_complex_size)} but must be an int"
317
+ )
318
+
319
+ if not isinstance(max_promiscuity, int):
320
+ raise TypeError(
321
+ f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
322
+ )
323
+
324
+ if not isinstance(defining_biological_qualifiers, list):
325
+ raise TypeError(
326
+ f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
327
+ )
328
+
329
+ # primary annotations of a species
330
+ bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
331
+
332
+ # add components within modestly sized protein complexes
333
+ # look at HAS_PART IDs
334
+ bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
335
+
336
+ # number of species in a complex
337
+ n_species_components = bqb_has_parts_species.value_counts(
338
+ [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
339
+ )
340
+ big_complex_sids = set(
341
+ n_species_components[
342
+ n_species_components > max_complex_size
343
+ ].index.get_level_values(SBML_DFS.S_ID)
344
+ )
345
+
346
+ filtered_bqb_has_parts = _filter_promiscuous_components(
347
+ bqb_has_parts_species, max_promiscuity
348
+ )
349
+
350
+ # drop species parts if there are many components
351
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
352
+ ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
353
+ ]
354
+
355
+ # combine primary identifiers and rare components
356
+ characteristic_species_ids = pd.concat(
357
+ [
358
+ bqb_is_species,
359
+ filtered_bqb_has_parts,
360
+ ]
361
+ )
362
+
363
+ return characteristic_species_ids
88
364
 
89
365
 
90
366
  def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
@@ -118,57 +394,26 @@ def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
118
394
  return current_max_id
119
395
 
120
396
 
121
- def adapt_pw_index(
122
- source: str | indices.PWIndex,
123
- species: str | Iterable[str] | None,
124
- outdir: str | None = None,
125
- ) -> indices.PWIndex:
126
- """Adapts a pw_index
127
-
128
- Helpful to filter for species before reconstructing.
397
+ def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
398
+ id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
399
+ return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
129
400
 
130
- Args:
131
- source (str | PWIndex): uri for pw_index.csv file or PWIndex object
132
- species (str):
133
- outdir (str | None, optional): Optional directory to write pw_index to.
134
- Defaults to None.
135
401
 
136
- Returns:
137
- indices.PWIndex: Filtered pw index
402
+ def id_formatter_inv(ids: list[str]) -> list[int]:
138
403
  """
139
- if isinstance(source, str):
140
- pw_index = indices.PWIndex(source)
141
- elif isinstance(source, indices.PWIndex):
142
- pw_index = copy.deepcopy(source)
143
- else:
144
- raise ValueError("'source' needs to be str or PWIndex.")
145
- pw_index.filter(species=species)
146
-
147
- if outdir is not None:
148
- with open_fs(outdir, create=True) as fs:
149
- with fs.open("pw_index.tsv", "w") as f:
150
- pw_index.index.to_csv(f, sep="\t")
151
- return pw_index
404
+ ID Formatter Inverter
152
405
 
406
+ Convert from internal IDs back to integer IDs
407
+ """
153
408
 
154
- def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
155
- if dogmatic:
156
- logger.info(
157
- "Running in dogmatic mode - differences genes, transcripts, and proteins will "
158
- "try to be maintained as separate species."
159
- )
160
- # preserve differences between genes, transcripts, and proteins
161
- defining_biological_qualifiers = BQB_DEFINING_ATTRS
162
- else:
163
- logger.info(
164
- "Running in non-dogmatic mode - genes, transcripts, and proteins will "
165
- "be merged if possible."
166
- )
167
- # merge genes, transcripts, and proteins (if they are defined with
168
- # bqb terms which specify their relationships).
169
- defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
409
+ id_val = list()
410
+ for an_id in ids:
411
+ if re.match("^[A-Z]+[0-9]+$", an_id):
412
+ id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
413
+ else:
414
+ id_val.append(np.nan) # type: ignore
170
415
 
171
- return defining_biological_qualifiers
416
+ return id_val
172
417
 
173
418
 
174
419
  def match_entitydata_index_to_entity(
@@ -200,7 +445,7 @@ def match_entitydata_index_to_entity(
200
445
  if len(entity_data_df.index.difference(consensus_entity_df.index)) == 0:
201
446
  logger.info(f"{data_table} ids are included in {table} ids")
202
447
  else:
203
- logger.warnning(
448
+ logger.warning(
204
449
  f"{data_table} have ids are not matched to {table} ids,"
205
450
  f"please check mismatched ids first"
206
451
  )
@@ -229,79 +474,173 @@ def match_entitydata_index_to_entity(
229
474
  return entity_data_df
230
475
 
231
476
 
232
- def check_entity_data_index_matching(sbml_dfs, table):
233
- """
234
- Update the input smbl_dfs's entity_data (dict) index
235
- with match_entitydata_index_to_entity,
236
- so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
237
- corresponding entity, and then passes sbml_dfs.validate()
238
- Args
239
- sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
240
- table (str): table whose data is being consolidates (currently species or reactions)
241
- Returns
242
- sbml_dfs (cpr.SBML_dfs):
243
- sbml_dfs whose entity_data is checked to have the same index
244
- as the corresponding entity.
245
- """
477
+ def species_type_types(x):
478
+ """Assign a high-level molecule type to a molecular species"""
246
479
 
247
- table_data = table + "_data"
480
+ if isinstance(x, identifiers.Identifiers):
481
+ if x.filter(["chebi"]):
482
+ return "metabolite"
483
+ elif x.filter(["molodex"]):
484
+ return "drug"
485
+ else:
486
+ return "protein"
487
+ else:
488
+ return "unknown"
248
489
 
249
- entity_data_dict = getattr(sbml_dfs, table_data)
250
- entity_schema = sbml_dfs.schema[table]
251
- sbml_dfs_entity = getattr(sbml_dfs, table)
252
490
 
253
- if entity_data_dict != {}:
254
- entity_data_types = set.union(set(entity_data_dict.keys()))
491
+ def stub_compartments(
492
+ stubbed_compartment: str = GENERIC_COMPARTMENT,
493
+ ) -> pd.DataFrame:
494
+ """Stub Compartments
255
495
 
256
- entity_data_dict_checked = {
257
- x: match_entitydata_index_to_entity(
258
- entity_data_dict, x, sbml_dfs_entity, entity_schema, table
259
- )
260
- for x in entity_data_types
496
+ Create a compartments table with only a single compartment
497
+
498
+ Args:
499
+ stubbed_compartment (str): the name of a compartment which should match the
500
+ keys in ingestion.constants.VALID_COMPARTMENTS and ingestion.constants.COMPARTMENTS_GO_TERMS
501
+
502
+ Returns:
503
+ compartments_df (pd.DataFrame): compartments dataframe
504
+ """
505
+
506
+ if stubbed_compartment not in VALID_COMPARTMENTS:
507
+ raise ValueError(
508
+ f"{stubbed_compartment} is not defined in ingestion.constants.VALID_COMPARTMENTS"
509
+ )
510
+
511
+ if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
512
+ raise ValueError(
513
+ f"{stubbed_compartment} is not defined in ingestion.constants.COMPARTMENTS_GO_TERMS"
514
+ )
515
+
516
+ stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
517
+
518
+ formatted_uri = identifiers.format_uri(
519
+ uri=identifiers.create_uri_url(
520
+ ontology=ONTOLOGIES.GO,
521
+ identifier=stubbed_compartment_id,
522
+ ),
523
+ biological_qualifier_type=BQB.IS,
524
+ )
525
+
526
+ compartments_df = pd.DataFrame(
527
+ {
528
+ SBML_DFS.C_NAME: [stubbed_compartment],
529
+ SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
261
530
  }
531
+ )
532
+ compartments_df.index = id_formatter([0], SBML_DFS.C_ID) # type: ignore
533
+ compartments_df.index.name = SBML_DFS.C_ID
262
534
 
263
- if table == SBML_DFS.REACTIONS:
264
- sbml_dfs.reactions_data = entity_data_dict_checked
265
- elif table == SBML_DFS.SPECIES:
266
- sbml_dfs.species_data = entity_data_dict_checked
535
+ return compartments_df
267
536
 
268
- return sbml_dfs
269
537
 
538
+ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
539
+ """
540
+ Unnest Identifiers
541
+
542
+ Take a pd.DataFrame containing an array of Identifiers and
543
+ return one-row per identifier.
270
544
 
271
- def get_characteristic_species_ids(
272
- sbml_dfs: sbml_dfs_core.SBML_dfs, dogmatic: bool = True
273
- ) -> pd.DataFrame:
545
+ Parameters:
546
+ id_table: pd.DataFrame
547
+ a table containing an array of Identifiers
548
+ id_var: str
549
+ variable containing Identifiers
550
+
551
+ Returns:
552
+ pd.Dataframe containing the index of id_table but expanded
553
+ to include one row per identifier
554
+
555
+ """
556
+
557
+ # validate inputs
558
+ utils.match_pd_vars(id_table, {id_var}).assert_present()
559
+
560
+ N_invalid_ids = sum(id_table[id_var].isna())
561
+ if N_invalid_ids != 0:
562
+
563
+ print("Rows with missing identifiers:")
564
+ print(id_table.loc[id_table[id_var].isna(), id_var])
565
+
566
+ raise ValueError(
567
+ f'{N_invalid_ids} entries in "id_table" were missing',
568
+ "entries with no identifiers should still include an Identifiers object",
569
+ )
570
+
571
+ # Get the identifier as a list of dicts
572
+ df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
573
+ # Filter out zero length lists
574
+ df = df.query(f"{id_var} != 0")
575
+ # Unnest the list of dicts into one dict per row
576
+ df = df.explode(id_var)
577
+ # Unnest the dict into a dataframe
578
+ df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
579
+ # Add the entry number as an index
580
+ df["entry"] = df.groupby(df.index).cumcount()
581
+ df.set_index("entry", append=True, inplace=True)
582
+ return df
583
+
584
+
585
+ def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
274
586
  """
275
- Get Characteristic Species IDs
587
+ Validate a standalone table against the SBML_dfs schema.
276
588
 
277
- List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
589
+ This function validates a table against the schema defined in SBML_DFS_SCHEMA,
590
+ without requiring an SBML_dfs object. Useful for validating tables before
591
+ creating an SBML_dfs object.
278
592
 
279
593
  Parameters
280
594
  ----------
281
- sbml_dfs : sbml_dfs_core.SBML_dfs
282
- The SBML_dfs object.
283
- dogmatic : bool, default=True
284
- Whether to use the dogmatic flag to determine which BQB attributes are valid.
595
+ table_data : pd.DataFrame
596
+ The table to validate
597
+ table_name : str
598
+ Name of the table in the SBML_dfs schema
599
+
600
+ Raises
601
+ ------
602
+ ValueError
603
+ If table_name is not in schema or validation fails
604
+ """
605
+ if table_name not in SBML_DFS_SCHEMA.SCHEMA:
606
+ raise ValueError(
607
+ f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
608
+ f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
609
+ )
285
610
 
286
- Returns
287
- -------
288
- pd.DataFrame
289
- A DataFrame containing the systematic identifiers which are characteristic of molecular species.
611
+ table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
612
+ _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
613
+
614
+
615
+ # =============================================================================
616
+ # PRIVATE FUNCTIONS (ALPHABETICAL ORDER)
617
+ # =============================================================================
618
+
619
+
620
+ def _add_stoi_to_species_name(stoi: float | int, name: str) -> str:
290
621
  """
622
+ Add Stoi To Species Name
291
623
 
292
- # select valid BQB attributes based on dogmatic flag
293
- defining_biological_qualifiers = _dogmatic_to_defining_bqbs(dogmatic)
624
+ Add # of molecules to a species name
625
+
626
+ Parameters:
627
+ ----------
628
+ stoi: float or int
629
+ Number of molecules
630
+ name: str
631
+ Name of species
294
632
 
295
- # pre-summarize ontologies
296
- species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
633
+ Returns:
634
+ ----------
635
+ name: str
636
+ Name containing number of species
297
637
 
298
- # drop some BQB_HAS_PART annotations
299
- species_identifiers = sbml_dfs_core.filter_to_characteristic_species_ids(
300
- species_identifiers,
301
- defining_biological_qualifiers=defining_biological_qualifiers,
302
- )
638
+ """
303
639
 
304
- return species_identifiers
640
+ if stoi in [-1, 0, 1]:
641
+ return name
642
+ else:
643
+ return str(abs(stoi)) + " " + name
305
644
 
306
645
 
307
646
  def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
@@ -325,8 +664,458 @@ def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
325
664
  return defining_biological_qualifiers
326
665
 
327
666
 
328
- def _stub_ids(ids):
329
- """Stub with a blank ID if an ids list is blank; otherwise create an Identifiers object from the provided ids"""
667
+ def _edgelist_create_compartmentalized_species(
668
+ interaction_edgelist, species_df, compartments_df, interaction_source
669
+ ):
670
+ """
671
+ Create compartmentalized species from interactions.
672
+
673
+ Parameters
674
+ ----------
675
+ interaction_edgelist : pd.DataFrame
676
+ Interaction data containing species-compartment combinations
677
+ species_df : pd.DataFrame
678
+ Processed species data with IDs
679
+ compartments_df : pd.DataFrame
680
+ Processed compartments data with IDs
681
+ interaction_source : source.Source
682
+ Source object to assign to compartmentalized species
683
+
684
+ Returns
685
+ -------
686
+ pd.DataFrame
687
+ Compartmentalized species with formatted names and IDs
688
+ """
689
+ # Get all distinct upstream and downstream compartmentalized species
690
+ comp_species = pd.concat(
691
+ [
692
+ interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
693
+ {
694
+ "upstream_name": SBML_DFS.S_NAME,
695
+ "upstream_compartment": SBML_DFS.C_NAME,
696
+ },
697
+ axis=1,
698
+ ),
699
+ interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
700
+ {
701
+ "downstream_name": SBML_DFS.S_NAME,
702
+ "downstream_compartment": SBML_DFS.C_NAME,
703
+ },
704
+ axis=1,
705
+ ),
706
+ ]
707
+ ).drop_duplicates()
708
+
709
+ # Add species and compartment IDs
710
+ comp_species_w_ids = comp_species.merge(
711
+ species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
712
+ ).merge(
713
+ compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
714
+ )
715
+
716
+ # Validate merge was successful
717
+ _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
718
+
719
+ # Format compartmentalized species with names, source, and IDs
720
+ comp_species_w_ids[SBML_DFS.SC_NAME] = [
721
+ f"{s} [{c}]"
722
+ for s, c in zip(
723
+ comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
724
+ )
725
+ ]
726
+ comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
727
+ comp_species_w_ids[SBML_DFS.SC_ID] = id_formatter(
728
+ range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
729
+ )
730
+
731
+ return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
732
+ [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
733
+ ]
734
+
735
+
736
+ def _edgelist_create_reactions_and_species(
737
+ interaction_edgelist,
738
+ comp_species,
739
+ species_df,
740
+ compartments_df,
741
+ interaction_source,
742
+ upstream_stoichiometry,
743
+ downstream_stoichiometry,
744
+ downstream_sbo_name,
745
+ extra_reactions_columns,
746
+ ):
747
+ """
748
+ Create reactions and reaction species from interactions.
749
+
750
+ Parameters
751
+ ----------
752
+ interaction_edgelist : pd.DataFrame
753
+ Original interaction data
754
+ comp_species : pd.DataFrame
755
+ Compartmentalized species with IDs
756
+ species_df : pd.DataFrame
757
+ Processed species data with IDs
758
+ compartments_df : pd.DataFrame
759
+ Processed compartments data with IDs
760
+ interaction_source : source.Source
761
+ Source object for reactions
762
+ upstream_stoichiometry : int
763
+ Stoichiometry for upstream species
764
+ downstream_stoichiometry : int
765
+ Stoichiometry for downstream species
766
+ downstream_sbo_name : str
767
+ SBO term name for downstream species
768
+ extra_reactions_columns : list
769
+ Names of extra columns to preserve
770
+
771
+ Returns
772
+ -------
773
+ tuple
774
+ (reactions_df, reaction_species_df, reactions_data)
775
+ """
776
+ # Add compartmentalized species IDs to interactions
777
+ comp_species_w_names = (
778
+ comp_species.reset_index()
779
+ .merge(species_df[SBML_DFS.S_NAME].reset_index())
780
+ .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
781
+ )
782
+
783
+ interaction_w_cspecies = interaction_edgelist.merge(
784
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
785
+ {
786
+ SBML_DFS.SC_ID: "sc_id_up",
787
+ SBML_DFS.S_NAME: "upstream_name",
788
+ SBML_DFS.C_NAME: "upstream_compartment",
789
+ },
790
+ axis=1,
791
+ ),
792
+ how="left",
793
+ ).merge(
794
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
795
+ {
796
+ SBML_DFS.SC_ID: "sc_id_down",
797
+ SBML_DFS.S_NAME: "downstream_name",
798
+ SBML_DFS.C_NAME: "downstream_compartment",
799
+ },
800
+ axis=1,
801
+ ),
802
+ how="left",
803
+ )[
804
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
805
+ ]
806
+
807
+ # Validate merge didn't create duplicates
808
+ if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
809
+ raise ValueError(
810
+ f"Merging compartmentalized species resulted in row count change "
811
+ f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
812
+ )
813
+
814
+ # Create reaction IDs FIRST - before using them
815
+ interaction_w_cspecies[SBML_DFS.R_ID] = id_formatter(
816
+ range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
817
+ )
818
+
819
+ # Create reactions DataFrame
820
+ interactions_copy = interaction_w_cspecies.copy()
821
+ interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
822
+
823
+ reactions_columns = [
824
+ SBML_DFS.R_NAME,
825
+ SBML_DFS.R_IDENTIFIERS,
826
+ SBML_DFS.R_SOURCE,
827
+ SBML_DFS.R_ISREVERSIBLE,
828
+ ]
829
+
830
+ reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
831
+ reactions_columns + extra_reactions_columns
832
+ ]
833
+
834
+ # Separate extra data
835
+ reactions_data = reactions_df[extra_reactions_columns]
836
+ reactions_df = reactions_df[reactions_columns]
837
+
838
+ # Create reaction species relationships - NOW r_id exists
839
+ reaction_species_df = pd.concat(
840
+ [
841
+ # Upstream species (modifiers/stimulators/inhibitors)
842
+ interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
843
+ .assign(stoichiometry=upstream_stoichiometry)
844
+ .rename({"sc_id_up": "sc_id"}, axis=1),
845
+ # Downstream species (products)
846
+ interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
847
+ .assign(
848
+ stoichiometry=downstream_stoichiometry,
849
+ sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
850
+ )
851
+ .rename({"sc_id_down": "sc_id"}, axis=1),
852
+ ]
853
+ )
854
+
855
+ reaction_species_df["rsc_id"] = id_formatter(
856
+ range(reaction_species_df.shape[0]), "rsc_id"
857
+ )
858
+
859
+ reaction_species_df = reaction_species_df.set_index("rsc_id")
860
+
861
+ return reactions_df, reaction_species_df, reactions_data
862
+
863
+
864
+ def _edgelist_identify_extra_columns(
865
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
866
+ ):
867
+ """
868
+ Identify extra columns in input data that should be preserved.
869
+
870
+ Parameters
871
+ ----------
872
+ interaction_edgelist : pd.DataFrame
873
+ Interaction data containing potential extra columns
874
+ species_df : pd.DataFrame
875
+ Species data containing potential extra columns
876
+ keep_reactions_data : bool or str
877
+ Whether to keep extra reaction columns
878
+ keep_species_data : bool or str
879
+ Whether to keep extra species columns
880
+
881
+ Returns
882
+ -------
883
+ dict
884
+ Dictionary with 'reactions' and 'species' keys containing lists of extra column names
885
+ """
886
+ extra_reactions_columns = []
887
+ extra_species_columns = []
888
+
889
+ if keep_reactions_data is not False:
890
+ extra_reactions_columns = [
891
+ c
892
+ for c in interaction_edgelist.columns
893
+ if c not in INTERACTION_EDGELIST_EXPECTED_VARS
894
+ ]
895
+
896
+ if keep_species_data is not False:
897
+ extra_species_columns = [
898
+ c
899
+ for c in species_df.columns
900
+ if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
901
+ ]
902
+
903
+ return {"reactions": extra_reactions_columns, "species": extra_species_columns}
904
+
905
+
906
+ def _edgelist_process_compartments(compartments_df, interaction_source):
907
+ """
908
+ Format compartments DataFrame with source and ID columns.
909
+
910
+ Parameters
911
+ ----------
912
+ compartments_df : pd.DataFrame
913
+ Raw compartments data
914
+ interaction_source : source.Source
915
+ Source object to assign to compartments
916
+
917
+ Returns
918
+ -------
919
+ pd.DataFrame
920
+ Processed compartments with IDs, indexed by compartment ID
921
+ """
922
+ compartments = compartments_df.copy()
923
+ compartments[SBML_DFS.C_SOURCE] = interaction_source
924
+ compartments[SBML_DFS.C_ID] = id_formatter(
925
+ range(compartments.shape[0]), SBML_DFS.C_ID
926
+ )
927
+ return compartments.set_index(SBML_DFS.C_ID)[
928
+ [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
929
+ ]
930
+
931
+
932
+ def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
933
+ """
934
+ Format species DataFrame and extract extra data.
935
+
936
+ Parameters
937
+ ----------
938
+ species_df : pd.DataFrame
939
+ Raw species data
940
+ interaction_source : source.Source
941
+ Source object to assign to species
942
+ extra_species_columns : list
943
+ Names of extra columns to preserve separately
944
+
945
+ Returns
946
+ -------
947
+ tuple of pd.DataFrame
948
+ Processed species DataFrame and species extra data DataFrame
949
+ """
950
+ species = species_df.copy()
951
+ species[SBML_DFS.S_SOURCE] = interaction_source
952
+ species[SBML_DFS.S_ID] = id_formatter(range(species.shape[0]), SBML_DFS.S_ID)
953
+
954
+ required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
955
+ species_indexed = species.set_index(SBML_DFS.S_ID)[
956
+ required_cols + extra_species_columns
957
+ ]
958
+
959
+ # Separate extra data from main species table
960
+ species_data = species_indexed[extra_species_columns]
961
+ processed_species = species_indexed[required_cols]
962
+
963
+ return processed_species, species_data
964
+
965
+
966
+ def _edgelist_validate_inputs(
967
+ interaction_edgelist: pd.DataFrame,
968
+ species_df: pd.DataFrame,
969
+ compartments_df: pd.DataFrame,
970
+ ) -> None:
971
+ """
972
+ Validate input DataFrames have required columns.
973
+
974
+ Parameters
975
+ ----------
976
+ interaction_edgelist : pd.DataFrame
977
+ Interaction data to validate
978
+ species_df : pd.DataFrame
979
+ Species data to validate
980
+ compartments_df : pd.DataFrame
981
+ Compartments data to validate
982
+ """
983
+
984
+ # check compartments
985
+ compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
986
+ compartments_df_columns = set(compartments_df.columns.tolist())
987
+ missing_required_fields = compartments_df_expected_vars.difference(
988
+ compartments_df_columns
989
+ )
990
+ if len(missing_required_fields) > 0:
991
+ raise ValueError(
992
+ f"{', '.join(missing_required_fields)} are required variables"
993
+ ' in "compartments_df" but were not present in the input file.'
994
+ )
995
+
996
+ # check species
997
+ species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
998
+ species_df_columns = set(species_df.columns.tolist())
999
+ missing_required_fields = species_df_expected_vars.difference(species_df_columns)
1000
+ if len(missing_required_fields) > 0:
1001
+ raise ValueError(
1002
+ f"{', '.join(missing_required_fields)} are required"
1003
+ ' variables in "species_df" but were not present '
1004
+ "in the input file."
1005
+ )
1006
+
1007
+ # check interactions
1008
+ interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
1009
+ missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
1010
+ interaction_edgelist_columns
1011
+ )
1012
+ if len(missing_required_fields) > 0:
1013
+ raise ValueError(
1014
+ f"{', '.join(missing_required_fields)} are required "
1015
+ 'variables in "interaction_edgelist" but were not '
1016
+ "present in the input file."
1017
+ )
1018
+
1019
+ return None
1020
+
1021
+
1022
+ def _filter_promiscuous_components(
1023
+ bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
1024
+ ) -> pd.DataFrame:
1025
+
1026
+ # number of complexes a species is part of
1027
+ n_complexes_involvedin = bqb_has_parts_species.value_counts(
1028
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1029
+ )
1030
+ promiscuous_component_identifiers_index = n_complexes_involvedin[
1031
+ n_complexes_involvedin > max_promiscuity
1032
+ ].index
1033
+ promiscuous_component_identifiers = pd.Series(
1034
+ data=[True] * len(promiscuous_component_identifiers_index),
1035
+ index=promiscuous_component_identifiers_index,
1036
+ name="is_shared_component",
1037
+ dtype=bool,
1038
+ )
1039
+
1040
+ if len(promiscuous_component_identifiers) == 0:
1041
+ return bqb_has_parts_species
1042
+
1043
+ filtered_bqb_has_parts = bqb_has_parts_species.merge(
1044
+ promiscuous_component_identifiers,
1045
+ left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
1046
+ right_index=True,
1047
+ how="left",
1048
+ )
1049
+
1050
+ filtered_bqb_has_parts["is_shared_component"] = (
1051
+ filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
1052
+ )
1053
+ # drop identifiers shared as components across many species
1054
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
1055
+ ~filtered_bqb_has_parts["is_shared_component"]
1056
+ ].drop(["is_shared_component"], axis=1)
1057
+
1058
+ return filtered_bqb_has_parts
1059
+
1060
+
1061
+ def _find_underspecified_reactions(
1062
+ reaction_species_w_roles: pd.DataFrame,
1063
+ ) -> pd.DataFrame:
1064
+
1065
+ # check that both sbo_role and "new" are present
1066
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
1067
+ raise ValueError(
1068
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
1069
+ )
1070
+ if "new" not in reaction_species_w_roles.columns:
1071
+ raise ValueError(
1072
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
1073
+ )
1074
+ # check that new is a boolean column
1075
+ if reaction_species_w_roles["new"].dtype != bool:
1076
+ raise ValueError(
1077
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
1078
+ )
1079
+
1080
+ reactions_with_lost_defining_members = set(
1081
+ reaction_species_w_roles.query("~new")
1082
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
1083
+ .tolist()
1084
+ )
1085
+
1086
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
1087
+ if N_reactions_with_lost_defining_members > 0:
1088
+ logger.info(
1089
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
1090
+ )
1091
+
1092
+ # find the cases where all "new" values for a given (r_id, sbo_term) are False
1093
+ reactions_with_lost_requirements = set(
1094
+ reaction_species_w_roles
1095
+ # drop already filtered reactions
1096
+ .query("r_id not in @reactions_with_lost_defining_members")
1097
+ .query("sbo_role == 'REQUIRED'")
1098
+ # which entries which have some required attribute have all False values for that attribute
1099
+ .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
1100
+ .agg({"new": "any"})
1101
+ .query("new == False")
1102
+ .index.get_level_values(SBML_DFS.R_ID)
1103
+ )
1104
+
1105
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
1106
+ if N_reactions_with_lost_requirements > 0:
1107
+ logger.info(
1108
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
1109
+ )
1110
+
1111
+ underspecified_reactions = reactions_with_lost_defining_members.union(
1112
+ reactions_with_lost_requirements
1113
+ )
1114
+
1115
+ return underspecified_reactions
1116
+
1117
+
1118
+ def _id_dict_to_df(ids):
330
1119
  if len(ids) == 0:
331
1120
  return pd.DataFrame(
332
1121
  {
@@ -338,3 +1127,157 @@ def _stub_ids(ids):
338
1127
  )
339
1128
  else:
340
1129
  return pd.DataFrame(ids)
1130
+
1131
+
1132
+ def _perform_sbml_dfs_table_validation(
1133
+ table_data: pd.DataFrame,
1134
+ table_schema: dict,
1135
+ table_name: str,
1136
+ ) -> None:
1137
+ """
1138
+ Core validation logic for SBML_dfs tables.
1139
+
1140
+ This function performs the actual validation checks for any table against its schema,
1141
+ regardless of whether it's part of an SBML_dfs object or standalone.
1142
+
1143
+ Parameters
1144
+ ----------
1145
+ table_data : pd.DataFrame
1146
+ The table data to validate
1147
+ table_schema : dict
1148
+ Schema definition for the table
1149
+ table_name : str
1150
+ Name of the table (for error messages)
1151
+
1152
+ Raises
1153
+ ------
1154
+ ValueError
1155
+ If the table does not conform to its schema:
1156
+ - Not a DataFrame
1157
+ - Wrong index name
1158
+ - Duplicate primary keys
1159
+ - Missing required variables
1160
+ - Empty table
1161
+ """
1162
+ if not isinstance(table_data, pd.DataFrame):
1163
+ raise ValueError(
1164
+ f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
1165
+ )
1166
+
1167
+ # check index
1168
+ expected_index_name = table_schema["pk"]
1169
+ if table_data.index.name != expected_index_name:
1170
+ raise ValueError(
1171
+ f"the index name for {table_name} was not the pk: {expected_index_name}"
1172
+ )
1173
+
1174
+ # check that all entries in the index are unique
1175
+ if len(set(table_data.index.tolist())) != table_data.shape[0]:
1176
+ duplicated_pks = table_data.index.value_counts()
1177
+ duplicated_pks = duplicated_pks[duplicated_pks > 1]
1178
+
1179
+ example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
1180
+ raise ValueError(
1181
+ f"{duplicated_pks.shape[0]} primary keys were duplicated "
1182
+ f"including {', '.join(example_duplicates)}"
1183
+ )
1184
+
1185
+ # check variables
1186
+ expected_vars = set(table_schema["vars"])
1187
+ table_vars = set(list(table_data.columns))
1188
+
1189
+ extra_vars = table_vars.difference(expected_vars)
1190
+ if len(extra_vars) != 0:
1191
+ logger.debug(
1192
+ f"{len(extra_vars)} extra variables were found for {table_name}: "
1193
+ f"{', '.join(extra_vars)}"
1194
+ )
1195
+
1196
+ missing_vars = expected_vars.difference(table_vars)
1197
+ if len(missing_vars) != 0:
1198
+ raise ValueError(
1199
+ f"Missing {len(missing_vars)} required variables for {table_name}: "
1200
+ f"{', '.join(missing_vars)}"
1201
+ )
1202
+
1203
+ # check for empty table
1204
+ if table_data.shape[0] == 0:
1205
+ raise ValueError(f"{table_name} contained no entries")
1206
+
1207
+
1208
+ def _sbml_dfs_from_edgelist_check_cspecies_merge(
1209
+ merged_species: pd.DataFrame, original_species: pd.DataFrame
1210
+ ) -> None:
1211
+ """Check for a mismatch between the provided species data and species implied by the edgelist."""
1212
+
1213
+ # check for 1-many merge
1214
+ if merged_species.shape[0] != original_species.shape[0]:
1215
+ raise ValueError(
1216
+ "Merging compartmentalized species to species_df"
1217
+ " and compartments_df by names resulted in an "
1218
+ f"increase in the tables from {original_species.shape[0]}"
1219
+ f" to {merged_species.shape[0]} indicating that names were"
1220
+ " not unique"
1221
+ )
1222
+
1223
+ # check for missing species and compartments
1224
+ missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
1225
+ SBML_DFS.C_NAME
1226
+ ].unique()
1227
+ if len(missing_compartments) >= 1:
1228
+ raise ValueError(
1229
+ f"{len(missing_compartments)} compartments were present in"
1230
+ ' "interaction_edgelist" but not "compartments_df":'
1231
+ f" {', '.join(missing_compartments)}"
1232
+ )
1233
+
1234
+ missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
1235
+ SBML_DFS.S_NAME
1236
+ ].unique()
1237
+ if len(missing_species) >= 1:
1238
+ raise ValueError(
1239
+ f"{len(missing_species)} species were present in "
1240
+ '"interaction_edgelist" but not "species_df":'
1241
+ f" {', '.join(missing_species)}"
1242
+ )
1243
+
1244
+ return None
1245
+
1246
+
1247
+ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
1248
+ """Validates a table against a reference
1249
+
1250
+ This check if the table has the same index, no duplicates in the index
1251
+ and that all values in the index are in the reference table.
1252
+
1253
+ Args:
1254
+ data_table (pd.DataFrame): a table with data that should
1255
+ match the reference
1256
+ ref_table (pd.DataFrame): a reference table
1257
+
1258
+ Raises:
1259
+ ValueError: not same index name
1260
+ ValueError: index contains duplicates
1261
+ ValueError: index not subset of index of reactions table
1262
+ """
1263
+ ref_index_name = ref_table.index.name
1264
+ if data_table.index.name != ref_index_name:
1265
+ raise ValueError(
1266
+ "the index name for reaction data table was not"
1267
+ f" {ref_index_name}: {data_table.index.name}"
1268
+ )
1269
+ ids = data_table.index
1270
+ if any(ids.duplicated()):
1271
+ raise ValueError(
1272
+ "the index for reaction data table " "contained duplicate values"
1273
+ )
1274
+ if not all(ids.isin(ref_table.index)):
1275
+ raise ValueError(
1276
+ "the index for reaction data table contained values"
1277
+ " not found in the reactions table"
1278
+ )
1279
+ if not isinstance(data_table, pd.DataFrame):
1280
+ raise TypeError(
1281
+ f"The data table was type {type(data_table).__name__}"
1282
+ " but must be a pd.DataFrame"
1283
+ )