napistu 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. napistu/__init__.py +1 -1
  2. napistu/consensus.py +1010 -513
  3. napistu/constants.py +24 -0
  4. napistu/gcs/constants.py +2 -2
  5. napistu/gcs/downloads.py +57 -25
  6. napistu/gcs/utils.py +21 -0
  7. napistu/identifiers.py +105 -6
  8. napistu/ingestion/constants.py +0 -1
  9. napistu/ingestion/obo.py +24 -8
  10. napistu/ingestion/psi_mi.py +20 -5
  11. napistu/ingestion/reactome.py +8 -32
  12. napistu/mcp/__init__.py +69 -0
  13. napistu/mcp/__main__.py +180 -0
  14. napistu/mcp/codebase.py +182 -0
  15. napistu/mcp/codebase_utils.py +298 -0
  16. napistu/mcp/constants.py +72 -0
  17. napistu/mcp/documentation.py +166 -0
  18. napistu/mcp/documentation_utils.py +235 -0
  19. napistu/mcp/execution.py +382 -0
  20. napistu/mcp/profiles.py +73 -0
  21. napistu/mcp/server.py +86 -0
  22. napistu/mcp/tutorials.py +124 -0
  23. napistu/mcp/tutorials_utils.py +230 -0
  24. napistu/mcp/utils.py +47 -0
  25. napistu/mechanism_matching.py +782 -26
  26. napistu/modify/constants.py +41 -0
  27. napistu/modify/curation.py +4 -1
  28. napistu/modify/gaps.py +243 -156
  29. napistu/modify/pathwayannot.py +26 -8
  30. napistu/network/neighborhoods.py +16 -7
  31. napistu/network/net_create.py +209 -54
  32. napistu/network/net_propagation.py +118 -0
  33. napistu/network/net_utils.py +1 -32
  34. napistu/rpy2/netcontextr.py +10 -7
  35. napistu/rpy2/rids.py +7 -5
  36. napistu/sbml_dfs_core.py +46 -29
  37. napistu/sbml_dfs_utils.py +37 -1
  38. napistu/source.py +8 -2
  39. napistu/utils.py +67 -8
  40. napistu-0.2.4.dev2.dist-info/METADATA +84 -0
  41. napistu-0.2.4.dev2.dist-info/RECORD +95 -0
  42. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/WHEEL +1 -1
  43. tests/conftest.py +11 -5
  44. tests/test_consensus.py +4 -1
  45. tests/test_gaps.py +127 -0
  46. tests/test_gcs.py +3 -2
  47. tests/test_igraph.py +14 -0
  48. tests/test_mcp_documentation_utils.py +13 -0
  49. tests/test_mechanism_matching.py +658 -0
  50. tests/test_net_propagation.py +89 -0
  51. tests/test_net_utils.py +83 -0
  52. tests/test_sbml.py +2 -0
  53. tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
  54. tests/test_utils.py +81 -0
  55. napistu-0.1.0.dist-info/METADATA +0 -56
  56. napistu-0.1.0.dist-info/RECORD +0 -77
  57. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
  58. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
  59. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/top_level.txt +0 -0
napistu/consensus.py CHANGED
@@ -31,111 +31,43 @@ def construct_consensus_model(
31
31
  dogmatic: bool = True,
32
32
  ) -> sbml_dfs_core.SBML_dfs:
33
33
  """
34
- Construct Consensus Model
34
+ Construct a Consensus Model by merging shared entities across pathway models.
35
35
 
36
- Turn a dictionary of pathway models into a single consensus model by merging shared entities.
36
+ This function takes a dictionary of pathway models and merges shared entities (compartments, species, reactions, etc.)
37
+ into a single consensus model, using a set of rules for entity identity and merging.
37
38
 
38
- Parameters:
39
- ----------
40
- sbml_dfs_dict: dict{cpr.SBML_dfs}
41
- A dictionary of SBML_dfs from different models
42
- pw_index: indices.PWIndex
43
- An index of all tables being aggregated
44
- dogmatic: bool
45
- If True then try to preserve genes, transcript, and proteins as separate species. If False
46
- then try to merge them.
47
-
48
- Returns:
39
+ Parameters
49
40
  ----------
50
- A cpr.SBML_dfs object containing the consensus model
51
-
41
+ sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
42
+ A dictionary of SBML_dfs objects from different models, keyed by model name.
43
+ pw_index : indices.PWIndex
44
+ An index of all tables being aggregated, used for cross-referencing entities.
45
+ dogmatic : bool, default=True
46
+ If True, preserve genes, transcripts, and proteins as separate species. If False, merge them when possible.
47
+
48
+ Returns
49
+ -------
50
+ sbml_dfs_core.SBML_dfs
51
+ A consensus SBML_dfs object containing the merged model.
52
52
  """
53
-
53
+ # Validate inputs
54
54
  logger.info("Reporting possible issues in component models")
55
55
  _check_sbml_dfs_dict(sbml_dfs_dict)
56
56
  assert isinstance(pw_index, indices.PWIndex)
57
- # select valid BQB attributes based on dogmatic flag
58
- defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
59
-
60
- logger.info("Defining compartments based on unique ids")
61
- comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
62
- sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
63
- )
64
-
65
- logger.info("Defining species based on unique ids")
66
- spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
67
- sbml_dfs_dict=sbml_dfs_dict,
68
- pw_index=pw_index,
69
- table=SBML_DFS.SPECIES,
70
- defining_biological_qualifiers=defining_biological_qualifiers,
71
- )
72
-
73
- logger.info(
74
- "Defining compartmentalized species based on unique species x compartments"
75
- )
76
- compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
77
- sbml_dfs_dict,
78
- pw_index,
79
- table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
80
- fk_lookup_tables={
81
- SBML_DFS.C_ID: comp_lookup_table,
82
- SBML_DFS.S_ID: spec_lookup_table,
83
- },
84
- )
85
-
86
- logger.info(
87
- "Define reactions based on membership of identical compartmentalized species"
88
- )
89
- rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
90
- sbml_dfs_dict,
91
- pw_index,
92
- table=SBML_DFS.REACTIONS,
93
- defined_by=SBML_DFS.REACTION_SPECIES,
94
- defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
95
- defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
96
- )
97
57
 
98
- logger.info("Annotating reversibility based on merged reactions")
99
- rxn_consensus_species = _resolve_reversibility(
100
- sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
101
- )
58
+ # Select valid BQB attributes based on dogmatic flag
59
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
102
60
 
103
- # define reaction species with species
104
- logger.info("Define reaction species based on reactions")
105
- rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
106
- sbml_dfs_dict,
107
- pw_index,
108
- table=SBML_DFS.REACTION_SPECIES,
109
- fk_lookup_tables={
110
- SBML_DFS.R_ID: rxn_lookup_table,
111
- SBML_DFS.SC_ID: compspec_lookup_table,
112
- },
113
- # retain species with different roles
114
- extra_defining_attrs=[SBML_DFS.SBO_TERM],
61
+ # Step 1: Create consensus entities for all primary tables
62
+ consensus_entities, lookup_tables = _create_consensus_entities(
63
+ sbml_dfs_dict, pw_index, defining_biological_qualifiers
115
64
  )
116
65
 
117
- sbml_tbl_dict = {
118
- SBML_DFS.COMPARTMENTS: comp_consensus_entities,
119
- SBML_DFS.SPECIES: spec_consensus_entities,
120
- SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
121
- SBML_DFS.REACTIONS: rxn_consensus_species,
122
- SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
123
- }
124
-
125
- sbml_dfs = sbml_dfs_core.SBML_dfs(sbml_tbl_dict) # type: ignore
66
+ # Step 2: Create the consensus SBML_dfs object
67
+ sbml_dfs = sbml_dfs_core.SBML_dfs(consensus_entities) # type: ignore
126
68
 
127
- # add species and reactions data from component models
128
- consensus_species_data = merge_entity_data(
129
- sbml_dfs_dict, lookup_table=spec_lookup_table, table=SBML_DFS.SPECIES
130
- )
131
- for k in consensus_species_data.keys():
132
- sbml_dfs.add_species_data(k, consensus_species_data[k])
133
-
134
- consensus_reactions_data = merge_entity_data(
135
- sbml_dfs_dict, lookup_table=rxn_lookup_table, table=SBML_DFS.REACTIONS
136
- )
137
- for k in consensus_reactions_data.keys():
138
- sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
69
+ # Step 3: Add entity data from component models
70
+ sbml_dfs = _add_entity_data(sbml_dfs, sbml_dfs_dict, lookup_tables)
139
71
 
140
72
  return sbml_dfs
141
73
 
@@ -144,18 +76,22 @@ def construct_sbml_dfs_dict(
144
76
  pw_index: pd.DataFrame, strict: bool = True
145
77
  ) -> dict[str, sbml_dfs_core.SBML_dfs]:
146
78
  """
147
- Construct SBML DFs Dict
148
-
149
- Convert all models in the pathway index into SBML_dfs and add them to a dict.
150
-
151
- Parameters:
152
- pw_index: indices.PWIndex
153
- An index of all tables being aggregated
154
- strict (bool): if set to `false` errorenous files are skipped with warning. Default: True
79
+ Construct a dictionary of SBML_dfs objects from a pathway index.
155
80
 
156
- Returns:
157
- dict(sbml_dfs_core.SBML_dfs)
81
+ This function converts all models in the pathway index into SBML_dfs objects and adds them to a dictionary.
82
+ Optionally, it can skip erroneous files with a warning instead of raising an error.
158
83
 
84
+ Parameters
85
+ ----------
86
+ pw_index : pd.DataFrame
87
+ An index of all tables being aggregated, containing model metadata and file paths.
88
+ strict : bool, default=True
89
+ If True, raise an error on any file that cannot be loaded. If False, skip erroneous files with a warning.
90
+
91
+ Returns
92
+ -------
93
+ dict[str, sbml_dfs_core.SBML_dfs]
94
+ A dictionary mapping model names to SBML_dfs objects.
159
95
  """
160
96
 
161
97
  sbml_dfs_dict = dict()
@@ -182,18 +118,22 @@ def unnest_SBML_df(
182
118
  sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], table: str
183
119
  ) -> pd.DataFrame:
184
120
  """
185
- Unnest SBML_dfs
121
+ Unnest and concatenate a specific table from multiple SBML_dfs models.
186
122
 
187
- Merge corresponding tables from a set of models
188
-
189
- sbml_dfs_dict: dict{cpr.SBML_dfs}
190
- A dictionary of SBML_dfs from different models
191
- table: str
192
- A table to aggregate (e.g., species, reactions, compartments)
193
-
194
- Returns:
195
- pd.Dataframe, a table with a multindex of model and an entity_id
123
+ This function merges corresponding tables from a set of models into a single DataFrame,
124
+ adding the model name as an index level.
196
125
 
126
+ Parameters
127
+ ----------
128
+ sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
129
+ A dictionary of SBML_dfs objects from different models, keyed by model name.
130
+ table : str
131
+ The name of the table to aggregate (e.g., 'species', 'reactions', 'compartments').
132
+
133
+ Returns
134
+ -------
135
+ pd.DataFrame
136
+ A concatenated table with a MultiIndex of model and entity ID.
197
137
  """
198
138
 
199
139
  # check that all sbml_dfs have the same schema
@@ -222,31 +162,30 @@ def construct_meta_entities_identifiers(
222
162
  defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
223
163
  ) -> tuple[pd.DataFrame, pd.Series]:
224
164
  """
225
- Construct Meta Entities Defined by Identifiers
226
-
227
- Aggregating across one entity type for a set of pathway models merge entities which share identifiers
165
+ Construct meta-entities by merging entities across models that share identifiers.
228
166
 
229
- Parameters:
230
- ----------
231
- sbml_df_dict (dict{"model": cpr.SBML_dfs}):
232
- A dictionary of cpr.SBML_dfs
233
- pw_index (indices.PWIndex):
234
- An index of all tables being aggregated
235
- table (str):
236
- A table/entity set from the sbml_dfs to work-with
237
- fk_lookup_tables (dict):
238
- Dictionary containing lookup tables for all foreign keys used by the table
239
- defining_biological_qualifiers (list[str]):
240
- BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
241
- permissive settings could merge homologs, different forms of the same gene.
167
+ Aggregates a single entity type from a set of pathway models and merges entities that share identifiers
168
+ (as defined by the provided biological qualifiers).
242
169
 
243
- Returns:
170
+ Parameters
244
171
  ----------
245
- new_id_table: pd.DataFrame
246
- Matching the schema of one of the tables within sbml_df_dict
247
- lookup_table: pd.Series
248
- Matches the index of the aggregated entities to new_ids
249
-
172
+ sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
173
+ A dictionary of SBML_dfs objects from different models, keyed by model name.
174
+ pw_index : indices.PWIndex
175
+ An index of all tables being aggregated.
176
+ table : str
177
+ The name of the table/entity set to aggregate (e.g., 'species', 'compartments').
178
+ fk_lookup_tables : dict, optional
179
+ Dictionary containing lookup tables for all foreign keys used by the table (default: empty dict).
180
+ defining_biological_qualifiers : list[str], optional
181
+ List of BQB codes which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
182
+
183
+ Returns
184
+ -------
185
+ new_id_table : pd.DataFrame
186
+ Table matching the schema of one of the input models, with merged entities.
187
+ lookup_table : pd.Series
188
+ Series mapping the index of the aggregated entities to new consensus IDs.
250
189
  """
251
190
 
252
191
  # combine sbml_dfs by adding model to the index and concatinating all dfs
@@ -281,96 +220,58 @@ def reduce_to_consensus_ids(
281
220
  defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
282
221
  ) -> tuple[pd.DataFrame, pd.Series]:
283
222
  """
284
- Reduce to Consensus
285
-
286
- Reduce a table of entities to unique entries based on identifiers.
223
+ Reduce a table of entities to unique entries based on consensus identifiers.
287
224
 
288
- Parameters:
289
- ----------
290
- sbml_df: pd.DataFrame
291
- One type of entity from sbml_dfs_dict expanded to include
292
- model its index, as produced by unnest_SBML_df(sbml_dfs_dict)
293
- table_schema: dict
294
- Schema for the table sbml_df
295
- pw_index: indices.PWIndex
296
- An index of all tables being aggregated
297
- defining_biological_qualifiers: list(str)
298
- A list of biological qualifier types which define distinct entities
225
+ This function clusters entities that share identifiers (as defined by the provided biological qualifiers)
226
+ and produces a new table of unique entities, along with a lookup table mapping original entities to consensus IDs.
299
227
 
300
- Returns:
228
+ Parameters
301
229
  ----------
302
- new_id_table: pd.DataFrame
303
- Matching the schema of one of the tables within sbml_df_dict
304
- lookup_table: pd.Series
305
- Matches the index of the aggregated entities to new_ids
230
+ sbml_df : pd.DataFrame
231
+ Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
232
+ table_schema : dict
233
+ Schema for the table being reduced.
234
+ pw_index : indices.PWIndex, optional
235
+ An index of all tables being aggregated (default: None).
236
+ defining_biological_qualifiers : list[str], optional
237
+ List of biological qualifier types which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
238
+
239
+ Returns
240
+ -------
241
+ new_id_table : pd.DataFrame
242
+ Table matching the schema of one of the input models, with merged entities.
243
+ lookup_table : pd.Series
244
+ Series mapping the index of the aggregated entities to new consensus IDs.
306
245
  """
307
-
246
+ # Step 1: Build consensus identifiers to create clusters of equivalent entities
308
247
  indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
309
248
  sbml_df, table_schema, defining_biological_qualifiers
310
249
  )
311
250
 
312
- # add cluster to reduce non-identifier attributes
251
+ # Step 2: Join cluster information to the original table
313
252
  agg_table_harmonized = sbml_df.join(indexed_cluster)
314
- # create a new numbering schema off of cluster #s and id type
315
- # print(agg_table_harmonized["cluster"])
316
- # print(table_schema["pk"])
317
-
318
- agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
319
- agg_table_harmonized["cluster"], table_schema["pk"]
320
- )
321
253
 
322
- lookup_table = agg_table_harmonized["new_id"]
254
+ # Step 3: Create lookup table for entity IDs
255
+ lookup_table = _create_entity_lookup_table(agg_table_harmonized, table_schema)
323
256
 
324
- # add nameness_score as a measure of how-readable a possible name would be
325
- # (this will help to select names which are more human readable after the merge)
257
+ # Step 4: Add nameness scores to help select representative names
326
258
  agg_table_harmonized = utils._add_nameness_score_wrapper(
327
259
  agg_table_harmonized, "label", table_schema
328
260
  )
329
261
 
330
- # reduce to one row per new_id and set as the primary key of the source table
331
- agg_table_reduced = (
332
- agg_table_harmonized.reset_index(drop=True)
333
- .sort_values(["nameness_score"])
334
- .rename(columns={"new_id": table_schema["pk"]})
335
- .groupby(table_schema["pk"])
336
- .first()
337
- .drop("nameness_score", axis=1)
338
- )
339
-
340
- new_id_table = (
341
- agg_table_reduced.drop(table_schema["id"], axis=1)
342
- .merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
343
- .drop("cluster", axis=1)
262
+ # Step 5: Prepare the consensus table with one row per unique entity
263
+ new_id_table = _prepare_consensus_table(
264
+ agg_table_harmonized, table_schema, cluster_consensus_identifiers
344
265
  )
345
266
 
267
+ # Step 6: Add source information if required
346
268
  if "source" in table_schema.keys():
347
- if type(pw_index) is not indices.PWIndex:
348
- raise ValueError(
349
- f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
350
- )
351
-
352
- # track the model(s) that each entity came from
353
- new_sources = create_consensus_sources(
354
- agg_table_harmonized, lookup_table, table_schema, pw_index
355
- )
356
- assert isinstance(new_sources, pd.Series)
357
-
358
- new_id_table = new_id_table.drop(
359
- table_schema[SOURCE_SPEC.SOURCE], axis=1
360
- ).merge(new_sources, left_index=True, right_index=True)
361
-
362
- # check that the index name and variables match the source
363
- if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
364
- new_id_table.index.names
365
- ):
366
- raise ValueError(
367
- "The newly constructed id table's index does not match the inputs"
269
+ new_id_table = _add_consensus_sources(
270
+ new_id_table, agg_table_harmonized, lookup_table, table_schema, pw_index
368
271
  )
369
272
 
370
- if set(sbml_df) != set(new_id_table.columns):
371
- raise ValueError(
372
- "The newly constructed id table's variables do not match the inputs"
373
- )
273
+ # Step 7: Validate the resulting table
274
+ _validate_consensus_table(new_id_table, sbml_df)
374
275
 
375
276
  return new_id_table, lookup_table
376
277
 
@@ -381,163 +282,85 @@ def build_consensus_identifiers(
381
282
  defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
382
283
  ) -> tuple[pd.Series, pd.DataFrame]:
383
284
  """
384
- Build Consensus Identifiers
385
-
386
- Take a set of entities spanning multiple models and find all unique entities.
285
+ Build consensus identifiers by clustering entities that share biological identifiers.
387
286
 
388
- Defining attributes provided in defining_biological_qualifiers will
389
- be used for grouping; other identifiers will be added back at the end.
287
+ This function takes a set of entities spanning multiple models and finds all unique entities
288
+ by grouping them according to the provided biological qualifiers. It returns a mapping from
289
+ original entities to clusters and a DataFrame of consensus identifier objects for each cluster.
390
290
 
391
- Parameters:
392
- ----------
393
- sbml_df: pd.DataFrame
394
- One type of entity from sbml_dfs_dict expanded to include model its index,
395
- as produced by unnest_SBML_df(sbml_dfs_dict)
396
- table_schema: dict
397
- Schema for the table sbml_df
398
- defining_biological_qualifiers: [str]
399
- A list of biological qualifier types which should be used for grouping
400
-
401
- Returns:
291
+ Parameters
402
292
  ----------
403
- indexed_cluster: pd.Series
404
- Maps the index from sbml_df onto a set of clusters which define unique entities
405
- cluster_consensus_identifiers_df: pd.DataFrame
406
- Maps an index of clusters onto a consensus cpr.identifiers.Identifiers object
293
+ sbml_df : pd.DataFrame
294
+ Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
295
+ table_schema : dict
296
+ Schema for the table being processed.
297
+ defining_biological_qualifiers : list[str], optional
298
+ List of biological qualifier types to use for grouping. Defaults to BQB_DEFINING_ATTRS.
299
+
300
+ Returns
301
+ -------
302
+ indexed_cluster : pd.Series
303
+ Series mapping the index from sbml_df onto a set of clusters which define unique entities.
304
+ cluster_consensus_identifiers_df : pd.DataFrame
305
+ DataFrame mapping clusters to consensus identifiers (Identifiers objects).
407
306
  """
408
-
409
- # create a table which is one row per entry
307
+ # Step 1: Extract and validate identifiers
410
308
  meta_identifiers = sbml_dfs_utils.unnest_identifiers(sbml_df, table_schema["id"])
411
- # check the identifiers for missing attributes
412
309
  _validate_meta_identifiers(meta_identifiers)
413
310
 
414
- # remove some biological qualifier types types to avoid over-grouping
415
-
416
- valid_identifiers = meta_identifiers.copy()
417
- valid_identifiers = valid_identifiers[
418
- meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
419
- ]
420
-
421
- # catch entries which no longer have any identifiers
422
- # add a dummy identifier to these which will still uniquely tag them
423
-
424
- filtered_entries = sbml_df.reset_index().merge(
425
- valid_identifiers.reset_index(),
426
- left_on=sbml_df.index.names,
427
- right_on=sbml_df.index.names,
428
- how="outer",
429
- )[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
430
- filtered_entries = filtered_entries[
431
- filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
432
- ]
433
- if filtered_entries.shape[0] != 0:
434
- logger.warning(
435
- f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
436
- )
437
-
438
- filtered_entries[SOURCE_SPEC.ENTRY] = 0
439
- filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
440
- filtered_entries[IDENTIFIERS.ONTOLOGY] = [
441
- "dummy_value_" + str(val)
442
- for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
443
- ]
444
- filtered_entries[IDENTIFIERS.URL] = None
445
- filtered_entries[IDENTIFIERS.BQB] = None
446
-
447
- filtered_entries = filtered_entries.set_index(
448
- sbml_df.index.names + [SOURCE_SPEC.ENTRY]
449
- )
450
-
451
- valid_identifiers = pd.concat([valid_identifiers, filtered_entries])
452
-
453
- # combine multi-index into a single variable; combine ontology + identifiers as a single variable
454
- valid_identifiers = utils.format_identifiers_as_edgelist(
455
- valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
311
+ # Step 2: Filter identifiers by biological qualifier type
312
+ valid_identifiers = _filter_identifiers_by_qualifier(
313
+ meta_identifiers, defining_biological_qualifiers
456
314
  )
457
315
 
458
- # create a unique tag for a species from the original index
459
- indexed_species_tags = (
460
- valid_identifiers.reset_index()
461
- .set_index(valid_identifiers.index.names, drop=False)[sbml_df.index.names]
462
- .astype(str)
463
- .apply("__".join, axis=1)
464
- )
465
- valid_identifiers.loc[:, "model_spec"] = indexed_species_tags
316
+ # Step 3: Handle entries that don't have identifiers
317
+ valid_identifiers = _handle_entries_without_identifiers(sbml_df, valid_identifiers)
466
318
 
467
- # convert index-identifier edge list into a network
468
- # doing this will allow any entities with matching ontologies to be
469
- # added to the same cluster so that they can be merged
470
- id_edgelist = pd.concat(
471
- [
472
- valid_identifiers[["ind", "id"]],
473
- # add id-ind edges so that identifiers corresponding to the same entity are grouped
474
- # these entries will be discarded when merging the results back in by "ind"
475
- valid_identifiers[["model_spec", "id"]].rename(
476
- columns={"model_spec": "ind"}
477
- ),
478
- ]
479
- )
319
+ # Step 4: Prepare edgelist for clustering
320
+ id_edgelist = _prepare_identifier_edgelist(valid_identifiers, sbml_df)
480
321
 
481
- # aggregate index entries which have overlapping identifiers
482
- # using a greedy graph-based approach
322
+ # Step 5: Cluster entities based on shared identifiers
483
323
  ind_clusters = utils.find_weakly_connected_subgraphs(id_edgelist)
484
324
 
485
- # add clusters to identifier entries
486
- valid_identifiers = valid_identifiers.reset_index().merge(ind_clusters)
487
-
488
- # all entries for the same (model, id) will have the same cluster so convert back to
489
- # sbml_df index to facilitate join
490
- indexed_cluster = valid_identifiers.groupby(sbml_df.index.names).first()["cluster"]
491
-
492
- # combine equivalent entries into a single Identifiers object
493
- # include identifiers which were filtered by bqb
494
-
495
- all_cluster_identifiers = meta_identifiers.reset_index().merge(
496
- indexed_cluster, left_on=sbml_df.index.names, right_index=True
325
+ # Step 6: Map entity indices to clusters
326
+ valid_identifiers_with_clusters = valid_identifiers.reset_index().merge(
327
+ ind_clusters
497
328
  )
329
+ indexed_cluster = valid_identifiers_with_clusters.groupby(
330
+ sbml_df.index.names
331
+ ).first()["cluster"]
498
332
 
499
- cluster_consensus_identifiers = {
500
- k: identifiers.Identifiers(
501
- list(
502
- v[
503
- [
504
- IDENTIFIERS.ONTOLOGY,
505
- IDENTIFIERS.IDENTIFIER,
506
- IDENTIFIERS.URL,
507
- IDENTIFIERS.BQB,
508
- ]
509
- ]
510
- .T.to_dict()
511
- .values()
512
- )
513
- )
514
- for k, v in all_cluster_identifiers.groupby("cluster")
515
- }
516
-
517
- # recover clusters which don't have any identifiers
518
- catchup_clusters = {
519
- c: identifiers.Identifiers(list())
520
- for c in set(ind_clusters["cluster"].tolist()).difference(
521
- cluster_consensus_identifiers
522
- )
523
- }
524
- cluster_consensus_identifiers = {
525
- **cluster_consensus_identifiers,
526
- **catchup_clusters,
527
- }
528
-
529
- cluster_consensus_identifiers_df = pd.DataFrame(
530
- cluster_consensus_identifiers, index=[table_schema["id"]]
531
- ).T
532
- cluster_consensus_identifiers_df.index.name = "cluster"
333
+ # Step 7: Create consensus identifiers for each cluster
334
+ cluster_consensus_identifiers_df = _create_cluster_identifiers(
335
+ meta_identifiers, indexed_cluster, sbml_df, ind_clusters, table_schema
336
+ )
533
337
 
534
338
  return indexed_cluster, cluster_consensus_identifiers_df
535
339
 
536
340
 
537
341
  def pre_consensus_ontology_check(
538
342
  sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
539
- ):
540
- """Check for shared ontologies across source models."""
343
+ ) -> tuple[list, pd.DataFrame]:
344
+ """
345
+ Check for shared ontologies across source models for a given table.
346
+
347
+ For compartments, species, or reactions tables, this function returns the set of ontologies
348
+ shared among all SBML_dfs in the input dictionary, as well as a DataFrame summarizing ontologies per model.
349
+
350
+ Parameters
351
+ ----------
352
+ sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
353
+ Dictionary of SBML_dfs objects from different models, keyed by model name.
354
+ tablename : str
355
+ Name of the table to check (should be one of 'compartments', 'species', or 'reactions').
356
+
357
+ Returns
358
+ -------
359
+ shared_onto_list : list
360
+ List of ontologies shared by all models for the specified table.
361
+ sbml_dict_onto_df : pd.DataFrame
362
+ DataFrame summarizing ontologies present in each model for the specified table.
363
+ """
541
364
 
542
365
  # tablename: compartments/species/reactions tables with Identifiers
543
366
  # returns shared ontologies among sbml_dfs in sbml_dfs_dict for
@@ -572,23 +395,23 @@ def pre_consensus_ontology_check(
572
395
  return shared_onto_list, sbml_dict_onto_df
573
396
 
574
397
 
575
- def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
576
- """Flag cases where meta identifers are totally missing or BQB codes are not included"""
398
+ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
399
+ """
400
+ Check and return the set of ontologies shared by different sources in a consensus model's species table.
577
401
 
578
- if meta_identifiers.shape[0] == 0:
579
- raise ValueError(
580
- '"meta_identifiers" was empty; some identifiers should be present'
581
- )
582
-
583
- n_null = sum(meta_identifiers["bqb"].isnull())
584
- if n_null > 0:
585
- msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
586
- logger.warn(msg)
587
-
588
- return None
402
+ This function examines the species table in a consensus SBML_dfs object, determines the ontologies
403
+ present for each source model, and returns the intersection of ontologies shared by all sources.
589
404
 
405
+ Parameters
406
+ ----------
407
+ sbml_dfs : sbml_dfs_core.SBML_dfs
408
+ The consensus SBML_dfs object containing merged species from multiple models.
590
409
 
591
- def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
410
+ Returns
411
+ -------
412
+ set[str]
413
+ Set of ontology terms shared by all sources in the consensus model's species table.
414
+ """
592
415
  # Checking the ontology in "species" shared by different sources in a consensus model
593
416
  # returns a set of shared ontologies by different sources
594
417
 
@@ -636,27 +459,6 @@ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> s
636
459
  return shared_onto_set
637
460
 
638
461
 
639
- def _update_foreign_keys(
640
- agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
641
- ) -> pd.DataFrame:
642
- """Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
643
-
644
- for fk in table_schema["fk"]:
645
- updated_fks = (
646
- agg_tbl[fk]
647
- .reset_index()
648
- .merge(
649
- fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
650
- )
651
- .drop(fk, axis=1)
652
- .rename(columns={"new_id": fk})
653
- .set_index(["model", table_schema["pk"]])
654
- )
655
- agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
656
-
657
- return agg_tbl
658
-
659
-
660
462
  def pre_consensus_compartment_check(
661
463
  sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
662
464
  ) -> tuple[list, dict]:
@@ -855,146 +657,57 @@ def construct_meta_entities_members(
855
657
  Matching the schema of one of the tables within sbml_df_dict
856
658
  lookup_table: pd.Series
857
659
  Matches the index of the aggregated entities to new_ids
858
-
859
660
  """
860
-
861
661
  logger.info(
862
662
  f"Merging {table} based on identical membership ({' + '.join(defining_attrs)})"
863
663
  )
864
664
 
865
- # combine sbml_dfs by adding model to the index and concatinating all dfs
866
- agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
867
-
868
- # to debug and see names of species
869
- # comp_species = unnest_SBML_df(sbml_dfs_dict, table="compartmentalized_species")
870
- # agg_tbl = agg_tbl.merge(comp_species, left_on = ["model", "sc_id"], right_index = True )
871
-
872
- # since all sbml_dfs have the same schema pull out one schema for reference
665
+ # Step 1: Get schemas for both tables
873
666
  table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
874
667
  defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
875
668
 
876
- # update ids using previously created lookup tables
877
- for k in defined_lookup_tables.keys():
878
- agg_tbl = (
879
- agg_tbl.merge(
880
- defined_lookup_tables[k],
881
- left_on=[SOURCE_SPEC.MODEL, k],
882
- right_index=True,
883
- )
884
- .drop(k, axis=1)
885
- .rename(columns={"new_id": k})
886
- )
887
-
888
- # create a set of species x compartment instances for each reaction
889
- defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
890
-
891
- if (
892
- len(defining_fk) != 1
893
- or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
894
- ):
895
- raise ValueError(
896
- f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
897
- )
898
- else:
899
- defining_fk = list(defining_fk)[0]
900
-
901
- # define what it is to be a unique member based on a combination of defining_attrs
902
- valid_defining_attrs = agg_tbl.columns.values.tolist()
903
- invalid_defining_attrs = [
904
- x for x in defining_attrs if x not in valid_defining_attrs
905
- ]
906
-
907
- if len(invalid_defining_attrs) != 0:
908
- raise ValueError(
909
- f"{', '.join(invalid_defining_attrs)} was not found; "
910
- f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
911
- )
912
-
913
- # create unique members
914
- agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
915
-
916
- # members are aggregated by reaction
917
- membership_df = (
918
- agg_tbl.reset_index()
919
- .groupby(["model", table_schema["pk"]])
920
- .agg(membership=("member", lambda x: (list(set(x)))))
669
+ # Step 2: Prepare the member table and validate its structure
670
+ agg_tbl, defining_fk = _prepare_member_table(
671
+ sbml_dfs_dict,
672
+ defined_by,
673
+ defined_lookup_tables,
674
+ table_schema,
675
+ defined_by_schema,
676
+ defining_attrs,
677
+ table,
921
678
  )
922
679
 
923
- # check whether members are duplicated within a given group
924
- # suggesting that distinct entities have been coerced into
925
- # the same entity
926
- for i in range(membership_df.shape[0]):
927
- members = membership_df["membership"].iloc[i]
928
- if len(members) != len(set(members)):
929
- _ = agg_tbl.reset_index().merge(
930
- membership_df.iloc[i : i + 1],
931
- how="inner",
932
- left_on=[SOURCE_SPEC.MODEL, table_schema["pk"]],
933
- right_index=True,
934
- )
935
-
936
- raise ValueError(
937
- "Members were duplicated suggesting overmerging in the source "
938
- )
939
-
940
- membership_df["member_string"] = [
941
- _create_member_string(x) for x in membership_df["membership"]
942
- ]
943
-
944
- membership_lookup = membership_df.reset_index()
680
+ # Step 3: Create lookup table for entity membership
681
+ membership_lookup = _create_membership_lookup(agg_tbl, table_schema)
945
682
 
946
- consensus_entities = membership_lookup.groupby("member_string").first()
947
- consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
948
- range(consensus_entities.shape[0]), table_schema["pk"]
683
+ # Step 4: Create consensus entities and lookup table
684
+ consensus_entities, lookup_table = _create_entity_consensus(
685
+ membership_lookup, table_schema
949
686
  )
950
687
 
951
- lookup_table = membership_lookup.merge(
952
- consensus_entities["new_id"], left_on="member_string", right_index=True
953
- ).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
954
-
955
- # logging merges that occurred
688
+ # Step 5: Log merger information
956
689
  report_consensus_merges(
957
690
  lookup_table, table_schema, sbml_dfs_dict=sbml_dfs_dict, n_example_merges=5
958
691
  )
959
692
 
693
+ # Step 6: Get primary entity table and merge identifiers
960
694
  agg_primary_table = unnest_SBML_df(sbml_dfs_dict, table=table)
961
695
 
962
- # add nameness_score as a measure of how-readable a possible name would be
963
- # (this will help to select names which are more human readable after the merge)
964
- agg_primary_table = utils._add_nameness_score_wrapper(
965
- agg_primary_table, "label", table_schema
966
- )
967
-
968
- new_id_table = (
969
- agg_primary_table.join(lookup_table)
970
- .reset_index(drop=True)
971
- .sort_values(["nameness_score"])
972
- .rename(columns={"new_id": table_schema["pk"]})
973
- .groupby(table_schema["pk"])
974
- .first()[table_schema["vars"]]
975
- )
976
-
977
- # merge identifiers
978
696
  logger.info(f"Merging {table} identifiers")
979
- indexed_old_identifiers = (
980
- agg_primary_table.join(lookup_table)
981
- .reset_index(drop=True)
982
- .rename(columns={"new_id": table_schema["pk"]})
983
- .groupby(table_schema["pk"])[table_schema["id"]]
697
+ updated_identifiers = _merge_entity_identifiers(
698
+ agg_primary_table, lookup_table, table_schema
984
699
  )
985
700
 
986
- # combine merged identifiers into single identifier objects indexed by new id
987
- updated_identifiers = indexed_old_identifiers.agg(identifiers.merge_identifiers)
988
-
989
- # add merged identifiers back to new_id table overwriting existing ids
990
- new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
991
- updated_identifiers, left_index=True, right_index=True
701
+ # Step 7: Create consensus table with merged entities
702
+ new_id_table = _create_consensus_table(
703
+ agg_primary_table, lookup_table, updated_identifiers, table_schema
992
704
  )
993
705
 
706
+ # Step 8: Add source information if present
994
707
  if "source" in table_schema.keys():
995
708
  logger.info(f"Merging {table} sources")
996
709
 
997
- # track the model(s) that each entity came from
710
+ # Track the model(s) that each entity came from
998
711
  new_sources = create_consensus_sources(
999
712
  agg_primary_table.merge(lookup_table, left_index=True, right_index=True),
1000
713
  lookup_table,
@@ -1190,6 +903,163 @@ def report_consensus_merges(
1190
903
  return None
1191
904
 
1192
905
 
906
+ def _create_entity_lookup_table(
907
+ agg_table_harmonized: pd.DataFrame, table_schema: dict
908
+ ) -> pd.Series:
909
+ """
910
+ Create a lookup table mapping original entity IDs to new consensus IDs.
911
+
912
+ Parameters:
913
+ ----------
914
+ agg_table_harmonized: pd.DataFrame
915
+ Table with cluster assignments for each entity
916
+ table_schema: dict
917
+ Schema for the table
918
+
919
+ Returns:
920
+ ----------
921
+ pd.Series
922
+ Lookup table mapping old entity IDs to new consensus IDs
923
+ """
924
+ # Create a new ID based on cluster number and entity type
925
+ agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
926
+ agg_table_harmonized["cluster"], table_schema["pk"]
927
+ )
928
+
929
+ # Return the lookup series
930
+ return agg_table_harmonized["new_id"]
931
+
932
+
933
+ def _prepare_consensus_table(
934
+ agg_table_harmonized: pd.DataFrame,
935
+ table_schema: dict,
936
+ cluster_consensus_identifiers: pd.DataFrame,
937
+ ) -> pd.DataFrame:
938
+ """
939
+ Prepare a consensus table with one row per unique entity.
940
+
941
+ Parameters:
942
+ ----------
943
+ agg_table_harmonized: pd.DataFrame
944
+ Table with nameness scores and cluster assignments
945
+ table_schema: dict
946
+ Schema for the table
947
+ cluster_consensus_identifiers: pd.DataFrame
948
+ Consensus identifiers for each cluster
949
+
950
+ Returns:
951
+ ----------
952
+ pd.DataFrame
953
+ New consensus table with merged entities
954
+ """
955
+ # Sort by nameness score and keep one row per new entity ID
956
+ agg_table_reduced = (
957
+ agg_table_harmonized.reset_index(drop=True)
958
+ .sort_values(["nameness_score"])
959
+ .rename(columns={"new_id": table_schema["pk"]})
960
+ .groupby(table_schema["pk"])
961
+ .first()
962
+ .drop("nameness_score", axis=1)
963
+ )
964
+
965
+ # Join in the consensus identifiers and drop the temporary cluster column
966
+ new_id_table = (
967
+ agg_table_reduced.drop(table_schema["id"], axis=1)
968
+ .merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
969
+ .drop("cluster", axis=1)
970
+ )
971
+
972
+ return new_id_table
973
+
974
+
975
+ def _add_consensus_sources(
976
+ new_id_table: pd.DataFrame,
977
+ agg_table_harmonized: pd.DataFrame,
978
+ lookup_table: pd.Series,
979
+ table_schema: dict,
980
+ pw_index: indices.PWIndex | None,
981
+ ) -> pd.DataFrame:
982
+ """
983
+ Add source information to the consensus table.
984
+
985
+ Parameters:
986
+ ----------
987
+ new_id_table: pd.DataFrame
988
+ Consensus table without source information
989
+ agg_table_harmonized: pd.DataFrame
990
+ Original table with cluster assignments
991
+ lookup_table: pd.Series
992
+ Maps old IDs to new consensus IDs
993
+ table_schema: dict
994
+ Schema for the table
995
+ pw_index: indices.PWIndex | None
996
+ An index of all tables being aggregated
997
+
998
+ Returns:
999
+ ----------
1000
+ pd.DataFrame
1001
+ Consensus table with source information added
1002
+ """
1003
+ if type(pw_index) is not indices.PWIndex:
1004
+ raise ValueError(
1005
+ f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
1006
+ )
1007
+
1008
+ # Track the model(s) that each entity came from
1009
+ new_sources = create_consensus_sources(
1010
+ agg_table_harmonized, lookup_table, table_schema, pw_index
1011
+ )
1012
+ assert isinstance(new_sources, pd.Series)
1013
+
1014
+ # Add the sources to the consensus table
1015
+ updated_table = new_id_table.drop(table_schema[SOURCE_SPEC.SOURCE], axis=1).merge(
1016
+ new_sources, left_index=True, right_index=True
1017
+ )
1018
+
1019
+ return updated_table
1020
+
1021
+
1022
+ def _validate_consensus_table(
1023
+ new_id_table: pd.DataFrame, sbml_df: pd.DataFrame
1024
+ ) -> None:
1025
+ """
1026
+ Validate that the new consensus table has the same structure as the original.
1027
+
1028
+ Parameters:
1029
+ ----------
1030
+ new_id_table: pd.DataFrame
1031
+ Newly created consensus table
1032
+ sbml_df: pd.DataFrame
1033
+ Original table from which consensus was built
1034
+
1035
+ Raises:
1036
+ ------
1037
+ ValueError
1038
+ If index names or columns don't match
1039
+ """
1040
+ # Check that the index names match
1041
+ if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
1042
+ new_id_table.index.names
1043
+ ):
1044
+ raise ValueError(
1045
+ f"The newly constructed id table's index does not match the inputs.\n"
1046
+ f"Expected index names: {sbml_df.index.names}\n"
1047
+ f"Actual index names: {new_id_table.index.names}"
1048
+ )
1049
+
1050
+ # Check that the columns match
1051
+ if set(sbml_df) != set(new_id_table.columns):
1052
+ missing_in_new = set(sbml_df) - set(new_id_table.columns)
1053
+ extra_in_new = set(new_id_table.columns) - set(sbml_df)
1054
+ raise ValueError(
1055
+ "The newly constructed id table's variables do not match the inputs.\n"
1056
+ f"Expected columns: {list(sbml_df.columns)}\n"
1057
+ f"Actual columns: {list(new_id_table.columns)}\n"
1058
+ f"Missing in new: {missing_in_new}\n"
1059
+ f"Extra in new: {extra_in_new}"
1060
+ )
1061
+
1062
+
1193
1063
  def merge_entity_data(
1194
1064
  sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1195
1065
  lookup_table: pd.Series,
@@ -1232,35 +1102,619 @@ def merge_entity_data(
1232
1102
  return entity_data
1233
1103
 
1234
1104
 
1235
- def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
1236
- """Check models in SBML_dfs for problems which can be reported up-front
1237
-
1238
- Args:
1239
- sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
1240
- primarily used as an input for construct_consensus_model
1241
-
1242
- Returns:
1243
- None
1244
-
1105
+ def _create_consensus_entities(
1106
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1107
+ pw_index: indices.PWIndex,
1108
+ defining_biological_qualifiers: list[str],
1109
+ ) -> tuple[dict, dict]:
1245
1110
  """
1111
+ Create consensus entities for all primary tables in the model.
1246
1112
 
1247
- for k, v in sbml_dfs_dict.items():
1248
- _check_sbml_dfs(sbml_dfs=v, model_label=k)
1249
- return None
1113
+ This helper function creates consensus compartments, species, compartmentalized species,
1114
+ reactions, and reaction species by finding shared entities across source models.
1250
1115
 
1116
+ Parameters:
1117
+ ----------
1118
+ sbml_dfs_dict: dict{cpr.SBML_dfs}
1119
+ A dictionary of SBML_dfs from different models
1120
+ pw_index: indices.PWIndex
1121
+ An index of all tables being aggregated
1122
+ defining_biological_qualifiers: list[str]
1123
+ Biological qualifier terms that define distinct entities
1251
1124
 
1252
- def _check_sbml_dfs(
1253
- sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
1254
- ) -> None:
1255
- """Check SBML_dfs for identifiers which are associated with different entities before a merge."""
1125
+ Returns:
1126
+ ----------
1127
+ tuple:
1128
+ - dict of consensus entities tables
1129
+ - dict of lookup tables
1130
+ """
1131
+ # Step 1: Compartments
1132
+ logger.info("Defining compartments based on unique ids")
1133
+ comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
1134
+ sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
1135
+ )
1256
1136
 
1257
- ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
1258
- defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
1137
+ # Step 2: Species
1138
+ logger.info("Defining species based on unique ids")
1139
+ spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
1140
+ sbml_dfs_dict=sbml_dfs_dict,
1141
+ pw_index=pw_index,
1142
+ table=SBML_DFS.SPECIES,
1143
+ defining_biological_qualifiers=defining_biological_qualifiers,
1144
+ )
1259
1145
 
1260
- defining_identifier_counts = defining_ids.value_counts(
1261
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1146
+ # Step 3: Compartmentalized species
1147
+ logger.info(
1148
+ "Defining compartmentalized species based on unique species x compartments"
1262
1149
  )
1263
- degenerate_defining_identities = (
1150
+ compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
1151
+ sbml_dfs_dict,
1152
+ pw_index,
1153
+ table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
1154
+ fk_lookup_tables={
1155
+ SBML_DFS.C_ID: comp_lookup_table,
1156
+ SBML_DFS.S_ID: spec_lookup_table,
1157
+ },
1158
+ )
1159
+
1160
+ # Step 4: Reactions
1161
+ logger.info(
1162
+ "Define reactions based on membership of identical compartmentalized species"
1163
+ )
1164
+ rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
1165
+ sbml_dfs_dict,
1166
+ pw_index,
1167
+ table=SBML_DFS.REACTIONS,
1168
+ defined_by=SBML_DFS.REACTION_SPECIES,
1169
+ defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
1170
+ defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
1171
+ )
1172
+
1173
+ logger.info("Annotating reversibility based on merged reactions")
1174
+ rxn_consensus_species = _resolve_reversibility(
1175
+ sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
1176
+ )
1177
+
1178
+ # Step 5: Reaction species
1179
+ logger.info("Define reaction species based on reactions")
1180
+ rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
1181
+ sbml_dfs_dict,
1182
+ pw_index,
1183
+ table=SBML_DFS.REACTION_SPECIES,
1184
+ fk_lookup_tables={
1185
+ SBML_DFS.R_ID: rxn_lookup_table,
1186
+ SBML_DFS.SC_ID: compspec_lookup_table,
1187
+ },
1188
+ # retain species with different roles
1189
+ extra_defining_attrs=[SBML_DFS.SBO_TERM],
1190
+ )
1191
+
1192
+ consensus_entities = {
1193
+ SBML_DFS.COMPARTMENTS: comp_consensus_entities,
1194
+ SBML_DFS.SPECIES: spec_consensus_entities,
1195
+ SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
1196
+ SBML_DFS.REACTIONS: rxn_consensus_species,
1197
+ SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
1198
+ }
1199
+
1200
+ lookup_tables = {
1201
+ SBML_DFS.COMPARTMENTS: comp_lookup_table,
1202
+ SBML_DFS.SPECIES: spec_lookup_table,
1203
+ SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_lookup_table,
1204
+ SBML_DFS.REACTIONS: rxn_lookup_table,
1205
+ SBML_DFS.REACTION_SPECIES: rxnspec_lookup_table,
1206
+ }
1207
+
1208
+ return consensus_entities, lookup_tables
1209
+
1210
+
1211
+ def _add_entity_data(
1212
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1213
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1214
+ lookup_tables: dict,
1215
+ ) -> sbml_dfs_core.SBML_dfs:
1216
+ """
1217
+ Add entity data from component models to the consensus model.
1218
+
1219
+ Parameters:
1220
+ ----------
1221
+ sbml_dfs: sbml_dfs_core.SBML_dfs
1222
+ The consensus model being built
1223
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
1224
+ A dictionary of SBML_dfs from different models
1225
+ lookup_tables: dict
1226
+ Dictionary of lookup tables for translating between old and new entity IDs
1227
+
1228
+ Returns:
1229
+ ----------
1230
+ sbml_dfs_core.SBML_dfs
1231
+ The updated consensus model
1232
+ """
1233
+ # Add species data
1234
+ consensus_species_data = merge_entity_data(
1235
+ sbml_dfs_dict,
1236
+ lookup_table=lookup_tables[SBML_DFS.SPECIES],
1237
+ table=SBML_DFS.SPECIES,
1238
+ )
1239
+ for k in consensus_species_data.keys():
1240
+ sbml_dfs.add_species_data(k, consensus_species_data[k])
1241
+
1242
+ # Add reactions data
1243
+ consensus_reactions_data = merge_entity_data(
1244
+ sbml_dfs_dict,
1245
+ lookup_table=lookup_tables[SBML_DFS.REACTIONS],
1246
+ table=SBML_DFS.REACTIONS,
1247
+ )
1248
+ for k in consensus_reactions_data.keys():
1249
+ sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
1250
+
1251
+ return sbml_dfs
1252
+
1253
+
1254
+ def _prepare_member_table(
1255
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1256
+ defined_by: str,
1257
+ defined_lookup_tables: dict,
1258
+ table_schema: dict,
1259
+ defined_by_schema: dict,
1260
+ defining_attrs: list[str],
1261
+ table: str = SBML_DFS.REACTIONS,
1262
+ ) -> tuple[pd.DataFrame, str]:
1263
+ """
1264
+ Prepare a table of members and validate their structure.
1265
+
1266
+ Parameters:
1267
+ ----------
1268
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
1269
+ Dictionary of SBML_dfs from different models
1270
+ defined_by: str
1271
+ Name of the table whose entries define membership
1272
+ defined_lookup_tables: dict
1273
+ Lookup tables for updating IDs
1274
+ table_schema: dict
1275
+ Schema for the main table
1276
+ defined_by_schema: dict
1277
+ Schema for the defining table
1278
+ defining_attrs: list[str]
1279
+ Attributes that define a unique member
1280
+ table: str
1281
+ Name of the main table (default: REACTIONS)
1282
+
1283
+ Returns:
1284
+ ----------
1285
+ tuple:
1286
+ - Updated aggregated table with member strings
1287
+ - Name of the foreign key
1288
+ """
1289
+ # Combine models into a single table
1290
+ agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
1291
+
1292
+ # Update IDs using previously created lookup tables
1293
+ for k in defined_lookup_tables.keys():
1294
+ agg_tbl = (
1295
+ agg_tbl.merge(
1296
+ defined_lookup_tables[k],
1297
+ left_on=[SOURCE_SPEC.MODEL, k],
1298
+ right_index=True,
1299
+ )
1300
+ .drop(k, axis=1)
1301
+ .rename(columns={"new_id": k})
1302
+ )
1303
+
1304
+ # Identify the foreign key
1305
+ defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
1306
+
1307
+ if (
1308
+ len(defining_fk) != 1
1309
+ or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
1310
+ ):
1311
+ raise ValueError(
1312
+ f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
1313
+ )
1314
+ else:
1315
+ defining_fk = list(defining_fk)[0]
1316
+
1317
+ # Validate defining attributes
1318
+ valid_defining_attrs = agg_tbl.columns.values.tolist()
1319
+ invalid_defining_attrs = [
1320
+ x for x in defining_attrs if x not in valid_defining_attrs
1321
+ ]
1322
+
1323
+ if len(invalid_defining_attrs) != 0:
1324
+ raise ValueError(
1325
+ f"{', '.join(invalid_defining_attrs)} was not found; "
1326
+ f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
1327
+ )
1328
+
1329
+ # Create unique member strings
1330
+ agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
1331
+
1332
+ return agg_tbl, defining_fk
1333
+
1334
+
1335
+ def _create_membership_lookup(
1336
+ agg_tbl: pd.DataFrame, table_schema: dict
1337
+ ) -> pd.DataFrame:
1338
+ """
1339
+ Create a lookup table for entity membership.
1340
+
1341
+ Parameters:
1342
+ ----------
1343
+ agg_tbl: pd.DataFrame
1344
+ Table with member information
1345
+ table_schema: dict
1346
+ Schema for the table
1347
+
1348
+ Returns:
1349
+ ----------
1350
+ pd.DataFrame
1351
+ Lookup table mapping entity IDs to member strings
1352
+ """
1353
+ # Group members by entity
1354
+ membership_df = (
1355
+ agg_tbl.reset_index()
1356
+ .groupby(["model", table_schema["pk"]])
1357
+ .agg(membership=("member", lambda x: (list(set(x)))))
1358
+ )
1359
+
1360
+ # Check for duplicated members within an entity
1361
+ for i in range(membership_df.shape[0]):
1362
+ members = membership_df["membership"].iloc[i]
1363
+ if len(members) != len(set(members)):
1364
+ raise ValueError(
1365
+ "Members were duplicated suggesting overmerging in the source"
1366
+ )
1367
+
1368
+ # Convert membership lists to strings for comparison
1369
+ membership_df["member_string"] = [
1370
+ _create_member_string(x) for x in membership_df["membership"]
1371
+ ]
1372
+
1373
+ return membership_df.reset_index()
1374
+
1375
+
1376
+ def _create_entity_consensus(
1377
+ membership_lookup: pd.DataFrame, table_schema: dict
1378
+ ) -> tuple[pd.DataFrame, pd.Series]:
1379
+ """
1380
+ Create consensus entities based on membership.
1381
+
1382
+ Parameters:
1383
+ ----------
1384
+ membership_lookup: pd.DataFrame
1385
+ Table mapping entities to their member strings
1386
+ table_schema: dict
1387
+ Schema for the table
1388
+
1389
+ Returns:
1390
+ ----------
1391
+ tuple:
1392
+ - Consensus entities DataFrame
1393
+ - Lookup table mapping old IDs to new IDs
1394
+ """
1395
+ # Group by member string to find entities with identical members
1396
+ consensus_entities = membership_lookup.groupby("member_string").first()
1397
+
1398
+ # Create new IDs for the consensus entities
1399
+ consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
1400
+ range(consensus_entities.shape[0]), table_schema["pk"]
1401
+ )
1402
+
1403
+ # Create lookup table mapping original entities to consensus entities
1404
+ lookup_table = membership_lookup.merge(
1405
+ consensus_entities["new_id"], left_on="member_string", right_index=True
1406
+ ).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
1407
+
1408
+ return consensus_entities, lookup_table
1409
+
1410
+
1411
+ def _merge_entity_identifiers(
1412
+ agg_primary_table: pd.DataFrame, lookup_table: pd.Series, table_schema: dict
1413
+ ) -> pd.Series:
1414
+ """
1415
+ Merge identifiers from multiple entities.
1416
+
1417
+ Parameters:
1418
+ ----------
1419
+ agg_primary_table: pd.DataFrame
1420
+ Table of entities
1421
+ lookup_table: pd.Series
1422
+ Lookup table mapping old IDs to new IDs
1423
+ table_schema: dict
1424
+ Schema for the table
1425
+
1426
+ Returns:
1427
+ ----------
1428
+ pd.Series
1429
+ Series mapping new IDs to merged identifier objects
1430
+ """
1431
+ # Combine entities with the same consensus ID
1432
+ indexed_old_identifiers = (
1433
+ agg_primary_table.join(lookup_table)
1434
+ .reset_index(drop=True)
1435
+ .rename(columns={"new_id": table_schema["pk"]})
1436
+ .groupby(table_schema["pk"])[table_schema["id"]]
1437
+ )
1438
+
1439
+ # Merge identifier objects
1440
+ return indexed_old_identifiers.agg(identifiers.merge_identifiers)
1441
+
1442
+
1443
+ def _create_consensus_table(
1444
+ agg_primary_table: pd.DataFrame,
1445
+ lookup_table: pd.Series,
1446
+ updated_identifiers: pd.Series,
1447
+ table_schema: dict,
1448
+ ) -> pd.DataFrame:
1449
+ """
1450
+ Create a consensus table with merged entities.
1451
+
1452
+ Parameters:
1453
+ ----------
1454
+ agg_primary_table: pd.DataFrame
1455
+ Table of entities
1456
+ lookup_table: pd.Series
1457
+ Lookup table mapping old IDs to new IDs
1458
+ updated_identifiers: pd.Series
1459
+ Series mapping new IDs to merged identifier objects
1460
+ table_schema: dict
1461
+ Schema for the table
1462
+
1463
+ Returns:
1464
+ ----------
1465
+ pd.DataFrame
1466
+ Consensus table with one row per unique entity
1467
+ """
1468
+ # Add nameness scores to help select representative names
1469
+ agg_primary_table_scored = utils._add_nameness_score_wrapper(
1470
+ agg_primary_table, "label", table_schema
1471
+ )
1472
+
1473
+ # Create a table with one row per consensus entity
1474
+ new_id_table = (
1475
+ agg_primary_table_scored.join(lookup_table)
1476
+ .reset_index(drop=True)
1477
+ .sort_values(["nameness_score"])
1478
+ .rename(columns={"new_id": table_schema["pk"]})
1479
+ .groupby(table_schema["pk"])
1480
+ .first()[table_schema["vars"]]
1481
+ )
1482
+
1483
+ # Replace identifiers with merged versions
1484
+ new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
1485
+ updated_identifiers, left_index=True, right_index=True
1486
+ )
1487
+
1488
+ return new_id_table
1489
+
1490
+
1491
+ def _filter_identifiers_by_qualifier(
1492
+ meta_identifiers: pd.DataFrame, defining_biological_qualifiers: list[str]
1493
+ ) -> pd.DataFrame:
1494
+ """
1495
+ Filter identifiers to only include those with specific biological qualifiers.
1496
+
1497
+ Parameters:
1498
+ ----------
1499
+ meta_identifiers: pd.DataFrame
1500
+ Table of identifiers
1501
+ defining_biological_qualifiers: list[str]
1502
+ List of biological qualifier types to keep
1503
+
1504
+ Returns:
1505
+ ----------
1506
+ pd.DataFrame
1507
+ Filtered identifiers
1508
+ """
1509
+ valid_identifiers = meta_identifiers.copy()
1510
+ return valid_identifiers[
1511
+ meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
1512
+ ]
1513
+
1514
+
1515
+ def _handle_entries_without_identifiers(
1516
+ sbml_df: pd.DataFrame, valid_identifiers: pd.DataFrame
1517
+ ) -> pd.DataFrame:
1518
+ """
1519
+ Handle entities that don't have identifiers by adding dummy identifiers.
1520
+
1521
+ Parameters:
1522
+ ----------
1523
+ sbml_df: pd.DataFrame
1524
+ Original table of entities
1525
+ valid_identifiers: pd.DataFrame
1526
+ Table of identifiers that passed filtering
1527
+
1528
+ Returns:
1529
+ ----------
1530
+ pd.DataFrame
1531
+ Valid identifiers with dummy entries added
1532
+ """
1533
+ # Find entries which no longer have any identifiers
1534
+ filtered_entries = sbml_df.reset_index().merge(
1535
+ valid_identifiers.reset_index(),
1536
+ left_on=sbml_df.index.names,
1537
+ right_on=sbml_df.index.names,
1538
+ how="outer",
1539
+ )[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
1540
+
1541
+ filtered_entries = filtered_entries[
1542
+ filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
1543
+ ]
1544
+
1545
+ if filtered_entries.shape[0] == 0:
1546
+ return valid_identifiers
1547
+
1548
+ # Add dummy identifiers to these entries
1549
+ logger.warning(
1550
+ f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
1551
+ )
1552
+
1553
+ filtered_entries[SOURCE_SPEC.ENTRY] = 0
1554
+ filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
1555
+ filtered_entries[IDENTIFIERS.ONTOLOGY] = [
1556
+ "dummy_value_" + str(val)
1557
+ for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
1558
+ ]
1559
+ filtered_entries[IDENTIFIERS.URL] = None
1560
+ filtered_entries[IDENTIFIERS.BQB] = None
1561
+
1562
+ filtered_entries = filtered_entries.set_index(
1563
+ sbml_df.index.names + [SOURCE_SPEC.ENTRY]
1564
+ )
1565
+
1566
+ # Combine original valid identifiers with dummy identifiers
1567
+ return pd.concat([valid_identifiers, filtered_entries])
1568
+
1569
+
1570
+ def _prepare_identifier_edgelist(
1571
+ valid_identifiers: pd.DataFrame, sbml_df: pd.DataFrame
1572
+ ) -> pd.DataFrame:
1573
+ """
1574
+ Prepare an edgelist for clustering identifiers.
1575
+
1576
+ Parameters:
1577
+ ----------
1578
+ valid_identifiers: pd.DataFrame
1579
+ Table of identifiers
1580
+ sbml_df: pd.DataFrame
1581
+ Original table of entities
1582
+
1583
+ Returns:
1584
+ ----------
1585
+ pd.DataFrame
1586
+ Edgelist connecting entities to their identifiers
1587
+ """
1588
+ # Format identifiers as edgelist
1589
+ formatted_identifiers = utils.format_identifiers_as_edgelist(
1590
+ valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1591
+ )
1592
+
1593
+ # Create a unique tag for each entity from the original index
1594
+ indexed_species_tags = (
1595
+ formatted_identifiers.reset_index()
1596
+ .set_index(formatted_identifiers.index.names, drop=False)[sbml_df.index.names]
1597
+ .astype(str)
1598
+ .apply("__".join, axis=1)
1599
+ )
1600
+ formatted_identifiers.loc[:, "model_spec"] = indexed_species_tags
1601
+
1602
+ # Create edgelist that connects entities to identifiers
1603
+ id_edgelist = pd.concat(
1604
+ [
1605
+ formatted_identifiers[["ind", "id"]],
1606
+ # Add edges connecting model-specific instances to their identifiers
1607
+ formatted_identifiers[["model_spec", "id"]].rename(
1608
+ columns={"model_spec": "ind"}
1609
+ ),
1610
+ ]
1611
+ )
1612
+
1613
+ return id_edgelist
1614
+
1615
+
1616
+ def _create_cluster_identifiers(
1617
+ meta_identifiers: pd.DataFrame,
1618
+ indexed_cluster: pd.Series,
1619
+ sbml_df: pd.DataFrame,
1620
+ ind_clusters: pd.DataFrame,
1621
+ table_schema: dict,
1622
+ ) -> pd.DataFrame:
1623
+ """
1624
+ Create identifier objects for each cluster.
1625
+
1626
+ Parameters
1627
+ ----------
1628
+ meta_identifiers : pd.DataFrame
1629
+ All identifiers (including those filtered out by BQB)
1630
+ indexed_cluster : pd.Series
1631
+ Maps entity indices to cluster IDs
1632
+ sbml_df : pd.DataFrame
1633
+ Original table of entities
1634
+ ind_clusters : pd.DataFrame
1635
+ Cluster assignments from graph algorithm
1636
+ table_schema : dict
1637
+ Schema for the table, used to determine the correct identifier column name
1638
+
1639
+ Returns
1640
+ -------
1641
+ pd.DataFrame
1642
+ Table mapping clusters to their consensus identifiers, with the identifier column named according to the schema
1643
+ """
1644
+ # Combine all identifiers with cluster assignments
1645
+ all_cluster_identifiers = meta_identifiers.reset_index().merge(
1646
+ indexed_cluster, left_on=sbml_df.index.names, right_index=True
1647
+ )
1648
+
1649
+ # Create an Identifiers object for each cluster
1650
+ cluster_consensus_identifiers = {
1651
+ k: identifiers.Identifiers(
1652
+ list(
1653
+ v[
1654
+ [
1655
+ IDENTIFIERS.ONTOLOGY,
1656
+ IDENTIFIERS.IDENTIFIER,
1657
+ IDENTIFIERS.URL,
1658
+ IDENTIFIERS.BQB,
1659
+ ]
1660
+ ]
1661
+ .T.to_dict()
1662
+ .values()
1663
+ )
1664
+ )
1665
+ for k, v in all_cluster_identifiers.groupby("cluster")
1666
+ }
1667
+
1668
+ # Handle clusters that don't have any identifiers
1669
+ catchup_clusters = {
1670
+ c: identifiers.Identifiers(list())
1671
+ for c in set(ind_clusters["cluster"].tolist()).difference(
1672
+ cluster_consensus_identifiers
1673
+ )
1674
+ }
1675
+ cluster_consensus_identifiers = {
1676
+ **cluster_consensus_identifiers,
1677
+ **catchup_clusters,
1678
+ }
1679
+
1680
+ # Convert to DataFrame with correct column name
1681
+ id_col = table_schema["id"]
1682
+ cluster_consensus_identifiers_df = pd.DataFrame(
1683
+ cluster_consensus_identifiers, index=[id_col]
1684
+ ).T
1685
+ cluster_consensus_identifiers_df.index.name = "cluster"
1686
+ return cluster_consensus_identifiers_df
1687
+
1688
+
1689
+ def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
1690
+ """Check models in SBML_dfs for problems which can be reported up-front
1691
+
1692
+ Args:
1693
+ sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
1694
+ primarily used as an input for construct_consensus_model
1695
+
1696
+ Returns:
1697
+ None
1698
+
1699
+ """
1700
+
1701
+ for k, v in sbml_dfs_dict.items():
1702
+ _check_sbml_dfs(sbml_dfs=v, model_label=k)
1703
+ return None
1704
+
1705
+
1706
+ def _check_sbml_dfs(
1707
+ sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
1708
+ ) -> None:
1709
+ """Check SBML_dfs for identifiers which are associated with different entities before a merge."""
1710
+
1711
+ ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
1712
+ defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
1713
+
1714
+ defining_identifier_counts = defining_ids.value_counts(
1715
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1716
+ )
1717
+ degenerate_defining_identities = (
1264
1718
  defining_identifier_counts[defining_identifier_counts > 1]
1265
1719
  .rename("N")
1266
1720
  .reset_index()
@@ -1314,9 +1768,46 @@ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
1314
1768
  return None
1315
1769
 
1316
1770
 
1771
+ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
1772
+ """Flag cases where meta identifers are totally missing or BQB codes are not included"""
1773
+
1774
+ if meta_identifiers.shape[0] == 0:
1775
+ raise ValueError(
1776
+ '"meta_identifiers" was empty; some identifiers should be present'
1777
+ )
1778
+
1779
+ n_null = sum(meta_identifiers["bqb"].isnull())
1780
+ if n_null > 0:
1781
+ msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
1782
+ logger.warn(msg)
1783
+
1784
+ return None
1785
+
1786
+
1787
+ def _update_foreign_keys(
1788
+ agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
1789
+ ) -> pd.DataFrame:
1790
+ for fk in table_schema["fk"]:
1791
+ updated_fks = (
1792
+ agg_tbl[fk]
1793
+ .reset_index()
1794
+ .merge(
1795
+ fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
1796
+ )
1797
+ .drop(fk, axis=1)
1798
+ .rename(columns={"new_id": fk})
1799
+ .set_index(["model", table_schema["pk"]])
1800
+ )
1801
+ agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
1802
+
1803
+ return agg_tbl
1804
+
1805
+
1317
1806
  def _update_foreign_keys(
1318
1807
  agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
1319
1808
  ) -> pd.DataFrame:
1809
+ """Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
1810
+
1320
1811
  for fk in table_schema["fk"]:
1321
1812
  updated_fks = (
1322
1813
  agg_tbl[fk]
@@ -1378,8 +1869,14 @@ def _resolve_reversibility(
1378
1869
  SBML_DFS.R_ISREVERSIBLE, axis=1
1379
1870
  ).join(r_id_reversibility)
1380
1871
 
1381
- assert rxns_w_reversibility.shape[0] == rxn_consensus_species.shape[0]
1382
- assert all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False]))
1872
+ if rxns_w_reversibility.shape[0] != rxn_consensus_species.shape[0]:
1873
+ raise ValueError(
1874
+ "rxns_w_reversibility and rxn_consensus_species must have the same number of rows"
1875
+ )
1876
+ if not all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False])):
1877
+ raise ValueError(
1878
+ "All rxns_w_reversibility[R_ISREVERSIBLE] must be True or False"
1879
+ )
1383
1880
 
1384
1881
  return rxns_w_reversibility
1385
1882