napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
napistu/consensus.py ADDED
@@ -0,0 +1,1557 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import random
6
+
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ from napistu import identifiers
11
+ from napistu import indices
12
+ from napistu import sbml_dfs_core
13
+ from napistu import sbml_dfs_utils
14
+ from napistu import source
15
+ from napistu import utils
16
+ from napistu.ingestion import sbml
17
+
18
+ from napistu.constants import SBML_DFS
19
+ from napistu.constants import IDENTIFIERS
20
+ from napistu.constants import SOURCE_SPEC
21
+ from napistu.constants import BQB_DEFINING_ATTRS
22
+
23
+ logger = logging.getLogger(__name__)
24
+ # set the level to show logger.info message
25
+ logging.basicConfig(level=logging.DEBUG)
26
+
27
+
28
+ def construct_consensus_model(
29
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
30
+ pw_index: indices.PWIndex,
31
+ dogmatic: bool = True,
32
+ ) -> sbml_dfs_core.SBML_dfs:
33
+ """
34
+ Construct Consensus Model
35
+
36
+ Turn a dictionary of pathway models into a single consensus model by merging shared entities.
37
+
38
+ Parameters:
39
+ ----------
40
+ sbml_dfs_dict: dict{cpr.SBML_dfs}
41
+ A dictionary of SBML_dfs from different models
42
+ pw_index: indices.PWIndex
43
+ An index of all tables being aggregated
44
+ dogmatic: bool
45
+ If True then try to preserve genes, transcript, and proteins as separate species. If False
46
+ then try to merge them.
47
+
48
+ Returns:
49
+ ----------
50
+ A cpr.SBML_dfs object containing the consensus model
51
+
52
+ """
53
+
54
+ logger.info("Reporting possible issues in component models")
55
+ _check_sbml_dfs_dict(sbml_dfs_dict)
56
+ assert isinstance(pw_index, indices.PWIndex)
57
+ # select valid BQB attributes based on dogmatic flag
58
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
59
+
60
+ logger.info("Defining compartments based on unique ids")
61
+ comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
62
+ sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
63
+ )
64
+
65
+ logger.info("Defining species based on unique ids")
66
+ spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
67
+ sbml_dfs_dict=sbml_dfs_dict,
68
+ pw_index=pw_index,
69
+ table=SBML_DFS.SPECIES,
70
+ defining_biological_qualifiers=defining_biological_qualifiers,
71
+ )
72
+
73
+ logger.info(
74
+ "Defining compartmentalized species based on unique species x compartments"
75
+ )
76
+ compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
77
+ sbml_dfs_dict,
78
+ pw_index,
79
+ table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
80
+ fk_lookup_tables={
81
+ SBML_DFS.C_ID: comp_lookup_table,
82
+ SBML_DFS.S_ID: spec_lookup_table,
83
+ },
84
+ )
85
+
86
+ logger.info(
87
+ "Define reactions based on membership of identical compartmentalized species"
88
+ )
89
+ rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
90
+ sbml_dfs_dict,
91
+ pw_index,
92
+ table=SBML_DFS.REACTIONS,
93
+ defined_by=SBML_DFS.REACTION_SPECIES,
94
+ defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
95
+ defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
96
+ )
97
+
98
+ logger.info("Annotating reversibility based on merged reactions")
99
+ rxn_consensus_species = _resolve_reversibility(
100
+ sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
101
+ )
102
+
103
+ # define reaction species with species
104
+ logger.info("Define reaction species based on reactions")
105
+ rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
106
+ sbml_dfs_dict,
107
+ pw_index,
108
+ table=SBML_DFS.REACTION_SPECIES,
109
+ fk_lookup_tables={
110
+ SBML_DFS.R_ID: rxn_lookup_table,
111
+ SBML_DFS.SC_ID: compspec_lookup_table,
112
+ },
113
+ # retain species with different roles
114
+ extra_defining_attrs=[SBML_DFS.SBO_TERM],
115
+ )
116
+
117
+ sbml_tbl_dict = {
118
+ SBML_DFS.COMPARTMENTS: comp_consensus_entities,
119
+ SBML_DFS.SPECIES: spec_consensus_entities,
120
+ SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
121
+ SBML_DFS.REACTIONS: rxn_consensus_species,
122
+ SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
123
+ }
124
+
125
+ sbml_dfs = sbml_dfs_core.SBML_dfs(sbml_tbl_dict) # type: ignore
126
+
127
+ # add species and reactions data from component models
128
+ consensus_species_data = merge_entity_data(
129
+ sbml_dfs_dict, lookup_table=spec_lookup_table, table=SBML_DFS.SPECIES
130
+ )
131
+ for k in consensus_species_data.keys():
132
+ sbml_dfs.add_species_data(k, consensus_species_data[k])
133
+
134
+ consensus_reactions_data = merge_entity_data(
135
+ sbml_dfs_dict, lookup_table=rxn_lookup_table, table=SBML_DFS.REACTIONS
136
+ )
137
+ for k in consensus_reactions_data.keys():
138
+ sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
139
+
140
+ return sbml_dfs
141
+
142
+
143
+ def construct_sbml_dfs_dict(
144
+ pw_index: pd.DataFrame, strict: bool = True
145
+ ) -> dict[str, sbml_dfs_core.SBML_dfs]:
146
+ """
147
+ Construct SBML DFs Dict
148
+
149
+ Convert all models in the pathway index into SBML_dfs and add them to a dict.
150
+
151
+ Parameters:
152
+ pw_index: indices.PWIndex
153
+ An index of all tables being aggregated
154
+ strict (bool): if set to `false` errorenous files are skipped with warning. Default: True
155
+
156
+ Returns:
157
+ dict(sbml_dfs_core.SBML_dfs)
158
+
159
+ """
160
+
161
+ sbml_dfs_dict = dict()
162
+ for i in tqdm(pw_index.index.index.tolist()):
163
+ pw_entry = pw_index.index.loc[i]
164
+ logger.info(f"processing {pw_entry[SOURCE_SPEC.NAME]}")
165
+
166
+ sbml_path = os.path.join(pw_index.base_path, pw_entry[SOURCE_SPEC.FILE])
167
+ try:
168
+ sbml_obj = sbml.SBML(sbml_path)
169
+ sbml_dfs_dict[pw_entry[SOURCE_SPEC.PATHWAY_ID]] = sbml_dfs_core.SBML_dfs(
170
+ sbml_obj
171
+ )
172
+ except ValueError as e:
173
+ if strict:
174
+ raise e
175
+ logger.warning(
176
+ f"{pw_entry[SOURCE_SPEC.NAME]} not successfully loaded:", exc_info=True
177
+ )
178
+ return sbml_dfs_dict
179
+
180
+
181
+ def unnest_SBML_df(
182
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], table: str
183
+ ) -> pd.DataFrame:
184
+ """
185
+ Unnest SBML_dfs
186
+
187
+ Merge corresponding tables from a set of models
188
+
189
+ sbml_dfs_dict: dict{cpr.SBML_dfs}
190
+ A dictionary of SBML_dfs from different models
191
+ table: str
192
+ A table to aggregate (e.g., species, reactions, compartments)
193
+
194
+ Returns:
195
+ pd.Dataframe, a table with a multindex of model and an entity_id
196
+
197
+ """
198
+
199
+ # check that all sbml_dfs have the same schema
200
+ _test_same_schema(sbml_dfs_dict)
201
+ table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
202
+
203
+ df_list = [
204
+ getattr(sbml_dfs_dict[x], table).assign(model=x) for x in sbml_dfs_dict.keys()
205
+ ]
206
+ df_concat = pd.concat(df_list)
207
+
208
+ # add model to index columns
209
+ if df_concat.size != 0:
210
+ df_concat = df_concat.reset_index().set_index(
211
+ [SOURCE_SPEC.MODEL, table_schema["pk"]]
212
+ )
213
+
214
+ return df_concat
215
+
216
+
217
+ def construct_meta_entities_identifiers(
218
+ sbml_dfs_dict: dict,
219
+ pw_index: indices.PWIndex,
220
+ table: str,
221
+ fk_lookup_tables: dict = {},
222
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
223
+ ) -> tuple[pd.DataFrame, pd.Series]:
224
+ """
225
+ Construct Meta Entities Defined by Identifiers
226
+
227
+ Aggregating across one entity type for a set of pathway models merge entities which share identifiers
228
+
229
+ Parameters:
230
+ ----------
231
+ sbml_df_dict (dict{"model": cpr.SBML_dfs}):
232
+ A dictionary of cpr.SBML_dfs
233
+ pw_index (indices.PWIndex):
234
+ An index of all tables being aggregated
235
+ table (str):
236
+ A table/entity set from the sbml_dfs to work-with
237
+ fk_lookup_tables (dict):
238
+ Dictionary containing lookup tables for all foreign keys used by the table
239
+ defining_biological_qualifiers (list[str]):
240
+ BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
241
+ permissive settings could merge homologs, different forms of the same gene.
242
+
243
+ Returns:
244
+ ----------
245
+ new_id_table: pd.DataFrame
246
+ Matching the schema of one of the tables within sbml_df_dict
247
+ lookup_table: pd.Series
248
+ Matches the index of the aggregated entities to new_ids
249
+
250
+ """
251
+
252
+ # combine sbml_dfs by adding model to the index and concatinating all dfs
253
+ agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=table)
254
+
255
+ # since all sbml_dfs have the same schema pull out one schema for reference
256
+ table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
257
+
258
+ # update foreign keys using provided lookup tables
259
+ if "fk" in table_schema.keys():
260
+ agg_tbl = _update_foreign_keys(agg_tbl, table_schema, fk_lookup_tables)
261
+
262
+ new_id_table, lookup_table = reduce_to_consensus_ids(
263
+ sbml_df=agg_tbl,
264
+ table_schema=table_schema,
265
+ pw_index=pw_index,
266
+ defining_biological_qualifiers=defining_biological_qualifiers,
267
+ )
268
+
269
+ # logging merges that occurred
270
+ report_consensus_merges(
271
+ lookup_table, table_schema, agg_tbl=agg_tbl, n_example_merges=5
272
+ )
273
+
274
+ return new_id_table, lookup_table
275
+
276
+
277
+ def reduce_to_consensus_ids(
278
+ sbml_df: pd.DataFrame,
279
+ table_schema: dict,
280
+ pw_index: indices.PWIndex | None = None,
281
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
282
+ ) -> tuple[pd.DataFrame, pd.Series]:
283
+ """
284
+ Reduce to Consensus
285
+
286
+ Reduce a table of entities to unique entries based on identifiers.
287
+
288
+ Parameters:
289
+ ----------
290
+ sbml_df: pd.DataFrame
291
+ One type of entity from sbml_dfs_dict expanded to include
292
+ model its index, as produced by unnest_SBML_df(sbml_dfs_dict)
293
+ table_schema: dict
294
+ Schema for the table sbml_df
295
+ pw_index: indices.PWIndex
296
+ An index of all tables being aggregated
297
+ defining_biological_qualifiers: list(str)
298
+ A list of biological qualifier types which define distinct entities
299
+
300
+ Returns:
301
+ ----------
302
+ new_id_table: pd.DataFrame
303
+ Matching the schema of one of the tables within sbml_df_dict
304
+ lookup_table: pd.Series
305
+ Matches the index of the aggregated entities to new_ids
306
+ """
307
+
308
+ indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
309
+ sbml_df, table_schema, defining_biological_qualifiers
310
+ )
311
+
312
+ # add cluster to reduce non-identifier attributes
313
+ agg_table_harmonized = sbml_df.join(indexed_cluster)
314
+ # create a new numbering schema off of cluster #s and id type
315
+ # print(agg_table_harmonized["cluster"])
316
+ # print(table_schema["pk"])
317
+
318
+ agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
319
+ agg_table_harmonized["cluster"], table_schema["pk"]
320
+ )
321
+
322
+ lookup_table = agg_table_harmonized["new_id"]
323
+
324
+ # add nameness_score as a measure of how-readable a possible name would be
325
+ # (this will help to select names which are more human readable after the merge)
326
+ agg_table_harmonized = utils._add_nameness_score_wrapper(
327
+ agg_table_harmonized, "label", table_schema
328
+ )
329
+
330
+ # reduce to one row per new_id and set as the primary key of the source table
331
+ agg_table_reduced = (
332
+ agg_table_harmonized.reset_index(drop=True)
333
+ .sort_values(["nameness_score"])
334
+ .rename(columns={"new_id": table_schema["pk"]})
335
+ .groupby(table_schema["pk"])
336
+ .first()
337
+ .drop("nameness_score", axis=1)
338
+ )
339
+
340
+ new_id_table = (
341
+ agg_table_reduced.drop(table_schema["id"], axis=1)
342
+ .merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
343
+ .drop("cluster", axis=1)
344
+ )
345
+
346
+ if "source" in table_schema.keys():
347
+ if type(pw_index) is not indices.PWIndex:
348
+ raise ValueError(
349
+ f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
350
+ )
351
+
352
+ # track the model(s) that each entity came from
353
+ new_sources = create_consensus_sources(
354
+ agg_table_harmonized, lookup_table, table_schema, pw_index
355
+ )
356
+ assert isinstance(new_sources, pd.Series)
357
+
358
+ new_id_table = new_id_table.drop(
359
+ table_schema[SOURCE_SPEC.SOURCE], axis=1
360
+ ).merge(new_sources, left_index=True, right_index=True)
361
+
362
+ # check that the index name and variables match the source
363
+ if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
364
+ new_id_table.index.names
365
+ ):
366
+ raise ValueError(
367
+ "The newly constructed id table's index does not match the inputs"
368
+ )
369
+
370
+ if set(sbml_df) != set(new_id_table.columns):
371
+ raise ValueError(
372
+ "The newly constructed id table's variables do not match the inputs"
373
+ )
374
+
375
+ return new_id_table, lookup_table
376
+
377
+
378
+ def build_consensus_identifiers(
379
+ sbml_df: pd.DataFrame,
380
+ table_schema: dict,
381
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
382
+ ) -> tuple[pd.Series, pd.DataFrame]:
383
+ """
384
+ Build Consensus Identifiers
385
+
386
+ Take a set of entities spanning multiple models and find all unique entities.
387
+
388
+ Defining attributes provided in defining_biological_qualifiers will
389
+ be used for grouping; other identifiers will be added back at the end.
390
+
391
+ Parameters:
392
+ ----------
393
+ sbml_df: pd.DataFrame
394
+ One type of entity from sbml_dfs_dict expanded to include model its index,
395
+ as produced by unnest_SBML_df(sbml_dfs_dict)
396
+ table_schema: dict
397
+ Schema for the table sbml_df
398
+ defining_biological_qualifiers: [str]
399
+ A list of biological qualifier types which should be used for grouping
400
+
401
+ Returns:
402
+ ----------
403
+ indexed_cluster: pd.Series
404
+ Maps the index from sbml_df onto a set of clusters which define unique entities
405
+ cluster_consensus_identifiers_df: pd.DataFrame
406
+ Maps an index of clusters onto a consensus cpr.identifiers.Identifiers object
407
+ """
408
+
409
+ # create a table which is one row per entry
410
+ meta_identifiers = sbml_dfs_utils.unnest_identifiers(sbml_df, table_schema["id"])
411
+ # check the identifiers for missing attributes
412
+ _validate_meta_identifiers(meta_identifiers)
413
+
414
+ # remove some biological qualifier types types to avoid over-grouping
415
+
416
+ valid_identifiers = meta_identifiers.copy()
417
+ valid_identifiers = valid_identifiers[
418
+ meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
419
+ ]
420
+
421
+ # catch entries which no longer have any identifiers
422
+ # add a dummy identifier to these which will still uniquely tag them
423
+
424
+ filtered_entries = sbml_df.reset_index().merge(
425
+ valid_identifiers.reset_index(),
426
+ left_on=sbml_df.index.names,
427
+ right_on=sbml_df.index.names,
428
+ how="outer",
429
+ )[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
430
+ filtered_entries = filtered_entries[
431
+ filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
432
+ ]
433
+ if filtered_entries.shape[0] != 0:
434
+ logger.warning(
435
+ f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
436
+ )
437
+
438
+ filtered_entries[SOURCE_SPEC.ENTRY] = 0
439
+ filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
440
+ filtered_entries[IDENTIFIERS.ONTOLOGY] = [
441
+ "dummy_value_" + str(val)
442
+ for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
443
+ ]
444
+ filtered_entries[IDENTIFIERS.URL] = None
445
+ filtered_entries[IDENTIFIERS.BQB] = None
446
+
447
+ filtered_entries = filtered_entries.set_index(
448
+ sbml_df.index.names + [SOURCE_SPEC.ENTRY]
449
+ )
450
+
451
+ valid_identifiers = pd.concat([valid_identifiers, filtered_entries])
452
+
453
+ # combine multi-index into a single variable; combine ontology + identifiers as a single variable
454
+ valid_identifiers = utils.format_identifiers_as_edgelist(
455
+ valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
456
+ )
457
+
458
+ # create a unique tag for a species from the original index
459
+ indexed_species_tags = (
460
+ valid_identifiers.reset_index()
461
+ .set_index(valid_identifiers.index.names, drop=False)[sbml_df.index.names]
462
+ .astype(str)
463
+ .apply("__".join, axis=1)
464
+ )
465
+ valid_identifiers.loc[:, "model_spec"] = indexed_species_tags
466
+
467
+ # convert index-identifier edge list into a network
468
+ # doing this will allow any entities with matching ontologies to be
469
+ # added to the same cluster so that they can be merged
470
+ id_edgelist = pd.concat(
471
+ [
472
+ valid_identifiers[["ind", "id"]],
473
+ # add id-ind edges so that identifiers corresponding to the same entity are grouped
474
+ # these entries will be discarded when merging the results back in by "ind"
475
+ valid_identifiers[["model_spec", "id"]].rename(
476
+ columns={"model_spec": "ind"}
477
+ ),
478
+ ]
479
+ )
480
+
481
+ # aggregate index entries which have overlapping identifiers
482
+ # using a greedy graph-based approach
483
+ ind_clusters = utils.find_weakly_connected_subgraphs(id_edgelist)
484
+
485
+ # add clusters to identifier entries
486
+ valid_identifiers = valid_identifiers.reset_index().merge(ind_clusters)
487
+
488
+ # all entries for the same (model, id) will have the same cluster so convert back to
489
+ # sbml_df index to facilitate join
490
+ indexed_cluster = valid_identifiers.groupby(sbml_df.index.names).first()["cluster"]
491
+
492
+ # combine equivalent entries into a single Identifiers object
493
+ # include identifiers which were filtered by bqb
494
+
495
+ all_cluster_identifiers = meta_identifiers.reset_index().merge(
496
+ indexed_cluster, left_on=sbml_df.index.names, right_index=True
497
+ )
498
+
499
+ cluster_consensus_identifiers = {
500
+ k: identifiers.Identifiers(
501
+ list(
502
+ v[
503
+ [
504
+ IDENTIFIERS.ONTOLOGY,
505
+ IDENTIFIERS.IDENTIFIER,
506
+ IDENTIFIERS.URL,
507
+ IDENTIFIERS.BQB,
508
+ ]
509
+ ]
510
+ .T.to_dict()
511
+ .values()
512
+ )
513
+ )
514
+ for k, v in all_cluster_identifiers.groupby("cluster")
515
+ }
516
+
517
+ # recover clusters which don't have any identifiers
518
+ catchup_clusters = {
519
+ c: identifiers.Identifiers(list())
520
+ for c in set(ind_clusters["cluster"].tolist()).difference(
521
+ cluster_consensus_identifiers
522
+ )
523
+ }
524
+ cluster_consensus_identifiers = {
525
+ **cluster_consensus_identifiers,
526
+ **catchup_clusters,
527
+ }
528
+
529
+ cluster_consensus_identifiers_df = pd.DataFrame(
530
+ cluster_consensus_identifiers, index=[table_schema["id"]]
531
+ ).T
532
+ cluster_consensus_identifiers_df.index.name = "cluster"
533
+
534
+ return indexed_cluster, cluster_consensus_identifiers_df
535
+
536
+
537
+ def pre_consensus_ontology_check(
538
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
539
+ ):
540
+ """Check for shared ontologies across source models."""
541
+
542
+ # tablename: compartments/species/reactions tables with Identifiers
543
+ # returns shared ontologies among sbml_dfs in sbml_dfs_dict for
544
+ # compartments/species/reactions tables
545
+
546
+ if tablename in [SBML_DFS.COMPARTMENTS, SBML_DFS.SPECIES, SBML_DFS.REACTIONS]:
547
+ sbml_onto_lists = []
548
+ for df_key, sbml_dfs_ind in sbml_dfs_dict.items():
549
+ sbml_onto_df_ind = sbml_dfs_ind.get_identifiers(tablename).value_counts(
550
+ IDENTIFIERS.ONTOLOGY
551
+ )
552
+ sbml_onto_lists.append(sbml_onto_df_ind.index.to_list())
553
+
554
+ shared_onto_set = set.intersection(*map(set, sbml_onto_lists))
555
+ shared_onto_list = list(shared_onto_set)
556
+
557
+ sbml_name_list = list(sbml_dfs_dict.keys())
558
+ sbml_dict_onto_df = pd.DataFrame({"single_sbml_dfs": sbml_name_list})
559
+ sbml_dict_onto_df[IDENTIFIERS.ONTOLOGY] = sbml_onto_lists
560
+
561
+ else:
562
+ logger.error(
563
+ f"{tablename} entry doesn't have identifiers and thus cannot check its ontology"
564
+ )
565
+ shared_onto_list = []
566
+ sbml_dict_onto_df = []
567
+
568
+ logger.info(
569
+ f"Shared ontologies for {tablename} are {shared_onto_list} before building a consensus model."
570
+ )
571
+
572
+ return shared_onto_list, sbml_dict_onto_df
573
+
574
+
575
+ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
576
+ """Flag cases where meta identifers are totally missing or BQB codes are not included"""
577
+
578
+ if meta_identifiers.shape[0] == 0:
579
+ raise ValueError(
580
+ '"meta_identifiers" was empty; some identifiers should be present'
581
+ )
582
+
583
+ n_null = sum(meta_identifiers["bqb"].isnull())
584
+ if n_null > 0:
585
+ msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
586
+ logger.warn(msg)
587
+
588
+ return None
589
+
590
+
591
+ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
592
+ # Checking the ontology in "species" shared by different sources in a consensus model
593
+ # returns a set of shared ontologies by different sources
594
+
595
+ consensus_sbmldf_tbl_var = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
596
+
597
+ # get the sources of species in the consensus model
598
+ consensus_sbmldf_tbl_var_sc = (
599
+ source.unnest_sources(sbml_dfs.species, SBML_DFS.S_SOURCE, verbose=False)
600
+ .reset_index()
601
+ .sort_values([SOURCE_SPEC.NAME])
602
+ )
603
+
604
+ # merge columns with source info to the model's species identifiers df.
605
+ consensus_sbmldf_tbl_var_w_sc = consensus_sbmldf_tbl_var.merge(
606
+ consensus_sbmldf_tbl_var_sc.loc[
607
+ :,
608
+ [
609
+ SBML_DFS.S_ID,
610
+ SOURCE_SPEC.MODEL,
611
+ SOURCE_SPEC.FILE,
612
+ SOURCE_SPEC.PATHWAY_ID,
613
+ SOURCE_SPEC.SOURCE,
614
+ SOURCE_SPEC.NAME,
615
+ ],
616
+ ],
617
+ on=SBML_DFS.S_ID,
618
+ )
619
+
620
+ # get the model/source and its ontology set to a separate df
621
+ shared_ontology_df = (
622
+ consensus_sbmldf_tbl_var_w_sc.groupby(SOURCE_SPEC.NAME)[IDENTIFIERS.ONTOLOGY]
623
+ .apply(set)
624
+ .reset_index(name="onto_expanded")
625
+ )
626
+
627
+ # the intersection set among ontology sets of all sources
628
+ shared_onto_set = shared_ontology_df.onto_expanded[0]
629
+ for i in range(1, len(shared_ontology_df.onto_expanded)):
630
+ shared_onto_set = shared_onto_set.intersection(
631
+ shared_ontology_df.onto_expanded[i]
632
+ )
633
+
634
+ logger.info(f"shared ontologies in the consesus model are: {shared_onto_set}")
635
+
636
+ return shared_onto_set
637
+
638
+
639
+ def _update_foreign_keys(
640
+ agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
641
+ ) -> pd.DataFrame:
642
+ """Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
643
+
644
+ for fk in table_schema["fk"]:
645
+ updated_fks = (
646
+ agg_tbl[fk]
647
+ .reset_index()
648
+ .merge(
649
+ fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
650
+ )
651
+ .drop(fk, axis=1)
652
+ .rename(columns={"new_id": fk})
653
+ .set_index(["model", table_schema["pk"]])
654
+ )
655
+ agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
656
+
657
+ return agg_tbl
658
+
659
+
660
+ def pre_consensus_compartment_check(
661
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
662
+ ) -> tuple[list, dict]:
663
+ """Find compartments shared across models."""
664
+
665
+ # tablename: compartments only
666
+ # returns shared c_name in compartments of sbml_dfs in sbml_dfs_dict for
667
+
668
+ if tablename in [SBML_DFS.COMPARTMENTS]:
669
+ sbml_cname_list = []
670
+ for df_key, sbml_dfs_ind in sbml_dfs_dict.items():
671
+ sbml_df_ind_cname = sbml_dfs_ind.get_identifiers(tablename).value_counts(
672
+ SBML_DFS.C_NAME
673
+ )
674
+ sbml_cname_list.append(sbml_df_ind_cname.index.to_list())
675
+
676
+ shared_cname_set = set.intersection(*map(set, sbml_cname_list))
677
+ shared_cname_list = list(shared_cname_set)
678
+
679
+ sbml_name_list = list(sbml_dfs_dict.keys())
680
+ sbml_dict_cname_df = pd.DataFrame({"single_sbml_dfs": sbml_name_list})
681
+ sbml_dict_cname_df["c_names"] = sbml_cname_list
682
+
683
+ else:
684
+ logger.error(f"{tablename} entry doesn't have c_name")
685
+
686
+ logger.info(
687
+ f"Shared compartments for {tablename} are {shared_cname_list} before building a consensus model."
688
+ )
689
+
690
+ return shared_cname_list, sbml_dict_cname_df
691
+
692
+
693
+ def post_consensus_source_check(
694
+ sbml_dfs: sbml_dfs_core.SBML_dfs, table_name: str
695
+ ) -> pd.DataFrame:
696
+ """Provide sources of tables in a consensus model; the output df will be used to determine whether models are merged."""
697
+
698
+ table_source = sbml_dfs.schema[table_name][SOURCE_SPEC.SOURCE]
699
+ table_pk = sbml_dfs.schema[table_name]["pk"]
700
+
701
+ sbml_dfs_tbl = getattr(sbml_dfs, table_name)
702
+ sbml_dfs_tbl_pathway_source = (
703
+ source.unnest_sources(sbml_dfs_tbl, table_source, verbose=False)
704
+ .reset_index()
705
+ .sort_values(["name"])
706
+ )
707
+
708
+ sbml_dfs_tbl_pathway_source["pathway"] = sbml_dfs_tbl_pathway_source.groupby(
709
+ [table_pk]
710
+ )["name"].transform(lambda x: " + ".join(set(x)))
711
+
712
+ sbml_dfs_tbl_pathway_source = (
713
+ sbml_dfs_tbl_pathway_source[[table_pk, "pathway"]]
714
+ .drop_duplicates()
715
+ .set_index(table_pk)
716
+ )
717
+
718
+ tbl_pathway_source_df = pd.DataFrame(
719
+ sbml_dfs_tbl_pathway_source["pathway"].value_counts()
720
+ )
721
+
722
+ return tbl_pathway_source_df
723
+
724
+
725
+ def construct_meta_entities_fk(
726
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
727
+ pw_index: pd.DataFrame,
728
+ table: str = SBML_DFS.COMPARTMENTALIZED_SPECIES,
729
+ fk_lookup_tables: dict = {},
730
+ extra_defining_attrs: list = [],
731
+ ) -> tuple[pd.DataFrame, pd.Series]:
732
+ """
733
+ Construct Meta Entities Defined by Foreign Keys
734
+
735
+ Aggregating across one entity type for a set of pathway
736
+ models merge entities which are defined by their foreign keys.
737
+
738
+ Parameters:
739
+ ----------
740
+ sbml_df_dict: dict{"model": cpr.SBML_dfs}
741
+ A dictionary of cpr.SBML_dfs
742
+ pw_index: indices.PWIndex
743
+ An index of all tables being aggregated
744
+ table:
745
+ A table/entity set from the sbml_dfs to work-with
746
+ fk_lookup_tables: dict
747
+ Dictionary containing lookup tables for all foreign keys used by the table
748
+ extra_defining_attrs: list
749
+ List of terms which uniquely define a reaction species in addition
750
+ to the foreign keys. A common case is when a species is a modifier
751
+ and a substrate in a reaction.
752
+
753
+ Returns:
754
+ ----------
755
+ new_id_table: pd.DataFrame
756
+ Matching the schema of one of the tables within sbml_df_dict
757
+ lookup_table: pd.Series
758
+ Matches the index of the aggregated entities to new_ids
759
+
760
+ """
761
+
762
+ if not isinstance(extra_defining_attrs, list):
763
+ raise TypeError("extra_defining_attrs must be a list")
764
+
765
+ # combine sbml_dfs by adding model to the index and concatinating all dfs
766
+ agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=table)
767
+
768
+ # since all sbml_dfs have the same schema pull out one schema for reference
769
+ table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
770
+
771
+ # update foreign keys using provided lookup tables
772
+ agg_tbl = _update_foreign_keys(agg_tbl, table_schema, fk_lookup_tables)
773
+
774
+ # add nameness_score as a measure of how-readable a possible name would be
775
+ # (this will help to select names which are more human readable after the merge)
776
+ agg_tbl = utils._add_nameness_score_wrapper(agg_tbl, "label", table_schema)
777
+
778
+ # reduce to unique elements
779
+ induced_entities = (
780
+ agg_tbl.reset_index(drop=True)
781
+ .sort_values(["nameness_score"])
782
+ .groupby(table_schema["fk"] + extra_defining_attrs)
783
+ .first()
784
+ .drop("nameness_score", axis=1)
785
+ )
786
+ induced_entities["new_id"] = sbml_dfs_utils.id_formatter(
787
+ range(induced_entities.shape[0]), table_schema["pk"]
788
+ )
789
+
790
+ new_id_table = (
791
+ induced_entities.reset_index()
792
+ .rename(columns={"new_id": table_schema["pk"]})
793
+ .set_index(table_schema["pk"])[table_schema["vars"]]
794
+ )
795
+
796
+ lookup_table = agg_tbl[table_schema["fk"] + extra_defining_attrs].merge(
797
+ induced_entities,
798
+ left_on=table_schema["fk"] + extra_defining_attrs,
799
+ right_index=True,
800
+ )["new_id"]
801
+
802
+ # logging merges that occurred
803
+ report_consensus_merges(
804
+ lookup_table, table_schema, agg_tbl=agg_tbl, n_example_merges=5
805
+ )
806
+
807
+ if "source" in table_schema.keys():
808
+ # track the model(s) that each entity came from
809
+ new_sources = create_consensus_sources(
810
+ agg_tbl.merge(lookup_table, left_index=True, right_index=True),
811
+ lookup_table,
812
+ table_schema,
813
+ pw_index,
814
+ )
815
+ assert isinstance(new_sources, pd.Series)
816
+
817
+ new_id_table = new_id_table.drop(table_schema["source"], axis=1).merge(
818
+ new_sources, left_index=True, right_index=True
819
+ )
820
+
821
+ return new_id_table, lookup_table
822
+
823
+
824
+ def construct_meta_entities_members(
825
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
826
+ pw_index: indices.PWIndex | None,
827
+ table: str = SBML_DFS.REACTIONS,
828
+ defined_by: str = SBML_DFS.REACTION_SPECIES,
829
+ defined_lookup_tables: dict = {},
830
+ defining_attrs: list[str] = [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
831
+ ) -> tuple[pd.DataFrame, pd.Series]:
832
+ """
833
+ Construct Meta Entities Defined by Membership
834
+
835
+ Aggregating across one entity type for a set of pathway models, merge entities with the same members.
836
+
837
+ Parameters:
838
+ ----------
839
+ sbml_df_dict: dict{"model": cpr.SBML_dfs}
840
+ A dictionary of cpr.SBML_dfs
841
+ pw_index: indices.PWIndex
842
+ An index of all tables being aggregated
843
+ table: str
844
+ A table/entity set from the sbml_dfs to work-with
845
+ defined_by: dict
846
+ A table/entity set whose entries are members of "table"
847
+ defined_lookup_tables: {pd.Series}
848
+ Lookup table for updating the ids of "defined_by"
849
+ defining_attrs: [str]
850
+ A list of attributes which jointly define a unique entity
851
+
852
+ Returns:
853
+ ----------
854
+ new_id_table: pd.DataFrame
855
+ Matching the schema of one of the tables within sbml_df_dict
856
+ lookup_table: pd.Series
857
+ Matches the index of the aggregated entities to new_ids
858
+
859
+ """
860
+
861
+ logger.info(
862
+ f"Merging {table} based on identical membership ({' + '.join(defining_attrs)})"
863
+ )
864
+
865
+ # combine sbml_dfs by adding model to the index and concatinating all dfs
866
+ agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
867
+
868
+ # to debug and see names of species
869
+ # comp_species = unnest_SBML_df(sbml_dfs_dict, table="compartmentalized_species")
870
+ # agg_tbl = agg_tbl.merge(comp_species, left_on = ["model", "sc_id"], right_index = True )
871
+
872
+ # since all sbml_dfs have the same schema pull out one schema for reference
873
+ table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
874
+ defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
875
+
876
+ # update ids using previously created lookup tables
877
+ for k in defined_lookup_tables.keys():
878
+ agg_tbl = (
879
+ agg_tbl.merge(
880
+ defined_lookup_tables[k],
881
+ left_on=[SOURCE_SPEC.MODEL, k],
882
+ right_index=True,
883
+ )
884
+ .drop(k, axis=1)
885
+ .rename(columns={"new_id": k})
886
+ )
887
+
888
+ # create a set of species x compartment instances for each reaction
889
+ defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
890
+
891
+ if (
892
+ len(defining_fk) != 1
893
+ or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
894
+ ):
895
+ raise ValueError(
896
+ f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
897
+ )
898
+ else:
899
+ defining_fk = list(defining_fk)[0]
900
+
901
+ # define what it is to be a unique member based on a combination of defining_attrs
902
+ valid_defining_attrs = agg_tbl.columns.values.tolist()
903
+ invalid_defining_attrs = [
904
+ x for x in defining_attrs if x not in valid_defining_attrs
905
+ ]
906
+
907
+ if len(invalid_defining_attrs) != 0:
908
+ raise ValueError(
909
+ f"{', '.join(invalid_defining_attrs)} was not found; "
910
+ f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
911
+ )
912
+
913
+ # create unique members
914
+ agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
915
+
916
+ # members are aggregated by reaction
917
+ membership_df = (
918
+ agg_tbl.reset_index()
919
+ .groupby(["model", table_schema["pk"]])
920
+ .agg(membership=("member", lambda x: (list(set(x)))))
921
+ )
922
+
923
+ # check whether members are duplicated within a given group
924
+ # suggesting that distinct entities have been coerced into
925
+ # the same entity
926
+ for i in range(membership_df.shape[0]):
927
+ members = membership_df["membership"].iloc[i]
928
+ if len(members) != len(set(members)):
929
+ _ = agg_tbl.reset_index().merge(
930
+ membership_df.iloc[i : i + 1],
931
+ how="inner",
932
+ left_on=[SOURCE_SPEC.MODEL, table_schema["pk"]],
933
+ right_index=True,
934
+ )
935
+
936
+ raise ValueError(
937
+ "Members were duplicated suggesting overmerging in the source "
938
+ )
939
+
940
+ membership_df["member_string"] = [
941
+ _create_member_string(x) for x in membership_df["membership"]
942
+ ]
943
+
944
+ membership_lookup = membership_df.reset_index()
945
+
946
+ consensus_entities = membership_lookup.groupby("member_string").first()
947
+ consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
948
+ range(consensus_entities.shape[0]), table_schema["pk"]
949
+ )
950
+
951
+ lookup_table = membership_lookup.merge(
952
+ consensus_entities["new_id"], left_on="member_string", right_index=True
953
+ ).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
954
+
955
+ # logging merges that occurred
956
+ report_consensus_merges(
957
+ lookup_table, table_schema, sbml_dfs_dict=sbml_dfs_dict, n_example_merges=5
958
+ )
959
+
960
+ agg_primary_table = unnest_SBML_df(sbml_dfs_dict, table=table)
961
+
962
+ # add nameness_score as a measure of how-readable a possible name would be
963
+ # (this will help to select names which are more human readable after the merge)
964
+ agg_primary_table = utils._add_nameness_score_wrapper(
965
+ agg_primary_table, "label", table_schema
966
+ )
967
+
968
+ new_id_table = (
969
+ agg_primary_table.join(lookup_table)
970
+ .reset_index(drop=True)
971
+ .sort_values(["nameness_score"])
972
+ .rename(columns={"new_id": table_schema["pk"]})
973
+ .groupby(table_schema["pk"])
974
+ .first()[table_schema["vars"]]
975
+ )
976
+
977
+ # merge identifiers
978
+ logger.info(f"Merging {table} identifiers")
979
+ indexed_old_identifiers = (
980
+ agg_primary_table.join(lookup_table)
981
+ .reset_index(drop=True)
982
+ .rename(columns={"new_id": table_schema["pk"]})
983
+ .groupby(table_schema["pk"])[table_schema["id"]]
984
+ )
985
+
986
+ # combine merged identifiers into single identifier objects indexed by new id
987
+ updated_identifiers = indexed_old_identifiers.agg(identifiers.merge_identifiers)
988
+
989
+ # add merged identifiers back to new_id table overwriting existing ids
990
+ new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
991
+ updated_identifiers, left_index=True, right_index=True
992
+ )
993
+
994
+ if "source" in table_schema.keys():
995
+ logger.info(f"Merging {table} sources")
996
+
997
+ # track the model(s) that each entity came from
998
+ new_sources = create_consensus_sources(
999
+ agg_primary_table.merge(lookup_table, left_index=True, right_index=True),
1000
+ lookup_table,
1001
+ table_schema,
1002
+ pw_index,
1003
+ )
1004
+
1005
+ new_id_table = new_id_table.drop(table_schema["source"], axis=1).merge(
1006
+ new_sources, left_index=True, right_index=True
1007
+ )
1008
+
1009
+ return new_id_table, lookup_table
1010
+
1011
+
1012
+ def create_consensus_sources(
1013
+ agg_tbl: pd.DataFrame,
1014
+ lookup_table: pd.Series,
1015
+ table_schema: dict,
1016
+ pw_index: indices.PWIndex | None,
1017
+ ) -> pd.Series:
1018
+ """
1019
+ Create Consensus Sources
1020
+
1021
+ Annotate the source of to-be-merged species with the models they came from, and combine with existing annotations.
1022
+
1023
+ Parameters:
1024
+ ----------
1025
+ agg_tbl: pd.DataFrame
1026
+ A table containing existing source.Source objects and a many-1
1027
+ "new_id" of their post-aggregation consensus entity
1028
+ lookup_table: pd.Series
1029
+ A series where the index are old identifiers and the values are
1030
+ post-aggregation new identifiers
1031
+ table_schema: dict
1032
+ Summary of the schema for the operant entitye type
1033
+ pw_index: indices.PWIndex
1034
+ An index of all tables being aggregated
1035
+
1036
+ Returns:
1037
+ ----------
1038
+ new_sources: pd.DataFrame
1039
+ Mapping where the index is new identifiers and values are aggregated source.Source objects
1040
+
1041
+ """
1042
+
1043
+ logger.info("Creating source table")
1044
+ # Sources for all new entries
1045
+ new_sources = source.create_source_table(lookup_table, table_schema, pw_index)
1046
+
1047
+ # create a pd.Series with an index of all new_ids (which will be rewritten as the entity primary keys)
1048
+ # and values of source.Source objects (where multiple Sources may match an index value).
1049
+ logger.info("Aggregating old sources")
1050
+ indexed_old_sources = (
1051
+ agg_tbl.reset_index(drop=True)
1052
+ .rename(columns={"new_id": table_schema["pk"]})
1053
+ .groupby(table_schema["pk"])[table_schema["source"]]
1054
+ )
1055
+
1056
+ # combine old sources into a single source.Source object per index value
1057
+ aggregated_old_sources = indexed_old_sources.agg(source.merge_sources)
1058
+
1059
+ aligned_sources = new_sources.merge(
1060
+ aggregated_old_sources, left_index=True, right_index=True
1061
+ )
1062
+ assert isinstance(aligned_sources, pd.DataFrame)
1063
+
1064
+ logger.info("Returning new source table")
1065
+ new_sources = aligned_sources.apply(source.merge_sources, axis=1).rename(table_schema["source"]) # type: ignore
1066
+ assert isinstance(new_sources, pd.Series)
1067
+
1068
+ return new_sources
1069
+
1070
+
1071
+ def report_consensus_merges(
1072
+ lookup_table: pd.Series,
1073
+ table_schema: dict,
1074
+ agg_tbl: pd.DataFrame | None = None,
1075
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs] | None = None,
1076
+ n_example_merges: int = 3,
1077
+ ) -> None:
1078
+ """
1079
+ Report Consensus Merges
1080
+
1081
+ Print a summary of merges that occurred
1082
+
1083
+ Parameters:
1084
+ ----------
1085
+ lookup_table : pd.Series
1086
+ An index of "model" and the entities primary key with values of new_id
1087
+ table_schema : dict
1088
+ Schema of the table being merged
1089
+ agg_tbl : pd.DataFrame or None
1090
+ Contains the original model, primary keys and a label. Required if the primary key is not r_id (i.e., reactions)
1091
+ sbml_dfs_dict : pd.DataFrame or None
1092
+ The dict of full models across all models. Used to create reaction formulas if the primary key is r_id
1093
+ n_example_merges : int
1094
+ Number of example merges to report details on
1095
+
1096
+ Returns:
1097
+ ----------
1098
+ None
1099
+ """
1100
+
1101
+ entity_merge_num = lookup_table.value_counts()
1102
+ merged_entities = entity_merge_num[entity_merge_num != 1]
1103
+
1104
+ if merged_entities.shape[0] == 0:
1105
+ logger.warning(f"No merging occurred for {table_schema['pk']}")
1106
+ return None
1107
+
1108
+ if "label" not in table_schema.keys():
1109
+ # we dont need to track unnamed species
1110
+ return None
1111
+
1112
+ logger.info(
1113
+ f">>>> {merged_entities.sum()} {table_schema['pk']} entries merged into {merged_entities.shape[0]}"
1114
+ )
1115
+
1116
+ merges_lookup = lookup_table[
1117
+ lookup_table.isin(merged_entities.index.tolist())
1118
+ ].reset_index()
1119
+
1120
+ if table_schema["pk"] == "r_id":
1121
+ logger.info(
1122
+ "Creating formulas for to-be-merged reactions to help with reporting merges of reactions"
1123
+ " with inconsistently named reactants"
1124
+ )
1125
+ if not isinstance(sbml_dfs_dict, dict):
1126
+ raise ValueError(
1127
+ f"sbml_dfs_dict was a {type(sbml_dfs_dict)} and must be a dict if the table_schema pk is r_id"
1128
+ )
1129
+
1130
+ indexed_models = merges_lookup.set_index("model").sort_index()
1131
+ merges_dict = dict()
1132
+ for mod in indexed_models.index.unique():
1133
+ merges_dict[mod] = sbml_dfs_core.reaction_summaries(
1134
+ sbml_dfs_dict[mod], indexed_models.loc[mod]["r_id"]
1135
+ )
1136
+ merge_labels = pd.concat(merges_dict, names=["model", "r_id"]).rename("label")
1137
+
1138
+ # add labels to models + r_id
1139
+ merges_lookup = merges_lookup.merge(
1140
+ merge_labels, how="left", left_on=["model", "r_id"], right_index=True
1141
+ )
1142
+
1143
+ logger.info("Done creating reaction formulas")
1144
+
1145
+ else:
1146
+ if type(agg_tbl) is not pd.DataFrame:
1147
+ raise ValueError(
1148
+ f"agg_tbl was a {type(agg_tbl)} and must be a pd.DataFrame if the table_schema pk is NOT r_id"
1149
+ )
1150
+
1151
+ merges_lookup = merges_lookup.merge(
1152
+ agg_tbl[table_schema["label"]],
1153
+ left_on=["model", table_schema["pk"]],
1154
+ right_index=True,
1155
+ ).rename(columns={table_schema["label"]: "label"})
1156
+
1157
+ indexed_merges_lookup = merges_lookup.set_index("new_id")
1158
+
1159
+ # filter to entries with non-identical labels
1160
+
1161
+ logger.info("Testing for identical formulas of to-be-merged reactions")
1162
+
1163
+ index_label_counts = (
1164
+ indexed_merges_lookup["label"].drop_duplicates().index.value_counts()
1165
+ )
1166
+ inexact_merges = index_label_counts[index_label_counts > 1].index.tolist()
1167
+
1168
+ if len(inexact_merges) == 0:
1169
+ logger.info("All merges names matched exactly")
1170
+ else:
1171
+ logger.warning(
1172
+ f"\n{len(inexact_merges)} merges were of entities with distinct names, including:\n"
1173
+ )
1174
+
1175
+ inexact_merges_samples = random.sample(
1176
+ inexact_merges, min(len(inexact_merges), n_example_merges)
1177
+ )
1178
+
1179
+ inexact_merge_collapses = (
1180
+ indexed_merges_lookup.loc[inexact_merges_samples]["label"]
1181
+ .drop_duplicates()
1182
+ .groupby(level=0)
1183
+ .agg(" & ".join)
1184
+ )
1185
+
1186
+ logger.warning("\n\n".join(inexact_merge_collapses.tolist()) + "\n")
1187
+
1188
+ logger.info("==============================\n")
1189
+
1190
+ return None
1191
+
1192
+
1193
+ def merge_entity_data(
1194
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1195
+ lookup_table: pd.Series,
1196
+ table: str,
1197
+ ) -> dict:
1198
+ """
1199
+ Merge Entity Data
1200
+
1201
+ Report cases where a single "new" id is associated with multiple different values of entity_var
1202
+
1203
+ Args
1204
+ sbml_dfs_dict (dict): dictionary where values are to-be-merged model nnames and values
1205
+ are sbml_dfs_core.SBML_dfs
1206
+ lookup_table (pd.Series): a series where the index is an old model and primary key and the
1207
+ value is the new consensus id
1208
+ table (str): table whose data is being consolidates (currently species or reactions)
1209
+
1210
+ Returns:
1211
+ entity_data (dict): dictionary containing pd.DataFrames which aggregate all of the
1212
+ individual entity_data tables in "sbml_dfs_dict"
1213
+
1214
+ """
1215
+
1216
+ entity_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
1217
+ data_table_name = table + "_data"
1218
+
1219
+ entity_data_dict = {
1220
+ k: getattr(sbml_dfs_dict[k], data_table_name) for k in sbml_dfs_dict.keys()
1221
+ }
1222
+
1223
+ entity_data_types = set.union(*[set(v.keys()) for v in entity_data_dict.values()])
1224
+
1225
+ entity_data = {
1226
+ x: _merge_entity_data_create_consensus(
1227
+ entity_data_dict, lookup_table, entity_schema, x, table
1228
+ )
1229
+ for x in entity_data_types
1230
+ }
1231
+
1232
+ return entity_data
1233
+
1234
+
1235
+ def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
1236
+ """Check models in SBML_dfs for problems which can be reported up-front
1237
+
1238
+ Args:
1239
+ sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
1240
+ primarily used as an input for construct_consensus_model
1241
+
1242
+ Returns:
1243
+ None
1244
+
1245
+ """
1246
+
1247
+ for k, v in sbml_dfs_dict.items():
1248
+ _check_sbml_dfs(sbml_dfs=v, model_label=k)
1249
+ return None
1250
+
1251
+
1252
+ def _check_sbml_dfs(
1253
+ sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
1254
+ ) -> None:
1255
+ """Check SBML_dfs for identifiers which are associated with different entities before a merge."""
1256
+
1257
+ ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
1258
+ defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
1259
+
1260
+ defining_identifier_counts = defining_ids.value_counts(
1261
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1262
+ )
1263
+ degenerate_defining_identities = (
1264
+ defining_identifier_counts[defining_identifier_counts > 1]
1265
+ .rename("N")
1266
+ .reset_index()
1267
+ .set_index(IDENTIFIERS.ONTOLOGY)
1268
+ )
1269
+
1270
+ if degenerate_defining_identities.shape[0] > 0:
1271
+ logger.info(
1272
+ "Some defining identifiers are present multiple times "
1273
+ f"in {model_label} and will likely result in species merging "
1274
+ )
1275
+
1276
+ degen_defining_id_list = list()
1277
+ for k in degenerate_defining_identities.index.unique():
1278
+ n_degen = degenerate_defining_identities.loc[k].shape[0]
1279
+ example_duplicates = utils.ensure_pd_df(
1280
+ degenerate_defining_identities.loc[k].sample(min([n_degen, N_examples]))
1281
+ )
1282
+
1283
+ degen_defining_id_list.append(
1284
+ k
1285
+ + f" has {n_degen} duplicates including: "
1286
+ + ", ".join(
1287
+ [
1288
+ f"{x} ({y})"
1289
+ for x, y in zip(
1290
+ example_duplicates[IDENTIFIERS.IDENTIFIER].tolist(),
1291
+ example_duplicates["N"].tolist(),
1292
+ )
1293
+ ]
1294
+ )
1295
+ )
1296
+
1297
+ logger.info("\n".join(degen_defining_id_list))
1298
+ return None
1299
+
1300
+
1301
+ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
1302
+ """Check Identifiers to make sure they aren't empty and flag cases where IDs are missing BQB terms."""
1303
+
1304
+ if meta_identifiers.shape[0] == 0:
1305
+ raise ValueError(
1306
+ '"meta_identifiers" was empty; some identifiers should be present'
1307
+ )
1308
+
1309
+ n_null = sum(meta_identifiers[IDENTIFIERS.BQB].isnull())
1310
+ if n_null > 0:
1311
+ msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
1312
+ logger.warn(msg)
1313
+
1314
+ return None
1315
+
1316
+
1317
+ def _update_foreign_keys(
1318
+ agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
1319
+ ) -> pd.DataFrame:
1320
+ for fk in table_schema["fk"]:
1321
+ updated_fks = (
1322
+ agg_tbl[fk]
1323
+ .reset_index()
1324
+ .merge(
1325
+ fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
1326
+ )
1327
+ .drop(fk, axis=1)
1328
+ .rename(columns={"new_id": fk})
1329
+ .set_index(["model", table_schema["pk"]])
1330
+ )
1331
+ agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
1332
+
1333
+ return agg_tbl
1334
+
1335
+
1336
+ def _resolve_reversibility(
1337
+ sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
1338
+ rxn_consensus_species: pd.DataFrame,
1339
+ rxn_lookup_table: pd.Series,
1340
+ ) -> pd.DataFrame:
1341
+ """
1342
+ For a set of merged reactions determine what their consensus reaction reversibilities are
1343
+ """
1344
+
1345
+ agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=SBML_DFS.REACTIONS)
1346
+
1347
+ if not all(agg_tbl[SBML_DFS.R_ISREVERSIBLE].isin([True, False])):
1348
+ invalid_levels = agg_tbl[~agg_tbl[SBML_DFS.R_ISREVERSIBLE].isin([True, False])][
1349
+ SBML_DFS.R_ISREVERSIBLE
1350
+ ].unique()
1351
+ raise ValueError(
1352
+ "One or more aggregated models included invalid values for r_isreversible in the reactions table: "
1353
+ f"{', '.join(invalid_levels)}"
1354
+ )
1355
+
1356
+ # add new ids to aggregated reactions by indexes
1357
+ # map each new r_id to every distinct value of is_irreversible from reactions it originated from
1358
+ # in most cases there will only be a single level
1359
+ r_id_to_all_reversibilities = (
1360
+ agg_tbl.join(rxn_lookup_table)
1361
+ .reset_index()[["new_id", SBML_DFS.R_ISREVERSIBLE]]
1362
+ .rename({"new_id": SBML_DFS.R_ID}, axis=1)
1363
+ .drop_duplicates()
1364
+ )
1365
+
1366
+ # when a reaction could be irreversible or reversible define it as reversible.
1367
+ r_id_reversibility = (
1368
+ r_id_to_all_reversibilities.sort_values(
1369
+ SBML_DFS.R_ISREVERSIBLE, ascending=False
1370
+ )
1371
+ .groupby(SBML_DFS.R_ID)
1372
+ .first()
1373
+ )
1374
+
1375
+ # drop existing reversibility since it is selected arbitrarily and replace
1376
+ # with consensus reversibility which respects priorities
1377
+ rxns_w_reversibility = rxn_consensus_species.drop(
1378
+ SBML_DFS.R_ISREVERSIBLE, axis=1
1379
+ ).join(r_id_reversibility)
1380
+
1381
+ assert rxns_w_reversibility.shape[0] == rxn_consensus_species.shape[0]
1382
+ assert all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False]))
1383
+
1384
+ return rxns_w_reversibility
1385
+
1386
+
1387
+ def _merge_entity_data_create_consensus(
1388
+ entity_data_dict: dict,
1389
+ lookup_table: pd.Series,
1390
+ entity_schema: dict,
1391
+ an_entity_data_type: str,
1392
+ table: str,
1393
+ ) -> pd.DataFrame:
1394
+ """
1395
+ Merge Entity Data - Report Mismatches
1396
+
1397
+ Report cases where a single "new" id is associated with multiple different values of entity_var
1398
+
1399
+ Args
1400
+ entity_data_dict (dict): dictionary containing all model's "an_entity_data_type" dictionaries
1401
+ lookup_table (pd.Series): a series where the index is an old model and primary key and the
1402
+ value is the new consensus id
1403
+ entity_schema (dict): schema for "table"
1404
+ an_entity_data_type (str): data_type from species/reactions_data in entity_data_dict
1405
+ table (str): table whose data is being consolidates (currently species or reactions)
1406
+
1407
+ Returns:
1408
+ consensus_entity_data (pd.DataFrame) table where index is primary key of "table" and
1409
+ values are all distinct annotations from "an_entity_data_type".
1410
+
1411
+ """
1412
+
1413
+ models_w_entity_data_type = [
1414
+ k for k, v in entity_data_dict.items() if an_entity_data_type in v.keys()
1415
+ ]
1416
+
1417
+ logger.info(
1418
+ f"Merging {len(models_w_entity_data_type)} models with {an_entity_data_type} data in the {table} table"
1419
+ )
1420
+
1421
+ # check that all tables have the same index and column names
1422
+ distinct_indices = {
1423
+ ", ".join(entity_data_dict[x][an_entity_data_type].index.names)
1424
+ for x in models_w_entity_data_type
1425
+ }
1426
+ if len(distinct_indices) > 1:
1427
+ raise ValueError(
1428
+ f"Multiple tables with the same {an_entity_data_type} cannot be combined"
1429
+ " because they have different index names:"
1430
+ f"{' & '.join(list(distinct_indices))}"
1431
+ )
1432
+ distinct_cols = {
1433
+ ", ".join(entity_data_dict[x][an_entity_data_type].columns.tolist())
1434
+ for x in models_w_entity_data_type
1435
+ }
1436
+ if len(distinct_cols) > 1:
1437
+ raise ValueError(
1438
+ f"Multiple tables with the same {an_entity_data_type} cannot be combined"
1439
+ " because they have different column names:"
1440
+ f"{' & '.join(list(distinct_cols))}"
1441
+ )
1442
+
1443
+ # stack all models
1444
+ combined_entity_data = pd.concat(
1445
+ {k: entity_data_dict[k][an_entity_data_type] for k in models_w_entity_data_type}
1446
+ )
1447
+ combined_entity_data.index.names = ["model", entity_schema["pk"]]
1448
+ if isinstance(combined_entity_data, pd.Series):
1449
+ # enforce that atttributes should always be DataFrames
1450
+ combined_entity_data = combined_entity_data.to_frame()
1451
+
1452
+ # create a table indexed by the NEW primary key containing all the entity data of type an_entity_data_type
1453
+ # right now the index may map to multiple rows if entities were consolidated
1454
+ combined_entity_data = (
1455
+ combined_entity_data.join(lookup_table)
1456
+ .reset_index(drop=True)
1457
+ .rename({"new_id": entity_schema["pk"]}, axis=1)
1458
+ .set_index(entity_schema["pk"])
1459
+ .sort_index()
1460
+ )
1461
+
1462
+ # report cases where merges produce id-variable combinations with distinct values
1463
+ _merge_entity_data_report_mismatches(
1464
+ combined_entity_data, entity_schema, an_entity_data_type, table
1465
+ )
1466
+
1467
+ # save one value for each id-variable combination
1468
+ # (this will accept the first value regardless of the above mismatches.)
1469
+ consensus_entity_data = (
1470
+ combined_entity_data.reset_index().groupby(entity_schema["pk"]).first()
1471
+ )
1472
+
1473
+ return consensus_entity_data
1474
+
1475
+
1476
+ def _merge_entity_data_report_mismatches(
1477
+ combined_entity_data: pd.DataFrame,
1478
+ entity_schema: dict,
1479
+ an_entity_data_type: str,
1480
+ table: str,
1481
+ ) -> None:
1482
+ """
1483
+ Merge Entity Data - Report Mismatches
1484
+
1485
+ Report cases where a single "new" id is associated with multiple different values of entity_var
1486
+
1487
+ Args
1488
+ combined_entity_data (pd.DataFrame): indexed by table primary key containing all
1489
+ data from "an_entity_data_type"
1490
+ entity_schema (dict): schema for "table"
1491
+ an_entity_data_type (str): data_type from species/reactions_data in combined_entity_data
1492
+ table (str): table whose data is being consolidates (currently species or reactions)
1493
+
1494
+ Returns:
1495
+ None
1496
+
1497
+ """
1498
+
1499
+ data_table_name = table + "_data"
1500
+
1501
+ entity_vars = combined_entity_data.columns
1502
+ for entity_var in entity_vars:
1503
+ unique_counts = (
1504
+ combined_entity_data.reset_index()
1505
+ .groupby(entity_schema["pk"])
1506
+ .agg("nunique")
1507
+ )
1508
+ entities_w_imperfect_matches = unique_counts[
1509
+ unique_counts[entity_var] != 1
1510
+ ].index.tolist()
1511
+
1512
+ if len(entities_w_imperfect_matches) > 0:
1513
+ N_select_entities_w_imperfect_matches = min(
1514
+ 5, len(entities_w_imperfect_matches)
1515
+ )
1516
+ select_entities_w_imperfect_matches = entities_w_imperfect_matches[
1517
+ 0:N_select_entities_w_imperfect_matches
1518
+ ]
1519
+
1520
+ warning_msg_select = [
1521
+ x
1522
+ + ": "
1523
+ + ", ".join(
1524
+ combined_entity_data[entity_var].loc[x].apply(str).unique().tolist()
1525
+ )
1526
+ for x in select_entities_w_imperfect_matches
1527
+ ]
1528
+ full_warning_msg = (
1529
+ f"{len(entities_w_imperfect_matches)} {table} contains multiple values for the {entity_var} variable"
1530
+ f" in the {data_table_name} table of {an_entity_data_type}: "
1531
+ + ". ".join(warning_msg_select)
1532
+ )
1533
+
1534
+ logger.warning(full_warning_msg)
1535
+
1536
+ return None
1537
+
1538
+
1539
+ def _test_same_schema(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
1540
+ """
1541
+ Ensure that all sbml_dfs in the dict have the same schema
1542
+ """
1543
+
1544
+ if len(sbml_dfs_dict) != 0:
1545
+ # extract all schemas
1546
+ schema_list = [sbml_dfs_dict[x].schema for x in sbml_dfs_dict.keys()]
1547
+ # if multiple entries are present then are they the same?
1548
+ if len(sbml_dfs_dict) > 1:
1549
+ if not all([x == schema_list[0] for x in schema_list]):
1550
+ raise ValueError("sbml_df schemas were not identical")
1551
+
1552
+ return None
1553
+
1554
+
1555
+ def _create_member_string(x: list[str]) -> str:
1556
+ x.sort()
1557
+ return "_".join(x)