napistu 0.1.0__py3-none-any.whl → 0.2.4.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -1
- napistu/consensus.py +1010 -513
- napistu/constants.py +24 -0
- napistu/gcs/constants.py +2 -2
- napistu/gcs/downloads.py +57 -25
- napistu/gcs/utils.py +21 -0
- napistu/identifiers.py +105 -6
- napistu/ingestion/constants.py +0 -1
- napistu/ingestion/obo.py +24 -8
- napistu/ingestion/psi_mi.py +20 -5
- napistu/ingestion/reactome.py +8 -32
- napistu/mcp/__init__.py +69 -0
- napistu/mcp/__main__.py +180 -0
- napistu/mcp/codebase.py +182 -0
- napistu/mcp/codebase_utils.py +298 -0
- napistu/mcp/constants.py +72 -0
- napistu/mcp/documentation.py +166 -0
- napistu/mcp/documentation_utils.py +235 -0
- napistu/mcp/execution.py +382 -0
- napistu/mcp/profiles.py +73 -0
- napistu/mcp/server.py +86 -0
- napistu/mcp/tutorials.py +124 -0
- napistu/mcp/tutorials_utils.py +230 -0
- napistu/mcp/utils.py +47 -0
- napistu/mechanism_matching.py +782 -26
- napistu/modify/constants.py +41 -0
- napistu/modify/curation.py +4 -1
- napistu/modify/gaps.py +243 -156
- napistu/modify/pathwayannot.py +26 -8
- napistu/network/neighborhoods.py +16 -7
- napistu/network/net_create.py +209 -54
- napistu/network/net_propagation.py +118 -0
- napistu/network/net_utils.py +1 -32
- napistu/rpy2/netcontextr.py +10 -7
- napistu/rpy2/rids.py +7 -5
- napistu/sbml_dfs_core.py +46 -29
- napistu/sbml_dfs_utils.py +37 -1
- napistu/source.py +8 -2
- napistu/utils.py +67 -8
- napistu-0.2.4.dev3.dist-info/METADATA +84 -0
- napistu-0.2.4.dev3.dist-info/RECORD +95 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/WHEEL +1 -1
- tests/conftest.py +11 -5
- tests/test_consensus.py +4 -1
- tests/test_gaps.py +127 -0
- tests/test_gcs.py +3 -2
- tests/test_igraph.py +14 -0
- tests/test_mcp_documentation_utils.py +13 -0
- tests/test_mechanism_matching.py +658 -0
- tests/test_net_propagation.py +89 -0
- tests/test_net_utils.py +83 -0
- tests/test_sbml.py +2 -0
- tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
- tests/test_utils.py +81 -0
- napistu-0.1.0.dist-info/METADATA +0 -56
- napistu-0.1.0.dist-info/RECORD +0 -77
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/entry_points.txt +0 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/top_level.txt +0 -0
napistu/consensus.py
CHANGED
@@ -31,111 +31,43 @@ def construct_consensus_model(
|
|
31
31
|
dogmatic: bool = True,
|
32
32
|
) -> sbml_dfs_core.SBML_dfs:
|
33
33
|
"""
|
34
|
-
Construct Consensus Model
|
34
|
+
Construct a Consensus Model by merging shared entities across pathway models.
|
35
35
|
|
36
|
-
|
36
|
+
This function takes a dictionary of pathway models and merges shared entities (compartments, species, reactions, etc.)
|
37
|
+
into a single consensus model, using a set of rules for entity identity and merging.
|
37
38
|
|
38
|
-
Parameters
|
39
|
-
----------
|
40
|
-
sbml_dfs_dict: dict{cpr.SBML_dfs}
|
41
|
-
A dictionary of SBML_dfs from different models
|
42
|
-
pw_index: indices.PWIndex
|
43
|
-
An index of all tables being aggregated
|
44
|
-
dogmatic: bool
|
45
|
-
If True then try to preserve genes, transcript, and proteins as separate species. If False
|
46
|
-
then try to merge them.
|
47
|
-
|
48
|
-
Returns:
|
39
|
+
Parameters
|
49
40
|
----------
|
50
|
-
|
51
|
-
|
41
|
+
sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
|
42
|
+
A dictionary of SBML_dfs objects from different models, keyed by model name.
|
43
|
+
pw_index : indices.PWIndex
|
44
|
+
An index of all tables being aggregated, used for cross-referencing entities.
|
45
|
+
dogmatic : bool, default=True
|
46
|
+
If True, preserve genes, transcripts, and proteins as separate species. If False, merge them when possible.
|
47
|
+
|
48
|
+
Returns
|
49
|
+
-------
|
50
|
+
sbml_dfs_core.SBML_dfs
|
51
|
+
A consensus SBML_dfs object containing the merged model.
|
52
52
|
"""
|
53
|
-
|
53
|
+
# Validate inputs
|
54
54
|
logger.info("Reporting possible issues in component models")
|
55
55
|
_check_sbml_dfs_dict(sbml_dfs_dict)
|
56
56
|
assert isinstance(pw_index, indices.PWIndex)
|
57
|
-
# select valid BQB attributes based on dogmatic flag
|
58
|
-
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
|
59
|
-
|
60
|
-
logger.info("Defining compartments based on unique ids")
|
61
|
-
comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
|
62
|
-
sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
|
63
|
-
)
|
64
|
-
|
65
|
-
logger.info("Defining species based on unique ids")
|
66
|
-
spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
|
67
|
-
sbml_dfs_dict=sbml_dfs_dict,
|
68
|
-
pw_index=pw_index,
|
69
|
-
table=SBML_DFS.SPECIES,
|
70
|
-
defining_biological_qualifiers=defining_biological_qualifiers,
|
71
|
-
)
|
72
|
-
|
73
|
-
logger.info(
|
74
|
-
"Defining compartmentalized species based on unique species x compartments"
|
75
|
-
)
|
76
|
-
compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
|
77
|
-
sbml_dfs_dict,
|
78
|
-
pw_index,
|
79
|
-
table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
|
80
|
-
fk_lookup_tables={
|
81
|
-
SBML_DFS.C_ID: comp_lookup_table,
|
82
|
-
SBML_DFS.S_ID: spec_lookup_table,
|
83
|
-
},
|
84
|
-
)
|
85
|
-
|
86
|
-
logger.info(
|
87
|
-
"Define reactions based on membership of identical compartmentalized species"
|
88
|
-
)
|
89
|
-
rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
|
90
|
-
sbml_dfs_dict,
|
91
|
-
pw_index,
|
92
|
-
table=SBML_DFS.REACTIONS,
|
93
|
-
defined_by=SBML_DFS.REACTION_SPECIES,
|
94
|
-
defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
|
95
|
-
defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
|
96
|
-
)
|
97
57
|
|
98
|
-
|
99
|
-
|
100
|
-
sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
|
101
|
-
)
|
58
|
+
# Select valid BQB attributes based on dogmatic flag
|
59
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
|
102
60
|
|
103
|
-
#
|
104
|
-
|
105
|
-
|
106
|
-
sbml_dfs_dict,
|
107
|
-
pw_index,
|
108
|
-
table=SBML_DFS.REACTION_SPECIES,
|
109
|
-
fk_lookup_tables={
|
110
|
-
SBML_DFS.R_ID: rxn_lookup_table,
|
111
|
-
SBML_DFS.SC_ID: compspec_lookup_table,
|
112
|
-
},
|
113
|
-
# retain species with different roles
|
114
|
-
extra_defining_attrs=[SBML_DFS.SBO_TERM],
|
61
|
+
# Step 1: Create consensus entities for all primary tables
|
62
|
+
consensus_entities, lookup_tables = _create_consensus_entities(
|
63
|
+
sbml_dfs_dict, pw_index, defining_biological_qualifiers
|
115
64
|
)
|
116
65
|
|
117
|
-
|
118
|
-
|
119
|
-
SBML_DFS.SPECIES: spec_consensus_entities,
|
120
|
-
SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
|
121
|
-
SBML_DFS.REACTIONS: rxn_consensus_species,
|
122
|
-
SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
|
123
|
-
}
|
124
|
-
|
125
|
-
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml_tbl_dict) # type: ignore
|
66
|
+
# Step 2: Create the consensus SBML_dfs object
|
67
|
+
sbml_dfs = sbml_dfs_core.SBML_dfs(consensus_entities) # type: ignore
|
126
68
|
|
127
|
-
#
|
128
|
-
|
129
|
-
sbml_dfs_dict, lookup_table=spec_lookup_table, table=SBML_DFS.SPECIES
|
130
|
-
)
|
131
|
-
for k in consensus_species_data.keys():
|
132
|
-
sbml_dfs.add_species_data(k, consensus_species_data[k])
|
133
|
-
|
134
|
-
consensus_reactions_data = merge_entity_data(
|
135
|
-
sbml_dfs_dict, lookup_table=rxn_lookup_table, table=SBML_DFS.REACTIONS
|
136
|
-
)
|
137
|
-
for k in consensus_reactions_data.keys():
|
138
|
-
sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
|
69
|
+
# Step 3: Add entity data from component models
|
70
|
+
sbml_dfs = _add_entity_data(sbml_dfs, sbml_dfs_dict, lookup_tables)
|
139
71
|
|
140
72
|
return sbml_dfs
|
141
73
|
|
@@ -144,18 +76,22 @@ def construct_sbml_dfs_dict(
|
|
144
76
|
pw_index: pd.DataFrame, strict: bool = True
|
145
77
|
) -> dict[str, sbml_dfs_core.SBML_dfs]:
|
146
78
|
"""
|
147
|
-
Construct
|
148
|
-
|
149
|
-
Convert all models in the pathway index into SBML_dfs and add them to a dict.
|
150
|
-
|
151
|
-
Parameters:
|
152
|
-
pw_index: indices.PWIndex
|
153
|
-
An index of all tables being aggregated
|
154
|
-
strict (bool): if set to `false` errorenous files are skipped with warning. Default: True
|
79
|
+
Construct a dictionary of SBML_dfs objects from a pathway index.
|
155
80
|
|
156
|
-
|
157
|
-
|
81
|
+
This function converts all models in the pathway index into SBML_dfs objects and adds them to a dictionary.
|
82
|
+
Optionally, it can skip erroneous files with a warning instead of raising an error.
|
158
83
|
|
84
|
+
Parameters
|
85
|
+
----------
|
86
|
+
pw_index : pd.DataFrame
|
87
|
+
An index of all tables being aggregated, containing model metadata and file paths.
|
88
|
+
strict : bool, default=True
|
89
|
+
If True, raise an error on any file that cannot be loaded. If False, skip erroneous files with a warning.
|
90
|
+
|
91
|
+
Returns
|
92
|
+
-------
|
93
|
+
dict[str, sbml_dfs_core.SBML_dfs]
|
94
|
+
A dictionary mapping model names to SBML_dfs objects.
|
159
95
|
"""
|
160
96
|
|
161
97
|
sbml_dfs_dict = dict()
|
@@ -182,18 +118,22 @@ def unnest_SBML_df(
|
|
182
118
|
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], table: str
|
183
119
|
) -> pd.DataFrame:
|
184
120
|
"""
|
185
|
-
Unnest SBML_dfs
|
121
|
+
Unnest and concatenate a specific table from multiple SBML_dfs models.
|
186
122
|
|
187
|
-
|
188
|
-
|
189
|
-
sbml_dfs_dict: dict{cpr.SBML_dfs}
|
190
|
-
A dictionary of SBML_dfs from different models
|
191
|
-
table: str
|
192
|
-
A table to aggregate (e.g., species, reactions, compartments)
|
193
|
-
|
194
|
-
Returns:
|
195
|
-
pd.Dataframe, a table with a multindex of model and an entity_id
|
123
|
+
This function merges corresponding tables from a set of models into a single DataFrame,
|
124
|
+
adding the model name as an index level.
|
196
125
|
|
126
|
+
Parameters
|
127
|
+
----------
|
128
|
+
sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
|
129
|
+
A dictionary of SBML_dfs objects from different models, keyed by model name.
|
130
|
+
table : str
|
131
|
+
The name of the table to aggregate (e.g., 'species', 'reactions', 'compartments').
|
132
|
+
|
133
|
+
Returns
|
134
|
+
-------
|
135
|
+
pd.DataFrame
|
136
|
+
A concatenated table with a MultiIndex of model and entity ID.
|
197
137
|
"""
|
198
138
|
|
199
139
|
# check that all sbml_dfs have the same schema
|
@@ -222,31 +162,30 @@ def construct_meta_entities_identifiers(
|
|
222
162
|
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
223
163
|
) -> tuple[pd.DataFrame, pd.Series]:
|
224
164
|
"""
|
225
|
-
Construct
|
226
|
-
|
227
|
-
Aggregating across one entity type for a set of pathway models merge entities which share identifiers
|
165
|
+
Construct meta-entities by merging entities across models that share identifiers.
|
228
166
|
|
229
|
-
|
230
|
-
|
231
|
-
sbml_df_dict (dict{"model": cpr.SBML_dfs}):
|
232
|
-
A dictionary of cpr.SBML_dfs
|
233
|
-
pw_index (indices.PWIndex):
|
234
|
-
An index of all tables being aggregated
|
235
|
-
table (str):
|
236
|
-
A table/entity set from the sbml_dfs to work-with
|
237
|
-
fk_lookup_tables (dict):
|
238
|
-
Dictionary containing lookup tables for all foreign keys used by the table
|
239
|
-
defining_biological_qualifiers (list[str]):
|
240
|
-
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
241
|
-
permissive settings could merge homologs, different forms of the same gene.
|
167
|
+
Aggregates a single entity type from a set of pathway models and merges entities that share identifiers
|
168
|
+
(as defined by the provided biological qualifiers).
|
242
169
|
|
243
|
-
|
170
|
+
Parameters
|
244
171
|
----------
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
172
|
+
sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
|
173
|
+
A dictionary of SBML_dfs objects from different models, keyed by model name.
|
174
|
+
pw_index : indices.PWIndex
|
175
|
+
An index of all tables being aggregated.
|
176
|
+
table : str
|
177
|
+
The name of the table/entity set to aggregate (e.g., 'species', 'compartments').
|
178
|
+
fk_lookup_tables : dict, optional
|
179
|
+
Dictionary containing lookup tables for all foreign keys used by the table (default: empty dict).
|
180
|
+
defining_biological_qualifiers : list[str], optional
|
181
|
+
List of BQB codes which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
|
182
|
+
|
183
|
+
Returns
|
184
|
+
-------
|
185
|
+
new_id_table : pd.DataFrame
|
186
|
+
Table matching the schema of one of the input models, with merged entities.
|
187
|
+
lookup_table : pd.Series
|
188
|
+
Series mapping the index of the aggregated entities to new consensus IDs.
|
250
189
|
"""
|
251
190
|
|
252
191
|
# combine sbml_dfs by adding model to the index and concatinating all dfs
|
@@ -281,96 +220,58 @@ def reduce_to_consensus_ids(
|
|
281
220
|
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
282
221
|
) -> tuple[pd.DataFrame, pd.Series]:
|
283
222
|
"""
|
284
|
-
Reduce to
|
285
|
-
|
286
|
-
Reduce a table of entities to unique entries based on identifiers.
|
223
|
+
Reduce a table of entities to unique entries based on consensus identifiers.
|
287
224
|
|
288
|
-
|
289
|
-
|
290
|
-
sbml_df: pd.DataFrame
|
291
|
-
One type of entity from sbml_dfs_dict expanded to include
|
292
|
-
model its index, as produced by unnest_SBML_df(sbml_dfs_dict)
|
293
|
-
table_schema: dict
|
294
|
-
Schema for the table sbml_df
|
295
|
-
pw_index: indices.PWIndex
|
296
|
-
An index of all tables being aggregated
|
297
|
-
defining_biological_qualifiers: list(str)
|
298
|
-
A list of biological qualifier types which define distinct entities
|
225
|
+
This function clusters entities that share identifiers (as defined by the provided biological qualifiers)
|
226
|
+
and produces a new table of unique entities, along with a lookup table mapping original entities to consensus IDs.
|
299
227
|
|
300
|
-
|
228
|
+
Parameters
|
301
229
|
----------
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
230
|
+
sbml_df : pd.DataFrame
|
231
|
+
Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
|
232
|
+
table_schema : dict
|
233
|
+
Schema for the table being reduced.
|
234
|
+
pw_index : indices.PWIndex, optional
|
235
|
+
An index of all tables being aggregated (default: None).
|
236
|
+
defining_biological_qualifiers : list[str], optional
|
237
|
+
List of biological qualifier types which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
|
238
|
+
|
239
|
+
Returns
|
240
|
+
-------
|
241
|
+
new_id_table : pd.DataFrame
|
242
|
+
Table matching the schema of one of the input models, with merged entities.
|
243
|
+
lookup_table : pd.Series
|
244
|
+
Series mapping the index of the aggregated entities to new consensus IDs.
|
306
245
|
"""
|
307
|
-
|
246
|
+
# Step 1: Build consensus identifiers to create clusters of equivalent entities
|
308
247
|
indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
|
309
248
|
sbml_df, table_schema, defining_biological_qualifiers
|
310
249
|
)
|
311
250
|
|
312
|
-
#
|
251
|
+
# Step 2: Join cluster information to the original table
|
313
252
|
agg_table_harmonized = sbml_df.join(indexed_cluster)
|
314
|
-
# create a new numbering schema off of cluster #s and id type
|
315
|
-
# print(agg_table_harmonized["cluster"])
|
316
|
-
# print(table_schema["pk"])
|
317
|
-
|
318
|
-
agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
|
319
|
-
agg_table_harmonized["cluster"], table_schema["pk"]
|
320
|
-
)
|
321
253
|
|
322
|
-
|
254
|
+
# Step 3: Create lookup table for entity IDs
|
255
|
+
lookup_table = _create_entity_lookup_table(agg_table_harmonized, table_schema)
|
323
256
|
|
324
|
-
#
|
325
|
-
# (this will help to select names which are more human readable after the merge)
|
257
|
+
# Step 4: Add nameness scores to help select representative names
|
326
258
|
agg_table_harmonized = utils._add_nameness_score_wrapper(
|
327
259
|
agg_table_harmonized, "label", table_schema
|
328
260
|
)
|
329
261
|
|
330
|
-
#
|
331
|
-
|
332
|
-
agg_table_harmonized
|
333
|
-
.sort_values(["nameness_score"])
|
334
|
-
.rename(columns={"new_id": table_schema["pk"]})
|
335
|
-
.groupby(table_schema["pk"])
|
336
|
-
.first()
|
337
|
-
.drop("nameness_score", axis=1)
|
338
|
-
)
|
339
|
-
|
340
|
-
new_id_table = (
|
341
|
-
agg_table_reduced.drop(table_schema["id"], axis=1)
|
342
|
-
.merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
|
343
|
-
.drop("cluster", axis=1)
|
262
|
+
# Step 5: Prepare the consensus table with one row per unique entity
|
263
|
+
new_id_table = _prepare_consensus_table(
|
264
|
+
agg_table_harmonized, table_schema, cluster_consensus_identifiers
|
344
265
|
)
|
345
266
|
|
267
|
+
# Step 6: Add source information if required
|
346
268
|
if "source" in table_schema.keys():
|
347
|
-
|
348
|
-
|
349
|
-
f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
|
350
|
-
)
|
351
|
-
|
352
|
-
# track the model(s) that each entity came from
|
353
|
-
new_sources = create_consensus_sources(
|
354
|
-
agg_table_harmonized, lookup_table, table_schema, pw_index
|
355
|
-
)
|
356
|
-
assert isinstance(new_sources, pd.Series)
|
357
|
-
|
358
|
-
new_id_table = new_id_table.drop(
|
359
|
-
table_schema[SOURCE_SPEC.SOURCE], axis=1
|
360
|
-
).merge(new_sources, left_index=True, right_index=True)
|
361
|
-
|
362
|
-
# check that the index name and variables match the source
|
363
|
-
if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
|
364
|
-
new_id_table.index.names
|
365
|
-
):
|
366
|
-
raise ValueError(
|
367
|
-
"The newly constructed id table's index does not match the inputs"
|
269
|
+
new_id_table = _add_consensus_sources(
|
270
|
+
new_id_table, agg_table_harmonized, lookup_table, table_schema, pw_index
|
368
271
|
)
|
369
272
|
|
370
|
-
|
371
|
-
|
372
|
-
"The newly constructed id table's variables do not match the inputs"
|
373
|
-
)
|
273
|
+
# Step 7: Validate the resulting table
|
274
|
+
_validate_consensus_table(new_id_table, sbml_df)
|
374
275
|
|
375
276
|
return new_id_table, lookup_table
|
376
277
|
|
@@ -381,163 +282,85 @@ def build_consensus_identifiers(
|
|
381
282
|
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
382
283
|
) -> tuple[pd.Series, pd.DataFrame]:
|
383
284
|
"""
|
384
|
-
Build
|
385
|
-
|
386
|
-
Take a set of entities spanning multiple models and find all unique entities.
|
285
|
+
Build consensus identifiers by clustering entities that share biological identifiers.
|
387
286
|
|
388
|
-
|
389
|
-
|
287
|
+
This function takes a set of entities spanning multiple models and finds all unique entities
|
288
|
+
by grouping them according to the provided biological qualifiers. It returns a mapping from
|
289
|
+
original entities to clusters and a DataFrame of consensus identifier objects for each cluster.
|
390
290
|
|
391
|
-
Parameters
|
392
|
-
----------
|
393
|
-
sbml_df: pd.DataFrame
|
394
|
-
One type of entity from sbml_dfs_dict expanded to include model its index,
|
395
|
-
as produced by unnest_SBML_df(sbml_dfs_dict)
|
396
|
-
table_schema: dict
|
397
|
-
Schema for the table sbml_df
|
398
|
-
defining_biological_qualifiers: [str]
|
399
|
-
A list of biological qualifier types which should be used for grouping
|
400
|
-
|
401
|
-
Returns:
|
291
|
+
Parameters
|
402
292
|
----------
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
293
|
+
sbml_df : pd.DataFrame
|
294
|
+
Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
|
295
|
+
table_schema : dict
|
296
|
+
Schema for the table being processed.
|
297
|
+
defining_biological_qualifiers : list[str], optional
|
298
|
+
List of biological qualifier types to use for grouping. Defaults to BQB_DEFINING_ATTRS.
|
299
|
+
|
300
|
+
Returns
|
301
|
+
-------
|
302
|
+
indexed_cluster : pd.Series
|
303
|
+
Series mapping the index from sbml_df onto a set of clusters which define unique entities.
|
304
|
+
cluster_consensus_identifiers_df : pd.DataFrame
|
305
|
+
DataFrame mapping clusters to consensus identifiers (Identifiers objects).
|
407
306
|
"""
|
408
|
-
|
409
|
-
# create a table which is one row per entry
|
307
|
+
# Step 1: Extract and validate identifiers
|
410
308
|
meta_identifiers = sbml_dfs_utils.unnest_identifiers(sbml_df, table_schema["id"])
|
411
|
-
# check the identifiers for missing attributes
|
412
309
|
_validate_meta_identifiers(meta_identifiers)
|
413
310
|
|
414
|
-
#
|
415
|
-
|
416
|
-
|
417
|
-
valid_identifiers = valid_identifiers[
|
418
|
-
meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
|
419
|
-
]
|
420
|
-
|
421
|
-
# catch entries which no longer have any identifiers
|
422
|
-
# add a dummy identifier to these which will still uniquely tag them
|
423
|
-
|
424
|
-
filtered_entries = sbml_df.reset_index().merge(
|
425
|
-
valid_identifiers.reset_index(),
|
426
|
-
left_on=sbml_df.index.names,
|
427
|
-
right_on=sbml_df.index.names,
|
428
|
-
how="outer",
|
429
|
-
)[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
|
430
|
-
filtered_entries = filtered_entries[
|
431
|
-
filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
|
432
|
-
]
|
433
|
-
if filtered_entries.shape[0] != 0:
|
434
|
-
logger.warning(
|
435
|
-
f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
|
436
|
-
)
|
437
|
-
|
438
|
-
filtered_entries[SOURCE_SPEC.ENTRY] = 0
|
439
|
-
filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
|
440
|
-
filtered_entries[IDENTIFIERS.ONTOLOGY] = [
|
441
|
-
"dummy_value_" + str(val)
|
442
|
-
for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
|
443
|
-
]
|
444
|
-
filtered_entries[IDENTIFIERS.URL] = None
|
445
|
-
filtered_entries[IDENTIFIERS.BQB] = None
|
446
|
-
|
447
|
-
filtered_entries = filtered_entries.set_index(
|
448
|
-
sbml_df.index.names + [SOURCE_SPEC.ENTRY]
|
449
|
-
)
|
450
|
-
|
451
|
-
valid_identifiers = pd.concat([valid_identifiers, filtered_entries])
|
452
|
-
|
453
|
-
# combine multi-index into a single variable; combine ontology + identifiers as a single variable
|
454
|
-
valid_identifiers = utils.format_identifiers_as_edgelist(
|
455
|
-
valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
311
|
+
# Step 2: Filter identifiers by biological qualifier type
|
312
|
+
valid_identifiers = _filter_identifiers_by_qualifier(
|
313
|
+
meta_identifiers, defining_biological_qualifiers
|
456
314
|
)
|
457
315
|
|
458
|
-
#
|
459
|
-
|
460
|
-
valid_identifiers.reset_index()
|
461
|
-
.set_index(valid_identifiers.index.names, drop=False)[sbml_df.index.names]
|
462
|
-
.astype(str)
|
463
|
-
.apply("__".join, axis=1)
|
464
|
-
)
|
465
|
-
valid_identifiers.loc[:, "model_spec"] = indexed_species_tags
|
316
|
+
# Step 3: Handle entries that don't have identifiers
|
317
|
+
valid_identifiers = _handle_entries_without_identifiers(sbml_df, valid_identifiers)
|
466
318
|
|
467
|
-
#
|
468
|
-
|
469
|
-
# added to the same cluster so that they can be merged
|
470
|
-
id_edgelist = pd.concat(
|
471
|
-
[
|
472
|
-
valid_identifiers[["ind", "id"]],
|
473
|
-
# add id-ind edges so that identifiers corresponding to the same entity are grouped
|
474
|
-
# these entries will be discarded when merging the results back in by "ind"
|
475
|
-
valid_identifiers[["model_spec", "id"]].rename(
|
476
|
-
columns={"model_spec": "ind"}
|
477
|
-
),
|
478
|
-
]
|
479
|
-
)
|
319
|
+
# Step 4: Prepare edgelist for clustering
|
320
|
+
id_edgelist = _prepare_identifier_edgelist(valid_identifiers, sbml_df)
|
480
321
|
|
481
|
-
#
|
482
|
-
# using a greedy graph-based approach
|
322
|
+
# Step 5: Cluster entities based on shared identifiers
|
483
323
|
ind_clusters = utils.find_weakly_connected_subgraphs(id_edgelist)
|
484
324
|
|
485
|
-
#
|
486
|
-
|
487
|
-
|
488
|
-
# all entries for the same (model, id) will have the same cluster so convert back to
|
489
|
-
# sbml_df index to facilitate join
|
490
|
-
indexed_cluster = valid_identifiers.groupby(sbml_df.index.names).first()["cluster"]
|
491
|
-
|
492
|
-
# combine equivalent entries into a single Identifiers object
|
493
|
-
# include identifiers which were filtered by bqb
|
494
|
-
|
495
|
-
all_cluster_identifiers = meta_identifiers.reset_index().merge(
|
496
|
-
indexed_cluster, left_on=sbml_df.index.names, right_index=True
|
325
|
+
# Step 6: Map entity indices to clusters
|
326
|
+
valid_identifiers_with_clusters = valid_identifiers.reset_index().merge(
|
327
|
+
ind_clusters
|
497
328
|
)
|
329
|
+
indexed_cluster = valid_identifiers_with_clusters.groupby(
|
330
|
+
sbml_df.index.names
|
331
|
+
).first()["cluster"]
|
498
332
|
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
[
|
504
|
-
IDENTIFIERS.ONTOLOGY,
|
505
|
-
IDENTIFIERS.IDENTIFIER,
|
506
|
-
IDENTIFIERS.URL,
|
507
|
-
IDENTIFIERS.BQB,
|
508
|
-
]
|
509
|
-
]
|
510
|
-
.T.to_dict()
|
511
|
-
.values()
|
512
|
-
)
|
513
|
-
)
|
514
|
-
for k, v in all_cluster_identifiers.groupby("cluster")
|
515
|
-
}
|
516
|
-
|
517
|
-
# recover clusters which don't have any identifiers
|
518
|
-
catchup_clusters = {
|
519
|
-
c: identifiers.Identifiers(list())
|
520
|
-
for c in set(ind_clusters["cluster"].tolist()).difference(
|
521
|
-
cluster_consensus_identifiers
|
522
|
-
)
|
523
|
-
}
|
524
|
-
cluster_consensus_identifiers = {
|
525
|
-
**cluster_consensus_identifiers,
|
526
|
-
**catchup_clusters,
|
527
|
-
}
|
528
|
-
|
529
|
-
cluster_consensus_identifiers_df = pd.DataFrame(
|
530
|
-
cluster_consensus_identifiers, index=[table_schema["id"]]
|
531
|
-
).T
|
532
|
-
cluster_consensus_identifiers_df.index.name = "cluster"
|
333
|
+
# Step 7: Create consensus identifiers for each cluster
|
334
|
+
cluster_consensus_identifiers_df = _create_cluster_identifiers(
|
335
|
+
meta_identifiers, indexed_cluster, sbml_df, ind_clusters, table_schema
|
336
|
+
)
|
533
337
|
|
534
338
|
return indexed_cluster, cluster_consensus_identifiers_df
|
535
339
|
|
536
340
|
|
537
341
|
def pre_consensus_ontology_check(
|
538
342
|
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
|
539
|
-
):
|
540
|
-
"""
|
343
|
+
) -> tuple[list, pd.DataFrame]:
|
344
|
+
"""
|
345
|
+
Check for shared ontologies across source models for a given table.
|
346
|
+
|
347
|
+
For compartments, species, or reactions tables, this function returns the set of ontologies
|
348
|
+
shared among all SBML_dfs in the input dictionary, as well as a DataFrame summarizing ontologies per model.
|
349
|
+
|
350
|
+
Parameters
|
351
|
+
----------
|
352
|
+
sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
|
353
|
+
Dictionary of SBML_dfs objects from different models, keyed by model name.
|
354
|
+
tablename : str
|
355
|
+
Name of the table to check (should be one of 'compartments', 'species', or 'reactions').
|
356
|
+
|
357
|
+
Returns
|
358
|
+
-------
|
359
|
+
shared_onto_list : list
|
360
|
+
List of ontologies shared by all models for the specified table.
|
361
|
+
sbml_dict_onto_df : pd.DataFrame
|
362
|
+
DataFrame summarizing ontologies present in each model for the specified table.
|
363
|
+
"""
|
541
364
|
|
542
365
|
# tablename: compartments/species/reactions tables with Identifiers
|
543
366
|
# returns shared ontologies among sbml_dfs in sbml_dfs_dict for
|
@@ -572,23 +395,23 @@ def pre_consensus_ontology_check(
|
|
572
395
|
return shared_onto_list, sbml_dict_onto_df
|
573
396
|
|
574
397
|
|
575
|
-
def
|
576
|
-
"""
|
398
|
+
def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
|
399
|
+
"""
|
400
|
+
Check and return the set of ontologies shared by different sources in a consensus model's species table.
|
577
401
|
|
578
|
-
|
579
|
-
|
580
|
-
'"meta_identifiers" was empty; some identifiers should be present'
|
581
|
-
)
|
582
|
-
|
583
|
-
n_null = sum(meta_identifiers["bqb"].isnull())
|
584
|
-
if n_null > 0:
|
585
|
-
msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
|
586
|
-
logger.warn(msg)
|
587
|
-
|
588
|
-
return None
|
402
|
+
This function examines the species table in a consensus SBML_dfs object, determines the ontologies
|
403
|
+
present for each source model, and returns the intersection of ontologies shared by all sources.
|
589
404
|
|
405
|
+
Parameters
|
406
|
+
----------
|
407
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
408
|
+
The consensus SBML_dfs object containing merged species from multiple models.
|
590
409
|
|
591
|
-
|
410
|
+
Returns
|
411
|
+
-------
|
412
|
+
set[str]
|
413
|
+
Set of ontology terms shared by all sources in the consensus model's species table.
|
414
|
+
"""
|
592
415
|
# Checking the ontology in "species" shared by different sources in a consensus model
|
593
416
|
# returns a set of shared ontologies by different sources
|
594
417
|
|
@@ -636,27 +459,6 @@ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> s
|
|
636
459
|
return shared_onto_set
|
637
460
|
|
638
461
|
|
639
|
-
def _update_foreign_keys(
|
640
|
-
agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
|
641
|
-
) -> pd.DataFrame:
|
642
|
-
"""Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
|
643
|
-
|
644
|
-
for fk in table_schema["fk"]:
|
645
|
-
updated_fks = (
|
646
|
-
agg_tbl[fk]
|
647
|
-
.reset_index()
|
648
|
-
.merge(
|
649
|
-
fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
|
650
|
-
)
|
651
|
-
.drop(fk, axis=1)
|
652
|
-
.rename(columns={"new_id": fk})
|
653
|
-
.set_index(["model", table_schema["pk"]])
|
654
|
-
)
|
655
|
-
agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
|
656
|
-
|
657
|
-
return agg_tbl
|
658
|
-
|
659
|
-
|
660
462
|
def pre_consensus_compartment_check(
|
661
463
|
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
|
662
464
|
) -> tuple[list, dict]:
|
@@ -855,146 +657,57 @@ def construct_meta_entities_members(
|
|
855
657
|
Matching the schema of one of the tables within sbml_df_dict
|
856
658
|
lookup_table: pd.Series
|
857
659
|
Matches the index of the aggregated entities to new_ids
|
858
|
-
|
859
660
|
"""
|
860
|
-
|
861
661
|
logger.info(
|
862
662
|
f"Merging {table} based on identical membership ({' + '.join(defining_attrs)})"
|
863
663
|
)
|
864
664
|
|
865
|
-
#
|
866
|
-
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
|
867
|
-
|
868
|
-
# to debug and see names of species
|
869
|
-
# comp_species = unnest_SBML_df(sbml_dfs_dict, table="compartmentalized_species")
|
870
|
-
# agg_tbl = agg_tbl.merge(comp_species, left_on = ["model", "sc_id"], right_index = True )
|
871
|
-
|
872
|
-
# since all sbml_dfs have the same schema pull out one schema for reference
|
665
|
+
# Step 1: Get schemas for both tables
|
873
666
|
table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
874
667
|
defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
|
875
668
|
|
876
|
-
#
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
.rename(columns={"new_id": k})
|
886
|
-
)
|
887
|
-
|
888
|
-
# create a set of species x compartment instances for each reaction
|
889
|
-
defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
|
890
|
-
|
891
|
-
if (
|
892
|
-
len(defining_fk) != 1
|
893
|
-
or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
|
894
|
-
):
|
895
|
-
raise ValueError(
|
896
|
-
f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
|
897
|
-
)
|
898
|
-
else:
|
899
|
-
defining_fk = list(defining_fk)[0]
|
900
|
-
|
901
|
-
# define what it is to be a unique member based on a combination of defining_attrs
|
902
|
-
valid_defining_attrs = agg_tbl.columns.values.tolist()
|
903
|
-
invalid_defining_attrs = [
|
904
|
-
x for x in defining_attrs if x not in valid_defining_attrs
|
905
|
-
]
|
906
|
-
|
907
|
-
if len(invalid_defining_attrs) != 0:
|
908
|
-
raise ValueError(
|
909
|
-
f"{', '.join(invalid_defining_attrs)} was not found; "
|
910
|
-
f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
|
911
|
-
)
|
912
|
-
|
913
|
-
# create unique members
|
914
|
-
agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
|
915
|
-
|
916
|
-
# members are aggregated by reaction
|
917
|
-
membership_df = (
|
918
|
-
agg_tbl.reset_index()
|
919
|
-
.groupby(["model", table_schema["pk"]])
|
920
|
-
.agg(membership=("member", lambda x: (list(set(x)))))
|
669
|
+
# Step 2: Prepare the member table and validate its structure
|
670
|
+
agg_tbl, defining_fk = _prepare_member_table(
|
671
|
+
sbml_dfs_dict,
|
672
|
+
defined_by,
|
673
|
+
defined_lookup_tables,
|
674
|
+
table_schema,
|
675
|
+
defined_by_schema,
|
676
|
+
defining_attrs,
|
677
|
+
table,
|
921
678
|
)
|
922
679
|
|
923
|
-
#
|
924
|
-
|
925
|
-
# the same entity
|
926
|
-
for i in range(membership_df.shape[0]):
|
927
|
-
members = membership_df["membership"].iloc[i]
|
928
|
-
if len(members) != len(set(members)):
|
929
|
-
_ = agg_tbl.reset_index().merge(
|
930
|
-
membership_df.iloc[i : i + 1],
|
931
|
-
how="inner",
|
932
|
-
left_on=[SOURCE_SPEC.MODEL, table_schema["pk"]],
|
933
|
-
right_index=True,
|
934
|
-
)
|
935
|
-
|
936
|
-
raise ValueError(
|
937
|
-
"Members were duplicated suggesting overmerging in the source "
|
938
|
-
)
|
939
|
-
|
940
|
-
membership_df["member_string"] = [
|
941
|
-
_create_member_string(x) for x in membership_df["membership"]
|
942
|
-
]
|
943
|
-
|
944
|
-
membership_lookup = membership_df.reset_index()
|
680
|
+
# Step 3: Create lookup table for entity membership
|
681
|
+
membership_lookup = _create_membership_lookup(agg_tbl, table_schema)
|
945
682
|
|
946
|
-
|
947
|
-
consensus_entities
|
948
|
-
|
683
|
+
# Step 4: Create consensus entities and lookup table
|
684
|
+
consensus_entities, lookup_table = _create_entity_consensus(
|
685
|
+
membership_lookup, table_schema
|
949
686
|
)
|
950
687
|
|
951
|
-
|
952
|
-
consensus_entities["new_id"], left_on="member_string", right_index=True
|
953
|
-
).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
|
954
|
-
|
955
|
-
# logging merges that occurred
|
688
|
+
# Step 5: Log merger information
|
956
689
|
report_consensus_merges(
|
957
690
|
lookup_table, table_schema, sbml_dfs_dict=sbml_dfs_dict, n_example_merges=5
|
958
691
|
)
|
959
692
|
|
693
|
+
# Step 6: Get primary entity table and merge identifiers
|
960
694
|
agg_primary_table = unnest_SBML_df(sbml_dfs_dict, table=table)
|
961
695
|
|
962
|
-
# add nameness_score as a measure of how-readable a possible name would be
|
963
|
-
# (this will help to select names which are more human readable after the merge)
|
964
|
-
agg_primary_table = utils._add_nameness_score_wrapper(
|
965
|
-
agg_primary_table, "label", table_schema
|
966
|
-
)
|
967
|
-
|
968
|
-
new_id_table = (
|
969
|
-
agg_primary_table.join(lookup_table)
|
970
|
-
.reset_index(drop=True)
|
971
|
-
.sort_values(["nameness_score"])
|
972
|
-
.rename(columns={"new_id": table_schema["pk"]})
|
973
|
-
.groupby(table_schema["pk"])
|
974
|
-
.first()[table_schema["vars"]]
|
975
|
-
)
|
976
|
-
|
977
|
-
# merge identifiers
|
978
696
|
logger.info(f"Merging {table} identifiers")
|
979
|
-
|
980
|
-
agg_primary_table
|
981
|
-
.reset_index(drop=True)
|
982
|
-
.rename(columns={"new_id": table_schema["pk"]})
|
983
|
-
.groupby(table_schema["pk"])[table_schema["id"]]
|
697
|
+
updated_identifiers = _merge_entity_identifiers(
|
698
|
+
agg_primary_table, lookup_table, table_schema
|
984
699
|
)
|
985
700
|
|
986
|
-
#
|
987
|
-
|
988
|
-
|
989
|
-
# add merged identifiers back to new_id table overwriting existing ids
|
990
|
-
new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
|
991
|
-
updated_identifiers, left_index=True, right_index=True
|
701
|
+
# Step 7: Create consensus table with merged entities
|
702
|
+
new_id_table = _create_consensus_table(
|
703
|
+
agg_primary_table, lookup_table, updated_identifiers, table_schema
|
992
704
|
)
|
993
705
|
|
706
|
+
# Step 8: Add source information if present
|
994
707
|
if "source" in table_schema.keys():
|
995
708
|
logger.info(f"Merging {table} sources")
|
996
709
|
|
997
|
-
#
|
710
|
+
# Track the model(s) that each entity came from
|
998
711
|
new_sources = create_consensus_sources(
|
999
712
|
agg_primary_table.merge(lookup_table, left_index=True, right_index=True),
|
1000
713
|
lookup_table,
|
@@ -1190,6 +903,163 @@ def report_consensus_merges(
|
|
1190
903
|
return None
|
1191
904
|
|
1192
905
|
|
906
|
+
def _create_entity_lookup_table(
|
907
|
+
agg_table_harmonized: pd.DataFrame, table_schema: dict
|
908
|
+
) -> pd.Series:
|
909
|
+
"""
|
910
|
+
Create a lookup table mapping original entity IDs to new consensus IDs.
|
911
|
+
|
912
|
+
Parameters:
|
913
|
+
----------
|
914
|
+
agg_table_harmonized: pd.DataFrame
|
915
|
+
Table with cluster assignments for each entity
|
916
|
+
table_schema: dict
|
917
|
+
Schema for the table
|
918
|
+
|
919
|
+
Returns:
|
920
|
+
----------
|
921
|
+
pd.Series
|
922
|
+
Lookup table mapping old entity IDs to new consensus IDs
|
923
|
+
"""
|
924
|
+
# Create a new ID based on cluster number and entity type
|
925
|
+
agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
|
926
|
+
agg_table_harmonized["cluster"], table_schema["pk"]
|
927
|
+
)
|
928
|
+
|
929
|
+
# Return the lookup series
|
930
|
+
return agg_table_harmonized["new_id"]
|
931
|
+
|
932
|
+
|
933
|
+
def _prepare_consensus_table(
|
934
|
+
agg_table_harmonized: pd.DataFrame,
|
935
|
+
table_schema: dict,
|
936
|
+
cluster_consensus_identifiers: pd.DataFrame,
|
937
|
+
) -> pd.DataFrame:
|
938
|
+
"""
|
939
|
+
Prepare a consensus table with one row per unique entity.
|
940
|
+
|
941
|
+
Parameters:
|
942
|
+
----------
|
943
|
+
agg_table_harmonized: pd.DataFrame
|
944
|
+
Table with nameness scores and cluster assignments
|
945
|
+
table_schema: dict
|
946
|
+
Schema for the table
|
947
|
+
cluster_consensus_identifiers: pd.DataFrame
|
948
|
+
Consensus identifiers for each cluster
|
949
|
+
|
950
|
+
Returns:
|
951
|
+
----------
|
952
|
+
pd.DataFrame
|
953
|
+
New consensus table with merged entities
|
954
|
+
"""
|
955
|
+
# Sort by nameness score and keep one row per new entity ID
|
956
|
+
agg_table_reduced = (
|
957
|
+
agg_table_harmonized.reset_index(drop=True)
|
958
|
+
.sort_values(["nameness_score"])
|
959
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
960
|
+
.groupby(table_schema["pk"])
|
961
|
+
.first()
|
962
|
+
.drop("nameness_score", axis=1)
|
963
|
+
)
|
964
|
+
|
965
|
+
# Join in the consensus identifiers and drop the temporary cluster column
|
966
|
+
new_id_table = (
|
967
|
+
agg_table_reduced.drop(table_schema["id"], axis=1)
|
968
|
+
.merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
|
969
|
+
.drop("cluster", axis=1)
|
970
|
+
)
|
971
|
+
|
972
|
+
return new_id_table
|
973
|
+
|
974
|
+
|
975
|
+
def _add_consensus_sources(
|
976
|
+
new_id_table: pd.DataFrame,
|
977
|
+
agg_table_harmonized: pd.DataFrame,
|
978
|
+
lookup_table: pd.Series,
|
979
|
+
table_schema: dict,
|
980
|
+
pw_index: indices.PWIndex | None,
|
981
|
+
) -> pd.DataFrame:
|
982
|
+
"""
|
983
|
+
Add source information to the consensus table.
|
984
|
+
|
985
|
+
Parameters:
|
986
|
+
----------
|
987
|
+
new_id_table: pd.DataFrame
|
988
|
+
Consensus table without source information
|
989
|
+
agg_table_harmonized: pd.DataFrame
|
990
|
+
Original table with cluster assignments
|
991
|
+
lookup_table: pd.Series
|
992
|
+
Maps old IDs to new consensus IDs
|
993
|
+
table_schema: dict
|
994
|
+
Schema for the table
|
995
|
+
pw_index: indices.PWIndex | None
|
996
|
+
An index of all tables being aggregated
|
997
|
+
|
998
|
+
Returns:
|
999
|
+
----------
|
1000
|
+
pd.DataFrame
|
1001
|
+
Consensus table with source information added
|
1002
|
+
"""
|
1003
|
+
if type(pw_index) is not indices.PWIndex:
|
1004
|
+
raise ValueError(
|
1005
|
+
f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
|
1006
|
+
)
|
1007
|
+
|
1008
|
+
# Track the model(s) that each entity came from
|
1009
|
+
new_sources = create_consensus_sources(
|
1010
|
+
agg_table_harmonized, lookup_table, table_schema, pw_index
|
1011
|
+
)
|
1012
|
+
assert isinstance(new_sources, pd.Series)
|
1013
|
+
|
1014
|
+
# Add the sources to the consensus table
|
1015
|
+
updated_table = new_id_table.drop(table_schema[SOURCE_SPEC.SOURCE], axis=1).merge(
|
1016
|
+
new_sources, left_index=True, right_index=True
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
return updated_table
|
1020
|
+
|
1021
|
+
|
1022
|
+
def _validate_consensus_table(
|
1023
|
+
new_id_table: pd.DataFrame, sbml_df: pd.DataFrame
|
1024
|
+
) -> None:
|
1025
|
+
"""
|
1026
|
+
Validate that the new consensus table has the same structure as the original.
|
1027
|
+
|
1028
|
+
Parameters:
|
1029
|
+
----------
|
1030
|
+
new_id_table: pd.DataFrame
|
1031
|
+
Newly created consensus table
|
1032
|
+
sbml_df: pd.DataFrame
|
1033
|
+
Original table from which consensus was built
|
1034
|
+
|
1035
|
+
Raises:
|
1036
|
+
------
|
1037
|
+
ValueError
|
1038
|
+
If index names or columns don't match
|
1039
|
+
"""
|
1040
|
+
# Check that the index names match
|
1041
|
+
if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
|
1042
|
+
new_id_table.index.names
|
1043
|
+
):
|
1044
|
+
raise ValueError(
|
1045
|
+
f"The newly constructed id table's index does not match the inputs.\n"
|
1046
|
+
f"Expected index names: {sbml_df.index.names}\n"
|
1047
|
+
f"Actual index names: {new_id_table.index.names}"
|
1048
|
+
)
|
1049
|
+
|
1050
|
+
# Check that the columns match
|
1051
|
+
if set(sbml_df) != set(new_id_table.columns):
|
1052
|
+
missing_in_new = set(sbml_df) - set(new_id_table.columns)
|
1053
|
+
extra_in_new = set(new_id_table.columns) - set(sbml_df)
|
1054
|
+
raise ValueError(
|
1055
|
+
"The newly constructed id table's variables do not match the inputs.\n"
|
1056
|
+
f"Expected columns: {list(sbml_df.columns)}\n"
|
1057
|
+
f"Actual columns: {list(new_id_table.columns)}\n"
|
1058
|
+
f"Missing in new: {missing_in_new}\n"
|
1059
|
+
f"Extra in new: {extra_in_new}"
|
1060
|
+
)
|
1061
|
+
|
1062
|
+
|
1193
1063
|
def merge_entity_data(
|
1194
1064
|
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1195
1065
|
lookup_table: pd.Series,
|
@@ -1232,35 +1102,619 @@ def merge_entity_data(
|
|
1232
1102
|
return entity_data
|
1233
1103
|
|
1234
1104
|
|
1235
|
-
def
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
primarily used as an input for construct_consensus_model
|
1241
|
-
|
1242
|
-
Returns:
|
1243
|
-
None
|
1244
|
-
|
1105
|
+
def _create_consensus_entities(
|
1106
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1107
|
+
pw_index: indices.PWIndex,
|
1108
|
+
defining_biological_qualifiers: list[str],
|
1109
|
+
) -> tuple[dict, dict]:
|
1245
1110
|
"""
|
1111
|
+
Create consensus entities for all primary tables in the model.
|
1246
1112
|
|
1247
|
-
|
1248
|
-
|
1249
|
-
return None
|
1113
|
+
This helper function creates consensus compartments, species, compartmentalized species,
|
1114
|
+
reactions, and reaction species by finding shared entities across source models.
|
1250
1115
|
|
1116
|
+
Parameters:
|
1117
|
+
----------
|
1118
|
+
sbml_dfs_dict: dict{cpr.SBML_dfs}
|
1119
|
+
A dictionary of SBML_dfs from different models
|
1120
|
+
pw_index: indices.PWIndex
|
1121
|
+
An index of all tables being aggregated
|
1122
|
+
defining_biological_qualifiers: list[str]
|
1123
|
+
Biological qualifier terms that define distinct entities
|
1251
1124
|
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1125
|
+
Returns:
|
1126
|
+
----------
|
1127
|
+
tuple:
|
1128
|
+
- dict of consensus entities tables
|
1129
|
+
- dict of lookup tables
|
1130
|
+
"""
|
1131
|
+
# Step 1: Compartments
|
1132
|
+
logger.info("Defining compartments based on unique ids")
|
1133
|
+
comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
|
1134
|
+
sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
|
1135
|
+
)
|
1256
1136
|
|
1257
|
-
|
1258
|
-
|
1137
|
+
# Step 2: Species
|
1138
|
+
logger.info("Defining species based on unique ids")
|
1139
|
+
spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
|
1140
|
+
sbml_dfs_dict=sbml_dfs_dict,
|
1141
|
+
pw_index=pw_index,
|
1142
|
+
table=SBML_DFS.SPECIES,
|
1143
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
1144
|
+
)
|
1259
1145
|
|
1260
|
-
|
1261
|
-
|
1146
|
+
# Step 3: Compartmentalized species
|
1147
|
+
logger.info(
|
1148
|
+
"Defining compartmentalized species based on unique species x compartments"
|
1262
1149
|
)
|
1263
|
-
|
1150
|
+
compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
|
1151
|
+
sbml_dfs_dict,
|
1152
|
+
pw_index,
|
1153
|
+
table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
|
1154
|
+
fk_lookup_tables={
|
1155
|
+
SBML_DFS.C_ID: comp_lookup_table,
|
1156
|
+
SBML_DFS.S_ID: spec_lookup_table,
|
1157
|
+
},
|
1158
|
+
)
|
1159
|
+
|
1160
|
+
# Step 4: Reactions
|
1161
|
+
logger.info(
|
1162
|
+
"Define reactions based on membership of identical compartmentalized species"
|
1163
|
+
)
|
1164
|
+
rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
|
1165
|
+
sbml_dfs_dict,
|
1166
|
+
pw_index,
|
1167
|
+
table=SBML_DFS.REACTIONS,
|
1168
|
+
defined_by=SBML_DFS.REACTION_SPECIES,
|
1169
|
+
defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
|
1170
|
+
defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
|
1171
|
+
)
|
1172
|
+
|
1173
|
+
logger.info("Annotating reversibility based on merged reactions")
|
1174
|
+
rxn_consensus_species = _resolve_reversibility(
|
1175
|
+
sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
|
1176
|
+
)
|
1177
|
+
|
1178
|
+
# Step 5: Reaction species
|
1179
|
+
logger.info("Define reaction species based on reactions")
|
1180
|
+
rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
|
1181
|
+
sbml_dfs_dict,
|
1182
|
+
pw_index,
|
1183
|
+
table=SBML_DFS.REACTION_SPECIES,
|
1184
|
+
fk_lookup_tables={
|
1185
|
+
SBML_DFS.R_ID: rxn_lookup_table,
|
1186
|
+
SBML_DFS.SC_ID: compspec_lookup_table,
|
1187
|
+
},
|
1188
|
+
# retain species with different roles
|
1189
|
+
extra_defining_attrs=[SBML_DFS.SBO_TERM],
|
1190
|
+
)
|
1191
|
+
|
1192
|
+
consensus_entities = {
|
1193
|
+
SBML_DFS.COMPARTMENTS: comp_consensus_entities,
|
1194
|
+
SBML_DFS.SPECIES: spec_consensus_entities,
|
1195
|
+
SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
|
1196
|
+
SBML_DFS.REACTIONS: rxn_consensus_species,
|
1197
|
+
SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
|
1198
|
+
}
|
1199
|
+
|
1200
|
+
lookup_tables = {
|
1201
|
+
SBML_DFS.COMPARTMENTS: comp_lookup_table,
|
1202
|
+
SBML_DFS.SPECIES: spec_lookup_table,
|
1203
|
+
SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_lookup_table,
|
1204
|
+
SBML_DFS.REACTIONS: rxn_lookup_table,
|
1205
|
+
SBML_DFS.REACTION_SPECIES: rxnspec_lookup_table,
|
1206
|
+
}
|
1207
|
+
|
1208
|
+
return consensus_entities, lookup_tables
|
1209
|
+
|
1210
|
+
|
1211
|
+
def _add_entity_data(
|
1212
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1213
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1214
|
+
lookup_tables: dict,
|
1215
|
+
) -> sbml_dfs_core.SBML_dfs:
|
1216
|
+
"""
|
1217
|
+
Add entity data from component models to the consensus model.
|
1218
|
+
|
1219
|
+
Parameters:
|
1220
|
+
----------
|
1221
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
1222
|
+
The consensus model being built
|
1223
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
|
1224
|
+
A dictionary of SBML_dfs from different models
|
1225
|
+
lookup_tables: dict
|
1226
|
+
Dictionary of lookup tables for translating between old and new entity IDs
|
1227
|
+
|
1228
|
+
Returns:
|
1229
|
+
----------
|
1230
|
+
sbml_dfs_core.SBML_dfs
|
1231
|
+
The updated consensus model
|
1232
|
+
"""
|
1233
|
+
# Add species data
|
1234
|
+
consensus_species_data = merge_entity_data(
|
1235
|
+
sbml_dfs_dict,
|
1236
|
+
lookup_table=lookup_tables[SBML_DFS.SPECIES],
|
1237
|
+
table=SBML_DFS.SPECIES,
|
1238
|
+
)
|
1239
|
+
for k in consensus_species_data.keys():
|
1240
|
+
sbml_dfs.add_species_data(k, consensus_species_data[k])
|
1241
|
+
|
1242
|
+
# Add reactions data
|
1243
|
+
consensus_reactions_data = merge_entity_data(
|
1244
|
+
sbml_dfs_dict,
|
1245
|
+
lookup_table=lookup_tables[SBML_DFS.REACTIONS],
|
1246
|
+
table=SBML_DFS.REACTIONS,
|
1247
|
+
)
|
1248
|
+
for k in consensus_reactions_data.keys():
|
1249
|
+
sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
|
1250
|
+
|
1251
|
+
return sbml_dfs
|
1252
|
+
|
1253
|
+
|
1254
|
+
def _prepare_member_table(
|
1255
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1256
|
+
defined_by: str,
|
1257
|
+
defined_lookup_tables: dict,
|
1258
|
+
table_schema: dict,
|
1259
|
+
defined_by_schema: dict,
|
1260
|
+
defining_attrs: list[str],
|
1261
|
+
table: str = SBML_DFS.REACTIONS,
|
1262
|
+
) -> tuple[pd.DataFrame, str]:
|
1263
|
+
"""
|
1264
|
+
Prepare a table of members and validate their structure.
|
1265
|
+
|
1266
|
+
Parameters:
|
1267
|
+
----------
|
1268
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
|
1269
|
+
Dictionary of SBML_dfs from different models
|
1270
|
+
defined_by: str
|
1271
|
+
Name of the table whose entries define membership
|
1272
|
+
defined_lookup_tables: dict
|
1273
|
+
Lookup tables for updating IDs
|
1274
|
+
table_schema: dict
|
1275
|
+
Schema for the main table
|
1276
|
+
defined_by_schema: dict
|
1277
|
+
Schema for the defining table
|
1278
|
+
defining_attrs: list[str]
|
1279
|
+
Attributes that define a unique member
|
1280
|
+
table: str
|
1281
|
+
Name of the main table (default: REACTIONS)
|
1282
|
+
|
1283
|
+
Returns:
|
1284
|
+
----------
|
1285
|
+
tuple:
|
1286
|
+
- Updated aggregated table with member strings
|
1287
|
+
- Name of the foreign key
|
1288
|
+
"""
|
1289
|
+
# Combine models into a single table
|
1290
|
+
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
|
1291
|
+
|
1292
|
+
# Update IDs using previously created lookup tables
|
1293
|
+
for k in defined_lookup_tables.keys():
|
1294
|
+
agg_tbl = (
|
1295
|
+
agg_tbl.merge(
|
1296
|
+
defined_lookup_tables[k],
|
1297
|
+
left_on=[SOURCE_SPEC.MODEL, k],
|
1298
|
+
right_index=True,
|
1299
|
+
)
|
1300
|
+
.drop(k, axis=1)
|
1301
|
+
.rename(columns={"new_id": k})
|
1302
|
+
)
|
1303
|
+
|
1304
|
+
# Identify the foreign key
|
1305
|
+
defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
|
1306
|
+
|
1307
|
+
if (
|
1308
|
+
len(defining_fk) != 1
|
1309
|
+
or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
|
1310
|
+
):
|
1311
|
+
raise ValueError(
|
1312
|
+
f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
|
1313
|
+
)
|
1314
|
+
else:
|
1315
|
+
defining_fk = list(defining_fk)[0]
|
1316
|
+
|
1317
|
+
# Validate defining attributes
|
1318
|
+
valid_defining_attrs = agg_tbl.columns.values.tolist()
|
1319
|
+
invalid_defining_attrs = [
|
1320
|
+
x for x in defining_attrs if x not in valid_defining_attrs
|
1321
|
+
]
|
1322
|
+
|
1323
|
+
if len(invalid_defining_attrs) != 0:
|
1324
|
+
raise ValueError(
|
1325
|
+
f"{', '.join(invalid_defining_attrs)} was not found; "
|
1326
|
+
f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
|
1327
|
+
)
|
1328
|
+
|
1329
|
+
# Create unique member strings
|
1330
|
+
agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
|
1331
|
+
|
1332
|
+
return agg_tbl, defining_fk
|
1333
|
+
|
1334
|
+
|
1335
|
+
def _create_membership_lookup(
|
1336
|
+
agg_tbl: pd.DataFrame, table_schema: dict
|
1337
|
+
) -> pd.DataFrame:
|
1338
|
+
"""
|
1339
|
+
Create a lookup table for entity membership.
|
1340
|
+
|
1341
|
+
Parameters:
|
1342
|
+
----------
|
1343
|
+
agg_tbl: pd.DataFrame
|
1344
|
+
Table with member information
|
1345
|
+
table_schema: dict
|
1346
|
+
Schema for the table
|
1347
|
+
|
1348
|
+
Returns:
|
1349
|
+
----------
|
1350
|
+
pd.DataFrame
|
1351
|
+
Lookup table mapping entity IDs to member strings
|
1352
|
+
"""
|
1353
|
+
# Group members by entity
|
1354
|
+
membership_df = (
|
1355
|
+
agg_tbl.reset_index()
|
1356
|
+
.groupby(["model", table_schema["pk"]])
|
1357
|
+
.agg(membership=("member", lambda x: (list(set(x)))))
|
1358
|
+
)
|
1359
|
+
|
1360
|
+
# Check for duplicated members within an entity
|
1361
|
+
for i in range(membership_df.shape[0]):
|
1362
|
+
members = membership_df["membership"].iloc[i]
|
1363
|
+
if len(members) != len(set(members)):
|
1364
|
+
raise ValueError(
|
1365
|
+
"Members were duplicated suggesting overmerging in the source"
|
1366
|
+
)
|
1367
|
+
|
1368
|
+
# Convert membership lists to strings for comparison
|
1369
|
+
membership_df["member_string"] = [
|
1370
|
+
_create_member_string(x) for x in membership_df["membership"]
|
1371
|
+
]
|
1372
|
+
|
1373
|
+
return membership_df.reset_index()
|
1374
|
+
|
1375
|
+
|
1376
|
+
def _create_entity_consensus(
|
1377
|
+
membership_lookup: pd.DataFrame, table_schema: dict
|
1378
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
1379
|
+
"""
|
1380
|
+
Create consensus entities based on membership.
|
1381
|
+
|
1382
|
+
Parameters:
|
1383
|
+
----------
|
1384
|
+
membership_lookup: pd.DataFrame
|
1385
|
+
Table mapping entities to their member strings
|
1386
|
+
table_schema: dict
|
1387
|
+
Schema for the table
|
1388
|
+
|
1389
|
+
Returns:
|
1390
|
+
----------
|
1391
|
+
tuple:
|
1392
|
+
- Consensus entities DataFrame
|
1393
|
+
- Lookup table mapping old IDs to new IDs
|
1394
|
+
"""
|
1395
|
+
# Group by member string to find entities with identical members
|
1396
|
+
consensus_entities = membership_lookup.groupby("member_string").first()
|
1397
|
+
|
1398
|
+
# Create new IDs for the consensus entities
|
1399
|
+
consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
|
1400
|
+
range(consensus_entities.shape[0]), table_schema["pk"]
|
1401
|
+
)
|
1402
|
+
|
1403
|
+
# Create lookup table mapping original entities to consensus entities
|
1404
|
+
lookup_table = membership_lookup.merge(
|
1405
|
+
consensus_entities["new_id"], left_on="member_string", right_index=True
|
1406
|
+
).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
|
1407
|
+
|
1408
|
+
return consensus_entities, lookup_table
|
1409
|
+
|
1410
|
+
|
1411
|
+
def _merge_entity_identifiers(
|
1412
|
+
agg_primary_table: pd.DataFrame, lookup_table: pd.Series, table_schema: dict
|
1413
|
+
) -> pd.Series:
|
1414
|
+
"""
|
1415
|
+
Merge identifiers from multiple entities.
|
1416
|
+
|
1417
|
+
Parameters:
|
1418
|
+
----------
|
1419
|
+
agg_primary_table: pd.DataFrame
|
1420
|
+
Table of entities
|
1421
|
+
lookup_table: pd.Series
|
1422
|
+
Lookup table mapping old IDs to new IDs
|
1423
|
+
table_schema: dict
|
1424
|
+
Schema for the table
|
1425
|
+
|
1426
|
+
Returns:
|
1427
|
+
----------
|
1428
|
+
pd.Series
|
1429
|
+
Series mapping new IDs to merged identifier objects
|
1430
|
+
"""
|
1431
|
+
# Combine entities with the same consensus ID
|
1432
|
+
indexed_old_identifiers = (
|
1433
|
+
agg_primary_table.join(lookup_table)
|
1434
|
+
.reset_index(drop=True)
|
1435
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
1436
|
+
.groupby(table_schema["pk"])[table_schema["id"]]
|
1437
|
+
)
|
1438
|
+
|
1439
|
+
# Merge identifier objects
|
1440
|
+
return indexed_old_identifiers.agg(identifiers.merge_identifiers)
|
1441
|
+
|
1442
|
+
|
1443
|
+
def _create_consensus_table(
|
1444
|
+
agg_primary_table: pd.DataFrame,
|
1445
|
+
lookup_table: pd.Series,
|
1446
|
+
updated_identifiers: pd.Series,
|
1447
|
+
table_schema: dict,
|
1448
|
+
) -> pd.DataFrame:
|
1449
|
+
"""
|
1450
|
+
Create a consensus table with merged entities.
|
1451
|
+
|
1452
|
+
Parameters:
|
1453
|
+
----------
|
1454
|
+
agg_primary_table: pd.DataFrame
|
1455
|
+
Table of entities
|
1456
|
+
lookup_table: pd.Series
|
1457
|
+
Lookup table mapping old IDs to new IDs
|
1458
|
+
updated_identifiers: pd.Series
|
1459
|
+
Series mapping new IDs to merged identifier objects
|
1460
|
+
table_schema: dict
|
1461
|
+
Schema for the table
|
1462
|
+
|
1463
|
+
Returns:
|
1464
|
+
----------
|
1465
|
+
pd.DataFrame
|
1466
|
+
Consensus table with one row per unique entity
|
1467
|
+
"""
|
1468
|
+
# Add nameness scores to help select representative names
|
1469
|
+
agg_primary_table_scored = utils._add_nameness_score_wrapper(
|
1470
|
+
agg_primary_table, "label", table_schema
|
1471
|
+
)
|
1472
|
+
|
1473
|
+
# Create a table with one row per consensus entity
|
1474
|
+
new_id_table = (
|
1475
|
+
agg_primary_table_scored.join(lookup_table)
|
1476
|
+
.reset_index(drop=True)
|
1477
|
+
.sort_values(["nameness_score"])
|
1478
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
1479
|
+
.groupby(table_schema["pk"])
|
1480
|
+
.first()[table_schema["vars"]]
|
1481
|
+
)
|
1482
|
+
|
1483
|
+
# Replace identifiers with merged versions
|
1484
|
+
new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
|
1485
|
+
updated_identifiers, left_index=True, right_index=True
|
1486
|
+
)
|
1487
|
+
|
1488
|
+
return new_id_table
|
1489
|
+
|
1490
|
+
|
1491
|
+
def _filter_identifiers_by_qualifier(
|
1492
|
+
meta_identifiers: pd.DataFrame, defining_biological_qualifiers: list[str]
|
1493
|
+
) -> pd.DataFrame:
|
1494
|
+
"""
|
1495
|
+
Filter identifiers to only include those with specific biological qualifiers.
|
1496
|
+
|
1497
|
+
Parameters:
|
1498
|
+
----------
|
1499
|
+
meta_identifiers: pd.DataFrame
|
1500
|
+
Table of identifiers
|
1501
|
+
defining_biological_qualifiers: list[str]
|
1502
|
+
List of biological qualifier types to keep
|
1503
|
+
|
1504
|
+
Returns:
|
1505
|
+
----------
|
1506
|
+
pd.DataFrame
|
1507
|
+
Filtered identifiers
|
1508
|
+
"""
|
1509
|
+
valid_identifiers = meta_identifiers.copy()
|
1510
|
+
return valid_identifiers[
|
1511
|
+
meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
|
1512
|
+
]
|
1513
|
+
|
1514
|
+
|
1515
|
+
def _handle_entries_without_identifiers(
|
1516
|
+
sbml_df: pd.DataFrame, valid_identifiers: pd.DataFrame
|
1517
|
+
) -> pd.DataFrame:
|
1518
|
+
"""
|
1519
|
+
Handle entities that don't have identifiers by adding dummy identifiers.
|
1520
|
+
|
1521
|
+
Parameters:
|
1522
|
+
----------
|
1523
|
+
sbml_df: pd.DataFrame
|
1524
|
+
Original table of entities
|
1525
|
+
valid_identifiers: pd.DataFrame
|
1526
|
+
Table of identifiers that passed filtering
|
1527
|
+
|
1528
|
+
Returns:
|
1529
|
+
----------
|
1530
|
+
pd.DataFrame
|
1531
|
+
Valid identifiers with dummy entries added
|
1532
|
+
"""
|
1533
|
+
# Find entries which no longer have any identifiers
|
1534
|
+
filtered_entries = sbml_df.reset_index().merge(
|
1535
|
+
valid_identifiers.reset_index(),
|
1536
|
+
left_on=sbml_df.index.names,
|
1537
|
+
right_on=sbml_df.index.names,
|
1538
|
+
how="outer",
|
1539
|
+
)[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
|
1540
|
+
|
1541
|
+
filtered_entries = filtered_entries[
|
1542
|
+
filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
|
1543
|
+
]
|
1544
|
+
|
1545
|
+
if filtered_entries.shape[0] == 0:
|
1546
|
+
return valid_identifiers
|
1547
|
+
|
1548
|
+
# Add dummy identifiers to these entries
|
1549
|
+
logger.warning(
|
1550
|
+
f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
|
1551
|
+
)
|
1552
|
+
|
1553
|
+
filtered_entries[SOURCE_SPEC.ENTRY] = 0
|
1554
|
+
filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
|
1555
|
+
filtered_entries[IDENTIFIERS.ONTOLOGY] = [
|
1556
|
+
"dummy_value_" + str(val)
|
1557
|
+
for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
|
1558
|
+
]
|
1559
|
+
filtered_entries[IDENTIFIERS.URL] = None
|
1560
|
+
filtered_entries[IDENTIFIERS.BQB] = None
|
1561
|
+
|
1562
|
+
filtered_entries = filtered_entries.set_index(
|
1563
|
+
sbml_df.index.names + [SOURCE_SPEC.ENTRY]
|
1564
|
+
)
|
1565
|
+
|
1566
|
+
# Combine original valid identifiers with dummy identifiers
|
1567
|
+
return pd.concat([valid_identifiers, filtered_entries])
|
1568
|
+
|
1569
|
+
|
1570
|
+
def _prepare_identifier_edgelist(
|
1571
|
+
valid_identifiers: pd.DataFrame, sbml_df: pd.DataFrame
|
1572
|
+
) -> pd.DataFrame:
|
1573
|
+
"""
|
1574
|
+
Prepare an edgelist for clustering identifiers.
|
1575
|
+
|
1576
|
+
Parameters:
|
1577
|
+
----------
|
1578
|
+
valid_identifiers: pd.DataFrame
|
1579
|
+
Table of identifiers
|
1580
|
+
sbml_df: pd.DataFrame
|
1581
|
+
Original table of entities
|
1582
|
+
|
1583
|
+
Returns:
|
1584
|
+
----------
|
1585
|
+
pd.DataFrame
|
1586
|
+
Edgelist connecting entities to their identifiers
|
1587
|
+
"""
|
1588
|
+
# Format identifiers as edgelist
|
1589
|
+
formatted_identifiers = utils.format_identifiers_as_edgelist(
|
1590
|
+
valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
1591
|
+
)
|
1592
|
+
|
1593
|
+
# Create a unique tag for each entity from the original index
|
1594
|
+
indexed_species_tags = (
|
1595
|
+
formatted_identifiers.reset_index()
|
1596
|
+
.set_index(formatted_identifiers.index.names, drop=False)[sbml_df.index.names]
|
1597
|
+
.astype(str)
|
1598
|
+
.apply("__".join, axis=1)
|
1599
|
+
)
|
1600
|
+
formatted_identifiers.loc[:, "model_spec"] = indexed_species_tags
|
1601
|
+
|
1602
|
+
# Create edgelist that connects entities to identifiers
|
1603
|
+
id_edgelist = pd.concat(
|
1604
|
+
[
|
1605
|
+
formatted_identifiers[["ind", "id"]],
|
1606
|
+
# Add edges connecting model-specific instances to their identifiers
|
1607
|
+
formatted_identifiers[["model_spec", "id"]].rename(
|
1608
|
+
columns={"model_spec": "ind"}
|
1609
|
+
),
|
1610
|
+
]
|
1611
|
+
)
|
1612
|
+
|
1613
|
+
return id_edgelist
|
1614
|
+
|
1615
|
+
|
1616
|
+
def _create_cluster_identifiers(
|
1617
|
+
meta_identifiers: pd.DataFrame,
|
1618
|
+
indexed_cluster: pd.Series,
|
1619
|
+
sbml_df: pd.DataFrame,
|
1620
|
+
ind_clusters: pd.DataFrame,
|
1621
|
+
table_schema: dict,
|
1622
|
+
) -> pd.DataFrame:
|
1623
|
+
"""
|
1624
|
+
Create identifier objects for each cluster.
|
1625
|
+
|
1626
|
+
Parameters
|
1627
|
+
----------
|
1628
|
+
meta_identifiers : pd.DataFrame
|
1629
|
+
All identifiers (including those filtered out by BQB)
|
1630
|
+
indexed_cluster : pd.Series
|
1631
|
+
Maps entity indices to cluster IDs
|
1632
|
+
sbml_df : pd.DataFrame
|
1633
|
+
Original table of entities
|
1634
|
+
ind_clusters : pd.DataFrame
|
1635
|
+
Cluster assignments from graph algorithm
|
1636
|
+
table_schema : dict
|
1637
|
+
Schema for the table, used to determine the correct identifier column name
|
1638
|
+
|
1639
|
+
Returns
|
1640
|
+
-------
|
1641
|
+
pd.DataFrame
|
1642
|
+
Table mapping clusters to their consensus identifiers, with the identifier column named according to the schema
|
1643
|
+
"""
|
1644
|
+
# Combine all identifiers with cluster assignments
|
1645
|
+
all_cluster_identifiers = meta_identifiers.reset_index().merge(
|
1646
|
+
indexed_cluster, left_on=sbml_df.index.names, right_index=True
|
1647
|
+
)
|
1648
|
+
|
1649
|
+
# Create an Identifiers object for each cluster
|
1650
|
+
cluster_consensus_identifiers = {
|
1651
|
+
k: identifiers.Identifiers(
|
1652
|
+
list(
|
1653
|
+
v[
|
1654
|
+
[
|
1655
|
+
IDENTIFIERS.ONTOLOGY,
|
1656
|
+
IDENTIFIERS.IDENTIFIER,
|
1657
|
+
IDENTIFIERS.URL,
|
1658
|
+
IDENTIFIERS.BQB,
|
1659
|
+
]
|
1660
|
+
]
|
1661
|
+
.T.to_dict()
|
1662
|
+
.values()
|
1663
|
+
)
|
1664
|
+
)
|
1665
|
+
for k, v in all_cluster_identifiers.groupby("cluster")
|
1666
|
+
}
|
1667
|
+
|
1668
|
+
# Handle clusters that don't have any identifiers
|
1669
|
+
catchup_clusters = {
|
1670
|
+
c: identifiers.Identifiers(list())
|
1671
|
+
for c in set(ind_clusters["cluster"].tolist()).difference(
|
1672
|
+
cluster_consensus_identifiers
|
1673
|
+
)
|
1674
|
+
}
|
1675
|
+
cluster_consensus_identifiers = {
|
1676
|
+
**cluster_consensus_identifiers,
|
1677
|
+
**catchup_clusters,
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
# Convert to DataFrame with correct column name
|
1681
|
+
id_col = table_schema["id"]
|
1682
|
+
cluster_consensus_identifiers_df = pd.DataFrame(
|
1683
|
+
cluster_consensus_identifiers, index=[id_col]
|
1684
|
+
).T
|
1685
|
+
cluster_consensus_identifiers_df.index.name = "cluster"
|
1686
|
+
return cluster_consensus_identifiers_df
|
1687
|
+
|
1688
|
+
|
1689
|
+
def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
|
1690
|
+
"""Check models in SBML_dfs for problems which can be reported up-front
|
1691
|
+
|
1692
|
+
Args:
|
1693
|
+
sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
|
1694
|
+
primarily used as an input for construct_consensus_model
|
1695
|
+
|
1696
|
+
Returns:
|
1697
|
+
None
|
1698
|
+
|
1699
|
+
"""
|
1700
|
+
|
1701
|
+
for k, v in sbml_dfs_dict.items():
|
1702
|
+
_check_sbml_dfs(sbml_dfs=v, model_label=k)
|
1703
|
+
return None
|
1704
|
+
|
1705
|
+
|
1706
|
+
def _check_sbml_dfs(
|
1707
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
|
1708
|
+
) -> None:
|
1709
|
+
"""Check SBML_dfs for identifiers which are associated with different entities before a merge."""
|
1710
|
+
|
1711
|
+
ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
1712
|
+
defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
|
1713
|
+
|
1714
|
+
defining_identifier_counts = defining_ids.value_counts(
|
1715
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
1716
|
+
)
|
1717
|
+
degenerate_defining_identities = (
|
1264
1718
|
defining_identifier_counts[defining_identifier_counts > 1]
|
1265
1719
|
.rename("N")
|
1266
1720
|
.reset_index()
|
@@ -1314,9 +1768,46 @@ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
|
|
1314
1768
|
return None
|
1315
1769
|
|
1316
1770
|
|
1771
|
+
def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
|
1772
|
+
"""Flag cases where meta identifers are totally missing or BQB codes are not included"""
|
1773
|
+
|
1774
|
+
if meta_identifiers.shape[0] == 0:
|
1775
|
+
raise ValueError(
|
1776
|
+
'"meta_identifiers" was empty; some identifiers should be present'
|
1777
|
+
)
|
1778
|
+
|
1779
|
+
n_null = sum(meta_identifiers["bqb"].isnull())
|
1780
|
+
if n_null > 0:
|
1781
|
+
msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
|
1782
|
+
logger.warn(msg)
|
1783
|
+
|
1784
|
+
return None
|
1785
|
+
|
1786
|
+
|
1787
|
+
def _update_foreign_keys(
|
1788
|
+
agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
|
1789
|
+
) -> pd.DataFrame:
|
1790
|
+
for fk in table_schema["fk"]:
|
1791
|
+
updated_fks = (
|
1792
|
+
agg_tbl[fk]
|
1793
|
+
.reset_index()
|
1794
|
+
.merge(
|
1795
|
+
fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
|
1796
|
+
)
|
1797
|
+
.drop(fk, axis=1)
|
1798
|
+
.rename(columns={"new_id": fk})
|
1799
|
+
.set_index(["model", table_schema["pk"]])
|
1800
|
+
)
|
1801
|
+
agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
|
1802
|
+
|
1803
|
+
return agg_tbl
|
1804
|
+
|
1805
|
+
|
1317
1806
|
def _update_foreign_keys(
|
1318
1807
|
agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
|
1319
1808
|
) -> pd.DataFrame:
|
1809
|
+
"""Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
|
1810
|
+
|
1320
1811
|
for fk in table_schema["fk"]:
|
1321
1812
|
updated_fks = (
|
1322
1813
|
agg_tbl[fk]
|
@@ -1378,8 +1869,14 @@ def _resolve_reversibility(
|
|
1378
1869
|
SBML_DFS.R_ISREVERSIBLE, axis=1
|
1379
1870
|
).join(r_id_reversibility)
|
1380
1871
|
|
1381
|
-
|
1382
|
-
|
1872
|
+
if rxns_w_reversibility.shape[0] != rxn_consensus_species.shape[0]:
|
1873
|
+
raise ValueError(
|
1874
|
+
"rxns_w_reversibility and rxn_consensus_species must have the same number of rows"
|
1875
|
+
)
|
1876
|
+
if not all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False])):
|
1877
|
+
raise ValueError(
|
1878
|
+
"All rxns_w_reversibility[R_ISREVERSIBLE] must be True or False"
|
1879
|
+
)
|
1383
1880
|
|
1384
1881
|
return rxns_w_reversibility
|
1385
1882
|
|