napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/consensus.py
ADDED
@@ -0,0 +1,1557 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import random
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
from napistu import identifiers
|
11
|
+
from napistu import indices
|
12
|
+
from napistu import sbml_dfs_core
|
13
|
+
from napistu import sbml_dfs_utils
|
14
|
+
from napistu import source
|
15
|
+
from napistu import utils
|
16
|
+
from napistu.ingestion import sbml
|
17
|
+
|
18
|
+
from napistu.constants import SBML_DFS
|
19
|
+
from napistu.constants import IDENTIFIERS
|
20
|
+
from napistu.constants import SOURCE_SPEC
|
21
|
+
from napistu.constants import BQB_DEFINING_ATTRS
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
# set the level to show logger.info message
|
25
|
+
logging.basicConfig(level=logging.DEBUG)
|
26
|
+
|
27
|
+
|
28
|
+
def construct_consensus_model(
|
29
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
30
|
+
pw_index: indices.PWIndex,
|
31
|
+
dogmatic: bool = True,
|
32
|
+
) -> sbml_dfs_core.SBML_dfs:
|
33
|
+
"""
|
34
|
+
Construct Consensus Model
|
35
|
+
|
36
|
+
Turn a dictionary of pathway models into a single consensus model by merging shared entities.
|
37
|
+
|
38
|
+
Parameters:
|
39
|
+
----------
|
40
|
+
sbml_dfs_dict: dict{cpr.SBML_dfs}
|
41
|
+
A dictionary of SBML_dfs from different models
|
42
|
+
pw_index: indices.PWIndex
|
43
|
+
An index of all tables being aggregated
|
44
|
+
dogmatic: bool
|
45
|
+
If True then try to preserve genes, transcript, and proteins as separate species. If False
|
46
|
+
then try to merge them.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
----------
|
50
|
+
A cpr.SBML_dfs object containing the consensus model
|
51
|
+
|
52
|
+
"""
|
53
|
+
|
54
|
+
logger.info("Reporting possible issues in component models")
|
55
|
+
_check_sbml_dfs_dict(sbml_dfs_dict)
|
56
|
+
assert isinstance(pw_index, indices.PWIndex)
|
57
|
+
# select valid BQB attributes based on dogmatic flag
|
58
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
|
59
|
+
|
60
|
+
logger.info("Defining compartments based on unique ids")
|
61
|
+
comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
|
62
|
+
sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
|
63
|
+
)
|
64
|
+
|
65
|
+
logger.info("Defining species based on unique ids")
|
66
|
+
spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
|
67
|
+
sbml_dfs_dict=sbml_dfs_dict,
|
68
|
+
pw_index=pw_index,
|
69
|
+
table=SBML_DFS.SPECIES,
|
70
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
71
|
+
)
|
72
|
+
|
73
|
+
logger.info(
|
74
|
+
"Defining compartmentalized species based on unique species x compartments"
|
75
|
+
)
|
76
|
+
compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
|
77
|
+
sbml_dfs_dict,
|
78
|
+
pw_index,
|
79
|
+
table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
|
80
|
+
fk_lookup_tables={
|
81
|
+
SBML_DFS.C_ID: comp_lookup_table,
|
82
|
+
SBML_DFS.S_ID: spec_lookup_table,
|
83
|
+
},
|
84
|
+
)
|
85
|
+
|
86
|
+
logger.info(
|
87
|
+
"Define reactions based on membership of identical compartmentalized species"
|
88
|
+
)
|
89
|
+
rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
|
90
|
+
sbml_dfs_dict,
|
91
|
+
pw_index,
|
92
|
+
table=SBML_DFS.REACTIONS,
|
93
|
+
defined_by=SBML_DFS.REACTION_SPECIES,
|
94
|
+
defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
|
95
|
+
defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
|
96
|
+
)
|
97
|
+
|
98
|
+
logger.info("Annotating reversibility based on merged reactions")
|
99
|
+
rxn_consensus_species = _resolve_reversibility(
|
100
|
+
sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
|
101
|
+
)
|
102
|
+
|
103
|
+
# define reaction species with species
|
104
|
+
logger.info("Define reaction species based on reactions")
|
105
|
+
rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
|
106
|
+
sbml_dfs_dict,
|
107
|
+
pw_index,
|
108
|
+
table=SBML_DFS.REACTION_SPECIES,
|
109
|
+
fk_lookup_tables={
|
110
|
+
SBML_DFS.R_ID: rxn_lookup_table,
|
111
|
+
SBML_DFS.SC_ID: compspec_lookup_table,
|
112
|
+
},
|
113
|
+
# retain species with different roles
|
114
|
+
extra_defining_attrs=[SBML_DFS.SBO_TERM],
|
115
|
+
)
|
116
|
+
|
117
|
+
sbml_tbl_dict = {
|
118
|
+
SBML_DFS.COMPARTMENTS: comp_consensus_entities,
|
119
|
+
SBML_DFS.SPECIES: spec_consensus_entities,
|
120
|
+
SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
|
121
|
+
SBML_DFS.REACTIONS: rxn_consensus_species,
|
122
|
+
SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
|
123
|
+
}
|
124
|
+
|
125
|
+
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml_tbl_dict) # type: ignore
|
126
|
+
|
127
|
+
# add species and reactions data from component models
|
128
|
+
consensus_species_data = merge_entity_data(
|
129
|
+
sbml_dfs_dict, lookup_table=spec_lookup_table, table=SBML_DFS.SPECIES
|
130
|
+
)
|
131
|
+
for k in consensus_species_data.keys():
|
132
|
+
sbml_dfs.add_species_data(k, consensus_species_data[k])
|
133
|
+
|
134
|
+
consensus_reactions_data = merge_entity_data(
|
135
|
+
sbml_dfs_dict, lookup_table=rxn_lookup_table, table=SBML_DFS.REACTIONS
|
136
|
+
)
|
137
|
+
for k in consensus_reactions_data.keys():
|
138
|
+
sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
|
139
|
+
|
140
|
+
return sbml_dfs
|
141
|
+
|
142
|
+
|
143
|
+
def construct_sbml_dfs_dict(
|
144
|
+
pw_index: pd.DataFrame, strict: bool = True
|
145
|
+
) -> dict[str, sbml_dfs_core.SBML_dfs]:
|
146
|
+
"""
|
147
|
+
Construct SBML DFs Dict
|
148
|
+
|
149
|
+
Convert all models in the pathway index into SBML_dfs and add them to a dict.
|
150
|
+
|
151
|
+
Parameters:
|
152
|
+
pw_index: indices.PWIndex
|
153
|
+
An index of all tables being aggregated
|
154
|
+
strict (bool): if set to `false` errorenous files are skipped with warning. Default: True
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
dict(sbml_dfs_core.SBML_dfs)
|
158
|
+
|
159
|
+
"""
|
160
|
+
|
161
|
+
sbml_dfs_dict = dict()
|
162
|
+
for i in tqdm(pw_index.index.index.tolist()):
|
163
|
+
pw_entry = pw_index.index.loc[i]
|
164
|
+
logger.info(f"processing {pw_entry[SOURCE_SPEC.NAME]}")
|
165
|
+
|
166
|
+
sbml_path = os.path.join(pw_index.base_path, pw_entry[SOURCE_SPEC.FILE])
|
167
|
+
try:
|
168
|
+
sbml_obj = sbml.SBML(sbml_path)
|
169
|
+
sbml_dfs_dict[pw_entry[SOURCE_SPEC.PATHWAY_ID]] = sbml_dfs_core.SBML_dfs(
|
170
|
+
sbml_obj
|
171
|
+
)
|
172
|
+
except ValueError as e:
|
173
|
+
if strict:
|
174
|
+
raise e
|
175
|
+
logger.warning(
|
176
|
+
f"{pw_entry[SOURCE_SPEC.NAME]} not successfully loaded:", exc_info=True
|
177
|
+
)
|
178
|
+
return sbml_dfs_dict
|
179
|
+
|
180
|
+
|
181
|
+
def unnest_SBML_df(
|
182
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], table: str
|
183
|
+
) -> pd.DataFrame:
|
184
|
+
"""
|
185
|
+
Unnest SBML_dfs
|
186
|
+
|
187
|
+
Merge corresponding tables from a set of models
|
188
|
+
|
189
|
+
sbml_dfs_dict: dict{cpr.SBML_dfs}
|
190
|
+
A dictionary of SBML_dfs from different models
|
191
|
+
table: str
|
192
|
+
A table to aggregate (e.g., species, reactions, compartments)
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
pd.Dataframe, a table with a multindex of model and an entity_id
|
196
|
+
|
197
|
+
"""
|
198
|
+
|
199
|
+
# check that all sbml_dfs have the same schema
|
200
|
+
_test_same_schema(sbml_dfs_dict)
|
201
|
+
table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
202
|
+
|
203
|
+
df_list = [
|
204
|
+
getattr(sbml_dfs_dict[x], table).assign(model=x) for x in sbml_dfs_dict.keys()
|
205
|
+
]
|
206
|
+
df_concat = pd.concat(df_list)
|
207
|
+
|
208
|
+
# add model to index columns
|
209
|
+
if df_concat.size != 0:
|
210
|
+
df_concat = df_concat.reset_index().set_index(
|
211
|
+
[SOURCE_SPEC.MODEL, table_schema["pk"]]
|
212
|
+
)
|
213
|
+
|
214
|
+
return df_concat
|
215
|
+
|
216
|
+
|
217
|
+
def construct_meta_entities_identifiers(
|
218
|
+
sbml_dfs_dict: dict,
|
219
|
+
pw_index: indices.PWIndex,
|
220
|
+
table: str,
|
221
|
+
fk_lookup_tables: dict = {},
|
222
|
+
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
223
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
224
|
+
"""
|
225
|
+
Construct Meta Entities Defined by Identifiers
|
226
|
+
|
227
|
+
Aggregating across one entity type for a set of pathway models merge entities which share identifiers
|
228
|
+
|
229
|
+
Parameters:
|
230
|
+
----------
|
231
|
+
sbml_df_dict (dict{"model": cpr.SBML_dfs}):
|
232
|
+
A dictionary of cpr.SBML_dfs
|
233
|
+
pw_index (indices.PWIndex):
|
234
|
+
An index of all tables being aggregated
|
235
|
+
table (str):
|
236
|
+
A table/entity set from the sbml_dfs to work-with
|
237
|
+
fk_lookup_tables (dict):
|
238
|
+
Dictionary containing lookup tables for all foreign keys used by the table
|
239
|
+
defining_biological_qualifiers (list[str]):
|
240
|
+
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
241
|
+
permissive settings could merge homologs, different forms of the same gene.
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
----------
|
245
|
+
new_id_table: pd.DataFrame
|
246
|
+
Matching the schema of one of the tables within sbml_df_dict
|
247
|
+
lookup_table: pd.Series
|
248
|
+
Matches the index of the aggregated entities to new_ids
|
249
|
+
|
250
|
+
"""
|
251
|
+
|
252
|
+
# combine sbml_dfs by adding model to the index and concatinating all dfs
|
253
|
+
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=table)
|
254
|
+
|
255
|
+
# since all sbml_dfs have the same schema pull out one schema for reference
|
256
|
+
table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
257
|
+
|
258
|
+
# update foreign keys using provided lookup tables
|
259
|
+
if "fk" in table_schema.keys():
|
260
|
+
agg_tbl = _update_foreign_keys(agg_tbl, table_schema, fk_lookup_tables)
|
261
|
+
|
262
|
+
new_id_table, lookup_table = reduce_to_consensus_ids(
|
263
|
+
sbml_df=agg_tbl,
|
264
|
+
table_schema=table_schema,
|
265
|
+
pw_index=pw_index,
|
266
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
267
|
+
)
|
268
|
+
|
269
|
+
# logging merges that occurred
|
270
|
+
report_consensus_merges(
|
271
|
+
lookup_table, table_schema, agg_tbl=agg_tbl, n_example_merges=5
|
272
|
+
)
|
273
|
+
|
274
|
+
return new_id_table, lookup_table
|
275
|
+
|
276
|
+
|
277
|
+
def reduce_to_consensus_ids(
|
278
|
+
sbml_df: pd.DataFrame,
|
279
|
+
table_schema: dict,
|
280
|
+
pw_index: indices.PWIndex | None = None,
|
281
|
+
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
282
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
283
|
+
"""
|
284
|
+
Reduce to Consensus
|
285
|
+
|
286
|
+
Reduce a table of entities to unique entries based on identifiers.
|
287
|
+
|
288
|
+
Parameters:
|
289
|
+
----------
|
290
|
+
sbml_df: pd.DataFrame
|
291
|
+
One type of entity from sbml_dfs_dict expanded to include
|
292
|
+
model its index, as produced by unnest_SBML_df(sbml_dfs_dict)
|
293
|
+
table_schema: dict
|
294
|
+
Schema for the table sbml_df
|
295
|
+
pw_index: indices.PWIndex
|
296
|
+
An index of all tables being aggregated
|
297
|
+
defining_biological_qualifiers: list(str)
|
298
|
+
A list of biological qualifier types which define distinct entities
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
----------
|
302
|
+
new_id_table: pd.DataFrame
|
303
|
+
Matching the schema of one of the tables within sbml_df_dict
|
304
|
+
lookup_table: pd.Series
|
305
|
+
Matches the index of the aggregated entities to new_ids
|
306
|
+
"""
|
307
|
+
|
308
|
+
indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
|
309
|
+
sbml_df, table_schema, defining_biological_qualifiers
|
310
|
+
)
|
311
|
+
|
312
|
+
# add cluster to reduce non-identifier attributes
|
313
|
+
agg_table_harmonized = sbml_df.join(indexed_cluster)
|
314
|
+
# create a new numbering schema off of cluster #s and id type
|
315
|
+
# print(agg_table_harmonized["cluster"])
|
316
|
+
# print(table_schema["pk"])
|
317
|
+
|
318
|
+
agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
|
319
|
+
agg_table_harmonized["cluster"], table_schema["pk"]
|
320
|
+
)
|
321
|
+
|
322
|
+
lookup_table = agg_table_harmonized["new_id"]
|
323
|
+
|
324
|
+
# add nameness_score as a measure of how-readable a possible name would be
|
325
|
+
# (this will help to select names which are more human readable after the merge)
|
326
|
+
agg_table_harmonized = utils._add_nameness_score_wrapper(
|
327
|
+
agg_table_harmonized, "label", table_schema
|
328
|
+
)
|
329
|
+
|
330
|
+
# reduce to one row per new_id and set as the primary key of the source table
|
331
|
+
agg_table_reduced = (
|
332
|
+
agg_table_harmonized.reset_index(drop=True)
|
333
|
+
.sort_values(["nameness_score"])
|
334
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
335
|
+
.groupby(table_schema["pk"])
|
336
|
+
.first()
|
337
|
+
.drop("nameness_score", axis=1)
|
338
|
+
)
|
339
|
+
|
340
|
+
new_id_table = (
|
341
|
+
agg_table_reduced.drop(table_schema["id"], axis=1)
|
342
|
+
.merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
|
343
|
+
.drop("cluster", axis=1)
|
344
|
+
)
|
345
|
+
|
346
|
+
if "source" in table_schema.keys():
|
347
|
+
if type(pw_index) is not indices.PWIndex:
|
348
|
+
raise ValueError(
|
349
|
+
f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
|
350
|
+
)
|
351
|
+
|
352
|
+
# track the model(s) that each entity came from
|
353
|
+
new_sources = create_consensus_sources(
|
354
|
+
agg_table_harmonized, lookup_table, table_schema, pw_index
|
355
|
+
)
|
356
|
+
assert isinstance(new_sources, pd.Series)
|
357
|
+
|
358
|
+
new_id_table = new_id_table.drop(
|
359
|
+
table_schema[SOURCE_SPEC.SOURCE], axis=1
|
360
|
+
).merge(new_sources, left_index=True, right_index=True)
|
361
|
+
|
362
|
+
# check that the index name and variables match the source
|
363
|
+
if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
|
364
|
+
new_id_table.index.names
|
365
|
+
):
|
366
|
+
raise ValueError(
|
367
|
+
"The newly constructed id table's index does not match the inputs"
|
368
|
+
)
|
369
|
+
|
370
|
+
if set(sbml_df) != set(new_id_table.columns):
|
371
|
+
raise ValueError(
|
372
|
+
"The newly constructed id table's variables do not match the inputs"
|
373
|
+
)
|
374
|
+
|
375
|
+
return new_id_table, lookup_table
|
376
|
+
|
377
|
+
|
378
|
+
def build_consensus_identifiers(
|
379
|
+
sbml_df: pd.DataFrame,
|
380
|
+
table_schema: dict,
|
381
|
+
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
382
|
+
) -> tuple[pd.Series, pd.DataFrame]:
|
383
|
+
"""
|
384
|
+
Build Consensus Identifiers
|
385
|
+
|
386
|
+
Take a set of entities spanning multiple models and find all unique entities.
|
387
|
+
|
388
|
+
Defining attributes provided in defining_biological_qualifiers will
|
389
|
+
be used for grouping; other identifiers will be added back at the end.
|
390
|
+
|
391
|
+
Parameters:
|
392
|
+
----------
|
393
|
+
sbml_df: pd.DataFrame
|
394
|
+
One type of entity from sbml_dfs_dict expanded to include model its index,
|
395
|
+
as produced by unnest_SBML_df(sbml_dfs_dict)
|
396
|
+
table_schema: dict
|
397
|
+
Schema for the table sbml_df
|
398
|
+
defining_biological_qualifiers: [str]
|
399
|
+
A list of biological qualifier types which should be used for grouping
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
----------
|
403
|
+
indexed_cluster: pd.Series
|
404
|
+
Maps the index from sbml_df onto a set of clusters which define unique entities
|
405
|
+
cluster_consensus_identifiers_df: pd.DataFrame
|
406
|
+
Maps an index of clusters onto a consensus cpr.identifiers.Identifiers object
|
407
|
+
"""
|
408
|
+
|
409
|
+
# create a table which is one row per entry
|
410
|
+
meta_identifiers = sbml_dfs_utils.unnest_identifiers(sbml_df, table_schema["id"])
|
411
|
+
# check the identifiers for missing attributes
|
412
|
+
_validate_meta_identifiers(meta_identifiers)
|
413
|
+
|
414
|
+
# remove some biological qualifier types types to avoid over-grouping
|
415
|
+
|
416
|
+
valid_identifiers = meta_identifiers.copy()
|
417
|
+
valid_identifiers = valid_identifiers[
|
418
|
+
meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
|
419
|
+
]
|
420
|
+
|
421
|
+
# catch entries which no longer have any identifiers
|
422
|
+
# add a dummy identifier to these which will still uniquely tag them
|
423
|
+
|
424
|
+
filtered_entries = sbml_df.reset_index().merge(
|
425
|
+
valid_identifiers.reset_index(),
|
426
|
+
left_on=sbml_df.index.names,
|
427
|
+
right_on=sbml_df.index.names,
|
428
|
+
how="outer",
|
429
|
+
)[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
|
430
|
+
filtered_entries = filtered_entries[
|
431
|
+
filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
|
432
|
+
]
|
433
|
+
if filtered_entries.shape[0] != 0:
|
434
|
+
logger.warning(
|
435
|
+
f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
|
436
|
+
)
|
437
|
+
|
438
|
+
filtered_entries[SOURCE_SPEC.ENTRY] = 0
|
439
|
+
filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
|
440
|
+
filtered_entries[IDENTIFIERS.ONTOLOGY] = [
|
441
|
+
"dummy_value_" + str(val)
|
442
|
+
for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
|
443
|
+
]
|
444
|
+
filtered_entries[IDENTIFIERS.URL] = None
|
445
|
+
filtered_entries[IDENTIFIERS.BQB] = None
|
446
|
+
|
447
|
+
filtered_entries = filtered_entries.set_index(
|
448
|
+
sbml_df.index.names + [SOURCE_SPEC.ENTRY]
|
449
|
+
)
|
450
|
+
|
451
|
+
valid_identifiers = pd.concat([valid_identifiers, filtered_entries])
|
452
|
+
|
453
|
+
# combine multi-index into a single variable; combine ontology + identifiers as a single variable
|
454
|
+
valid_identifiers = utils.format_identifiers_as_edgelist(
|
455
|
+
valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
456
|
+
)
|
457
|
+
|
458
|
+
# create a unique tag for a species from the original index
|
459
|
+
indexed_species_tags = (
|
460
|
+
valid_identifiers.reset_index()
|
461
|
+
.set_index(valid_identifiers.index.names, drop=False)[sbml_df.index.names]
|
462
|
+
.astype(str)
|
463
|
+
.apply("__".join, axis=1)
|
464
|
+
)
|
465
|
+
valid_identifiers.loc[:, "model_spec"] = indexed_species_tags
|
466
|
+
|
467
|
+
# convert index-identifier edge list into a network
|
468
|
+
# doing this will allow any entities with matching ontologies to be
|
469
|
+
# added to the same cluster so that they can be merged
|
470
|
+
id_edgelist = pd.concat(
|
471
|
+
[
|
472
|
+
valid_identifiers[["ind", "id"]],
|
473
|
+
# add id-ind edges so that identifiers corresponding to the same entity are grouped
|
474
|
+
# these entries will be discarded when merging the results back in by "ind"
|
475
|
+
valid_identifiers[["model_spec", "id"]].rename(
|
476
|
+
columns={"model_spec": "ind"}
|
477
|
+
),
|
478
|
+
]
|
479
|
+
)
|
480
|
+
|
481
|
+
# aggregate index entries which have overlapping identifiers
|
482
|
+
# using a greedy graph-based approach
|
483
|
+
ind_clusters = utils.find_weakly_connected_subgraphs(id_edgelist)
|
484
|
+
|
485
|
+
# add clusters to identifier entries
|
486
|
+
valid_identifiers = valid_identifiers.reset_index().merge(ind_clusters)
|
487
|
+
|
488
|
+
# all entries for the same (model, id) will have the same cluster so convert back to
|
489
|
+
# sbml_df index to facilitate join
|
490
|
+
indexed_cluster = valid_identifiers.groupby(sbml_df.index.names).first()["cluster"]
|
491
|
+
|
492
|
+
# combine equivalent entries into a single Identifiers object
|
493
|
+
# include identifiers which were filtered by bqb
|
494
|
+
|
495
|
+
all_cluster_identifiers = meta_identifiers.reset_index().merge(
|
496
|
+
indexed_cluster, left_on=sbml_df.index.names, right_index=True
|
497
|
+
)
|
498
|
+
|
499
|
+
cluster_consensus_identifiers = {
|
500
|
+
k: identifiers.Identifiers(
|
501
|
+
list(
|
502
|
+
v[
|
503
|
+
[
|
504
|
+
IDENTIFIERS.ONTOLOGY,
|
505
|
+
IDENTIFIERS.IDENTIFIER,
|
506
|
+
IDENTIFIERS.URL,
|
507
|
+
IDENTIFIERS.BQB,
|
508
|
+
]
|
509
|
+
]
|
510
|
+
.T.to_dict()
|
511
|
+
.values()
|
512
|
+
)
|
513
|
+
)
|
514
|
+
for k, v in all_cluster_identifiers.groupby("cluster")
|
515
|
+
}
|
516
|
+
|
517
|
+
# recover clusters which don't have any identifiers
|
518
|
+
catchup_clusters = {
|
519
|
+
c: identifiers.Identifiers(list())
|
520
|
+
for c in set(ind_clusters["cluster"].tolist()).difference(
|
521
|
+
cluster_consensus_identifiers
|
522
|
+
)
|
523
|
+
}
|
524
|
+
cluster_consensus_identifiers = {
|
525
|
+
**cluster_consensus_identifiers,
|
526
|
+
**catchup_clusters,
|
527
|
+
}
|
528
|
+
|
529
|
+
cluster_consensus_identifiers_df = pd.DataFrame(
|
530
|
+
cluster_consensus_identifiers, index=[table_schema["id"]]
|
531
|
+
).T
|
532
|
+
cluster_consensus_identifiers_df.index.name = "cluster"
|
533
|
+
|
534
|
+
return indexed_cluster, cluster_consensus_identifiers_df
|
535
|
+
|
536
|
+
|
537
|
+
def pre_consensus_ontology_check(
|
538
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
|
539
|
+
):
|
540
|
+
"""Check for shared ontologies across source models."""
|
541
|
+
|
542
|
+
# tablename: compartments/species/reactions tables with Identifiers
|
543
|
+
# returns shared ontologies among sbml_dfs in sbml_dfs_dict for
|
544
|
+
# compartments/species/reactions tables
|
545
|
+
|
546
|
+
if tablename in [SBML_DFS.COMPARTMENTS, SBML_DFS.SPECIES, SBML_DFS.REACTIONS]:
|
547
|
+
sbml_onto_lists = []
|
548
|
+
for df_key, sbml_dfs_ind in sbml_dfs_dict.items():
|
549
|
+
sbml_onto_df_ind = sbml_dfs_ind.get_identifiers(tablename).value_counts(
|
550
|
+
IDENTIFIERS.ONTOLOGY
|
551
|
+
)
|
552
|
+
sbml_onto_lists.append(sbml_onto_df_ind.index.to_list())
|
553
|
+
|
554
|
+
shared_onto_set = set.intersection(*map(set, sbml_onto_lists))
|
555
|
+
shared_onto_list = list(shared_onto_set)
|
556
|
+
|
557
|
+
sbml_name_list = list(sbml_dfs_dict.keys())
|
558
|
+
sbml_dict_onto_df = pd.DataFrame({"single_sbml_dfs": sbml_name_list})
|
559
|
+
sbml_dict_onto_df[IDENTIFIERS.ONTOLOGY] = sbml_onto_lists
|
560
|
+
|
561
|
+
else:
|
562
|
+
logger.error(
|
563
|
+
f"{tablename} entry doesn't have identifiers and thus cannot check its ontology"
|
564
|
+
)
|
565
|
+
shared_onto_list = []
|
566
|
+
sbml_dict_onto_df = []
|
567
|
+
|
568
|
+
logger.info(
|
569
|
+
f"Shared ontologies for {tablename} are {shared_onto_list} before building a consensus model."
|
570
|
+
)
|
571
|
+
|
572
|
+
return shared_onto_list, sbml_dict_onto_df
|
573
|
+
|
574
|
+
|
575
|
+
def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
|
576
|
+
"""Flag cases where meta identifers are totally missing or BQB codes are not included"""
|
577
|
+
|
578
|
+
if meta_identifiers.shape[0] == 0:
|
579
|
+
raise ValueError(
|
580
|
+
'"meta_identifiers" was empty; some identifiers should be present'
|
581
|
+
)
|
582
|
+
|
583
|
+
n_null = sum(meta_identifiers["bqb"].isnull())
|
584
|
+
if n_null > 0:
|
585
|
+
msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
|
586
|
+
logger.warn(msg)
|
587
|
+
|
588
|
+
return None
|
589
|
+
|
590
|
+
|
591
|
+
def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
|
592
|
+
# Checking the ontology in "species" shared by different sources in a consensus model
|
593
|
+
# returns a set of shared ontologies by different sources
|
594
|
+
|
595
|
+
consensus_sbmldf_tbl_var = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
596
|
+
|
597
|
+
# get the sources of species in the consensus model
|
598
|
+
consensus_sbmldf_tbl_var_sc = (
|
599
|
+
source.unnest_sources(sbml_dfs.species, SBML_DFS.S_SOURCE, verbose=False)
|
600
|
+
.reset_index()
|
601
|
+
.sort_values([SOURCE_SPEC.NAME])
|
602
|
+
)
|
603
|
+
|
604
|
+
# merge columns with source info to the model's species identifiers df.
|
605
|
+
consensus_sbmldf_tbl_var_w_sc = consensus_sbmldf_tbl_var.merge(
|
606
|
+
consensus_sbmldf_tbl_var_sc.loc[
|
607
|
+
:,
|
608
|
+
[
|
609
|
+
SBML_DFS.S_ID,
|
610
|
+
SOURCE_SPEC.MODEL,
|
611
|
+
SOURCE_SPEC.FILE,
|
612
|
+
SOURCE_SPEC.PATHWAY_ID,
|
613
|
+
SOURCE_SPEC.SOURCE,
|
614
|
+
SOURCE_SPEC.NAME,
|
615
|
+
],
|
616
|
+
],
|
617
|
+
on=SBML_DFS.S_ID,
|
618
|
+
)
|
619
|
+
|
620
|
+
# get the model/source and its ontology set to a separate df
|
621
|
+
shared_ontology_df = (
|
622
|
+
consensus_sbmldf_tbl_var_w_sc.groupby(SOURCE_SPEC.NAME)[IDENTIFIERS.ONTOLOGY]
|
623
|
+
.apply(set)
|
624
|
+
.reset_index(name="onto_expanded")
|
625
|
+
)
|
626
|
+
|
627
|
+
# the intersection set among ontology sets of all sources
|
628
|
+
shared_onto_set = shared_ontology_df.onto_expanded[0]
|
629
|
+
for i in range(1, len(shared_ontology_df.onto_expanded)):
|
630
|
+
shared_onto_set = shared_onto_set.intersection(
|
631
|
+
shared_ontology_df.onto_expanded[i]
|
632
|
+
)
|
633
|
+
|
634
|
+
logger.info(f"shared ontologies in the consesus model are: {shared_onto_set}")
|
635
|
+
|
636
|
+
return shared_onto_set
|
637
|
+
|
638
|
+
|
639
|
+
def _update_foreign_keys(
|
640
|
+
agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
|
641
|
+
) -> pd.DataFrame:
|
642
|
+
"""Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
|
643
|
+
|
644
|
+
for fk in table_schema["fk"]:
|
645
|
+
updated_fks = (
|
646
|
+
agg_tbl[fk]
|
647
|
+
.reset_index()
|
648
|
+
.merge(
|
649
|
+
fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
|
650
|
+
)
|
651
|
+
.drop(fk, axis=1)
|
652
|
+
.rename(columns={"new_id": fk})
|
653
|
+
.set_index(["model", table_schema["pk"]])
|
654
|
+
)
|
655
|
+
agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
|
656
|
+
|
657
|
+
return agg_tbl
|
658
|
+
|
659
|
+
|
660
|
+
def pre_consensus_compartment_check(
|
661
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
|
662
|
+
) -> tuple[list, dict]:
|
663
|
+
"""Find compartments shared across models."""
|
664
|
+
|
665
|
+
# tablename: compartments only
|
666
|
+
# returns shared c_name in compartments of sbml_dfs in sbml_dfs_dict for
|
667
|
+
|
668
|
+
if tablename in [SBML_DFS.COMPARTMENTS]:
|
669
|
+
sbml_cname_list = []
|
670
|
+
for df_key, sbml_dfs_ind in sbml_dfs_dict.items():
|
671
|
+
sbml_df_ind_cname = sbml_dfs_ind.get_identifiers(tablename).value_counts(
|
672
|
+
SBML_DFS.C_NAME
|
673
|
+
)
|
674
|
+
sbml_cname_list.append(sbml_df_ind_cname.index.to_list())
|
675
|
+
|
676
|
+
shared_cname_set = set.intersection(*map(set, sbml_cname_list))
|
677
|
+
shared_cname_list = list(shared_cname_set)
|
678
|
+
|
679
|
+
sbml_name_list = list(sbml_dfs_dict.keys())
|
680
|
+
sbml_dict_cname_df = pd.DataFrame({"single_sbml_dfs": sbml_name_list})
|
681
|
+
sbml_dict_cname_df["c_names"] = sbml_cname_list
|
682
|
+
|
683
|
+
else:
|
684
|
+
logger.error(f"{tablename} entry doesn't have c_name")
|
685
|
+
|
686
|
+
logger.info(
|
687
|
+
f"Shared compartments for {tablename} are {shared_cname_list} before building a consensus model."
|
688
|
+
)
|
689
|
+
|
690
|
+
return shared_cname_list, sbml_dict_cname_df
|
691
|
+
|
692
|
+
|
693
|
+
def post_consensus_source_check(
|
694
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, table_name: str
|
695
|
+
) -> pd.DataFrame:
|
696
|
+
"""Provide sources of tables in a consensus model; the output df will be used to determine whether models are merged."""
|
697
|
+
|
698
|
+
table_source = sbml_dfs.schema[table_name][SOURCE_SPEC.SOURCE]
|
699
|
+
table_pk = sbml_dfs.schema[table_name]["pk"]
|
700
|
+
|
701
|
+
sbml_dfs_tbl = getattr(sbml_dfs, table_name)
|
702
|
+
sbml_dfs_tbl_pathway_source = (
|
703
|
+
source.unnest_sources(sbml_dfs_tbl, table_source, verbose=False)
|
704
|
+
.reset_index()
|
705
|
+
.sort_values(["name"])
|
706
|
+
)
|
707
|
+
|
708
|
+
sbml_dfs_tbl_pathway_source["pathway"] = sbml_dfs_tbl_pathway_source.groupby(
|
709
|
+
[table_pk]
|
710
|
+
)["name"].transform(lambda x: " + ".join(set(x)))
|
711
|
+
|
712
|
+
sbml_dfs_tbl_pathway_source = (
|
713
|
+
sbml_dfs_tbl_pathway_source[[table_pk, "pathway"]]
|
714
|
+
.drop_duplicates()
|
715
|
+
.set_index(table_pk)
|
716
|
+
)
|
717
|
+
|
718
|
+
tbl_pathway_source_df = pd.DataFrame(
|
719
|
+
sbml_dfs_tbl_pathway_source["pathway"].value_counts()
|
720
|
+
)
|
721
|
+
|
722
|
+
return tbl_pathway_source_df
|
723
|
+
|
724
|
+
|
725
|
+
def construct_meta_entities_fk(
|
726
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
727
|
+
pw_index: pd.DataFrame,
|
728
|
+
table: str = SBML_DFS.COMPARTMENTALIZED_SPECIES,
|
729
|
+
fk_lookup_tables: dict = {},
|
730
|
+
extra_defining_attrs: list = [],
|
731
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
732
|
+
"""
|
733
|
+
Construct Meta Entities Defined by Foreign Keys
|
734
|
+
|
735
|
+
Aggregating across one entity type for a set of pathway
|
736
|
+
models merge entities which are defined by their foreign keys.
|
737
|
+
|
738
|
+
Parameters:
|
739
|
+
----------
|
740
|
+
sbml_df_dict: dict{"model": cpr.SBML_dfs}
|
741
|
+
A dictionary of cpr.SBML_dfs
|
742
|
+
pw_index: indices.PWIndex
|
743
|
+
An index of all tables being aggregated
|
744
|
+
table:
|
745
|
+
A table/entity set from the sbml_dfs to work-with
|
746
|
+
fk_lookup_tables: dict
|
747
|
+
Dictionary containing lookup tables for all foreign keys used by the table
|
748
|
+
extra_defining_attrs: list
|
749
|
+
List of terms which uniquely define a reaction species in addition
|
750
|
+
to the foreign keys. A common case is when a species is a modifier
|
751
|
+
and a substrate in a reaction.
|
752
|
+
|
753
|
+
Returns:
|
754
|
+
----------
|
755
|
+
new_id_table: pd.DataFrame
|
756
|
+
Matching the schema of one of the tables within sbml_df_dict
|
757
|
+
lookup_table: pd.Series
|
758
|
+
Matches the index of the aggregated entities to new_ids
|
759
|
+
|
760
|
+
"""
|
761
|
+
|
762
|
+
if not isinstance(extra_defining_attrs, list):
|
763
|
+
raise TypeError("extra_defining_attrs must be a list")
|
764
|
+
|
765
|
+
# combine sbml_dfs by adding model to the index and concatinating all dfs
|
766
|
+
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=table)
|
767
|
+
|
768
|
+
# since all sbml_dfs have the same schema pull out one schema for reference
|
769
|
+
table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
770
|
+
|
771
|
+
# update foreign keys using provided lookup tables
|
772
|
+
agg_tbl = _update_foreign_keys(agg_tbl, table_schema, fk_lookup_tables)
|
773
|
+
|
774
|
+
# add nameness_score as a measure of how-readable a possible name would be
|
775
|
+
# (this will help to select names which are more human readable after the merge)
|
776
|
+
agg_tbl = utils._add_nameness_score_wrapper(agg_tbl, "label", table_schema)
|
777
|
+
|
778
|
+
# reduce to unique elements
|
779
|
+
induced_entities = (
|
780
|
+
agg_tbl.reset_index(drop=True)
|
781
|
+
.sort_values(["nameness_score"])
|
782
|
+
.groupby(table_schema["fk"] + extra_defining_attrs)
|
783
|
+
.first()
|
784
|
+
.drop("nameness_score", axis=1)
|
785
|
+
)
|
786
|
+
induced_entities["new_id"] = sbml_dfs_utils.id_formatter(
|
787
|
+
range(induced_entities.shape[0]), table_schema["pk"]
|
788
|
+
)
|
789
|
+
|
790
|
+
new_id_table = (
|
791
|
+
induced_entities.reset_index()
|
792
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
793
|
+
.set_index(table_schema["pk"])[table_schema["vars"]]
|
794
|
+
)
|
795
|
+
|
796
|
+
lookup_table = agg_tbl[table_schema["fk"] + extra_defining_attrs].merge(
|
797
|
+
induced_entities,
|
798
|
+
left_on=table_schema["fk"] + extra_defining_attrs,
|
799
|
+
right_index=True,
|
800
|
+
)["new_id"]
|
801
|
+
|
802
|
+
# logging merges that occurred
|
803
|
+
report_consensus_merges(
|
804
|
+
lookup_table, table_schema, agg_tbl=agg_tbl, n_example_merges=5
|
805
|
+
)
|
806
|
+
|
807
|
+
if "source" in table_schema.keys():
|
808
|
+
# track the model(s) that each entity came from
|
809
|
+
new_sources = create_consensus_sources(
|
810
|
+
agg_tbl.merge(lookup_table, left_index=True, right_index=True),
|
811
|
+
lookup_table,
|
812
|
+
table_schema,
|
813
|
+
pw_index,
|
814
|
+
)
|
815
|
+
assert isinstance(new_sources, pd.Series)
|
816
|
+
|
817
|
+
new_id_table = new_id_table.drop(table_schema["source"], axis=1).merge(
|
818
|
+
new_sources, left_index=True, right_index=True
|
819
|
+
)
|
820
|
+
|
821
|
+
return new_id_table, lookup_table
|
822
|
+
|
823
|
+
|
824
|
+
def construct_meta_entities_members(
|
825
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
826
|
+
pw_index: indices.PWIndex | None,
|
827
|
+
table: str = SBML_DFS.REACTIONS,
|
828
|
+
defined_by: str = SBML_DFS.REACTION_SPECIES,
|
829
|
+
defined_lookup_tables: dict = {},
|
830
|
+
defining_attrs: list[str] = [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
|
831
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
832
|
+
"""
|
833
|
+
Construct Meta Entities Defined by Membership
|
834
|
+
|
835
|
+
Aggregating across one entity type for a set of pathway models, merge entities with the same members.
|
836
|
+
|
837
|
+
Parameters:
|
838
|
+
----------
|
839
|
+
sbml_df_dict: dict{"model": cpr.SBML_dfs}
|
840
|
+
A dictionary of cpr.SBML_dfs
|
841
|
+
pw_index: indices.PWIndex
|
842
|
+
An index of all tables being aggregated
|
843
|
+
table: str
|
844
|
+
A table/entity set from the sbml_dfs to work-with
|
845
|
+
defined_by: dict
|
846
|
+
A table/entity set whose entries are members of "table"
|
847
|
+
defined_lookup_tables: {pd.Series}
|
848
|
+
Lookup table for updating the ids of "defined_by"
|
849
|
+
defining_attrs: [str]
|
850
|
+
A list of attributes which jointly define a unique entity
|
851
|
+
|
852
|
+
Returns:
|
853
|
+
----------
|
854
|
+
new_id_table: pd.DataFrame
|
855
|
+
Matching the schema of one of the tables within sbml_df_dict
|
856
|
+
lookup_table: pd.Series
|
857
|
+
Matches the index of the aggregated entities to new_ids
|
858
|
+
|
859
|
+
"""
|
860
|
+
|
861
|
+
logger.info(
|
862
|
+
f"Merging {table} based on identical membership ({' + '.join(defining_attrs)})"
|
863
|
+
)
|
864
|
+
|
865
|
+
# combine sbml_dfs by adding model to the index and concatinating all dfs
|
866
|
+
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
|
867
|
+
|
868
|
+
# to debug and see names of species
|
869
|
+
# comp_species = unnest_SBML_df(sbml_dfs_dict, table="compartmentalized_species")
|
870
|
+
# agg_tbl = agg_tbl.merge(comp_species, left_on = ["model", "sc_id"], right_index = True )
|
871
|
+
|
872
|
+
# since all sbml_dfs have the same schema pull out one schema for reference
|
873
|
+
table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
874
|
+
defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
|
875
|
+
|
876
|
+
# update ids using previously created lookup tables
|
877
|
+
for k in defined_lookup_tables.keys():
|
878
|
+
agg_tbl = (
|
879
|
+
agg_tbl.merge(
|
880
|
+
defined_lookup_tables[k],
|
881
|
+
left_on=[SOURCE_SPEC.MODEL, k],
|
882
|
+
right_index=True,
|
883
|
+
)
|
884
|
+
.drop(k, axis=1)
|
885
|
+
.rename(columns={"new_id": k})
|
886
|
+
)
|
887
|
+
|
888
|
+
# create a set of species x compartment instances for each reaction
|
889
|
+
defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
|
890
|
+
|
891
|
+
if (
|
892
|
+
len(defining_fk) != 1
|
893
|
+
or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
|
894
|
+
):
|
895
|
+
raise ValueError(
|
896
|
+
f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
|
897
|
+
)
|
898
|
+
else:
|
899
|
+
defining_fk = list(defining_fk)[0]
|
900
|
+
|
901
|
+
# define what it is to be a unique member based on a combination of defining_attrs
|
902
|
+
valid_defining_attrs = agg_tbl.columns.values.tolist()
|
903
|
+
invalid_defining_attrs = [
|
904
|
+
x for x in defining_attrs if x not in valid_defining_attrs
|
905
|
+
]
|
906
|
+
|
907
|
+
if len(invalid_defining_attrs) != 0:
|
908
|
+
raise ValueError(
|
909
|
+
f"{', '.join(invalid_defining_attrs)} was not found; "
|
910
|
+
f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
|
911
|
+
)
|
912
|
+
|
913
|
+
# create unique members
|
914
|
+
agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
|
915
|
+
|
916
|
+
# members are aggregated by reaction
|
917
|
+
membership_df = (
|
918
|
+
agg_tbl.reset_index()
|
919
|
+
.groupby(["model", table_schema["pk"]])
|
920
|
+
.agg(membership=("member", lambda x: (list(set(x)))))
|
921
|
+
)
|
922
|
+
|
923
|
+
# check whether members are duplicated within a given group
|
924
|
+
# suggesting that distinct entities have been coerced into
|
925
|
+
# the same entity
|
926
|
+
for i in range(membership_df.shape[0]):
|
927
|
+
members = membership_df["membership"].iloc[i]
|
928
|
+
if len(members) != len(set(members)):
|
929
|
+
_ = agg_tbl.reset_index().merge(
|
930
|
+
membership_df.iloc[i : i + 1],
|
931
|
+
how="inner",
|
932
|
+
left_on=[SOURCE_SPEC.MODEL, table_schema["pk"]],
|
933
|
+
right_index=True,
|
934
|
+
)
|
935
|
+
|
936
|
+
raise ValueError(
|
937
|
+
"Members were duplicated suggesting overmerging in the source "
|
938
|
+
)
|
939
|
+
|
940
|
+
membership_df["member_string"] = [
|
941
|
+
_create_member_string(x) for x in membership_df["membership"]
|
942
|
+
]
|
943
|
+
|
944
|
+
membership_lookup = membership_df.reset_index()
|
945
|
+
|
946
|
+
consensus_entities = membership_lookup.groupby("member_string").first()
|
947
|
+
consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
|
948
|
+
range(consensus_entities.shape[0]), table_schema["pk"]
|
949
|
+
)
|
950
|
+
|
951
|
+
lookup_table = membership_lookup.merge(
|
952
|
+
consensus_entities["new_id"], left_on="member_string", right_index=True
|
953
|
+
).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
|
954
|
+
|
955
|
+
# logging merges that occurred
|
956
|
+
report_consensus_merges(
|
957
|
+
lookup_table, table_schema, sbml_dfs_dict=sbml_dfs_dict, n_example_merges=5
|
958
|
+
)
|
959
|
+
|
960
|
+
agg_primary_table = unnest_SBML_df(sbml_dfs_dict, table=table)
|
961
|
+
|
962
|
+
# add nameness_score as a measure of how-readable a possible name would be
|
963
|
+
# (this will help to select names which are more human readable after the merge)
|
964
|
+
agg_primary_table = utils._add_nameness_score_wrapper(
|
965
|
+
agg_primary_table, "label", table_schema
|
966
|
+
)
|
967
|
+
|
968
|
+
new_id_table = (
|
969
|
+
agg_primary_table.join(lookup_table)
|
970
|
+
.reset_index(drop=True)
|
971
|
+
.sort_values(["nameness_score"])
|
972
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
973
|
+
.groupby(table_schema["pk"])
|
974
|
+
.first()[table_schema["vars"]]
|
975
|
+
)
|
976
|
+
|
977
|
+
# merge identifiers
|
978
|
+
logger.info(f"Merging {table} identifiers")
|
979
|
+
indexed_old_identifiers = (
|
980
|
+
agg_primary_table.join(lookup_table)
|
981
|
+
.reset_index(drop=True)
|
982
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
983
|
+
.groupby(table_schema["pk"])[table_schema["id"]]
|
984
|
+
)
|
985
|
+
|
986
|
+
# combine merged identifiers into single identifier objects indexed by new id
|
987
|
+
updated_identifiers = indexed_old_identifiers.agg(identifiers.merge_identifiers)
|
988
|
+
|
989
|
+
# add merged identifiers back to new_id table overwriting existing ids
|
990
|
+
new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
|
991
|
+
updated_identifiers, left_index=True, right_index=True
|
992
|
+
)
|
993
|
+
|
994
|
+
if "source" in table_schema.keys():
|
995
|
+
logger.info(f"Merging {table} sources")
|
996
|
+
|
997
|
+
# track the model(s) that each entity came from
|
998
|
+
new_sources = create_consensus_sources(
|
999
|
+
agg_primary_table.merge(lookup_table, left_index=True, right_index=True),
|
1000
|
+
lookup_table,
|
1001
|
+
table_schema,
|
1002
|
+
pw_index,
|
1003
|
+
)
|
1004
|
+
|
1005
|
+
new_id_table = new_id_table.drop(table_schema["source"], axis=1).merge(
|
1006
|
+
new_sources, left_index=True, right_index=True
|
1007
|
+
)
|
1008
|
+
|
1009
|
+
return new_id_table, lookup_table
|
1010
|
+
|
1011
|
+
|
1012
|
+
def create_consensus_sources(
|
1013
|
+
agg_tbl: pd.DataFrame,
|
1014
|
+
lookup_table: pd.Series,
|
1015
|
+
table_schema: dict,
|
1016
|
+
pw_index: indices.PWIndex | None,
|
1017
|
+
) -> pd.Series:
|
1018
|
+
"""
|
1019
|
+
Create Consensus Sources
|
1020
|
+
|
1021
|
+
Annotate the source of to-be-merged species with the models they came from, and combine with existing annotations.
|
1022
|
+
|
1023
|
+
Parameters:
|
1024
|
+
----------
|
1025
|
+
agg_tbl: pd.DataFrame
|
1026
|
+
A table containing existing source.Source objects and a many-1
|
1027
|
+
"new_id" of their post-aggregation consensus entity
|
1028
|
+
lookup_table: pd.Series
|
1029
|
+
A series where the index are old identifiers and the values are
|
1030
|
+
post-aggregation new identifiers
|
1031
|
+
table_schema: dict
|
1032
|
+
Summary of the schema for the operant entitye type
|
1033
|
+
pw_index: indices.PWIndex
|
1034
|
+
An index of all tables being aggregated
|
1035
|
+
|
1036
|
+
Returns:
|
1037
|
+
----------
|
1038
|
+
new_sources: pd.DataFrame
|
1039
|
+
Mapping where the index is new identifiers and values are aggregated source.Source objects
|
1040
|
+
|
1041
|
+
"""
|
1042
|
+
|
1043
|
+
logger.info("Creating source table")
|
1044
|
+
# Sources for all new entries
|
1045
|
+
new_sources = source.create_source_table(lookup_table, table_schema, pw_index)
|
1046
|
+
|
1047
|
+
# create a pd.Series with an index of all new_ids (which will be rewritten as the entity primary keys)
|
1048
|
+
# and values of source.Source objects (where multiple Sources may match an index value).
|
1049
|
+
logger.info("Aggregating old sources")
|
1050
|
+
indexed_old_sources = (
|
1051
|
+
agg_tbl.reset_index(drop=True)
|
1052
|
+
.rename(columns={"new_id": table_schema["pk"]})
|
1053
|
+
.groupby(table_schema["pk"])[table_schema["source"]]
|
1054
|
+
)
|
1055
|
+
|
1056
|
+
# combine old sources into a single source.Source object per index value
|
1057
|
+
aggregated_old_sources = indexed_old_sources.agg(source.merge_sources)
|
1058
|
+
|
1059
|
+
aligned_sources = new_sources.merge(
|
1060
|
+
aggregated_old_sources, left_index=True, right_index=True
|
1061
|
+
)
|
1062
|
+
assert isinstance(aligned_sources, pd.DataFrame)
|
1063
|
+
|
1064
|
+
logger.info("Returning new source table")
|
1065
|
+
new_sources = aligned_sources.apply(source.merge_sources, axis=1).rename(table_schema["source"]) # type: ignore
|
1066
|
+
assert isinstance(new_sources, pd.Series)
|
1067
|
+
|
1068
|
+
return new_sources
|
1069
|
+
|
1070
|
+
|
1071
|
+
def report_consensus_merges(
|
1072
|
+
lookup_table: pd.Series,
|
1073
|
+
table_schema: dict,
|
1074
|
+
agg_tbl: pd.DataFrame | None = None,
|
1075
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs] | None = None,
|
1076
|
+
n_example_merges: int = 3,
|
1077
|
+
) -> None:
|
1078
|
+
"""
|
1079
|
+
Report Consensus Merges
|
1080
|
+
|
1081
|
+
Print a summary of merges that occurred
|
1082
|
+
|
1083
|
+
Parameters:
|
1084
|
+
----------
|
1085
|
+
lookup_table : pd.Series
|
1086
|
+
An index of "model" and the entities primary key with values of new_id
|
1087
|
+
table_schema : dict
|
1088
|
+
Schema of the table being merged
|
1089
|
+
agg_tbl : pd.DataFrame or None
|
1090
|
+
Contains the original model, primary keys and a label. Required if the primary key is not r_id (i.e., reactions)
|
1091
|
+
sbml_dfs_dict : pd.DataFrame or None
|
1092
|
+
The dict of full models across all models. Used to create reaction formulas if the primary key is r_id
|
1093
|
+
n_example_merges : int
|
1094
|
+
Number of example merges to report details on
|
1095
|
+
|
1096
|
+
Returns:
|
1097
|
+
----------
|
1098
|
+
None
|
1099
|
+
"""
|
1100
|
+
|
1101
|
+
entity_merge_num = lookup_table.value_counts()
|
1102
|
+
merged_entities = entity_merge_num[entity_merge_num != 1]
|
1103
|
+
|
1104
|
+
if merged_entities.shape[0] == 0:
|
1105
|
+
logger.warning(f"No merging occurred for {table_schema['pk']}")
|
1106
|
+
return None
|
1107
|
+
|
1108
|
+
if "label" not in table_schema.keys():
|
1109
|
+
# we dont need to track unnamed species
|
1110
|
+
return None
|
1111
|
+
|
1112
|
+
logger.info(
|
1113
|
+
f">>>> {merged_entities.sum()} {table_schema['pk']} entries merged into {merged_entities.shape[0]}"
|
1114
|
+
)
|
1115
|
+
|
1116
|
+
merges_lookup = lookup_table[
|
1117
|
+
lookup_table.isin(merged_entities.index.tolist())
|
1118
|
+
].reset_index()
|
1119
|
+
|
1120
|
+
if table_schema["pk"] == "r_id":
|
1121
|
+
logger.info(
|
1122
|
+
"Creating formulas for to-be-merged reactions to help with reporting merges of reactions"
|
1123
|
+
" with inconsistently named reactants"
|
1124
|
+
)
|
1125
|
+
if not isinstance(sbml_dfs_dict, dict):
|
1126
|
+
raise ValueError(
|
1127
|
+
f"sbml_dfs_dict was a {type(sbml_dfs_dict)} and must be a dict if the table_schema pk is r_id"
|
1128
|
+
)
|
1129
|
+
|
1130
|
+
indexed_models = merges_lookup.set_index("model").sort_index()
|
1131
|
+
merges_dict = dict()
|
1132
|
+
for mod in indexed_models.index.unique():
|
1133
|
+
merges_dict[mod] = sbml_dfs_core.reaction_summaries(
|
1134
|
+
sbml_dfs_dict[mod], indexed_models.loc[mod]["r_id"]
|
1135
|
+
)
|
1136
|
+
merge_labels = pd.concat(merges_dict, names=["model", "r_id"]).rename("label")
|
1137
|
+
|
1138
|
+
# add labels to models + r_id
|
1139
|
+
merges_lookup = merges_lookup.merge(
|
1140
|
+
merge_labels, how="left", left_on=["model", "r_id"], right_index=True
|
1141
|
+
)
|
1142
|
+
|
1143
|
+
logger.info("Done creating reaction formulas")
|
1144
|
+
|
1145
|
+
else:
|
1146
|
+
if type(agg_tbl) is not pd.DataFrame:
|
1147
|
+
raise ValueError(
|
1148
|
+
f"agg_tbl was a {type(agg_tbl)} and must be a pd.DataFrame if the table_schema pk is NOT r_id"
|
1149
|
+
)
|
1150
|
+
|
1151
|
+
merges_lookup = merges_lookup.merge(
|
1152
|
+
agg_tbl[table_schema["label"]],
|
1153
|
+
left_on=["model", table_schema["pk"]],
|
1154
|
+
right_index=True,
|
1155
|
+
).rename(columns={table_schema["label"]: "label"})
|
1156
|
+
|
1157
|
+
indexed_merges_lookup = merges_lookup.set_index("new_id")
|
1158
|
+
|
1159
|
+
# filter to entries with non-identical labels
|
1160
|
+
|
1161
|
+
logger.info("Testing for identical formulas of to-be-merged reactions")
|
1162
|
+
|
1163
|
+
index_label_counts = (
|
1164
|
+
indexed_merges_lookup["label"].drop_duplicates().index.value_counts()
|
1165
|
+
)
|
1166
|
+
inexact_merges = index_label_counts[index_label_counts > 1].index.tolist()
|
1167
|
+
|
1168
|
+
if len(inexact_merges) == 0:
|
1169
|
+
logger.info("All merges names matched exactly")
|
1170
|
+
else:
|
1171
|
+
logger.warning(
|
1172
|
+
f"\n{len(inexact_merges)} merges were of entities with distinct names, including:\n"
|
1173
|
+
)
|
1174
|
+
|
1175
|
+
inexact_merges_samples = random.sample(
|
1176
|
+
inexact_merges, min(len(inexact_merges), n_example_merges)
|
1177
|
+
)
|
1178
|
+
|
1179
|
+
inexact_merge_collapses = (
|
1180
|
+
indexed_merges_lookup.loc[inexact_merges_samples]["label"]
|
1181
|
+
.drop_duplicates()
|
1182
|
+
.groupby(level=0)
|
1183
|
+
.agg(" & ".join)
|
1184
|
+
)
|
1185
|
+
|
1186
|
+
logger.warning("\n\n".join(inexact_merge_collapses.tolist()) + "\n")
|
1187
|
+
|
1188
|
+
logger.info("==============================\n")
|
1189
|
+
|
1190
|
+
return None
|
1191
|
+
|
1192
|
+
|
1193
|
+
def merge_entity_data(
|
1194
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1195
|
+
lookup_table: pd.Series,
|
1196
|
+
table: str,
|
1197
|
+
) -> dict:
|
1198
|
+
"""
|
1199
|
+
Merge Entity Data
|
1200
|
+
|
1201
|
+
Report cases where a single "new" id is associated with multiple different values of entity_var
|
1202
|
+
|
1203
|
+
Args
|
1204
|
+
sbml_dfs_dict (dict): dictionary where values are to-be-merged model nnames and values
|
1205
|
+
are sbml_dfs_core.SBML_dfs
|
1206
|
+
lookup_table (pd.Series): a series where the index is an old model and primary key and the
|
1207
|
+
value is the new consensus id
|
1208
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
1209
|
+
|
1210
|
+
Returns:
|
1211
|
+
entity_data (dict): dictionary containing pd.DataFrames which aggregate all of the
|
1212
|
+
individual entity_data tables in "sbml_dfs_dict"
|
1213
|
+
|
1214
|
+
"""
|
1215
|
+
|
1216
|
+
entity_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
|
1217
|
+
data_table_name = table + "_data"
|
1218
|
+
|
1219
|
+
entity_data_dict = {
|
1220
|
+
k: getattr(sbml_dfs_dict[k], data_table_name) for k in sbml_dfs_dict.keys()
|
1221
|
+
}
|
1222
|
+
|
1223
|
+
entity_data_types = set.union(*[set(v.keys()) for v in entity_data_dict.values()])
|
1224
|
+
|
1225
|
+
entity_data = {
|
1226
|
+
x: _merge_entity_data_create_consensus(
|
1227
|
+
entity_data_dict, lookup_table, entity_schema, x, table
|
1228
|
+
)
|
1229
|
+
for x in entity_data_types
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
return entity_data
|
1233
|
+
|
1234
|
+
|
1235
|
+
def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
|
1236
|
+
"""Check models in SBML_dfs for problems which can be reported up-front
|
1237
|
+
|
1238
|
+
Args:
|
1239
|
+
sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
|
1240
|
+
primarily used as an input for construct_consensus_model
|
1241
|
+
|
1242
|
+
Returns:
|
1243
|
+
None
|
1244
|
+
|
1245
|
+
"""
|
1246
|
+
|
1247
|
+
for k, v in sbml_dfs_dict.items():
|
1248
|
+
_check_sbml_dfs(sbml_dfs=v, model_label=k)
|
1249
|
+
return None
|
1250
|
+
|
1251
|
+
|
1252
|
+
def _check_sbml_dfs(
|
1253
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
|
1254
|
+
) -> None:
|
1255
|
+
"""Check SBML_dfs for identifiers which are associated with different entities before a merge."""
|
1256
|
+
|
1257
|
+
ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
1258
|
+
defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
|
1259
|
+
|
1260
|
+
defining_identifier_counts = defining_ids.value_counts(
|
1261
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
1262
|
+
)
|
1263
|
+
degenerate_defining_identities = (
|
1264
|
+
defining_identifier_counts[defining_identifier_counts > 1]
|
1265
|
+
.rename("N")
|
1266
|
+
.reset_index()
|
1267
|
+
.set_index(IDENTIFIERS.ONTOLOGY)
|
1268
|
+
)
|
1269
|
+
|
1270
|
+
if degenerate_defining_identities.shape[0] > 0:
|
1271
|
+
logger.info(
|
1272
|
+
"Some defining identifiers are present multiple times "
|
1273
|
+
f"in {model_label} and will likely result in species merging "
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
degen_defining_id_list = list()
|
1277
|
+
for k in degenerate_defining_identities.index.unique():
|
1278
|
+
n_degen = degenerate_defining_identities.loc[k].shape[0]
|
1279
|
+
example_duplicates = utils.ensure_pd_df(
|
1280
|
+
degenerate_defining_identities.loc[k].sample(min([n_degen, N_examples]))
|
1281
|
+
)
|
1282
|
+
|
1283
|
+
degen_defining_id_list.append(
|
1284
|
+
k
|
1285
|
+
+ f" has {n_degen} duplicates including: "
|
1286
|
+
+ ", ".join(
|
1287
|
+
[
|
1288
|
+
f"{x} ({y})"
|
1289
|
+
for x, y in zip(
|
1290
|
+
example_duplicates[IDENTIFIERS.IDENTIFIER].tolist(),
|
1291
|
+
example_duplicates["N"].tolist(),
|
1292
|
+
)
|
1293
|
+
]
|
1294
|
+
)
|
1295
|
+
)
|
1296
|
+
|
1297
|
+
logger.info("\n".join(degen_defining_id_list))
|
1298
|
+
return None
|
1299
|
+
|
1300
|
+
|
1301
|
+
def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
|
1302
|
+
"""Check Identifiers to make sure they aren't empty and flag cases where IDs are missing BQB terms."""
|
1303
|
+
|
1304
|
+
if meta_identifiers.shape[0] == 0:
|
1305
|
+
raise ValueError(
|
1306
|
+
'"meta_identifiers" was empty; some identifiers should be present'
|
1307
|
+
)
|
1308
|
+
|
1309
|
+
n_null = sum(meta_identifiers[IDENTIFIERS.BQB].isnull())
|
1310
|
+
if n_null > 0:
|
1311
|
+
msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
|
1312
|
+
logger.warn(msg)
|
1313
|
+
|
1314
|
+
return None
|
1315
|
+
|
1316
|
+
|
1317
|
+
def _update_foreign_keys(
|
1318
|
+
agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
|
1319
|
+
) -> pd.DataFrame:
|
1320
|
+
for fk in table_schema["fk"]:
|
1321
|
+
updated_fks = (
|
1322
|
+
agg_tbl[fk]
|
1323
|
+
.reset_index()
|
1324
|
+
.merge(
|
1325
|
+
fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
|
1326
|
+
)
|
1327
|
+
.drop(fk, axis=1)
|
1328
|
+
.rename(columns={"new_id": fk})
|
1329
|
+
.set_index(["model", table_schema["pk"]])
|
1330
|
+
)
|
1331
|
+
agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
|
1332
|
+
|
1333
|
+
return agg_tbl
|
1334
|
+
|
1335
|
+
|
1336
|
+
def _resolve_reversibility(
|
1337
|
+
sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
|
1338
|
+
rxn_consensus_species: pd.DataFrame,
|
1339
|
+
rxn_lookup_table: pd.Series,
|
1340
|
+
) -> pd.DataFrame:
|
1341
|
+
"""
|
1342
|
+
For a set of merged reactions determine what their consensus reaction reversibilities are
|
1343
|
+
"""
|
1344
|
+
|
1345
|
+
agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=SBML_DFS.REACTIONS)
|
1346
|
+
|
1347
|
+
if not all(agg_tbl[SBML_DFS.R_ISREVERSIBLE].isin([True, False])):
|
1348
|
+
invalid_levels = agg_tbl[~agg_tbl[SBML_DFS.R_ISREVERSIBLE].isin([True, False])][
|
1349
|
+
SBML_DFS.R_ISREVERSIBLE
|
1350
|
+
].unique()
|
1351
|
+
raise ValueError(
|
1352
|
+
"One or more aggregated models included invalid values for r_isreversible in the reactions table: "
|
1353
|
+
f"{', '.join(invalid_levels)}"
|
1354
|
+
)
|
1355
|
+
|
1356
|
+
# add new ids to aggregated reactions by indexes
|
1357
|
+
# map each new r_id to every distinct value of is_irreversible from reactions it originated from
|
1358
|
+
# in most cases there will only be a single level
|
1359
|
+
r_id_to_all_reversibilities = (
|
1360
|
+
agg_tbl.join(rxn_lookup_table)
|
1361
|
+
.reset_index()[["new_id", SBML_DFS.R_ISREVERSIBLE]]
|
1362
|
+
.rename({"new_id": SBML_DFS.R_ID}, axis=1)
|
1363
|
+
.drop_duplicates()
|
1364
|
+
)
|
1365
|
+
|
1366
|
+
# when a reaction could be irreversible or reversible define it as reversible.
|
1367
|
+
r_id_reversibility = (
|
1368
|
+
r_id_to_all_reversibilities.sort_values(
|
1369
|
+
SBML_DFS.R_ISREVERSIBLE, ascending=False
|
1370
|
+
)
|
1371
|
+
.groupby(SBML_DFS.R_ID)
|
1372
|
+
.first()
|
1373
|
+
)
|
1374
|
+
|
1375
|
+
# drop existing reversibility since it is selected arbitrarily and replace
|
1376
|
+
# with consensus reversibility which respects priorities
|
1377
|
+
rxns_w_reversibility = rxn_consensus_species.drop(
|
1378
|
+
SBML_DFS.R_ISREVERSIBLE, axis=1
|
1379
|
+
).join(r_id_reversibility)
|
1380
|
+
|
1381
|
+
assert rxns_w_reversibility.shape[0] == rxn_consensus_species.shape[0]
|
1382
|
+
assert all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False]))
|
1383
|
+
|
1384
|
+
return rxns_w_reversibility
|
1385
|
+
|
1386
|
+
|
1387
|
+
def _merge_entity_data_create_consensus(
|
1388
|
+
entity_data_dict: dict,
|
1389
|
+
lookup_table: pd.Series,
|
1390
|
+
entity_schema: dict,
|
1391
|
+
an_entity_data_type: str,
|
1392
|
+
table: str,
|
1393
|
+
) -> pd.DataFrame:
|
1394
|
+
"""
|
1395
|
+
Merge Entity Data - Report Mismatches
|
1396
|
+
|
1397
|
+
Report cases where a single "new" id is associated with multiple different values of entity_var
|
1398
|
+
|
1399
|
+
Args
|
1400
|
+
entity_data_dict (dict): dictionary containing all model's "an_entity_data_type" dictionaries
|
1401
|
+
lookup_table (pd.Series): a series where the index is an old model and primary key and the
|
1402
|
+
value is the new consensus id
|
1403
|
+
entity_schema (dict): schema for "table"
|
1404
|
+
an_entity_data_type (str): data_type from species/reactions_data in entity_data_dict
|
1405
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
1406
|
+
|
1407
|
+
Returns:
|
1408
|
+
consensus_entity_data (pd.DataFrame) table where index is primary key of "table" and
|
1409
|
+
values are all distinct annotations from "an_entity_data_type".
|
1410
|
+
|
1411
|
+
"""
|
1412
|
+
|
1413
|
+
models_w_entity_data_type = [
|
1414
|
+
k for k, v in entity_data_dict.items() if an_entity_data_type in v.keys()
|
1415
|
+
]
|
1416
|
+
|
1417
|
+
logger.info(
|
1418
|
+
f"Merging {len(models_w_entity_data_type)} models with {an_entity_data_type} data in the {table} table"
|
1419
|
+
)
|
1420
|
+
|
1421
|
+
# check that all tables have the same index and column names
|
1422
|
+
distinct_indices = {
|
1423
|
+
", ".join(entity_data_dict[x][an_entity_data_type].index.names)
|
1424
|
+
for x in models_w_entity_data_type
|
1425
|
+
}
|
1426
|
+
if len(distinct_indices) > 1:
|
1427
|
+
raise ValueError(
|
1428
|
+
f"Multiple tables with the same {an_entity_data_type} cannot be combined"
|
1429
|
+
" because they have different index names:"
|
1430
|
+
f"{' & '.join(list(distinct_indices))}"
|
1431
|
+
)
|
1432
|
+
distinct_cols = {
|
1433
|
+
", ".join(entity_data_dict[x][an_entity_data_type].columns.tolist())
|
1434
|
+
for x in models_w_entity_data_type
|
1435
|
+
}
|
1436
|
+
if len(distinct_cols) > 1:
|
1437
|
+
raise ValueError(
|
1438
|
+
f"Multiple tables with the same {an_entity_data_type} cannot be combined"
|
1439
|
+
" because they have different column names:"
|
1440
|
+
f"{' & '.join(list(distinct_cols))}"
|
1441
|
+
)
|
1442
|
+
|
1443
|
+
# stack all models
|
1444
|
+
combined_entity_data = pd.concat(
|
1445
|
+
{k: entity_data_dict[k][an_entity_data_type] for k in models_w_entity_data_type}
|
1446
|
+
)
|
1447
|
+
combined_entity_data.index.names = ["model", entity_schema["pk"]]
|
1448
|
+
if isinstance(combined_entity_data, pd.Series):
|
1449
|
+
# enforce that atttributes should always be DataFrames
|
1450
|
+
combined_entity_data = combined_entity_data.to_frame()
|
1451
|
+
|
1452
|
+
# create a table indexed by the NEW primary key containing all the entity data of type an_entity_data_type
|
1453
|
+
# right now the index may map to multiple rows if entities were consolidated
|
1454
|
+
combined_entity_data = (
|
1455
|
+
combined_entity_data.join(lookup_table)
|
1456
|
+
.reset_index(drop=True)
|
1457
|
+
.rename({"new_id": entity_schema["pk"]}, axis=1)
|
1458
|
+
.set_index(entity_schema["pk"])
|
1459
|
+
.sort_index()
|
1460
|
+
)
|
1461
|
+
|
1462
|
+
# report cases where merges produce id-variable combinations with distinct values
|
1463
|
+
_merge_entity_data_report_mismatches(
|
1464
|
+
combined_entity_data, entity_schema, an_entity_data_type, table
|
1465
|
+
)
|
1466
|
+
|
1467
|
+
# save one value for each id-variable combination
|
1468
|
+
# (this will accept the first value regardless of the above mismatches.)
|
1469
|
+
consensus_entity_data = (
|
1470
|
+
combined_entity_data.reset_index().groupby(entity_schema["pk"]).first()
|
1471
|
+
)
|
1472
|
+
|
1473
|
+
return consensus_entity_data
|
1474
|
+
|
1475
|
+
|
1476
|
+
def _merge_entity_data_report_mismatches(
|
1477
|
+
combined_entity_data: pd.DataFrame,
|
1478
|
+
entity_schema: dict,
|
1479
|
+
an_entity_data_type: str,
|
1480
|
+
table: str,
|
1481
|
+
) -> None:
|
1482
|
+
"""
|
1483
|
+
Merge Entity Data - Report Mismatches
|
1484
|
+
|
1485
|
+
Report cases where a single "new" id is associated with multiple different values of entity_var
|
1486
|
+
|
1487
|
+
Args
|
1488
|
+
combined_entity_data (pd.DataFrame): indexed by table primary key containing all
|
1489
|
+
data from "an_entity_data_type"
|
1490
|
+
entity_schema (dict): schema for "table"
|
1491
|
+
an_entity_data_type (str): data_type from species/reactions_data in combined_entity_data
|
1492
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
1493
|
+
|
1494
|
+
Returns:
|
1495
|
+
None
|
1496
|
+
|
1497
|
+
"""
|
1498
|
+
|
1499
|
+
data_table_name = table + "_data"
|
1500
|
+
|
1501
|
+
entity_vars = combined_entity_data.columns
|
1502
|
+
for entity_var in entity_vars:
|
1503
|
+
unique_counts = (
|
1504
|
+
combined_entity_data.reset_index()
|
1505
|
+
.groupby(entity_schema["pk"])
|
1506
|
+
.agg("nunique")
|
1507
|
+
)
|
1508
|
+
entities_w_imperfect_matches = unique_counts[
|
1509
|
+
unique_counts[entity_var] != 1
|
1510
|
+
].index.tolist()
|
1511
|
+
|
1512
|
+
if len(entities_w_imperfect_matches) > 0:
|
1513
|
+
N_select_entities_w_imperfect_matches = min(
|
1514
|
+
5, len(entities_w_imperfect_matches)
|
1515
|
+
)
|
1516
|
+
select_entities_w_imperfect_matches = entities_w_imperfect_matches[
|
1517
|
+
0:N_select_entities_w_imperfect_matches
|
1518
|
+
]
|
1519
|
+
|
1520
|
+
warning_msg_select = [
|
1521
|
+
x
|
1522
|
+
+ ": "
|
1523
|
+
+ ", ".join(
|
1524
|
+
combined_entity_data[entity_var].loc[x].apply(str).unique().tolist()
|
1525
|
+
)
|
1526
|
+
for x in select_entities_w_imperfect_matches
|
1527
|
+
]
|
1528
|
+
full_warning_msg = (
|
1529
|
+
f"{len(entities_w_imperfect_matches)} {table} contains multiple values for the {entity_var} variable"
|
1530
|
+
f" in the {data_table_name} table of {an_entity_data_type}: "
|
1531
|
+
+ ". ".join(warning_msg_select)
|
1532
|
+
)
|
1533
|
+
|
1534
|
+
logger.warning(full_warning_msg)
|
1535
|
+
|
1536
|
+
return None
|
1537
|
+
|
1538
|
+
|
1539
|
+
def _test_same_schema(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
|
1540
|
+
"""
|
1541
|
+
Ensure that all sbml_dfs in the dict have the same schema
|
1542
|
+
"""
|
1543
|
+
|
1544
|
+
if len(sbml_dfs_dict) != 0:
|
1545
|
+
# extract all schemas
|
1546
|
+
schema_list = [sbml_dfs_dict[x].schema for x in sbml_dfs_dict.keys()]
|
1547
|
+
# if multiple entries are present then are they the same?
|
1548
|
+
if len(sbml_dfs_dict) > 1:
|
1549
|
+
if not all([x == schema_list[0] for x in schema_list]):
|
1550
|
+
raise ValueError("sbml_df schemas were not identical")
|
1551
|
+
|
1552
|
+
return None
|
1553
|
+
|
1554
|
+
|
1555
|
+
def _create_member_string(x: list[str]) -> str:
|
1556
|
+
x.sort()
|
1557
|
+
return "_".join(x)
|