napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,1381 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
|
8
|
+
from fs import open_fs
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from napistu import identifiers
|
13
|
+
from napistu import sbml_dfs_core
|
14
|
+
from napistu import sbml_dfs_utils
|
15
|
+
from napistu import source
|
16
|
+
from napistu import utils
|
17
|
+
|
18
|
+
from napistu.constants import SBML_DFS
|
19
|
+
from napistu.constants import BQB
|
20
|
+
from napistu.constants import IDENTIFIERS
|
21
|
+
from napistu.constants import SBOTERM_NAMES
|
22
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
23
|
+
from napistu.constants import ONTOLOGIES
|
24
|
+
from napistu.constants import ENSEMBL_PREFIX_TO_ONTOLOGY
|
25
|
+
from napistu.modify.constants import COFACTOR_SCHEMA
|
26
|
+
from napistu.modify.constants import COFACTOR_CHEBI_IDS
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
def identify_cofactors(sbml_dfs: sbml_dfs_core.SBML_dfs) -> pd.Series:
|
32
|
+
"""
|
33
|
+
Identify Cofactors
|
34
|
+
|
35
|
+
Find cofactors which are playing a supporting role in a reaction (e.g., ATP -> ADP or water).
|
36
|
+
|
37
|
+
Parameters:
|
38
|
+
----------
|
39
|
+
sbml_dfs: SBML_dfs
|
40
|
+
A pathway model
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
----------
|
44
|
+
pd.Series with index of rsc_ids and values containing the reason why a reaction species is a cofactor
|
45
|
+
|
46
|
+
"""
|
47
|
+
|
48
|
+
# load definitions of cofactors and their systematic IDs
|
49
|
+
cofactor_ids_list = COFACTOR_CHEBI_IDS[ONTOLOGIES.CHEBI].tolist()
|
50
|
+
|
51
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
52
|
+
raise TypeError(
|
53
|
+
f"sbml_dfs was type {type(sbml_dfs)} and must be an sbml_dfs_core.SBML_dfs"
|
54
|
+
)
|
55
|
+
|
56
|
+
# find sbml_dfs species matching possible cofactors
|
57
|
+
species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
58
|
+
# filter to small molecules ignoring cases where a small molecule is just a part of the species
|
59
|
+
species_identifiers = species_identifiers[
|
60
|
+
[
|
61
|
+
o == ONTOLOGIES.CHEBI and b == BQB.IS
|
62
|
+
for o, b in zip(
|
63
|
+
species_identifiers[IDENTIFIERS.ONTOLOGY],
|
64
|
+
species_identifiers[IDENTIFIERS.BQB],
|
65
|
+
)
|
66
|
+
]
|
67
|
+
]
|
68
|
+
|
69
|
+
species_identifiers = species_identifiers.rename(
|
70
|
+
columns={IDENTIFIERS.IDENTIFIER: ONTOLOGIES.CHEBI}
|
71
|
+
)
|
72
|
+
|
73
|
+
if species_identifiers.shape[0] == 0:
|
74
|
+
raise ValueError("No species had ChEBI IDs, cofactors can not be filtered")
|
75
|
+
|
76
|
+
species_identifiers[ONTOLOGIES.CHEBI] = species_identifiers[
|
77
|
+
ONTOLOGIES.CHEBI
|
78
|
+
].astype(int)
|
79
|
+
species_identifiers = species_identifiers[
|
80
|
+
species_identifiers[ONTOLOGIES.CHEBI].isin(cofactor_ids_list)
|
81
|
+
]
|
82
|
+
|
83
|
+
logger.info(
|
84
|
+
f"There were {species_identifiers.shape[0]} cofactor species: "
|
85
|
+
f"{', '.join(species_identifiers[SBML_DFS.S_NAME].tolist())}"
|
86
|
+
)
|
87
|
+
|
88
|
+
# report cofactors that were not found
|
89
|
+
|
90
|
+
cofactors_missed = COFACTOR_CHEBI_IDS[
|
91
|
+
~COFACTOR_CHEBI_IDS[ONTOLOGIES.CHEBI].isin(
|
92
|
+
species_identifiers[ONTOLOGIES.CHEBI].tolist()
|
93
|
+
)
|
94
|
+
]["cofactor"].tolist()
|
95
|
+
if len(cofactors_missed) != 0:
|
96
|
+
logger.warning(
|
97
|
+
f"{len(cofactors_missed)} of {len(cofactor_ids_list)} "
|
98
|
+
"cofactors were not located in the pathway model: "
|
99
|
+
f"{', '.join(cofactors_missed)}"
|
100
|
+
)
|
101
|
+
|
102
|
+
# join species to cofactor schema using labels
|
103
|
+
|
104
|
+
cofactor_species = (
|
105
|
+
species_identifiers.reset_index()
|
106
|
+
.merge(COFACTOR_CHEBI_IDS)
|
107
|
+
.set_index(SBML_DFS.S_ID)
|
108
|
+
)
|
109
|
+
cofactor_cspecies = sbml_dfs.compartmentalized_species.merge(
|
110
|
+
cofactor_species["cofactor"], left_on=SBML_DFS.S_ID, right_index=True
|
111
|
+
)
|
112
|
+
# filter reaction species to cofactor species
|
113
|
+
cofactor_rscspecies = sbml_dfs.reaction_species.merge(
|
114
|
+
cofactor_cspecies["cofactor"], left_on=SBML_DFS.SC_ID, right_index=True
|
115
|
+
)
|
116
|
+
# drop entries which arent produced or consumed
|
117
|
+
cofactor_rscspecies = cofactor_rscspecies[
|
118
|
+
cofactor_rscspecies[SBML_DFS.STOICHIOMETRY] != 0
|
119
|
+
]
|
120
|
+
|
121
|
+
logger.info(
|
122
|
+
f"Cofactor species are present {cofactor_rscspecies.shape[0]} times in reactions"
|
123
|
+
)
|
124
|
+
|
125
|
+
# loop through reactions with cofactors at test
|
126
|
+
|
127
|
+
reactions = set(cofactor_rscspecies[SBML_DFS.R_ID].tolist())
|
128
|
+
|
129
|
+
logger.info(
|
130
|
+
f"{len(reactions)} of {sbml_dfs.reactions.shape[0]} reactions include cofactor species"
|
131
|
+
)
|
132
|
+
|
133
|
+
filtered_rscs = list()
|
134
|
+
for rxn in reactions:
|
135
|
+
one_rxns_species = cofactor_rscspecies[
|
136
|
+
cofactor_rscspecies[SBML_DFS.R_ID] == rxn
|
137
|
+
]
|
138
|
+
|
139
|
+
for filter_type, cofactor_filter in COFACTOR_SCHEMA.items():
|
140
|
+
dropped_species = filter_one_reactions_cofactors(
|
141
|
+
one_rxns_species, filter_type, cofactor_filter
|
142
|
+
)
|
143
|
+
if dropped_species is not None:
|
144
|
+
filtered_rscs.append(dropped_species)
|
145
|
+
|
146
|
+
return pd.concat(filtered_rscs)
|
147
|
+
|
148
|
+
|
149
|
+
def filter_one_reactions_cofactors(
|
150
|
+
one_rxns_species: pd.DataFrame, filter_type: str, cofactor_filter: dict
|
151
|
+
) -> pd.Series:
|
152
|
+
"""
|
153
|
+
Filter One Reaction's Cofactors
|
154
|
+
|
155
|
+
Apply a cofactor filter to one reaction's species
|
156
|
+
|
157
|
+
Parameters:
|
158
|
+
----------
|
159
|
+
one_rxns_species (pd.DataFrame):
|
160
|
+
Rows of reactions species containing cofactors
|
161
|
+
filter_type: str
|
162
|
+
Reason to filter species with this filter
|
163
|
+
cofactor_filter: dict
|
164
|
+
Species included in filter
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
----------
|
168
|
+
pd.Series with index of rsc_ids and values containing the reason why a
|
169
|
+
reaction species is a cofactor, or None if filter was not triggered.
|
170
|
+
|
171
|
+
"""
|
172
|
+
|
173
|
+
# see if all cofactor species are present
|
174
|
+
rsc_labels_set = set(one_rxns_species["cofactor"].tolist())
|
175
|
+
missing_reqs = set(cofactor_filter["if_all"]).difference(rsc_labels_set)
|
176
|
+
if len(missing_reqs) != 0:
|
177
|
+
return None
|
178
|
+
|
179
|
+
# ignore cases involving "except_any" species
|
180
|
+
if "except_any" in cofactor_filter.keys():
|
181
|
+
detected_exceptions = set(cofactor_filter["except_any"]).intersection(
|
182
|
+
rsc_labels_set
|
183
|
+
)
|
184
|
+
if len(detected_exceptions) != 0:
|
185
|
+
return None
|
186
|
+
|
187
|
+
# consider a reaction only if "as_substrate" is a substrate
|
188
|
+
if "as_substrate" in cofactor_filter.keys():
|
189
|
+
substrates_set = set(
|
190
|
+
one_rxns_species[one_rxns_species["stoichiometry"] < 0]["cofactor"].tolist()
|
191
|
+
)
|
192
|
+
substrates_detected = set(cofactor_filter["as_substrate"]).intersection(
|
193
|
+
substrates_set
|
194
|
+
)
|
195
|
+
|
196
|
+
if len(substrates_detected) == 0:
|
197
|
+
return None
|
198
|
+
|
199
|
+
# save the dropped species and filter type (reason for filtering) to a dict
|
200
|
+
dropped_species = one_rxns_species[
|
201
|
+
one_rxns_species["cofactor"].isin(cofactor_filter["if_all"])
|
202
|
+
]
|
203
|
+
|
204
|
+
return dropped_species.assign(filter_reason=filter_type)["filter_reason"]
|
205
|
+
|
206
|
+
|
207
|
+
def drop_cofactors(sbml_dfs: sbml_dfs_core.SBML_dfs) -> sbml_dfs_core.SBML_dfs:
|
208
|
+
"""
|
209
|
+
Drop Cofactors
|
210
|
+
|
211
|
+
Remove reaction species when they are acting as cofactors
|
212
|
+
|
213
|
+
Parameters:
|
214
|
+
----------
|
215
|
+
sbml_dfs: SBML_dfs
|
216
|
+
A pathway model
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
----------
|
220
|
+
sbml_dfs (SBML_dfs):
|
221
|
+
A pathway model with some reaction species filtered
|
222
|
+
"""
|
223
|
+
|
224
|
+
all_cofactors = identify_cofactors(sbml_dfs)
|
225
|
+
|
226
|
+
logger.info(
|
227
|
+
f"{all_cofactors.shape[0]} of {sbml_dfs.reaction_species.shape[0]}"
|
228
|
+
f" reaction species will be filtered as cofactors"
|
229
|
+
)
|
230
|
+
|
231
|
+
styled_df = all_cofactors.value_counts().to_frame()
|
232
|
+
logger.info(utils.style_df(styled_df))
|
233
|
+
|
234
|
+
sbml_dfs_working = copy.copy(sbml_dfs)
|
235
|
+
sbml_dfs_working.reaction_species = sbml_dfs_working.reaction_species[
|
236
|
+
~sbml_dfs_working.reaction_species.index.isin(all_cofactors.index.tolist())
|
237
|
+
]
|
238
|
+
|
239
|
+
return sbml_dfs_working
|
240
|
+
|
241
|
+
|
242
|
+
def add_complex_formation_species(
|
243
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
244
|
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
245
|
+
"""
|
246
|
+
Add Complex Formation - Species
|
247
|
+
|
248
|
+
Define all species in complexes and format newly created species
|
249
|
+
|
250
|
+
Parameters
|
251
|
+
----------
|
252
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
253
|
+
A relational mechanistic network
|
254
|
+
|
255
|
+
Returns
|
256
|
+
-------
|
257
|
+
merged_membership: pd.DataFrame
|
258
|
+
A table of complexes and their component members
|
259
|
+
new_species_for_sbml_dfs: pd.DataFrame
|
260
|
+
New entries to add to sbml_dfs.species
|
261
|
+
complex_component_species_ids: pd.DataFrame
|
262
|
+
All complex components
|
263
|
+
"""
|
264
|
+
|
265
|
+
# define all species
|
266
|
+
species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
267
|
+
species_defining_attributes = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.IS]
|
268
|
+
complex_membership = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
269
|
+
|
270
|
+
# find the species corresponding to complex components (if they exist)
|
271
|
+
merged_membership = complex_membership.merge(
|
272
|
+
species_defining_attributes[
|
273
|
+
[
|
274
|
+
SBML_DFS.S_ID,
|
275
|
+
IDENTIFIERS.ONTOLOGY,
|
276
|
+
IDENTIFIERS.IDENTIFIER,
|
277
|
+
IDENTIFIERS.URL,
|
278
|
+
]
|
279
|
+
].rename({SBML_DFS.S_ID: "component_s_id"}, axis=1),
|
280
|
+
how="left",
|
281
|
+
)
|
282
|
+
|
283
|
+
# define unique component species
|
284
|
+
complex_component_species = merged_membership[
|
285
|
+
[
|
286
|
+
"component_s_id",
|
287
|
+
IDENTIFIERS.ONTOLOGY,
|
288
|
+
IDENTIFIERS.IDENTIFIER,
|
289
|
+
IDENTIFIERS.URL,
|
290
|
+
]
|
291
|
+
].drop_duplicates()
|
292
|
+
|
293
|
+
# turn unnlisted identifiers back into identifier format
|
294
|
+
complex_component_species[SBML_DFS.S_IDENTIFIERS] = [
|
295
|
+
identifiers.Identifiers(
|
296
|
+
[
|
297
|
+
{
|
298
|
+
IDENTIFIERS.ONTOLOGY: complex_component_species[
|
299
|
+
IDENTIFIERS.ONTOLOGY
|
300
|
+
].iloc[i],
|
301
|
+
IDENTIFIERS.IDENTIFIER: complex_component_species[
|
302
|
+
IDENTIFIERS.IDENTIFIER
|
303
|
+
].iloc[i],
|
304
|
+
IDENTIFIERS.URL: complex_component_species[IDENTIFIERS.URL].iloc[i],
|
305
|
+
IDENTIFIERS.BQB: BQB.IS,
|
306
|
+
}
|
307
|
+
]
|
308
|
+
)
|
309
|
+
for i in range(0, complex_component_species.shape[0])
|
310
|
+
]
|
311
|
+
|
312
|
+
# create an identifier -> source lookup by collapsing all sources with the same defining id
|
313
|
+
indexed_members = merged_membership.set_index(
|
314
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL]
|
315
|
+
).sort_index()
|
316
|
+
collapsed_sources = [
|
317
|
+
source.merge_sources(indexed_members.loc[ind][SBML_DFS.S_SOURCE].tolist())
|
318
|
+
for ind in indexed_members.index.unique()
|
319
|
+
]
|
320
|
+
collapsed_sources = pd.Series(
|
321
|
+
collapsed_sources, index=indexed_members.index.unique(), name=SBML_DFS.S_SOURCE
|
322
|
+
)
|
323
|
+
|
324
|
+
# add sources to unique complex components
|
325
|
+
complex_component_species = complex_component_species.merge(
|
326
|
+
collapsed_sources,
|
327
|
+
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL],
|
328
|
+
right_index=True,
|
329
|
+
)
|
330
|
+
|
331
|
+
# define the maximum current id so that we can make new ids without collisions
|
332
|
+
max_existing_sid = max(
|
333
|
+
sbml_dfs_utils.id_formatter_inv(sbml_dfs.species.index.tolist())
|
334
|
+
)
|
335
|
+
# if s_ids used an alternative convention then they'll be nans here; which is fine
|
336
|
+
if max_existing_sid is np.nan:
|
337
|
+
max_existing_sid = int(-1)
|
338
|
+
|
339
|
+
new_species = complex_component_species[
|
340
|
+
complex_component_species["component_s_id"].isna()
|
341
|
+
]
|
342
|
+
new_species["component_s_id"] = sbml_dfs_utils.id_formatter(
|
343
|
+
range(max_existing_sid + 1, max_existing_sid + new_species.shape[0] + 1),
|
344
|
+
SBML_DFS.S_ID,
|
345
|
+
)
|
346
|
+
|
347
|
+
# format new species and add to sbml_dfs.species
|
348
|
+
new_species_for_sbml_dfs = (
|
349
|
+
new_species.rename(
|
350
|
+
{"component_s_id": SBML_DFS.S_ID, "identifier": SBML_DFS.S_NAME}, axis=1
|
351
|
+
)[[SBML_DFS.S_ID, SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]]
|
352
|
+
.set_index("s_id")
|
353
|
+
.sort_index()
|
354
|
+
)
|
355
|
+
|
356
|
+
# prepend zzauto so the string comes late alphanumerically. this way a properly named species will
|
357
|
+
# be preferred when merging species by identifiers
|
358
|
+
new_species_for_sbml_dfs[SBML_DFS.S_NAME] = (
|
359
|
+
"zzauto " + new_species_for_sbml_dfs[SBML_DFS.S_NAME]
|
360
|
+
)
|
361
|
+
|
362
|
+
# combine existing and newly defined complex components
|
363
|
+
complex_component_species_ids = pd.concat(
|
364
|
+
[
|
365
|
+
complex_component_species[
|
366
|
+
~complex_component_species["component_s_id"].isna()
|
367
|
+
],
|
368
|
+
new_species,
|
369
|
+
]
|
370
|
+
)
|
371
|
+
|
372
|
+
return merged_membership, new_species_for_sbml_dfs, complex_component_species_ids
|
373
|
+
|
374
|
+
|
375
|
+
def add_complex_formation(sbml_dfs: sbml_dfs_core.SBML_dfs):
|
376
|
+
"""
|
377
|
+
Add Complex Formation
|
378
|
+
|
379
|
+
Using Reactome-style complex annotations,
|
380
|
+
where complex components are an attribute of complexes,
|
381
|
+
add explicit complex formation reactions.
|
382
|
+
|
383
|
+
Reactome represents complexers using BQB_HAS_PART
|
384
|
+
annotations, which are extracted into identifiers.Identifiers
|
385
|
+
objects. This is sufficient to define membership but does
|
386
|
+
not include stoichiometry. Also, in this approach components
|
387
|
+
are defined by their identifiers (URIs) rather than internal
|
388
|
+
s_ids/sc_ids.
|
389
|
+
"""
|
390
|
+
|
391
|
+
raise NotImplementedError(
|
392
|
+
"TO DO - Need to look closer to see if the unformed complexes really need a formation reaction"
|
393
|
+
)
|
394
|
+
|
395
|
+
|
396
|
+
""" # define species present in complexes
|
397
|
+
(
|
398
|
+
merged_membership,
|
399
|
+
new_species_for_sbml_dfs,
|
400
|
+
complex_component_species_ids,
|
401
|
+
) = add_complex_formation_species(sbml_dfs)
|
402
|
+
|
403
|
+
# define compartmentalized species present in complexes
|
404
|
+
(
|
405
|
+
new_compartmentalized_species_for_sbml_dfs,
|
406
|
+
updated_compartmentalized_membership,
|
407
|
+
) = _add_complex_formation_compartmentalized_species(
|
408
|
+
sbml_dfs,
|
409
|
+
merged_membership,
|
410
|
+
new_species_for_sbml_dfs,
|
411
|
+
complex_component_species_ids.drop("s_Source", axis=1),
|
412
|
+
)
|
413
|
+
|
414
|
+
# remove complex formation for reactions which already have clear formation reactions
|
415
|
+
# to flag these complexes look for cases where the membership of the substrates
|
416
|
+
# and products (including complex membership) are the same
|
417
|
+
|
418
|
+
reaction_species_expanded_complexes = sbml_dfs.reaction_species.merge(
|
419
|
+
updated_compartmentalized_membership[["sc_id", "component_sc_id"]], how="left"
|
420
|
+
)
|
421
|
+
|
422
|
+
# if a species is not a complex then it is its own component
|
423
|
+
reaction_species_expanded_complexes["component_sc_id"] = [
|
424
|
+
x if z else y
|
425
|
+
for x, y, z in zip(
|
426
|
+
reaction_species_expanded_complexes["sc_id"],
|
427
|
+
reaction_species_expanded_complexes["component_sc_id"],
|
428
|
+
reaction_species_expanded_complexes["component_sc_id"].isna(),
|
429
|
+
)
|
430
|
+
]
|
431
|
+
|
432
|
+
# check for equal membership of substrates and products
|
433
|
+
reaction_species_expanded_complexes = reaction_species_expanded_complexes.set_index(
|
434
|
+
"r_id"
|
435
|
+
)
|
436
|
+
|
437
|
+
complex_formation_reactions = list()
|
438
|
+
for rxn in reaction_species_expanded_complexes.index.unique():
|
439
|
+
rxn_species = reaction_species_expanded_complexes.loc[rxn]
|
440
|
+
substrates = set(
|
441
|
+
rxn_species[rxn_species["stoichiometry"] < 0]["component_sc_id"].tolist()
|
442
|
+
)
|
443
|
+
products = set(
|
444
|
+
rxn_species[rxn_species["stoichiometry"] > 0]["component_sc_id"].tolist()
|
445
|
+
)
|
446
|
+
|
447
|
+
if substrates == products:
|
448
|
+
complex_formation_reactions.append(rxn)
|
449
|
+
|
450
|
+
# find complexes which are products of complex formation reactions
|
451
|
+
|
452
|
+
compartmentalized_complexes = updated_compartmentalized_membership["sc_id"].unique()
|
453
|
+
|
454
|
+
# is a complex formation reaction
|
455
|
+
formed_complexes = sbml_dfs.reaction_species[
|
456
|
+
sbml_dfs.reaction_species["r_id"].isin(complex_formation_reactions)
|
457
|
+
]
|
458
|
+
# is a complex
|
459
|
+
formed_complexes = formed_complexes[
|
460
|
+
formed_complexes["sc_id"].isin(compartmentalized_complexes)
|
461
|
+
]
|
462
|
+
# complex is product
|
463
|
+
formed_complexes = formed_complexes[formed_complexes["stoichiometry"] > 0]
|
464
|
+
|
465
|
+
formed_complexes = formed_complexes["sc_id"].unique()
|
466
|
+
_ = set(compartmentalized_complexes).difference(set(formed_complexes))
|
467
|
+
|
468
|
+
# add formation and dissolution reactions for all complexes without explicit formation reactions
|
469
|
+
"""
|
470
|
+
|
471
|
+
|
472
|
+
def add_entity_sets(
|
473
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
474
|
+
neo4j_members: str,
|
475
|
+
) -> sbml_dfs_core.SBML_dfs:
|
476
|
+
"""
|
477
|
+
Add Entity Sets
|
478
|
+
|
479
|
+
Reactome represents some sets of interchangeable molecules as "entity sets".
|
480
|
+
Common examples are ligands for a receptor. This function add members
|
481
|
+
of each entity set as a "is a" style reaction.
|
482
|
+
|
483
|
+
Parameters
|
484
|
+
----------
|
485
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
486
|
+
A relational mechanistic network
|
487
|
+
neo4j_members: str
|
488
|
+
Path to a table containing Reactome entity sets and corresponding members.
|
489
|
+
This is currently extracted manually with Neo4j.
|
490
|
+
|
491
|
+
Returns
|
492
|
+
-------
|
493
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
494
|
+
An updated database which includes entity set species and formation reactions
|
495
|
+
|
496
|
+
"""
|
497
|
+
|
498
|
+
# read and reformat Reactome entity sets table
|
499
|
+
reactome_members = _read_neo4j_members(neo4j_members)
|
500
|
+
|
501
|
+
# create missing species and compartmentalized species
|
502
|
+
(
|
503
|
+
merged_membership,
|
504
|
+
new_species_for_sbml_dfs,
|
505
|
+
set_component_species_ids,
|
506
|
+
) = _add_entity_sets_species(sbml_dfs, reactome_members)
|
507
|
+
|
508
|
+
(
|
509
|
+
new_compartmentalized_species_for_sbml_dfs,
|
510
|
+
updated_compartmentalized_membership,
|
511
|
+
) = _add_complex_formation_compartmentalized_species(
|
512
|
+
sbml_dfs,
|
513
|
+
merged_membership,
|
514
|
+
new_species_for_sbml_dfs,
|
515
|
+
set_component_species_ids,
|
516
|
+
)
|
517
|
+
|
518
|
+
(
|
519
|
+
new_reactions_for_sbml_dfs,
|
520
|
+
new_reaction_species_for_sbml_dfs,
|
521
|
+
) = _add_entity_sets_reactions(
|
522
|
+
sbml_dfs,
|
523
|
+
new_compartmentalized_species_for_sbml_dfs,
|
524
|
+
updated_compartmentalized_membership,
|
525
|
+
)
|
526
|
+
|
527
|
+
# add all of the new entries to the sbml_dfs
|
528
|
+
sbml_dfs_working = copy.copy(sbml_dfs)
|
529
|
+
|
530
|
+
sbml_dfs_working.species = pd.concat(
|
531
|
+
[sbml_dfs_working.species, new_species_for_sbml_dfs]
|
532
|
+
)
|
533
|
+
sbml_dfs_working.compartmentalized_species = pd.concat(
|
534
|
+
[
|
535
|
+
sbml_dfs_working.compartmentalized_species,
|
536
|
+
new_compartmentalized_species_for_sbml_dfs,
|
537
|
+
]
|
538
|
+
)
|
539
|
+
sbml_dfs_working.reactions = pd.concat(
|
540
|
+
[sbml_dfs_working.reactions, new_reactions_for_sbml_dfs]
|
541
|
+
)
|
542
|
+
sbml_dfs_working.reaction_species = pd.concat(
|
543
|
+
[sbml_dfs_working.reaction_species, new_reaction_species_for_sbml_dfs]
|
544
|
+
)
|
545
|
+
|
546
|
+
return sbml_dfs_working
|
547
|
+
|
548
|
+
|
549
|
+
def add_reactome_identifiers(
|
550
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
551
|
+
crossref_path: str,
|
552
|
+
) -> sbml_dfs_core.SBML_dfs:
|
553
|
+
"""
|
554
|
+
Add Reactome Identifiers
|
555
|
+
|
556
|
+
Add reactome-specific identifiers to existing species
|
557
|
+
|
558
|
+
Params
|
559
|
+
------
|
560
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
561
|
+
A pathway model
|
562
|
+
crossref_path:
|
563
|
+
Path to the cross ref file extracted from Reactome's Neo4j database
|
564
|
+
|
565
|
+
Returns
|
566
|
+
-------
|
567
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
568
|
+
A pathway model with updated species' identifiers
|
569
|
+
|
570
|
+
"""
|
571
|
+
|
572
|
+
select_reactome_ids = _format_reactome_crossref_ids(crossref_path)
|
573
|
+
|
574
|
+
# read all current identifiers
|
575
|
+
current_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
576
|
+
# filter annotations of homologues and literature references
|
577
|
+
current_molecular_ids = (
|
578
|
+
current_ids[current_ids[IDENTIFIERS.BQB].isin([BQB.IS, BQB.HAS_PART])]
|
579
|
+
.set_index([SBML_DFS.S_ID, IDENTIFIERS.BQB])
|
580
|
+
.sort_index()
|
581
|
+
.copy()
|
582
|
+
)
|
583
|
+
|
584
|
+
# combine existing s_ids with additional cross-ref annotations using uniprot ids
|
585
|
+
merged_crossrefs = _merge_reactome_crossref_ids(
|
586
|
+
current_molecular_ids, select_reactome_ids
|
587
|
+
)
|
588
|
+
|
589
|
+
# create identifiers objects for each s_id
|
590
|
+
combined_ids = (
|
591
|
+
pd.concat(
|
592
|
+
[
|
593
|
+
current_ids[
|
594
|
+
[
|
595
|
+
SBML_DFS.S_ID,
|
596
|
+
IDENTIFIERS.ONTOLOGY,
|
597
|
+
IDENTIFIERS.IDENTIFIER,
|
598
|
+
IDENTIFIERS.URL,
|
599
|
+
IDENTIFIERS.BQB,
|
600
|
+
]
|
601
|
+
],
|
602
|
+
merged_crossrefs[
|
603
|
+
[
|
604
|
+
SBML_DFS.S_ID,
|
605
|
+
IDENTIFIERS.ONTOLOGY,
|
606
|
+
IDENTIFIERS.IDENTIFIER,
|
607
|
+
IDENTIFIERS.URL,
|
608
|
+
IDENTIFIERS.BQB,
|
609
|
+
]
|
610
|
+
],
|
611
|
+
]
|
612
|
+
)
|
613
|
+
.reset_index(drop=True)
|
614
|
+
.drop_duplicates()
|
615
|
+
)
|
616
|
+
|
617
|
+
updated_identifiers = {
|
618
|
+
k: identifiers.Identifiers(
|
619
|
+
list(
|
620
|
+
v[
|
621
|
+
[
|
622
|
+
IDENTIFIERS.ONTOLOGY,
|
623
|
+
IDENTIFIERS.IDENTIFIER,
|
624
|
+
IDENTIFIERS.URL,
|
625
|
+
IDENTIFIERS.BQB,
|
626
|
+
]
|
627
|
+
]
|
628
|
+
.T.to_dict()
|
629
|
+
.values()
|
630
|
+
)
|
631
|
+
)
|
632
|
+
for k, v in combined_ids.groupby(SBML_DFS.S_ID)
|
633
|
+
}
|
634
|
+
updated_identifiers = pd.Series(
|
635
|
+
updated_identifiers, index=updated_identifiers.keys()
|
636
|
+
)
|
637
|
+
updated_identifiers.index.name = SBML_DFS.S_ID
|
638
|
+
updated_identifiers.name = "new_Identifiers"
|
639
|
+
|
640
|
+
# add new identifiers to species table
|
641
|
+
updated_species = sbml_dfs.species.merge(
|
642
|
+
updated_identifiers,
|
643
|
+
left_index=True,
|
644
|
+
right_index=True,
|
645
|
+
how="outer",
|
646
|
+
indicator=True,
|
647
|
+
)
|
648
|
+
|
649
|
+
if updated_species[updated_species["_merge"] == "right_only"].shape[0] > 0:
|
650
|
+
raise ValueError("Reactome crossrefs added new sids; this shouldn't occur")
|
651
|
+
|
652
|
+
updated_species = pd.concat(
|
653
|
+
[
|
654
|
+
updated_species[updated_species["_merge"] == "both"]
|
655
|
+
.drop([SBML_DFS.S_IDENTIFIERS, "_merge"], axis=1)
|
656
|
+
.rename({"new_Identifiers": SBML_DFS.S_IDENTIFIERS}, axis=1),
|
657
|
+
# retain original Identifiers if there is not new_Identifiers object
|
658
|
+
# (this would occur if there were not identifiers)
|
659
|
+
updated_species[updated_species["_merge"] == "left_only"].drop(
|
660
|
+
["new_Identifiers", "_merge"], axis=1
|
661
|
+
),
|
662
|
+
]
|
663
|
+
)
|
664
|
+
|
665
|
+
n_species_diff = updated_species.shape[0] - sbml_dfs.species.shape[0]
|
666
|
+
if n_species_diff != 0:
|
667
|
+
raise ValueError(
|
668
|
+
f"There are {n_species_diff} more species in the updated "
|
669
|
+
"species table than the original one; this is unexpected behavior"
|
670
|
+
)
|
671
|
+
|
672
|
+
# create a copy to return a new object rather than update the provided one
|
673
|
+
sbml_dfs_working = copy.copy(sbml_dfs)
|
674
|
+
sbml_dfs_working.species = updated_species
|
675
|
+
return sbml_dfs_working
|
676
|
+
|
677
|
+
|
678
|
+
def _add_entity_sets_species(
|
679
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
680
|
+
reactome_members: pd.DataFrame,
|
681
|
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
682
|
+
"""
|
683
|
+
Add Entity Sets - Species
|
684
|
+
|
685
|
+
Define all species which are part of "entity sets" in the pathway
|
686
|
+
|
687
|
+
Parameters
|
688
|
+
----------
|
689
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
690
|
+
A relational mechanistic network
|
691
|
+
reactome_members: pd.DataFrame
|
692
|
+
A table of all Reactome entity sets members - obtained using a Neo4j query
|
693
|
+
|
694
|
+
Returns
|
695
|
+
-------
|
696
|
+
merged_membership: pd.DataFrame
|
697
|
+
A table of complexes and their component members
|
698
|
+
new_species_for_sbml_dfs: pd.DataFrame
|
699
|
+
New entries to add to sbml_dfs.species
|
700
|
+
set_component_species_ids: pd.DataFrame
|
701
|
+
All set components
|
702
|
+
"""
|
703
|
+
|
704
|
+
species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
705
|
+
reactome_ids = species_ids[
|
706
|
+
species_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.REACTOME
|
707
|
+
].copy()
|
708
|
+
reactome_ids = reactome_ids[reactome_ids[IDENTIFIERS.BQB] == BQB.IS]
|
709
|
+
|
710
|
+
# compare Reactome ids in sbml_dfs and reactome_members to make sure
|
711
|
+
# they are for the same species
|
712
|
+
identifiers.check_reactome_identifier_compatibility(
|
713
|
+
reactome_members["member_id"], reactome_ids[IDENTIFIERS.IDENTIFIER]
|
714
|
+
)
|
715
|
+
|
716
|
+
# merge each species' entity sets to define entities which must exist in this pathway
|
717
|
+
merged_membership = (
|
718
|
+
reactome_ids[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER, SBML_DFS.S_SOURCE]]
|
719
|
+
.rename({IDENTIFIERS.IDENTIFIER: "set_id"}, axis=1)
|
720
|
+
.merge(reactome_members)
|
721
|
+
)
|
722
|
+
|
723
|
+
# define unique component species
|
724
|
+
set_component_species = merged_membership[
|
725
|
+
[
|
726
|
+
"member_id",
|
727
|
+
IDENTIFIERS.ONTOLOGY,
|
728
|
+
IDENTIFIERS.IDENTIFIER,
|
729
|
+
IDENTIFIERS.URL,
|
730
|
+
"member_s_name",
|
731
|
+
]
|
732
|
+
].drop_duplicates()
|
733
|
+
|
734
|
+
distinct_members = set_component_species.set_index(
|
735
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL, "member_s_name"]
|
736
|
+
).sort_index()
|
737
|
+
|
738
|
+
# since reactome IDs are compartmentalized, use external IDs only
|
739
|
+
# to determine distinct species, but then add reactome IDs as well
|
740
|
+
distinct_members = pd.Series(
|
741
|
+
[
|
742
|
+
identifiers.Identifiers(
|
743
|
+
[
|
744
|
+
{
|
745
|
+
IDENTIFIERS.ONTOLOGY: ind[0],
|
746
|
+
IDENTIFIERS.IDENTIFIER: str(ind[1]),
|
747
|
+
IDENTIFIERS.URL: ind[2],
|
748
|
+
IDENTIFIERS.BQB: BQB.IS,
|
749
|
+
}
|
750
|
+
]
|
751
|
+
+ [
|
752
|
+
{
|
753
|
+
IDENTIFIERS.ONTOLOGY: ONTOLOGIES.REACTOME,
|
754
|
+
IDENTIFIERS.IDENTIFIER: x,
|
755
|
+
IDENTIFIERS.URL: "",
|
756
|
+
IDENTIFIERS.BQB: BQB.IS,
|
757
|
+
}
|
758
|
+
for x in utils.safe_series_tolist(
|
759
|
+
distinct_members.loc[ind, "member_id"]
|
760
|
+
)
|
761
|
+
]
|
762
|
+
)
|
763
|
+
for ind in distinct_members.index.unique()
|
764
|
+
],
|
765
|
+
index=distinct_members.index.unique(),
|
766
|
+
name=SBML_DFS.S_IDENTIFIERS,
|
767
|
+
)
|
768
|
+
|
769
|
+
utils.check_unique_index(distinct_members, "distinct_members")
|
770
|
+
|
771
|
+
# combine identical species' sources
|
772
|
+
indexed_members = merged_membership.set_index(
|
773
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL]
|
774
|
+
).sort_index()
|
775
|
+
|
776
|
+
collapsed_sources = [
|
777
|
+
source._safe_source_merge(indexed_members.loc[ind][SBML_DFS.S_SOURCE])
|
778
|
+
for ind in indexed_members.index.unique()
|
779
|
+
]
|
780
|
+
collapsed_sources = pd.Series(
|
781
|
+
collapsed_sources, index=indexed_members.index.unique(), name=SBML_DFS.S_SOURCE
|
782
|
+
)
|
783
|
+
|
784
|
+
# add sources to unique set components
|
785
|
+
distinct_members = distinct_members.to_frame().join(collapsed_sources.to_frame())
|
786
|
+
|
787
|
+
utils.check_unique_index(distinct_members, "distinct_members (with sources)")
|
788
|
+
|
789
|
+
# define set members which already exist as species versus those that must be added
|
790
|
+
set_component_species["is_already_included"] = set_component_species[
|
791
|
+
"member_id"
|
792
|
+
].isin(reactome_ids[IDENTIFIERS.IDENTIFIER])
|
793
|
+
|
794
|
+
# define the maximum current id so that we can make new ids without collisions
|
795
|
+
max_existing_sid = max(
|
796
|
+
sbml_dfs_utils.id_formatter_inv(sbml_dfs.species.index.tolist())
|
797
|
+
)
|
798
|
+
# if s_ids used an alternative convention then they'll be nans here; which is fine
|
799
|
+
if max_existing_sid is np.nan:
|
800
|
+
max_existing_sid = int(-1)
|
801
|
+
|
802
|
+
new_species = set_component_species[
|
803
|
+
~set_component_species["is_already_included"]
|
804
|
+
].copy()
|
805
|
+
new_species["component_s_id"] = sbml_dfs_utils.id_formatter(
|
806
|
+
range(max_existing_sid + 1, max_existing_sid + new_species.shape[0] + 1),
|
807
|
+
SBML_DFS.S_ID,
|
808
|
+
)
|
809
|
+
|
810
|
+
# define new unique species
|
811
|
+
new_species_for_sbml_dfs = (
|
812
|
+
new_species.merge(
|
813
|
+
distinct_members,
|
814
|
+
left_on=[
|
815
|
+
IDENTIFIERS.ONTOLOGY,
|
816
|
+
IDENTIFIERS.IDENTIFIER,
|
817
|
+
IDENTIFIERS.URL,
|
818
|
+
"member_s_name",
|
819
|
+
],
|
820
|
+
right_index=True,
|
821
|
+
)[
|
822
|
+
[
|
823
|
+
"component_s_id",
|
824
|
+
"member_s_name",
|
825
|
+
SBML_DFS.S_IDENTIFIERS,
|
826
|
+
SBML_DFS.S_SOURCE,
|
827
|
+
]
|
828
|
+
]
|
829
|
+
.rename(
|
830
|
+
{"component_s_id": SBML_DFS.S_ID, "member_s_name": SBML_DFS.S_NAME}, axis=1
|
831
|
+
)
|
832
|
+
.set_index(SBML_DFS.S_ID)
|
833
|
+
.sort_index()
|
834
|
+
)
|
835
|
+
|
836
|
+
utils.check_unique_index(new_species_for_sbml_dfs, "new_species_for_sbml_dfs")
|
837
|
+
|
838
|
+
# combine existing and newly defined set components
|
839
|
+
set_component_species_ids = pd.concat(
|
840
|
+
[
|
841
|
+
set_component_species[set_component_species["is_already_included"]].merge(
|
842
|
+
reactome_ids[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
|
843
|
+
{
|
844
|
+
IDENTIFIERS.IDENTIFIER: "member_id",
|
845
|
+
SBML_DFS.S_ID: "component_s_id",
|
846
|
+
},
|
847
|
+
axis=1,
|
848
|
+
)
|
849
|
+
),
|
850
|
+
new_species,
|
851
|
+
]
|
852
|
+
)
|
853
|
+
|
854
|
+
return merged_membership, new_species_for_sbml_dfs, set_component_species_ids
|
855
|
+
|
856
|
+
|
857
|
+
def _add_entity_sets_reactions(
|
858
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
859
|
+
new_compartmentalized_species_for_sbml_dfs: pd.DataFrame,
|
860
|
+
updated_compartmentalized_membership: pd.DataFrame,
|
861
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
862
|
+
"""
|
863
|
+
Add Entity Sets - Reactions
|
864
|
+
|
865
|
+
Create reactions which indicate membership in an entity set
|
866
|
+
|
867
|
+
Parameters
|
868
|
+
----------
|
869
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
870
|
+
A relational mechanistic network
|
871
|
+
new_compartmentalized_species_for_sbml_dfs: pd.DataFrame
|
872
|
+
New entries to add to sbml_dfs.compartmentalized_species
|
873
|
+
updated_compartmentalized_membership: pd.DataFrame
|
874
|
+
Compartmentalized complex components with updated IDs
|
875
|
+
|
876
|
+
Returns
|
877
|
+
-------
|
878
|
+
new_reactions_for_sbml_dfs: pd.DataFrame
|
879
|
+
New entries to add to sbml_dfs.reactions
|
880
|
+
new_reaction_species_for_sbml_dfs: pd.DataFrame
|
881
|
+
New entries to add to sbml_dfs.reaction_species
|
882
|
+
"""
|
883
|
+
|
884
|
+
all_compartmentalized_species = pd.concat(
|
885
|
+
[sbml_dfs.compartmentalized_species, new_compartmentalized_species_for_sbml_dfs]
|
886
|
+
)
|
887
|
+
|
888
|
+
# create a table with named "entity sets" and their members
|
889
|
+
# each row will be turned into an "IS A" reaction
|
890
|
+
named_set_components = updated_compartmentalized_membership[
|
891
|
+
[SBML_DFS.SC_ID, SBML_DFS.SC_NAME, SBML_DFS.SC_SOURCE, "component_sc_id"]
|
892
|
+
].merge(
|
893
|
+
all_compartmentalized_species[[SBML_DFS.SC_NAME]].rename(
|
894
|
+
{SBML_DFS.SC_NAME: "component_sc_name"}, axis=1
|
895
|
+
),
|
896
|
+
left_on="component_sc_id",
|
897
|
+
right_index=True,
|
898
|
+
how="left",
|
899
|
+
)
|
900
|
+
|
901
|
+
if any(named_set_components["component_sc_name"].isna()):
|
902
|
+
raise ValueError("Some components could not be merged")
|
903
|
+
|
904
|
+
# define newly added reactions
|
905
|
+
max_existing_rid = max(
|
906
|
+
sbml_dfs_utils.id_formatter_inv(sbml_dfs.reactions.index.tolist())
|
907
|
+
)
|
908
|
+
# if s_ids used an alternative convention then they'll be nans here; which is fine
|
909
|
+
if max_existing_rid is np.nan:
|
910
|
+
max_existing_rid = int(-1)
|
911
|
+
|
912
|
+
# name the reaction following the "IS A" convention
|
913
|
+
named_set_components[SBML_DFS.R_NAME] = [
|
914
|
+
f"{comp_sc} IS A {sc}"
|
915
|
+
for comp_sc, sc in zip(
|
916
|
+
named_set_components["component_sc_name"], named_set_components["sc_name"]
|
917
|
+
)
|
918
|
+
]
|
919
|
+
|
920
|
+
named_set_components[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
921
|
+
range(
|
922
|
+
max_existing_rid + 1,
|
923
|
+
max_existing_rid + named_set_components.shape[0] + 1,
|
924
|
+
),
|
925
|
+
SBML_DFS.R_ID,
|
926
|
+
)
|
927
|
+
|
928
|
+
named_set_components[SBML_DFS.R_SOURCE] = named_set_components[SBML_DFS.SC_SOURCE]
|
929
|
+
named_set_components[SBML_DFS.R_IDENTIFIERS] = [
|
930
|
+
identifiers.Identifiers([]) for i in range(0, named_set_components.shape[0])
|
931
|
+
]
|
932
|
+
|
933
|
+
new_reactions_for_sbml_dfs = (
|
934
|
+
named_set_components[
|
935
|
+
[SBML_DFS.R_ID, SBML_DFS.R_NAME, SBML_DFS.R_IDENTIFIERS, SBML_DFS.R_SOURCE]
|
936
|
+
]
|
937
|
+
.set_index(SBML_DFS.R_ID)
|
938
|
+
.sort_index()
|
939
|
+
.assign(r_isreversible=False)
|
940
|
+
)
|
941
|
+
|
942
|
+
# define newly added reactions' species
|
943
|
+
|
944
|
+
max_existing_rscid = max(
|
945
|
+
sbml_dfs_utils.id_formatter_inv(sbml_dfs.reaction_species.index.tolist())
|
946
|
+
)
|
947
|
+
if max_existing_rscid is np.nan:
|
948
|
+
max_existing_rscid = int(-1)
|
949
|
+
|
950
|
+
new_reaction_species_for_sbml_dfs = pd.concat(
|
951
|
+
[
|
952
|
+
named_set_components[["component_sc_id", SBML_DFS.R_ID]]
|
953
|
+
.rename({"component_sc_id": SBML_DFS.SC_ID}, axis=1)
|
954
|
+
.assign(stoichiometry=-1)
|
955
|
+
.assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]),
|
956
|
+
named_set_components[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
957
|
+
.assign(stoichiometry=1)
|
958
|
+
.assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]),
|
959
|
+
]
|
960
|
+
).sort_values([SBML_DFS.R_ID, SBML_DFS.STOICHIOMETRY])
|
961
|
+
|
962
|
+
new_reaction_species_for_sbml_dfs[SBML_DFS.RSC_ID] = sbml_dfs_utils.id_formatter(
|
963
|
+
range(
|
964
|
+
max_existing_rscid + 1,
|
965
|
+
max_existing_rscid + new_reaction_species_for_sbml_dfs.shape[0] + 1,
|
966
|
+
),
|
967
|
+
SBML_DFS.RSC_ID,
|
968
|
+
)
|
969
|
+
|
970
|
+
new_reaction_species_for_sbml_dfs = new_reaction_species_for_sbml_dfs.set_index(
|
971
|
+
SBML_DFS.RSC_ID
|
972
|
+
).sort_index()
|
973
|
+
|
974
|
+
return new_reactions_for_sbml_dfs, new_reaction_species_for_sbml_dfs
|
975
|
+
|
976
|
+
|
977
|
+
def _add_complex_formation_compartmentalized_species(
|
978
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
979
|
+
merged_membership: pd.DataFrame,
|
980
|
+
new_species_for_sbml_dfs: pd.DataFrame,
|
981
|
+
complex_component_species_ids: pd.DataFrame,
|
982
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
983
|
+
"""
|
984
|
+
Add Complex Formation - Compartmentalized Species
|
985
|
+
|
986
|
+
Define all compartmentalized species in complexes and format newly created compartmentalized species
|
987
|
+
|
988
|
+
Parameters
|
989
|
+
----------
|
990
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
991
|
+
A relational mechanistic network
|
992
|
+
merged_membership: pd.DataFrame
|
993
|
+
A table of complexes and their component members
|
994
|
+
new_species_for_sbml_dfs: pd.DataFrame
|
995
|
+
New entries to add to sbml_dfs.species
|
996
|
+
complex_component_species_ids: pd.DataFrame
|
997
|
+
All complex components
|
998
|
+
|
999
|
+
Returns
|
1000
|
+
-------
|
1001
|
+
new_compartmentalized_species_for_sbml_dfs: pd.DataFrame
|
1002
|
+
New entries to add to sbml_dfs.compartmentalized_species
|
1003
|
+
updated_compartmentalized_membership: pd.DataFrame
|
1004
|
+
Compartmentalized complex components with updated IDs
|
1005
|
+
"""
|
1006
|
+
|
1007
|
+
# filter compartmentalized species to complexes
|
1008
|
+
complexes = merged_membership[SBML_DFS.S_ID].unique()
|
1009
|
+
compartmentalized_complexes = sbml_dfs.compartmentalized_species[
|
1010
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(complexes)
|
1011
|
+
]
|
1012
|
+
|
1013
|
+
# create appropriate compartmentalized species
|
1014
|
+
# merge compartmentalized complexes with their membership
|
1015
|
+
merged_compartmentalized_membership = (
|
1016
|
+
compartmentalized_complexes.reset_index().merge(
|
1017
|
+
merged_membership[
|
1018
|
+
[
|
1019
|
+
SBML_DFS.S_ID,
|
1020
|
+
IDENTIFIERS.ONTOLOGY,
|
1021
|
+
IDENTIFIERS.IDENTIFIER,
|
1022
|
+
IDENTIFIERS.URL,
|
1023
|
+
]
|
1024
|
+
].merge(complex_component_species_ids)
|
1025
|
+
)
|
1026
|
+
)
|
1027
|
+
|
1028
|
+
# define all of the compartmentalized species that should exist
|
1029
|
+
complex_component_compartmentalized_species = (
|
1030
|
+
merged_compartmentalized_membership[["component_s_id", SBML_DFS.C_ID]]
|
1031
|
+
.drop_duplicates()
|
1032
|
+
.merge(
|
1033
|
+
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID, SBML_DFS.C_ID]]
|
1034
|
+
.reset_index()
|
1035
|
+
.rename({SBML_DFS.S_ID: "component_s_id"}, axis=1),
|
1036
|
+
how="left",
|
1037
|
+
)
|
1038
|
+
)
|
1039
|
+
|
1040
|
+
new_compartmentalized_species = complex_component_compartmentalized_species[
|
1041
|
+
complex_component_compartmentalized_species[SBML_DFS.SC_ID].isna()
|
1042
|
+
].copy()
|
1043
|
+
|
1044
|
+
# add new identifiers
|
1045
|
+
max_existing_scid = max(
|
1046
|
+
sbml_dfs_utils.id_formatter_inv(
|
1047
|
+
sbml_dfs.compartmentalized_species.index.tolist()
|
1048
|
+
)
|
1049
|
+
)
|
1050
|
+
if max_existing_scid is np.nan:
|
1051
|
+
max_existing_scid = int(-1)
|
1052
|
+
|
1053
|
+
new_compartmentalized_species[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
1054
|
+
range(
|
1055
|
+
max_existing_scid + 1,
|
1056
|
+
max_existing_scid + new_compartmentalized_species.shape[0] + 1,
|
1057
|
+
),
|
1058
|
+
SBML_DFS.SC_ID,
|
1059
|
+
)
|
1060
|
+
|
1061
|
+
all_species = pd.concat([sbml_dfs.species, new_species_for_sbml_dfs])
|
1062
|
+
|
1063
|
+
# name new sc_ids and inherit sources from their complexes
|
1064
|
+
new_compartmentalized_species_names = new_compartmentalized_species.merge(
|
1065
|
+
all_species[SBML_DFS.S_NAME],
|
1066
|
+
left_on="component_s_id",
|
1067
|
+
right_index=True,
|
1068
|
+
how="left",
|
1069
|
+
).merge(
|
1070
|
+
sbml_dfs.compartments[SBML_DFS.C_NAME],
|
1071
|
+
left_on=SBML_DFS.C_ID,
|
1072
|
+
right_index=True,
|
1073
|
+
how="left",
|
1074
|
+
)
|
1075
|
+
|
1076
|
+
if any(new_compartmentalized_species_names[SBML_DFS.S_NAME].isna()):
|
1077
|
+
raise ValueError("Some species were unnamed")
|
1078
|
+
if any(new_compartmentalized_species_names[SBML_DFS.C_NAME].isna()):
|
1079
|
+
raise ValueError("Some compartmnets were unnamed")
|
1080
|
+
|
1081
|
+
# name compartmentalized species
|
1082
|
+
new_compartmentalized_species_names[SBML_DFS.SC_NAME] = [
|
1083
|
+
f"{s_name} [{c_name}]"
|
1084
|
+
for s_name, c_name in zip(
|
1085
|
+
new_compartmentalized_species_names[SBML_DFS.S_NAME],
|
1086
|
+
new_compartmentalized_species_names[SBML_DFS.C_NAME],
|
1087
|
+
)
|
1088
|
+
]
|
1089
|
+
|
1090
|
+
# add sources from the complexes that compartmentalized species belong to
|
1091
|
+
indexed_cmembers = (
|
1092
|
+
merged_compartmentalized_membership[
|
1093
|
+
["component_s_id", SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
1094
|
+
]
|
1095
|
+
.set_index(["component_s_id", SBML_DFS.C_ID])
|
1096
|
+
.sort_index()
|
1097
|
+
)
|
1098
|
+
|
1099
|
+
collapsed_csources = [
|
1100
|
+
(
|
1101
|
+
source.merge_sources(indexed_cmembers.loc[ind][SBML_DFS.SC_SOURCE].tolist())
|
1102
|
+
if len(ind) == 1
|
1103
|
+
else indexed_cmembers.loc[ind][SBML_DFS.SC_SOURCE]
|
1104
|
+
)
|
1105
|
+
for ind in indexed_cmembers.index.unique()
|
1106
|
+
]
|
1107
|
+
collapsed_csources = pd.Series(
|
1108
|
+
collapsed_csources,
|
1109
|
+
index=indexed_cmembers.index.unique(),
|
1110
|
+
name=SBML_DFS.SC_SOURCE,
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
new_compartmentalized_species_names = new_compartmentalized_species_names.merge(
|
1114
|
+
collapsed_csources, left_on=["component_s_id", SBML_DFS.C_ID], right_index=True
|
1115
|
+
)
|
1116
|
+
|
1117
|
+
new_compartmentalized_species_for_sbml_dfs = (
|
1118
|
+
new_compartmentalized_species_names[
|
1119
|
+
[
|
1120
|
+
SBML_DFS.SC_ID,
|
1121
|
+
SBML_DFS.SC_NAME,
|
1122
|
+
"component_s_id",
|
1123
|
+
SBML_DFS.C_ID,
|
1124
|
+
SBML_DFS.SC_SOURCE,
|
1125
|
+
]
|
1126
|
+
]
|
1127
|
+
.rename({"component_s_id": SBML_DFS.S_ID}, axis=1)
|
1128
|
+
.set_index(SBML_DFS.SC_ID)
|
1129
|
+
)
|
1130
|
+
|
1131
|
+
utils.check_unique_index(
|
1132
|
+
new_compartmentalized_species_for_sbml_dfs,
|
1133
|
+
"new_compartmentalized_species_for_sbml_dfs",
|
1134
|
+
)
|
1135
|
+
|
1136
|
+
# combine old and new compartmentalized species using current sc_ids
|
1137
|
+
complex_compartmentalized_components_ids = pd.concat(
|
1138
|
+
[
|
1139
|
+
complex_component_compartmentalized_species[
|
1140
|
+
~complex_component_compartmentalized_species[SBML_DFS.SC_ID].isna()
|
1141
|
+
],
|
1142
|
+
new_compartmentalized_species,
|
1143
|
+
]
|
1144
|
+
).rename({SBML_DFS.SC_ID: "component_sc_id"}, axis=1)
|
1145
|
+
|
1146
|
+
updated_compartmentalized_membership = merged_compartmentalized_membership[
|
1147
|
+
[
|
1148
|
+
SBML_DFS.SC_ID,
|
1149
|
+
SBML_DFS.SC_NAME,
|
1150
|
+
SBML_DFS.S_ID,
|
1151
|
+
SBML_DFS.C_ID,
|
1152
|
+
"component_s_id",
|
1153
|
+
SBML_DFS.SC_SOURCE,
|
1154
|
+
]
|
1155
|
+
].merge(complex_compartmentalized_components_ids)
|
1156
|
+
|
1157
|
+
return (
|
1158
|
+
new_compartmentalized_species_for_sbml_dfs,
|
1159
|
+
updated_compartmentalized_membership,
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
|
1163
|
+
def _read_neo4j_members(neo4j_members: str) -> pd.DataFrame:
|
1164
|
+
"""Read a table containing entity sets (members) derived from Reactome's Neo4J database."""
|
1165
|
+
|
1166
|
+
# load a list containing Reactome entity sets -> members
|
1167
|
+
# entity sets are categories of molecular species that
|
1168
|
+
# share a common property such as serving as ligands for a receptor
|
1169
|
+
# these relationships are not represented in the Reactome .sbml
|
1170
|
+
# so they are pulled out of the Neo4j database.
|
1171
|
+
base, path = os.path.split(neo4j_members)
|
1172
|
+
with open_fs(base) as bfs:
|
1173
|
+
with bfs.open(path, "rb") as f:
|
1174
|
+
reactome_members = pd.read_csv(f).assign(url="")
|
1175
|
+
|
1176
|
+
reactome_members[IDENTIFIERS.ONTOLOGY] = reactome_members[
|
1177
|
+
IDENTIFIERS.ONTOLOGY
|
1178
|
+
].str.lower()
|
1179
|
+
|
1180
|
+
# add an uncompartmentalized name
|
1181
|
+
reactome_members["member_s_name"] = [
|
1182
|
+
re.sub(" \\[[A-Za-z ]+\\]$", "", x) for x in reactome_members["member_name"]
|
1183
|
+
]
|
1184
|
+
reactome_members[IDENTIFIERS.IDENTIFIER] = reactome_members[
|
1185
|
+
IDENTIFIERS.IDENTIFIER
|
1186
|
+
].astype(str)
|
1187
|
+
|
1188
|
+
return reactome_members
|
1189
|
+
|
1190
|
+
|
1191
|
+
def _merge_reactome_crossref_ids(
|
1192
|
+
current_molecular_ids: pd.DataFrame,
|
1193
|
+
select_reactome_ids: pd.DataFrame,
|
1194
|
+
) -> pd.DataFrame:
|
1195
|
+
"""
|
1196
|
+
Merge Reactome CrossRef IDs
|
1197
|
+
|
1198
|
+
Combine existing molecular IDs with Reactome crossref identifiers.
|
1199
|
+
|
1200
|
+
Params
|
1201
|
+
------
|
1202
|
+
current_molecular_ids: pd.DataFrame
|
1203
|
+
Molecular features in the current pathway model
|
1204
|
+
select_reactome_ids: pd.DataFrame
|
1205
|
+
Crossref identifiers produced by _format_reactome_crossref_ids()
|
1206
|
+
|
1207
|
+
Returns
|
1208
|
+
-------
|
1209
|
+
merged_crossrefs: pd.DataFrame
|
1210
|
+
Molecular feature sids matched to crossref annotations
|
1211
|
+
|
1212
|
+
"""
|
1213
|
+
|
1214
|
+
# reactome IDs to identifiers.Identifiers
|
1215
|
+
id_indices = current_molecular_ids.index.unique()
|
1216
|
+
# ind = id_indices[1]
|
1217
|
+
|
1218
|
+
# loop through all s_id x bqb pairs
|
1219
|
+
uniprot_ids = list()
|
1220
|
+
uniprot_ids_w_reactome = list()
|
1221
|
+
for ind in id_indices:
|
1222
|
+
ind_ids = current_molecular_ids.loc[ind]
|
1223
|
+
ontologies_present = ind_ids[IDENTIFIERS.ONTOLOGY].unique()
|
1224
|
+
if ONTOLOGIES.UNIPROT in ontologies_present:
|
1225
|
+
# return all (s_id, bqb) -> uniprot entries
|
1226
|
+
# save the uniprot source since it will be propagated to new ids joined to the uniprot id
|
1227
|
+
entry_uniprot_ids = (
|
1228
|
+
ind_ids.loc[ind_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.UNIPROT]
|
1229
|
+
.reset_index()[
|
1230
|
+
[
|
1231
|
+
SBML_DFS.S_ID,
|
1232
|
+
IDENTIFIERS.BQB,
|
1233
|
+
IDENTIFIERS.IDENTIFIER,
|
1234
|
+
SBML_DFS.S_SOURCE,
|
1235
|
+
]
|
1236
|
+
]
|
1237
|
+
.rename({IDENTIFIERS.IDENTIFIER: ONTOLOGIES.UNIPROT}, axis=1)
|
1238
|
+
)
|
1239
|
+
# remove trailing dashes in uniprot ids since they are not present in the crossref identifiers
|
1240
|
+
entry_uniprot_ids[ONTOLOGIES.UNIPROT] = entry_uniprot_ids[
|
1241
|
+
ONTOLOGIES.UNIPROT
|
1242
|
+
].replace("\\-[0-9]+$", "", regex=True)
|
1243
|
+
|
1244
|
+
uniprot_ids.append(entry_uniprot_ids)
|
1245
|
+
|
1246
|
+
# add reactome ids to lookup if they exist (they won't for BQB_HAS_PART qualifiers)
|
1247
|
+
if ONTOLOGIES.REACTOME in ontologies_present:
|
1248
|
+
# create the all x all cross of bqb-matched reactome and uniprot ids
|
1249
|
+
entry_reactome = (
|
1250
|
+
ind_ids.loc[ind_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.REACTOME]
|
1251
|
+
.reset_index()[
|
1252
|
+
[SBML_DFS.S_ID, IDENTIFIERS.BQB, IDENTIFIERS.IDENTIFIER]
|
1253
|
+
]
|
1254
|
+
.rename({IDENTIFIERS.IDENTIFIER: "reactome_id"}, axis=1)
|
1255
|
+
)
|
1256
|
+
uniprot_ids_w_reactome.append(entry_uniprot_ids.merge(entry_reactome))
|
1257
|
+
|
1258
|
+
uniprot_ids = pd.concat(uniprot_ids)
|
1259
|
+
uniprot_ids_w_reactome = pd.concat(uniprot_ids_w_reactome)
|
1260
|
+
|
1261
|
+
# uniprot_ids_w_reactome
|
1262
|
+
uni_rct_with_crossrefs = uniprot_ids_w_reactome.merge(select_reactome_ids)
|
1263
|
+
# check ontologies
|
1264
|
+
uni_rct_with_crossrefs_ensembl_genes = uni_rct_with_crossrefs.loc[
|
1265
|
+
uni_rct_with_crossrefs[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.ENSEMBL_GENE,
|
1266
|
+
SBML_DFS.S_ID,
|
1267
|
+
].unique()
|
1268
|
+
|
1269
|
+
failed_joins = uniprot_ids_w_reactome[
|
1270
|
+
~uniprot_ids_w_reactome[SBML_DFS.S_ID].isin(
|
1271
|
+
uni_rct_with_crossrefs_ensembl_genes
|
1272
|
+
)
|
1273
|
+
]
|
1274
|
+
# most of the failed joins are pathogens so they wouldn't match to human ensembl genes
|
1275
|
+
if failed_joins.shape[0] > 0:
|
1276
|
+
logged_join_fails = failed_joins.sample(min(failed_joins.shape[0], 5)).drop(
|
1277
|
+
SBML_DFS.S_SOURCE, axis=1
|
1278
|
+
)
|
1279
|
+
logger.warning(
|
1280
|
+
f"{failed_joins.shape[0]} network uniprot IDs were not matched to the Reactome Crossref IDs"
|
1281
|
+
)
|
1282
|
+
|
1283
|
+
utils.style_df(logged_join_fails, headers="keys", hide_index=True)
|
1284
|
+
|
1285
|
+
# entries without reactome IDs join just by uniprot
|
1286
|
+
# outer join back to uni_rct_with_crossrefs so we won't consider a uniprot-only match
|
1287
|
+
# when a uniprot + reactome match worked [its not entirely clear that this does anything]
|
1288
|
+
uni_no_rct_with_crossrefs = uniprot_ids.merge(select_reactome_ids).merge(
|
1289
|
+
uni_rct_with_crossrefs[[SBML_DFS.S_ID, IDENTIFIERS.BQB]].drop_duplicates(),
|
1290
|
+
how="outer",
|
1291
|
+
indicator=True,
|
1292
|
+
)
|
1293
|
+
uni_no_rct_with_crossrefs = uni_no_rct_with_crossrefs[
|
1294
|
+
uni_no_rct_with_crossrefs["_merge"] == "left_only"
|
1295
|
+
].drop("_merge", axis=1)
|
1296
|
+
|
1297
|
+
merged_crossrefs = pd.concat([uni_rct_with_crossrefs, uni_no_rct_with_crossrefs])
|
1298
|
+
assert (
|
1299
|
+
uni_rct_with_crossrefs.shape[0] + uni_no_rct_with_crossrefs.shape[0]
|
1300
|
+
) == merged_crossrefs.shape[0]
|
1301
|
+
|
1302
|
+
species_with_protein_and_no_gene = current_molecular_ids[
|
1303
|
+
current_molecular_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.UNIPROT
|
1304
|
+
].merge(
|
1305
|
+
merged_crossrefs.loc[
|
1306
|
+
merged_crossrefs[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.ENSEMBL_GENE,
|
1307
|
+
[SBML_DFS.S_ID, IDENTIFIERS.BQB],
|
1308
|
+
].drop_duplicates(),
|
1309
|
+
how="outer",
|
1310
|
+
left_index=True,
|
1311
|
+
right_on=[SBML_DFS.S_ID, IDENTIFIERS.BQB],
|
1312
|
+
indicator=True,
|
1313
|
+
)
|
1314
|
+
species_with_protein_and_no_gene = species_with_protein_and_no_gene[
|
1315
|
+
species_with_protein_and_no_gene["_merge"] == "left_only"
|
1316
|
+
][[SBML_DFS.S_ID, SBML_DFS.S_NAME, IDENTIFIERS.BQB]].drop_duplicates()
|
1317
|
+
|
1318
|
+
if species_with_protein_and_no_gene.shape[0] > 0:
|
1319
|
+
logged_join_fails = species_with_protein_and_no_gene.sample(
|
1320
|
+
min(species_with_protein_and_no_gene.shape[0], 5)
|
1321
|
+
)
|
1322
|
+
|
1323
|
+
logger.warning(
|
1324
|
+
f"A gene ID could not be found for {species_with_protein_and_no_gene.shape[0]} "
|
1325
|
+
"(species, bqb) pairs with a protein ID"
|
1326
|
+
)
|
1327
|
+
|
1328
|
+
logger.warning(
|
1329
|
+
utils.style_df(logged_join_fails, headers="keys", hide_index=True)
|
1330
|
+
)
|
1331
|
+
|
1332
|
+
return merged_crossrefs
|
1333
|
+
|
1334
|
+
|
1335
|
+
def _format_reactome_crossref_ids(
|
1336
|
+
crossref_path: str,
|
1337
|
+
) -> str:
|
1338
|
+
"""
|
1339
|
+
Format Reactome CrossRef IDs
|
1340
|
+
|
1341
|
+
Read and reformat Reactome's crossref identifiers
|
1342
|
+
|
1343
|
+
Params
|
1344
|
+
------
|
1345
|
+
crossref_path: str
|
1346
|
+
Path to the cross ref file extracted from Reactome's Neo4j database
|
1347
|
+
|
1348
|
+
Returns
|
1349
|
+
-------
|
1350
|
+
select_reactome_ids: str
|
1351
|
+
Crossref identifiers
|
1352
|
+
|
1353
|
+
"""
|
1354
|
+
|
1355
|
+
base, path = os.path.split(crossref_path)
|
1356
|
+
with open_fs(base) as bfs:
|
1357
|
+
with bfs.open(path, "rb") as f:
|
1358
|
+
reactome_ids = pd.read_csv(f)
|
1359
|
+
|
1360
|
+
# only use ensembl and pharos for now
|
1361
|
+
|
1362
|
+
# rename pharos ontology
|
1363
|
+
pharos_ids = reactome_ids[
|
1364
|
+
reactome_ids[IDENTIFIERS.ONTOLOGY] == "Pharos - Targets"
|
1365
|
+
].copy()
|
1366
|
+
pharos_ids[IDENTIFIERS.ONTOLOGY] = ONTOLOGIES.PHAROS
|
1367
|
+
|
1368
|
+
# format ensembl ids using conventions in identifiers.Identifiers
|
1369
|
+
ensembl_ids = reactome_ids[reactome_ids[IDENTIFIERS.ONTOLOGY] == "Ensembl"].copy()
|
1370
|
+
# distinguish ensembl genes/transcripts/proteins
|
1371
|
+
ensembl_ids["ontology_prefix"] = ensembl_ids[IDENTIFIERS.IDENTIFIER].str.slice(
|
1372
|
+
start=0, stop=4
|
1373
|
+
)
|
1374
|
+
ensembl_ids[IDENTIFIERS.ONTOLOGY] = [
|
1375
|
+
ENSEMBL_PREFIX_TO_ONTOLOGY[p] for p in ensembl_ids["ontology_prefix"]
|
1376
|
+
]
|
1377
|
+
ensembl_ids = ensembl_ids.drop("ontology_prefix", axis=1)
|
1378
|
+
|
1379
|
+
select_reactome_ids = pd.concat([pharos_ids, ensembl_ids])
|
1380
|
+
|
1381
|
+
return select_reactome_ids
|