napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/modify/gaps.py
ADDED
@@ -0,0 +1,635 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
import logging
|
5
|
+
|
6
|
+
import igraph as ig
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
from napistu import identifiers
|
11
|
+
from napistu import sbml_dfs_core
|
12
|
+
from napistu import sbml_dfs_utils
|
13
|
+
from napistu import source
|
14
|
+
from napistu import utils
|
15
|
+
from napistu.network import net_create
|
16
|
+
|
17
|
+
from napistu.constants import SBML_DFS
|
18
|
+
from napistu.constants import COMPARTMENTS
|
19
|
+
from napistu.constants import IDENTIFIERS
|
20
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
21
|
+
from napistu.constants import SBOTERM_NAMES
|
22
|
+
from napistu.constants import SOURCE_SPEC
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
def add_transportation_reactions(
|
28
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
29
|
+
exchange_compartment: str = COMPARTMENTS["CYTOSOL"],
|
30
|
+
) -> sbml_dfs_core.SBML_dfs:
|
31
|
+
"""
|
32
|
+
Add Transportation Reactions
|
33
|
+
|
34
|
+
Identifies proteins whose various compartmentalized forms cannot reach one
|
35
|
+
another via existing transportation reactions and then adds transportation
|
36
|
+
reactions which connect all forms of a protein.
|
37
|
+
|
38
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
39
|
+
A mechanistic model containing a set of molecular species which exist
|
40
|
+
in multiple compartments and are interconverted by reactions
|
41
|
+
exchange_compartment: str
|
42
|
+
The name of an exchange compartment matching a c_name from sbml_dfs.compartments
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
|
46
|
+
sbml_df_with_exchange: sbml_dfs_core.SBML_dfs
|
47
|
+
The input sbml_dfs with additional transport reactions and compartmentalized species
|
48
|
+
(in the exchange compartment) added.
|
49
|
+
|
50
|
+
"""
|
51
|
+
|
52
|
+
# validate arguments
|
53
|
+
if not any(sbml_dfs.compartments[SBML_DFS.C_NAME] == exchange_compartment):
|
54
|
+
raise ValueError(
|
55
|
+
f"{exchange_compartment} is not a compartment defined in sbml_dfs.compartments"
|
56
|
+
)
|
57
|
+
|
58
|
+
# find species which need transport reactions
|
59
|
+
species_needing_transport_rxns = _identify_species_needing_transport_reactions(
|
60
|
+
sbml_dfs=sbml_dfs
|
61
|
+
)
|
62
|
+
|
63
|
+
sbml_df_with_exchange = update_sbml_df_with_exchange(
|
64
|
+
species_needing_transport_rxns=species_needing_transport_rxns,
|
65
|
+
sbml_dfs=sbml_dfs,
|
66
|
+
exchange_compartment=exchange_compartment,
|
67
|
+
)
|
68
|
+
|
69
|
+
return sbml_df_with_exchange
|
70
|
+
|
71
|
+
|
72
|
+
def update_sbml_df_with_exchange(
|
73
|
+
species_needing_transport_rxns: np.ndarray,
|
74
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
75
|
+
exchange_compartment: str = COMPARTMENTS["CYTOSOL"],
|
76
|
+
) -> sbml_dfs_core.SBML_dfs:
|
77
|
+
"""
|
78
|
+
|
79
|
+
Update SBML_dfs With Exchange
|
80
|
+
|
81
|
+
Add transportation reactions between all locations of a set of molecular species by
|
82
|
+
including bidirectional exchange reactions through an exchange compartment.
|
83
|
+
|
84
|
+
Parameters:
|
85
|
+
|
86
|
+
species_needing_transport_rxns: np.ndarray
|
87
|
+
Vector of molecular species (s_ids) with no or insufficient transportation reactions
|
88
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
89
|
+
A mechanistic model containing a set of molecular species which exist
|
90
|
+
in multiple compartments and are interconverted by reactions
|
91
|
+
exchange_compartment: str
|
92
|
+
The name of an exchange compartment matching a c_name from sbml_dfs.compartments
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
|
96
|
+
update_sbml_df_with_exchange: sbml_dfs_core.SBML_dfs
|
97
|
+
The input sbml_dfs with additional transport reactions and compartmentalized species
|
98
|
+
(in the exchange compartment) added.
|
99
|
+
|
100
|
+
"""
|
101
|
+
|
102
|
+
exchange_compartment_id = sbml_dfs.compartments[
|
103
|
+
sbml_dfs.compartments[SBML_DFS.C_NAME] == exchange_compartment
|
104
|
+
].index.tolist()
|
105
|
+
if len(exchange_compartment_id) != 1:
|
106
|
+
raise ValueError(
|
107
|
+
"The provided exchange compartment matched "
|
108
|
+
f"{len(exchange_compartment_id)} compartments - this is unexpected behavior"
|
109
|
+
)
|
110
|
+
exchange_compartment_id = exchange_compartment_id[0]
|
111
|
+
|
112
|
+
# create a source object with provenance information for the entities that we'll add to the sbml_dfs
|
113
|
+
gap_filling_source_obj = source.Source(
|
114
|
+
pd.Series(
|
115
|
+
{
|
116
|
+
SOURCE_SPEC.MODEL: "gap filling",
|
117
|
+
SOURCE_SPEC.PATHWAY_ID: "gap_filling",
|
118
|
+
SOURCE_SPEC.NAME: "Gap filling to enable transport between all compartments where species is present",
|
119
|
+
}
|
120
|
+
)
|
121
|
+
.to_frame()
|
122
|
+
.T
|
123
|
+
)
|
124
|
+
|
125
|
+
# initialize an empty identifiers object for gap filled reactions
|
126
|
+
gap_filling_id_obj = identifiers.Identifiers([])
|
127
|
+
|
128
|
+
# find species which need exchange reactions but which are not currently present in the exchange compartment
|
129
|
+
existing_exchange_cspecies = sbml_dfs.compartmentalized_species[
|
130
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID] == exchange_compartment_id
|
131
|
+
]
|
132
|
+
new_exchange_cspecies = set(species_needing_transport_rxns).difference(
|
133
|
+
set(existing_exchange_cspecies[SBML_DFS.S_ID].tolist())
|
134
|
+
)
|
135
|
+
|
136
|
+
logger.info(
|
137
|
+
f"{len(new_exchange_cspecies)} new compartmentalized species must "
|
138
|
+
f"be added to the {exchange_compartment} to add protein transportation gap filling"
|
139
|
+
)
|
140
|
+
|
141
|
+
# since compartmentalized species are defined by their sid and cid
|
142
|
+
# add the defining foreign keys for all new exchange species
|
143
|
+
# then we'll add the primary key by autoincrementing existing keys
|
144
|
+
new_exchange_cspecies_fks = (
|
145
|
+
pd.DataFrame({SBML_DFS.S_ID: list(new_exchange_cspecies)})
|
146
|
+
.assign(c_id=exchange_compartment_id)
|
147
|
+
.merge(
|
148
|
+
sbml_dfs.species[SBML_DFS.S_NAME],
|
149
|
+
how="left",
|
150
|
+
left_on=SBML_DFS.S_ID,
|
151
|
+
right_index=True,
|
152
|
+
)
|
153
|
+
)
|
154
|
+
new_exchange_cspecies_fks[SBML_DFS.SC_NAME] = [
|
155
|
+
f"{s_name} [{exchange_compartment}]"
|
156
|
+
for s_name in new_exchange_cspecies_fks[SBML_DFS.S_NAME]
|
157
|
+
]
|
158
|
+
new_exchange_cspecies_fks = new_exchange_cspecies_fks.drop(SBML_DFS.S_NAME, axis=1)
|
159
|
+
new_exchange_cspecies_fks[SBML_DFS.SC_SOURCE] = gap_filling_source_obj
|
160
|
+
|
161
|
+
# update index by incrementing existing keys
|
162
|
+
existing_sc_ids = sbml_dfs_utils.id_formatter_inv(
|
163
|
+
sbml_dfs.compartmentalized_species.index.tolist()
|
164
|
+
)
|
165
|
+
# filter np.nan which will be introduced if the key is not the default format
|
166
|
+
existing_sc_ids = [x for x in existing_sc_ids if x is not np.nan]
|
167
|
+
current_max_sc_id = max(existing_sc_ids)
|
168
|
+
|
169
|
+
new_int_ids = [
|
170
|
+
1 + current_max_sc_id + x for x in new_exchange_cspecies_fks.index.tolist()
|
171
|
+
]
|
172
|
+
new_exchange_cspecies_fks[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
173
|
+
new_int_ids, id_type=SBML_DFS.SC_ID
|
174
|
+
)
|
175
|
+
new_exchange_cspecies_df = new_exchange_cspecies_fks.set_index(SBML_DFS.SC_ID)
|
176
|
+
|
177
|
+
# add new compartmentalized species to sbml_dfs model
|
178
|
+
updated_sbml_dfs = copy.deepcopy(sbml_dfs)
|
179
|
+
updated_sbml_dfs.compartmentalized_species = pd.concat(
|
180
|
+
[updated_sbml_dfs.compartmentalized_species, new_exchange_cspecies_df]
|
181
|
+
)
|
182
|
+
|
183
|
+
# define all new transport reactions as an edgelist
|
184
|
+
|
185
|
+
# pull out all cspecies of species needing transport
|
186
|
+
cspecies_needing_transport = (
|
187
|
+
updated_sbml_dfs.compartmentalized_species[
|
188
|
+
updated_sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(
|
189
|
+
species_needing_transport_rxns
|
190
|
+
)
|
191
|
+
]
|
192
|
+
.reset_index()
|
193
|
+
.drop(SBML_DFS.SC_SOURCE, axis=1)
|
194
|
+
)
|
195
|
+
|
196
|
+
exchange_cspecies = cspecies_needing_transport[
|
197
|
+
cspecies_needing_transport[SBML_DFS.C_ID] == exchange_compartment_id
|
198
|
+
].drop(SBML_DFS.C_ID, axis=1)
|
199
|
+
non_exchange_cspecies = cspecies_needing_transport[
|
200
|
+
cspecies_needing_transport[SBML_DFS.C_ID] != exchange_compartment_id
|
201
|
+
].drop(SBML_DFS.C_ID, axis=1)
|
202
|
+
|
203
|
+
transport_rxn_edgelist = pd.concat(
|
204
|
+
[
|
205
|
+
# exchange compartment -> non-exchange compartment
|
206
|
+
exchange_cspecies.rename(
|
207
|
+
{SBML_DFS.SC_ID: "sc_id_from", SBML_DFS.SC_NAME: "sc_name_from"}, axis=1
|
208
|
+
).merge(
|
209
|
+
non_exchange_cspecies.rename(
|
210
|
+
{SBML_DFS.SC_ID: "sc_id_to", SBML_DFS.SC_NAME: "sc_name_to"}, axis=1
|
211
|
+
)
|
212
|
+
),
|
213
|
+
# non-exchange compartment -> exchange compartment
|
214
|
+
non_exchange_cspecies.rename(
|
215
|
+
{SBML_DFS.SC_ID: "sc_id_from", SBML_DFS.SC_NAME: "sc_name_from"}, axis=1
|
216
|
+
).merge(
|
217
|
+
exchange_cspecies.rename(
|
218
|
+
{SBML_DFS.SC_ID: "sc_id_to", SBML_DFS.SC_NAME: "sc_name_to"}, axis=1
|
219
|
+
)
|
220
|
+
),
|
221
|
+
]
|
222
|
+
)
|
223
|
+
|
224
|
+
# we should add two reactions for each non-exchange compartment cspecies
|
225
|
+
# one transporting from the exchange compartment and one transporting into the
|
226
|
+
# exchange compartment
|
227
|
+
assert transport_rxn_edgelist.shape[0] == 2 * non_exchange_cspecies.shape[0]
|
228
|
+
|
229
|
+
# the rows in this edgelist correspond to new reactions that we'll add
|
230
|
+
# to the model
|
231
|
+
transport_rxn_edgelist[SBML_DFS.R_NAME] = [
|
232
|
+
f"{x} -> {y} gap-filling transport"
|
233
|
+
for x, y in zip(
|
234
|
+
transport_rxn_edgelist["sc_name_from"], transport_rxn_edgelist["sc_name_to"]
|
235
|
+
)
|
236
|
+
]
|
237
|
+
transport_rxn_edgelist = transport_rxn_edgelist.reset_index(drop=True)
|
238
|
+
|
239
|
+
# create new reactions, update index by incrementing existing keys
|
240
|
+
|
241
|
+
existing_r_ids = sbml_dfs_utils.id_formatter_inv(sbml_dfs.reactions.index.tolist())
|
242
|
+
# filter np.nan which will be introduced if the key is not the default format
|
243
|
+
existing_r_ids = [x for x in existing_r_ids if x is not np.nan]
|
244
|
+
current_max_r_id = max(existing_r_ids)
|
245
|
+
|
246
|
+
new_int_ids = [
|
247
|
+
1 + current_max_r_id + x for x in transport_rxn_edgelist.index.tolist()
|
248
|
+
]
|
249
|
+
transport_rxn_edgelist[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
250
|
+
new_int_ids, id_type=SBML_DFS.R_ID
|
251
|
+
)
|
252
|
+
new_reactions = (
|
253
|
+
transport_rxn_edgelist[[SBML_DFS.R_ID, SBML_DFS.R_NAME]]
|
254
|
+
.set_index(SBML_DFS.R_ID)
|
255
|
+
.assign(r_Identifiers=gap_filling_id_obj)
|
256
|
+
.assign(r_Source=gap_filling_source_obj)
|
257
|
+
)
|
258
|
+
|
259
|
+
logger.info(
|
260
|
+
f"{len(new_reactions)} new reactions must "
|
261
|
+
f"be added to the {exchange_compartment} to add molecular species transportation reactions"
|
262
|
+
)
|
263
|
+
|
264
|
+
# add new reactions
|
265
|
+
updated_sbml_dfs.reactions = pd.concat([updated_sbml_dfs.reactions, new_reactions])
|
266
|
+
|
267
|
+
# create new reaction species
|
268
|
+
# each reaction adds two reaction species - the from and to compartmentalized species
|
269
|
+
new_reaction_species = pd.concat(
|
270
|
+
[
|
271
|
+
transport_rxn_edgelist[["sc_id_from", SBML_DFS.R_ID]]
|
272
|
+
.rename({"sc_id_from": SBML_DFS.SC_ID}, axis=1)
|
273
|
+
.assign(stoichiometry=-1)
|
274
|
+
# substrate
|
275
|
+
.assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]),
|
276
|
+
transport_rxn_edgelist[["sc_id_to", SBML_DFS.R_ID]]
|
277
|
+
.rename({"sc_id_to": SBML_DFS.SC_ID}, axis=1)
|
278
|
+
.assign(stoichiometry=1)
|
279
|
+
# product
|
280
|
+
.assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]),
|
281
|
+
]
|
282
|
+
).reset_index(drop=True)
|
283
|
+
|
284
|
+
existing_rsc_ids = sbml_dfs_utils.id_formatter_inv(
|
285
|
+
sbml_dfs.reaction_species.index.tolist()
|
286
|
+
)
|
287
|
+
# filter np.nan which will be introduced if the key is not the default format
|
288
|
+
existing_rsc_ids = [x for x in existing_rsc_ids if x is not np.nan]
|
289
|
+
current_max_rsc_id = max(existing_rsc_ids)
|
290
|
+
|
291
|
+
new_int_ids = [
|
292
|
+
1 + current_max_rsc_id + x for x in new_reaction_species.index.tolist()
|
293
|
+
]
|
294
|
+
new_reaction_species[SBML_DFS.RSC_ID] = sbml_dfs_utils.id_formatter(
|
295
|
+
new_int_ids, id_type=SBML_DFS.RSC_ID
|
296
|
+
)
|
297
|
+
new_reaction_species = new_reaction_species.set_index(SBML_DFS.RSC_ID)
|
298
|
+
|
299
|
+
updated_sbml_dfs.reaction_species = pd.concat(
|
300
|
+
[updated_sbml_dfs.reaction_species, new_reaction_species]
|
301
|
+
)
|
302
|
+
|
303
|
+
updated_sbml_dfs = sbml_dfs_utils.check_entity_data_index_matching(
|
304
|
+
updated_sbml_dfs, SBML_DFS.REACTIONS
|
305
|
+
)
|
306
|
+
|
307
|
+
updated_sbml_dfs.validate()
|
308
|
+
|
309
|
+
return updated_sbml_dfs
|
310
|
+
|
311
|
+
|
312
|
+
def _identify_species_needing_transport_reactions(
|
313
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
314
|
+
) -> np.ndarray:
|
315
|
+
"""
|
316
|
+
Identify Molecular Species Needing Transport Reactions
|
317
|
+
|
318
|
+
Determine whether each molecular species has sufficient transport reactions
|
319
|
+
so all of the compartments where it exists are connected.
|
320
|
+
|
321
|
+
Parameters:
|
322
|
+
|
323
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
324
|
+
A mechanistic model containing a set of molecular species which exist
|
325
|
+
in multiple compartments and are interconverted by reactions
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
|
329
|
+
species_needing_transport_rxns: np.ndarray
|
330
|
+
Vector of molecular species (s_ids) with no or insufficient transportation reactions
|
331
|
+
|
332
|
+
"""
|
333
|
+
|
334
|
+
# ensure that all genic reaction species can be produced and transported to each
|
335
|
+
# compartment where they should exist.
|
336
|
+
# we should be able to follow a directed path from a synthesized protein
|
337
|
+
# (by default in the nucleoplasm) possibly through multiple complexes and to every
|
338
|
+
# other compartmentalized species
|
339
|
+
#
|
340
|
+
# if a path does not exist then we can create one assuming a path which
|
341
|
+
# look like nucleoplasm > cytoplasm > other compartment
|
342
|
+
|
343
|
+
species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
344
|
+
|
345
|
+
# identify all pure protein species - all of there cspecies should be connected
|
346
|
+
pure_protein_species = (
|
347
|
+
species_ids.query("ontology == 'uniprot' and bqb in ('BQB_IS')")[
|
348
|
+
[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]
|
349
|
+
]
|
350
|
+
.drop_duplicates()
|
351
|
+
.reset_index(drop=True)
|
352
|
+
)
|
353
|
+
|
354
|
+
# identify all species containing protein - these are the species which can be used
|
355
|
+
# as links for evaluating whether cspecies are connected
|
356
|
+
|
357
|
+
partial_protein_cspecies = (
|
358
|
+
species_ids.query(
|
359
|
+
"ontology == 'uniprot' and bqb in ('BQB_IS', 'BQB_HAS_PART')"
|
360
|
+
)[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]]
|
361
|
+
.drop_duplicates()
|
362
|
+
.merge(
|
363
|
+
sbml_dfs.compartmentalized_species.reset_index()[
|
364
|
+
[SBML_DFS.SC_ID, SBML_DFS.S_ID, SBML_DFS.C_ID]
|
365
|
+
]
|
366
|
+
)
|
367
|
+
.set_index(IDENTIFIERS.IDENTIFIER)
|
368
|
+
.sort_index()
|
369
|
+
)
|
370
|
+
|
371
|
+
# create a directed graph
|
372
|
+
directed_graph = net_create.create_cpr_graph(
|
373
|
+
sbml_dfs, directed=True, graph_type="bipartite"
|
374
|
+
)
|
375
|
+
|
376
|
+
# consider each s_id and protein separately
|
377
|
+
# if one s_id matches multiple proteins then
|
378
|
+
# ideally they should have the same paths but this
|
379
|
+
# may not be true if they are part of different protein complexes
|
380
|
+
#
|
381
|
+
# as a result we can identify compartmentalized species and transport reactions
|
382
|
+
# that must exist to support each s_id - identifier pair and then
|
383
|
+
# take the union of new entities over proteins matching a given s_id
|
384
|
+
|
385
|
+
cspecies_path_tuple_dict = dict()
|
386
|
+
for row in pure_protein_species.itertuples():
|
387
|
+
s_id = row.s_id
|
388
|
+
uniprot = row.identifier
|
389
|
+
|
390
|
+
comp_specs = sbml_dfs.compartmentalized_species[
|
391
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.S_ID] == s_id
|
392
|
+
]
|
393
|
+
|
394
|
+
if comp_specs.shape[0] == 1:
|
395
|
+
# the species only exists in one compartment so no transport reactions are needed
|
396
|
+
cspecies_path_tuple_dict[(s_id, uniprot)] = {"type": "single-compartment"}
|
397
|
+
else:
|
398
|
+
# find whether there are valid transportation routes between all a proteins' compartments
|
399
|
+
existing_cspecies_paths = _find_existing_inter_cspecies_paths(
|
400
|
+
comp_specs, uniprot, directed_graph, partial_protein_cspecies
|
401
|
+
)
|
402
|
+
if existing_cspecies_paths is not None:
|
403
|
+
cspecies_path_tuple_dict[(s_id, uniprot)] = (
|
404
|
+
_eval_existing_inter_cspecies_paths(
|
405
|
+
comp_specs, existing_cspecies_paths
|
406
|
+
)
|
407
|
+
)
|
408
|
+
else:
|
409
|
+
cspecies_path_tuple_dict[(s_id, uniprot)] = {
|
410
|
+
"type": "unreachable cspecies - no transport reactions"
|
411
|
+
}
|
412
|
+
|
413
|
+
# reformat dict as a pd.DataFrame
|
414
|
+
species_transport_status_dict_list = list()
|
415
|
+
for k, v in cspecies_path_tuple_dict.items():
|
416
|
+
entry = {SBML_DFS.S_ID: k[0], IDENTIFIERS.IDENTIFIER: k[1], **v}
|
417
|
+
|
418
|
+
species_transport_status_dict_list.append(entry)
|
419
|
+
|
420
|
+
species_transport_status_df = pd.DataFrame(species_transport_status_dict_list)
|
421
|
+
|
422
|
+
# optional logging
|
423
|
+
# logger.info(_log_protein_transport_gapfilling(species_transport_status_df))
|
424
|
+
|
425
|
+
# define proteins which whose compartmentalized forms are not connected
|
426
|
+
proteins_needing_transport_rxns = species_transport_status_df[
|
427
|
+
species_transport_status_df["type"].isin(
|
428
|
+
[
|
429
|
+
"unreachable cspecies - no transport reactions",
|
430
|
+
"unreachable cspecies - inadequate transport reactions",
|
431
|
+
]
|
432
|
+
)
|
433
|
+
]
|
434
|
+
|
435
|
+
# convert from proteins needing gap filling to species that they match
|
436
|
+
# multiple proteins may match a single species so if any of them
|
437
|
+
# need gap filling then gap filling will be added for the whole species
|
438
|
+
species_needing_transport_rxns = proteins_needing_transport_rxns[
|
439
|
+
SBML_DFS.S_ID
|
440
|
+
].unique()
|
441
|
+
|
442
|
+
return species_needing_transport_rxns
|
443
|
+
|
444
|
+
|
445
|
+
def _eval_existing_inter_cspecies_paths(
|
446
|
+
comp_specs: pd.DataFrame, existing_cspecies_paths: pd.DataFrame
|
447
|
+
) -> dict:
|
448
|
+
"""
|
449
|
+
Evaluate Existing Inter Compartmentalized Species Paths
|
450
|
+
|
451
|
+
Determine whether paths between compartments found in
|
452
|
+
_find_existing_inter_cspecies_paths()
|
453
|
+
cover all of the compartments where the protein exists.
|
454
|
+
|
455
|
+
Parameters:
|
456
|
+
|
457
|
+
comp_specs: pd.DataFrame
|
458
|
+
Compartmentalized species for a single s_id
|
459
|
+
existing_cspecies_paths: pd.DataFrame
|
460
|
+
An edgelist of a from and to compartmentalized species
|
461
|
+
and a label of the path connecting them.
|
462
|
+
|
463
|
+
Returns:
|
464
|
+
|
465
|
+
species_tranpsort_status: dict
|
466
|
+
type: the status category the species falls in
|
467
|
+
?msg: an optional message describing the type
|
468
|
+
|
469
|
+
"""
|
470
|
+
|
471
|
+
# If the largest connected component includes all compartmentalized species
|
472
|
+
# then we can assume that the transportation reactions which exist are adequate. Note that
|
473
|
+
# because the subgraph is directed its topology may still be kind of funky.
|
474
|
+
|
475
|
+
# find the largest connected component
|
476
|
+
largest_connected_component = (
|
477
|
+
ig.Graph.TupleList(
|
478
|
+
existing_cspecies_paths.itertuples(index=False), directed=False
|
479
|
+
)
|
480
|
+
.clusters()
|
481
|
+
.giant()
|
482
|
+
)
|
483
|
+
largest_connected_component_vertices = [
|
484
|
+
v["name"] for v in largest_connected_component.vs
|
485
|
+
]
|
486
|
+
|
487
|
+
if not isinstance(largest_connected_component_vertices, list):
|
488
|
+
raise TypeError("largest_connected_component must be a list")
|
489
|
+
|
490
|
+
missing_cspecies = set(comp_specs.index.tolist()).difference(
|
491
|
+
set(largest_connected_component_vertices)
|
492
|
+
)
|
493
|
+
|
494
|
+
existing_trans_msg = " & ".join(existing_cspecies_paths["paths_str"].tolist())
|
495
|
+
if len(missing_cspecies) != 0:
|
496
|
+
msg = f"{', '.join(comp_specs['sc_name'][missing_cspecies].tolist())} " # type: ignore
|
497
|
+
"compartmentalized species were not part of transport reactions though "
|
498
|
+
f"some transport paths could be found {existing_trans_msg}. Bidirectional "
|
499
|
+
"transport reactions with cytoplasm will be added for this species in "
|
500
|
+
"all other compartments"
|
501
|
+
return {
|
502
|
+
"type": "unreachable cspecies - inadequate transport reactions",
|
503
|
+
"msg": msg,
|
504
|
+
}
|
505
|
+
|
506
|
+
else:
|
507
|
+
msg = f"transportation paths between compartmentalized species already exist {existing_trans_msg}"
|
508
|
+
return {"type": "valid transportation paths", "msg": msg}
|
509
|
+
|
510
|
+
|
511
|
+
def _find_existing_inter_cspecies_paths(
|
512
|
+
comp_specs: pd.DataFrame,
|
513
|
+
uniprot_id: str,
|
514
|
+
directed_graph: ig.Graph,
|
515
|
+
partial_protein_cspecies: pd.DataFrame,
|
516
|
+
) -> pd.DataFrame | None:
|
517
|
+
"""
|
518
|
+
Find Existing Inter Compartmentalized Species Paths
|
519
|
+
|
520
|
+
Determine which compartments a protein exists in can be reached from one another by
|
521
|
+
traversing a directed graph of reactions and molecular species including the protein
|
522
|
+
(i.e., paths can involve complexes of the protein of interest).
|
523
|
+
|
524
|
+
Parameters:
|
525
|
+
|
526
|
+
comp_specs: pd.DataFrame
|
527
|
+
Compartmentalized species for a single s_id
|
528
|
+
uniprot_id: str
|
529
|
+
The Uniprot ID for the protein of interest
|
530
|
+
directed_graph: ig.Graph
|
531
|
+
An igraph version of the sbml_dfs model
|
532
|
+
partial_protein_cspecies: pd.DataFrame
|
533
|
+
A table of proteins included in each species ID (this includes BQB_HAS_PART
|
534
|
+
qualifiers in addition to the BQB_IS qualifiers which generally define
|
535
|
+
distinct species
|
536
|
+
|
537
|
+
Returns:
|
538
|
+
|
539
|
+
existing_cspecies_paths: pd.DataFrame or None
|
540
|
+
An edgelist of a from and to compartmentalized species and a label of the path
|
541
|
+
connecting them.
|
542
|
+
|
543
|
+
"""
|
544
|
+
|
545
|
+
reaction_vertices = np.where(
|
546
|
+
[x == "reaction" for x in directed_graph.vs["node_type"]]
|
547
|
+
)[0]
|
548
|
+
|
549
|
+
# find all species which include the protein of interest
|
550
|
+
valid_links = set(partial_protein_cspecies.loc[uniprot_id][SBML_DFS.SC_ID].tolist())
|
551
|
+
|
552
|
+
# define a subgraph which only uses reactions & species which include the protein of interest
|
553
|
+
protein_match_vec = [x in valid_links for x in directed_graph.vs["name"]]
|
554
|
+
protein_vertices = np.where(protein_match_vec)[0]
|
555
|
+
combined_vertices = np.concatenate((reaction_vertices, protein_vertices), axis=None)
|
556
|
+
|
557
|
+
proteinaceous_subgraph = directed_graph.subgraph(vertices=combined_vertices)
|
558
|
+
|
559
|
+
# find paths along subgraph
|
560
|
+
|
561
|
+
paths_df_dict = dict()
|
562
|
+
for a_cspecies in comp_specs.index.tolist():
|
563
|
+
to_cspecies = list(set(comp_specs.index.tolist()).difference({a_cspecies}))
|
564
|
+
|
565
|
+
# find a path from a_cspecies to each to_cspecies
|
566
|
+
paths = proteinaceous_subgraph.get_shortest_paths(
|
567
|
+
v=a_cspecies, to=to_cspecies, output="vpath"
|
568
|
+
)
|
569
|
+
|
570
|
+
# create a tabular summary of possible paths (whether or not a valid path was found)
|
571
|
+
paths_df = pd.DataFrame(
|
572
|
+
{"from": [a_cspecies] * len(to_cspecies), "to": to_cspecies, "paths": paths}
|
573
|
+
)
|
574
|
+
|
575
|
+
# filter to valid paths
|
576
|
+
paths_df = paths_df.iloc[np.where([p != [] for p in paths_df["paths"]])[0]]
|
577
|
+
paths_df["paths_str"] = [
|
578
|
+
" -> ".join([proteinaceous_subgraph.vs[x]["node_name"] for x in p])
|
579
|
+
for p in paths_df["paths"]
|
580
|
+
]
|
581
|
+
paths_df = paths_df.drop("paths", axis=1)
|
582
|
+
|
583
|
+
paths_df_dict[a_cspecies] = paths_df
|
584
|
+
|
585
|
+
existing_cspecies_paths = pd.concat(paths_df_dict.values())
|
586
|
+
|
587
|
+
if existing_cspecies_paths.shape[0] == 0:
|
588
|
+
return None
|
589
|
+
else:
|
590
|
+
return existing_cspecies_paths
|
591
|
+
|
592
|
+
|
593
|
+
def _log_protein_transport_gapfilling(
|
594
|
+
species_transport_status_df: pd.DataFrame,
|
595
|
+
) -> None:
|
596
|
+
print(
|
597
|
+
utils.style_df(
|
598
|
+
species_transport_status_df.value_counts("type").to_frame().reset_index(),
|
599
|
+
headers=["Transport Category", "# of Entries"],
|
600
|
+
hide_index=True,
|
601
|
+
)
|
602
|
+
)
|
603
|
+
|
604
|
+
transport_messages_fails = species_transport_status_df[
|
605
|
+
species_transport_status_df["type"].isin(
|
606
|
+
["unreachable cspecies - inadequate transport reactions"]
|
607
|
+
)
|
608
|
+
]
|
609
|
+
if transport_messages_fails.shape[0] > 0:
|
610
|
+
print(
|
611
|
+
f"Example messages for {transport_messages_fails.shape[0]} species with "
|
612
|
+
"some transportation reactions but where not all compartments can be reached\n"
|
613
|
+
)
|
614
|
+
|
615
|
+
n_messages = min(5, transport_messages_fails.shape[0])
|
616
|
+
transport_message_df = transport_messages_fails.sample(n_messages)
|
617
|
+
|
618
|
+
print("\n\n".join(transport_message_df["msg"].tolist()))
|
619
|
+
|
620
|
+
transport_messages_successes = species_transport_status_df[
|
621
|
+
species_transport_status_df["type"].isin(["valid transportation paths"])
|
622
|
+
]
|
623
|
+
if transport_messages_successes.shape[0] > 0:
|
624
|
+
print(
|
625
|
+
"---------------------\nExample messages for "
|
626
|
+
f"{transport_messages_successes.shape[0]} species where existing transportation "
|
627
|
+
"reactions are sufficient and no gap filling will be applied\n"
|
628
|
+
)
|
629
|
+
|
630
|
+
n_messages = min(5, transport_messages_successes.shape[0])
|
631
|
+
transport_message_df = transport_messages_successes.sample(n_messages)
|
632
|
+
|
633
|
+
print("\n\n".join(transport_message_df["msg"].tolist()))
|
634
|
+
|
635
|
+
return None
|