napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
napistu/modify/gaps.py ADDED
@@ -0,0 +1,635 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import logging
5
+
6
+ import igraph as ig
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from napistu import identifiers
11
+ from napistu import sbml_dfs_core
12
+ from napistu import sbml_dfs_utils
13
+ from napistu import source
14
+ from napistu import utils
15
+ from napistu.network import net_create
16
+
17
+ from napistu.constants import SBML_DFS
18
+ from napistu.constants import COMPARTMENTS
19
+ from napistu.constants import IDENTIFIERS
20
+ from napistu.constants import MINI_SBO_FROM_NAME
21
+ from napistu.constants import SBOTERM_NAMES
22
+ from napistu.constants import SOURCE_SPEC
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def add_transportation_reactions(
28
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
29
+ exchange_compartment: str = COMPARTMENTS["CYTOSOL"],
30
+ ) -> sbml_dfs_core.SBML_dfs:
31
+ """
32
+ Add Transportation Reactions
33
+
34
+ Identifies proteins whose various compartmentalized forms cannot reach one
35
+ another via existing transportation reactions and then adds transportation
36
+ reactions which connect all forms of a protein.
37
+
38
+ sbml_dfs: sbml_dfs_core.SBML_dfs
39
+ A mechanistic model containing a set of molecular species which exist
40
+ in multiple compartments and are interconverted by reactions
41
+ exchange_compartment: str
42
+ The name of an exchange compartment matching a c_name from sbml_dfs.compartments
43
+
44
+ Returns:
45
+
46
+ sbml_df_with_exchange: sbml_dfs_core.SBML_dfs
47
+ The input sbml_dfs with additional transport reactions and compartmentalized species
48
+ (in the exchange compartment) added.
49
+
50
+ """
51
+
52
+ # validate arguments
53
+ if not any(sbml_dfs.compartments[SBML_DFS.C_NAME] == exchange_compartment):
54
+ raise ValueError(
55
+ f"{exchange_compartment} is not a compartment defined in sbml_dfs.compartments"
56
+ )
57
+
58
+ # find species which need transport reactions
59
+ species_needing_transport_rxns = _identify_species_needing_transport_reactions(
60
+ sbml_dfs=sbml_dfs
61
+ )
62
+
63
+ sbml_df_with_exchange = update_sbml_df_with_exchange(
64
+ species_needing_transport_rxns=species_needing_transport_rxns,
65
+ sbml_dfs=sbml_dfs,
66
+ exchange_compartment=exchange_compartment,
67
+ )
68
+
69
+ return sbml_df_with_exchange
70
+
71
+
72
+ def update_sbml_df_with_exchange(
73
+ species_needing_transport_rxns: np.ndarray,
74
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
75
+ exchange_compartment: str = COMPARTMENTS["CYTOSOL"],
76
+ ) -> sbml_dfs_core.SBML_dfs:
77
+ """
78
+
79
+ Update SBML_dfs With Exchange
80
+
81
+ Add transportation reactions between all locations of a set of molecular species by
82
+ including bidirectional exchange reactions through an exchange compartment.
83
+
84
+ Parameters:
85
+
86
+ species_needing_transport_rxns: np.ndarray
87
+ Vector of molecular species (s_ids) with no or insufficient transportation reactions
88
+ sbml_dfs: sbml_dfs_core.SBML_dfs
89
+ A mechanistic model containing a set of molecular species which exist
90
+ in multiple compartments and are interconverted by reactions
91
+ exchange_compartment: str
92
+ The name of an exchange compartment matching a c_name from sbml_dfs.compartments
93
+
94
+ Returns:
95
+
96
+ update_sbml_df_with_exchange: sbml_dfs_core.SBML_dfs
97
+ The input sbml_dfs with additional transport reactions and compartmentalized species
98
+ (in the exchange compartment) added.
99
+
100
+ """
101
+
102
+ exchange_compartment_id = sbml_dfs.compartments[
103
+ sbml_dfs.compartments[SBML_DFS.C_NAME] == exchange_compartment
104
+ ].index.tolist()
105
+ if len(exchange_compartment_id) != 1:
106
+ raise ValueError(
107
+ "The provided exchange compartment matched "
108
+ f"{len(exchange_compartment_id)} compartments - this is unexpected behavior"
109
+ )
110
+ exchange_compartment_id = exchange_compartment_id[0]
111
+
112
+ # create a source object with provenance information for the entities that we'll add to the sbml_dfs
113
+ gap_filling_source_obj = source.Source(
114
+ pd.Series(
115
+ {
116
+ SOURCE_SPEC.MODEL: "gap filling",
117
+ SOURCE_SPEC.PATHWAY_ID: "gap_filling",
118
+ SOURCE_SPEC.NAME: "Gap filling to enable transport between all compartments where species is present",
119
+ }
120
+ )
121
+ .to_frame()
122
+ .T
123
+ )
124
+
125
+ # initialize an empty identifiers object for gap filled reactions
126
+ gap_filling_id_obj = identifiers.Identifiers([])
127
+
128
+ # find species which need exchange reactions but which are not currently present in the exchange compartment
129
+ existing_exchange_cspecies = sbml_dfs.compartmentalized_species[
130
+ sbml_dfs.compartmentalized_species[SBML_DFS.C_ID] == exchange_compartment_id
131
+ ]
132
+ new_exchange_cspecies = set(species_needing_transport_rxns).difference(
133
+ set(existing_exchange_cspecies[SBML_DFS.S_ID].tolist())
134
+ )
135
+
136
+ logger.info(
137
+ f"{len(new_exchange_cspecies)} new compartmentalized species must "
138
+ f"be added to the {exchange_compartment} to add protein transportation gap filling"
139
+ )
140
+
141
+ # since compartmentalized species are defined by their sid and cid
142
+ # add the defining foreign keys for all new exchange species
143
+ # then we'll add the primary key by autoincrementing existing keys
144
+ new_exchange_cspecies_fks = (
145
+ pd.DataFrame({SBML_DFS.S_ID: list(new_exchange_cspecies)})
146
+ .assign(c_id=exchange_compartment_id)
147
+ .merge(
148
+ sbml_dfs.species[SBML_DFS.S_NAME],
149
+ how="left",
150
+ left_on=SBML_DFS.S_ID,
151
+ right_index=True,
152
+ )
153
+ )
154
+ new_exchange_cspecies_fks[SBML_DFS.SC_NAME] = [
155
+ f"{s_name} [{exchange_compartment}]"
156
+ for s_name in new_exchange_cspecies_fks[SBML_DFS.S_NAME]
157
+ ]
158
+ new_exchange_cspecies_fks = new_exchange_cspecies_fks.drop(SBML_DFS.S_NAME, axis=1)
159
+ new_exchange_cspecies_fks[SBML_DFS.SC_SOURCE] = gap_filling_source_obj
160
+
161
+ # update index by incrementing existing keys
162
+ existing_sc_ids = sbml_dfs_utils.id_formatter_inv(
163
+ sbml_dfs.compartmentalized_species.index.tolist()
164
+ )
165
+ # filter np.nan which will be introduced if the key is not the default format
166
+ existing_sc_ids = [x for x in existing_sc_ids if x is not np.nan]
167
+ current_max_sc_id = max(existing_sc_ids)
168
+
169
+ new_int_ids = [
170
+ 1 + current_max_sc_id + x for x in new_exchange_cspecies_fks.index.tolist()
171
+ ]
172
+ new_exchange_cspecies_fks[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
173
+ new_int_ids, id_type=SBML_DFS.SC_ID
174
+ )
175
+ new_exchange_cspecies_df = new_exchange_cspecies_fks.set_index(SBML_DFS.SC_ID)
176
+
177
+ # add new compartmentalized species to sbml_dfs model
178
+ updated_sbml_dfs = copy.deepcopy(sbml_dfs)
179
+ updated_sbml_dfs.compartmentalized_species = pd.concat(
180
+ [updated_sbml_dfs.compartmentalized_species, new_exchange_cspecies_df]
181
+ )
182
+
183
+ # define all new transport reactions as an edgelist
184
+
185
+ # pull out all cspecies of species needing transport
186
+ cspecies_needing_transport = (
187
+ updated_sbml_dfs.compartmentalized_species[
188
+ updated_sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(
189
+ species_needing_transport_rxns
190
+ )
191
+ ]
192
+ .reset_index()
193
+ .drop(SBML_DFS.SC_SOURCE, axis=1)
194
+ )
195
+
196
+ exchange_cspecies = cspecies_needing_transport[
197
+ cspecies_needing_transport[SBML_DFS.C_ID] == exchange_compartment_id
198
+ ].drop(SBML_DFS.C_ID, axis=1)
199
+ non_exchange_cspecies = cspecies_needing_transport[
200
+ cspecies_needing_transport[SBML_DFS.C_ID] != exchange_compartment_id
201
+ ].drop(SBML_DFS.C_ID, axis=1)
202
+
203
+ transport_rxn_edgelist = pd.concat(
204
+ [
205
+ # exchange compartment -> non-exchange compartment
206
+ exchange_cspecies.rename(
207
+ {SBML_DFS.SC_ID: "sc_id_from", SBML_DFS.SC_NAME: "sc_name_from"}, axis=1
208
+ ).merge(
209
+ non_exchange_cspecies.rename(
210
+ {SBML_DFS.SC_ID: "sc_id_to", SBML_DFS.SC_NAME: "sc_name_to"}, axis=1
211
+ )
212
+ ),
213
+ # non-exchange compartment -> exchange compartment
214
+ non_exchange_cspecies.rename(
215
+ {SBML_DFS.SC_ID: "sc_id_from", SBML_DFS.SC_NAME: "sc_name_from"}, axis=1
216
+ ).merge(
217
+ exchange_cspecies.rename(
218
+ {SBML_DFS.SC_ID: "sc_id_to", SBML_DFS.SC_NAME: "sc_name_to"}, axis=1
219
+ )
220
+ ),
221
+ ]
222
+ )
223
+
224
+ # we should add two reactions for each non-exchange compartment cspecies
225
+ # one transporting from the exchange compartment and one transporting into the
226
+ # exchange compartment
227
+ assert transport_rxn_edgelist.shape[0] == 2 * non_exchange_cspecies.shape[0]
228
+
229
+ # the rows in this edgelist correspond to new reactions that we'll add
230
+ # to the model
231
+ transport_rxn_edgelist[SBML_DFS.R_NAME] = [
232
+ f"{x} -> {y} gap-filling transport"
233
+ for x, y in zip(
234
+ transport_rxn_edgelist["sc_name_from"], transport_rxn_edgelist["sc_name_to"]
235
+ )
236
+ ]
237
+ transport_rxn_edgelist = transport_rxn_edgelist.reset_index(drop=True)
238
+
239
+ # create new reactions, update index by incrementing existing keys
240
+
241
+ existing_r_ids = sbml_dfs_utils.id_formatter_inv(sbml_dfs.reactions.index.tolist())
242
+ # filter np.nan which will be introduced if the key is not the default format
243
+ existing_r_ids = [x for x in existing_r_ids if x is not np.nan]
244
+ current_max_r_id = max(existing_r_ids)
245
+
246
+ new_int_ids = [
247
+ 1 + current_max_r_id + x for x in transport_rxn_edgelist.index.tolist()
248
+ ]
249
+ transport_rxn_edgelist[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
250
+ new_int_ids, id_type=SBML_DFS.R_ID
251
+ )
252
+ new_reactions = (
253
+ transport_rxn_edgelist[[SBML_DFS.R_ID, SBML_DFS.R_NAME]]
254
+ .set_index(SBML_DFS.R_ID)
255
+ .assign(r_Identifiers=gap_filling_id_obj)
256
+ .assign(r_Source=gap_filling_source_obj)
257
+ )
258
+
259
+ logger.info(
260
+ f"{len(new_reactions)} new reactions must "
261
+ f"be added to the {exchange_compartment} to add molecular species transportation reactions"
262
+ )
263
+
264
+ # add new reactions
265
+ updated_sbml_dfs.reactions = pd.concat([updated_sbml_dfs.reactions, new_reactions])
266
+
267
+ # create new reaction species
268
+ # each reaction adds two reaction species - the from and to compartmentalized species
269
+ new_reaction_species = pd.concat(
270
+ [
271
+ transport_rxn_edgelist[["sc_id_from", SBML_DFS.R_ID]]
272
+ .rename({"sc_id_from": SBML_DFS.SC_ID}, axis=1)
273
+ .assign(stoichiometry=-1)
274
+ # substrate
275
+ .assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]),
276
+ transport_rxn_edgelist[["sc_id_to", SBML_DFS.R_ID]]
277
+ .rename({"sc_id_to": SBML_DFS.SC_ID}, axis=1)
278
+ .assign(stoichiometry=1)
279
+ # product
280
+ .assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]),
281
+ ]
282
+ ).reset_index(drop=True)
283
+
284
+ existing_rsc_ids = sbml_dfs_utils.id_formatter_inv(
285
+ sbml_dfs.reaction_species.index.tolist()
286
+ )
287
+ # filter np.nan which will be introduced if the key is not the default format
288
+ existing_rsc_ids = [x for x in existing_rsc_ids if x is not np.nan]
289
+ current_max_rsc_id = max(existing_rsc_ids)
290
+
291
+ new_int_ids = [
292
+ 1 + current_max_rsc_id + x for x in new_reaction_species.index.tolist()
293
+ ]
294
+ new_reaction_species[SBML_DFS.RSC_ID] = sbml_dfs_utils.id_formatter(
295
+ new_int_ids, id_type=SBML_DFS.RSC_ID
296
+ )
297
+ new_reaction_species = new_reaction_species.set_index(SBML_DFS.RSC_ID)
298
+
299
+ updated_sbml_dfs.reaction_species = pd.concat(
300
+ [updated_sbml_dfs.reaction_species, new_reaction_species]
301
+ )
302
+
303
+ updated_sbml_dfs = sbml_dfs_utils.check_entity_data_index_matching(
304
+ updated_sbml_dfs, SBML_DFS.REACTIONS
305
+ )
306
+
307
+ updated_sbml_dfs.validate()
308
+
309
+ return updated_sbml_dfs
310
+
311
+
312
+ def _identify_species_needing_transport_reactions(
313
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
314
+ ) -> np.ndarray:
315
+ """
316
+ Identify Molecular Species Needing Transport Reactions
317
+
318
+ Determine whether each molecular species has sufficient transport reactions
319
+ so all of the compartments where it exists are connected.
320
+
321
+ Parameters:
322
+
323
+ sbml_dfs: sbml_dfs_core.SBML_dfs
324
+ A mechanistic model containing a set of molecular species which exist
325
+ in multiple compartments and are interconverted by reactions
326
+
327
+ Returns:
328
+
329
+ species_needing_transport_rxns: np.ndarray
330
+ Vector of molecular species (s_ids) with no or insufficient transportation reactions
331
+
332
+ """
333
+
334
+ # ensure that all genic reaction species can be produced and transported to each
335
+ # compartment where they should exist.
336
+ # we should be able to follow a directed path from a synthesized protein
337
+ # (by default in the nucleoplasm) possibly through multiple complexes and to every
338
+ # other compartmentalized species
339
+ #
340
+ # if a path does not exist then we can create one assuming a path which
341
+ # look like nucleoplasm > cytoplasm > other compartment
342
+
343
+ species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
344
+
345
+ # identify all pure protein species - all of there cspecies should be connected
346
+ pure_protein_species = (
347
+ species_ids.query("ontology == 'uniprot' and bqb in ('BQB_IS')")[
348
+ [SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]
349
+ ]
350
+ .drop_duplicates()
351
+ .reset_index(drop=True)
352
+ )
353
+
354
+ # identify all species containing protein - these are the species which can be used
355
+ # as links for evaluating whether cspecies are connected
356
+
357
+ partial_protein_cspecies = (
358
+ species_ids.query(
359
+ "ontology == 'uniprot' and bqb in ('BQB_IS', 'BQB_HAS_PART')"
360
+ )[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]]
361
+ .drop_duplicates()
362
+ .merge(
363
+ sbml_dfs.compartmentalized_species.reset_index()[
364
+ [SBML_DFS.SC_ID, SBML_DFS.S_ID, SBML_DFS.C_ID]
365
+ ]
366
+ )
367
+ .set_index(IDENTIFIERS.IDENTIFIER)
368
+ .sort_index()
369
+ )
370
+
371
+ # create a directed graph
372
+ directed_graph = net_create.create_cpr_graph(
373
+ sbml_dfs, directed=True, graph_type="bipartite"
374
+ )
375
+
376
+ # consider each s_id and protein separately
377
+ # if one s_id matches multiple proteins then
378
+ # ideally they should have the same paths but this
379
+ # may not be true if they are part of different protein complexes
380
+ #
381
+ # as a result we can identify compartmentalized species and transport reactions
382
+ # that must exist to support each s_id - identifier pair and then
383
+ # take the union of new entities over proteins matching a given s_id
384
+
385
+ cspecies_path_tuple_dict = dict()
386
+ for row in pure_protein_species.itertuples():
387
+ s_id = row.s_id
388
+ uniprot = row.identifier
389
+
390
+ comp_specs = sbml_dfs.compartmentalized_species[
391
+ sbml_dfs.compartmentalized_species[SBML_DFS.S_ID] == s_id
392
+ ]
393
+
394
+ if comp_specs.shape[0] == 1:
395
+ # the species only exists in one compartment so no transport reactions are needed
396
+ cspecies_path_tuple_dict[(s_id, uniprot)] = {"type": "single-compartment"}
397
+ else:
398
+ # find whether there are valid transportation routes between all a proteins' compartments
399
+ existing_cspecies_paths = _find_existing_inter_cspecies_paths(
400
+ comp_specs, uniprot, directed_graph, partial_protein_cspecies
401
+ )
402
+ if existing_cspecies_paths is not None:
403
+ cspecies_path_tuple_dict[(s_id, uniprot)] = (
404
+ _eval_existing_inter_cspecies_paths(
405
+ comp_specs, existing_cspecies_paths
406
+ )
407
+ )
408
+ else:
409
+ cspecies_path_tuple_dict[(s_id, uniprot)] = {
410
+ "type": "unreachable cspecies - no transport reactions"
411
+ }
412
+
413
+ # reformat dict as a pd.DataFrame
414
+ species_transport_status_dict_list = list()
415
+ for k, v in cspecies_path_tuple_dict.items():
416
+ entry = {SBML_DFS.S_ID: k[0], IDENTIFIERS.IDENTIFIER: k[1], **v}
417
+
418
+ species_transport_status_dict_list.append(entry)
419
+
420
+ species_transport_status_df = pd.DataFrame(species_transport_status_dict_list)
421
+
422
+ # optional logging
423
+ # logger.info(_log_protein_transport_gapfilling(species_transport_status_df))
424
+
425
+ # define proteins which whose compartmentalized forms are not connected
426
+ proteins_needing_transport_rxns = species_transport_status_df[
427
+ species_transport_status_df["type"].isin(
428
+ [
429
+ "unreachable cspecies - no transport reactions",
430
+ "unreachable cspecies - inadequate transport reactions",
431
+ ]
432
+ )
433
+ ]
434
+
435
+ # convert from proteins needing gap filling to species that they match
436
+ # multiple proteins may match a single species so if any of them
437
+ # need gap filling then gap filling will be added for the whole species
438
+ species_needing_transport_rxns = proteins_needing_transport_rxns[
439
+ SBML_DFS.S_ID
440
+ ].unique()
441
+
442
+ return species_needing_transport_rxns
443
+
444
+
445
+ def _eval_existing_inter_cspecies_paths(
446
+ comp_specs: pd.DataFrame, existing_cspecies_paths: pd.DataFrame
447
+ ) -> dict:
448
+ """
449
+ Evaluate Existing Inter Compartmentalized Species Paths
450
+
451
+ Determine whether paths between compartments found in
452
+ _find_existing_inter_cspecies_paths()
453
+ cover all of the compartments where the protein exists.
454
+
455
+ Parameters:
456
+
457
+ comp_specs: pd.DataFrame
458
+ Compartmentalized species for a single s_id
459
+ existing_cspecies_paths: pd.DataFrame
460
+ An edgelist of a from and to compartmentalized species
461
+ and a label of the path connecting them.
462
+
463
+ Returns:
464
+
465
+ species_tranpsort_status: dict
466
+ type: the status category the species falls in
467
+ ?msg: an optional message describing the type
468
+
469
+ """
470
+
471
+ # If the largest connected component includes all compartmentalized species
472
+ # then we can assume that the transportation reactions which exist are adequate. Note that
473
+ # because the subgraph is directed its topology may still be kind of funky.
474
+
475
+ # find the largest connected component
476
+ largest_connected_component = (
477
+ ig.Graph.TupleList(
478
+ existing_cspecies_paths.itertuples(index=False), directed=False
479
+ )
480
+ .clusters()
481
+ .giant()
482
+ )
483
+ largest_connected_component_vertices = [
484
+ v["name"] for v in largest_connected_component.vs
485
+ ]
486
+
487
+ if not isinstance(largest_connected_component_vertices, list):
488
+ raise TypeError("largest_connected_component must be a list")
489
+
490
+ missing_cspecies = set(comp_specs.index.tolist()).difference(
491
+ set(largest_connected_component_vertices)
492
+ )
493
+
494
+ existing_trans_msg = " & ".join(existing_cspecies_paths["paths_str"].tolist())
495
+ if len(missing_cspecies) != 0:
496
+ msg = f"{', '.join(comp_specs['sc_name'][missing_cspecies].tolist())} " # type: ignore
497
+ "compartmentalized species were not part of transport reactions though "
498
+ f"some transport paths could be found {existing_trans_msg}. Bidirectional "
499
+ "transport reactions with cytoplasm will be added for this species in "
500
+ "all other compartments"
501
+ return {
502
+ "type": "unreachable cspecies - inadequate transport reactions",
503
+ "msg": msg,
504
+ }
505
+
506
+ else:
507
+ msg = f"transportation paths between compartmentalized species already exist {existing_trans_msg}"
508
+ return {"type": "valid transportation paths", "msg": msg}
509
+
510
+
511
+ def _find_existing_inter_cspecies_paths(
512
+ comp_specs: pd.DataFrame,
513
+ uniprot_id: str,
514
+ directed_graph: ig.Graph,
515
+ partial_protein_cspecies: pd.DataFrame,
516
+ ) -> pd.DataFrame | None:
517
+ """
518
+ Find Existing Inter Compartmentalized Species Paths
519
+
520
+ Determine which compartments a protein exists in can be reached from one another by
521
+ traversing a directed graph of reactions and molecular species including the protein
522
+ (i.e., paths can involve complexes of the protein of interest).
523
+
524
+ Parameters:
525
+
526
+ comp_specs: pd.DataFrame
527
+ Compartmentalized species for a single s_id
528
+ uniprot_id: str
529
+ The Uniprot ID for the protein of interest
530
+ directed_graph: ig.Graph
531
+ An igraph version of the sbml_dfs model
532
+ partial_protein_cspecies: pd.DataFrame
533
+ A table of proteins included in each species ID (this includes BQB_HAS_PART
534
+ qualifiers in addition to the BQB_IS qualifiers which generally define
535
+ distinct species
536
+
537
+ Returns:
538
+
539
+ existing_cspecies_paths: pd.DataFrame or None
540
+ An edgelist of a from and to compartmentalized species and a label of the path
541
+ connecting them.
542
+
543
+ """
544
+
545
+ reaction_vertices = np.where(
546
+ [x == "reaction" for x in directed_graph.vs["node_type"]]
547
+ )[0]
548
+
549
+ # find all species which include the protein of interest
550
+ valid_links = set(partial_protein_cspecies.loc[uniprot_id][SBML_DFS.SC_ID].tolist())
551
+
552
+ # define a subgraph which only uses reactions & species which include the protein of interest
553
+ protein_match_vec = [x in valid_links for x in directed_graph.vs["name"]]
554
+ protein_vertices = np.where(protein_match_vec)[0]
555
+ combined_vertices = np.concatenate((reaction_vertices, protein_vertices), axis=None)
556
+
557
+ proteinaceous_subgraph = directed_graph.subgraph(vertices=combined_vertices)
558
+
559
+ # find paths along subgraph
560
+
561
+ paths_df_dict = dict()
562
+ for a_cspecies in comp_specs.index.tolist():
563
+ to_cspecies = list(set(comp_specs.index.tolist()).difference({a_cspecies}))
564
+
565
+ # find a path from a_cspecies to each to_cspecies
566
+ paths = proteinaceous_subgraph.get_shortest_paths(
567
+ v=a_cspecies, to=to_cspecies, output="vpath"
568
+ )
569
+
570
+ # create a tabular summary of possible paths (whether or not a valid path was found)
571
+ paths_df = pd.DataFrame(
572
+ {"from": [a_cspecies] * len(to_cspecies), "to": to_cspecies, "paths": paths}
573
+ )
574
+
575
+ # filter to valid paths
576
+ paths_df = paths_df.iloc[np.where([p != [] for p in paths_df["paths"]])[0]]
577
+ paths_df["paths_str"] = [
578
+ " -> ".join([proteinaceous_subgraph.vs[x]["node_name"] for x in p])
579
+ for p in paths_df["paths"]
580
+ ]
581
+ paths_df = paths_df.drop("paths", axis=1)
582
+
583
+ paths_df_dict[a_cspecies] = paths_df
584
+
585
+ existing_cspecies_paths = pd.concat(paths_df_dict.values())
586
+
587
+ if existing_cspecies_paths.shape[0] == 0:
588
+ return None
589
+ else:
590
+ return existing_cspecies_paths
591
+
592
+
593
+ def _log_protein_transport_gapfilling(
594
+ species_transport_status_df: pd.DataFrame,
595
+ ) -> None:
596
+ print(
597
+ utils.style_df(
598
+ species_transport_status_df.value_counts("type").to_frame().reset_index(),
599
+ headers=["Transport Category", "# of Entries"],
600
+ hide_index=True,
601
+ )
602
+ )
603
+
604
+ transport_messages_fails = species_transport_status_df[
605
+ species_transport_status_df["type"].isin(
606
+ ["unreachable cspecies - inadequate transport reactions"]
607
+ )
608
+ ]
609
+ if transport_messages_fails.shape[0] > 0:
610
+ print(
611
+ f"Example messages for {transport_messages_fails.shape[0]} species with "
612
+ "some transportation reactions but where not all compartments can be reached\n"
613
+ )
614
+
615
+ n_messages = min(5, transport_messages_fails.shape[0])
616
+ transport_message_df = transport_messages_fails.sample(n_messages)
617
+
618
+ print("\n\n".join(transport_message_df["msg"].tolist()))
619
+
620
+ transport_messages_successes = species_transport_status_df[
621
+ species_transport_status_df["type"].isin(["valid transportation paths"])
622
+ ]
623
+ if transport_messages_successes.shape[0] > 0:
624
+ print(
625
+ "---------------------\nExample messages for "
626
+ f"{transport_messages_successes.shape[0]} species where existing transportation "
627
+ "reactions are sufficient and no gap filling will be applied\n"
628
+ )
629
+
630
+ n_messages = min(5, transport_messages_successes.shape[0])
631
+ transport_message_df = transport_messages_successes.sample(n_messages)
632
+
633
+ print("\n\n".join(transport_message_df["msg"].tolist()))
634
+
635
+ return None