napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,597 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import igraph as ig
6
+ import pandas as pd
7
+ from napistu import sbml_dfs_core
8
+ from napistu import utils
9
+ from napistu.constants import SBML_DFS
10
+ from napistu.constants import CPR_EDGELIST
11
+ from napistu.constants import CPR_EDGELIST_REQ_VARS
12
+ from napistu.constants import IDENTIFIERS
13
+ from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
14
+ from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
15
+ from napistu.network.constants import CPR_GRAPH_EDGES
16
+ from napistu.network import paths
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def features_to_pathway_species(
22
+ feature_identifiers: pd.DataFrame,
23
+ species_identifiers: pd.DataFrame,
24
+ ontologies: set,
25
+ feature_id_var: str,
26
+ ) -> pd.DataFrame:
27
+ """
28
+ Features to Pathway Species
29
+
30
+ Match a table of molecular species to their corresponding species in a pathway representation.
31
+
32
+ Parameters:
33
+ feature_identifiers: pd.DataFrame
34
+ pd.Dataframe containing a "feature_id_var" variable used to match entries
35
+ species_identifiers: pd.DataFrame
36
+ A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
37
+ generally using sbml_dfs_core.export_sbml_dfs()
38
+ ontologies: set
39
+ A set of ontologies used to match features to pathway species
40
+ feature_id_var: str
41
+ Variable in "feature_identifiers" containing identifiers
42
+
43
+ Returns:
44
+ pathway_species: pd.DataFrame
45
+ species_identifiers joined to feature_identifiers based on shared identifiers
46
+ """
47
+
48
+ # map features to molecular features in the pathway
49
+ if feature_id_var not in feature_identifiers.columns.to_list():
50
+ raise ValueError(
51
+ f"{feature_id_var} must be a variable in 'feature_identifiers', "
52
+ f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
53
+ )
54
+
55
+ # check identifiers table
56
+ _check_species_identifiers_table(species_identifiers)
57
+
58
+ available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
59
+ unavailable_ontologies = ontologies.difference(available_ontologies)
60
+
61
+ # no ontologies present
62
+ if len(unavailable_ontologies) == len(ontologies):
63
+ raise ValueError(
64
+ f"None of the requested ontologies ({', '.join(ontologies)}) "
65
+ "were used to annotate pathway species. Available ontologies are: "
66
+ f"{', '.join(available_ontologies)}"
67
+ )
68
+
69
+ # 1+ desired ontologies are not present
70
+ if len(unavailable_ontologies) > 0:
71
+ raise ValueError(
72
+ f"Some of the requested ontologies ({', '.join(unavailable_ontologies)}) "
73
+ "were NOT used to annotate pathway species. Available ontologies are: "
74
+ f"{', '.join(available_ontologies)}"
75
+ )
76
+
77
+ relevant_identifiers = species_identifiers[
78
+ species_identifiers[IDENTIFIERS.ONTOLOGY].isin(ontologies)
79
+ ]
80
+
81
+ # map features to pathway species
82
+ pathway_species = feature_identifiers.merge(
83
+ relevant_identifiers, left_on=feature_id_var, right_on=IDENTIFIERS.IDENTIFIER
84
+ )
85
+
86
+ if pathway_species.shape[0] == 0:
87
+ logger.warning(
88
+ "None of the provided species identifiers matched entries of the pathway; returning None"
89
+ )
90
+ None
91
+
92
+ # report the fraction of unmapped species
93
+
94
+ return pathway_species
95
+
96
+
97
+ def edgelist_to_pathway_species(
98
+ formatted_edgelist: pd.DataFrame, species_identifiers: pd.DataFrame, ontologies: set
99
+ ) -> pd.DataFrame:
100
+ """
101
+ Edgelist to Pathway Species
102
+
103
+ Match an edgelist of molecular species pairs to their corresponding species in a pathway representation.
104
+
105
+ Parameters:
106
+ formatted_edgelist: pd.DataFrame
107
+ pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
108
+ species_identifiers: pd.DataFrame
109
+ A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species") generally using
110
+ sbml_dfs_core.export_sbml_dfs()
111
+ ontologies: set
112
+ A set of ontologies used to match features to pathway species
113
+
114
+ Returns:
115
+ edges_on_pathway: pd.DataFrame
116
+ formatted_edgelist with upstream features mapped
117
+ to "s_id_upstream" and downstream species mapped
118
+ to "s_id_downstream"
119
+ """
120
+
121
+ required_vars_distinct_features = {
122
+ CPR_EDGELIST.IDENTIFIER_UPSTREAM,
123
+ CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
124
+ }
125
+ missing_required_vars_distinct_features = (
126
+ required_vars_distinct_features.difference(
127
+ set(formatted_edgelist.columns.tolist())
128
+ )
129
+ )
130
+
131
+ if len(missing_required_vars_distinct_features) > 0:
132
+ raise ValueError(
133
+ f"{len(missing_required_vars_distinct_features)} required variables were "
134
+ "missing from 'formatted_edgelist': "
135
+ f"{', '.join(missing_required_vars_distinct_features)}"
136
+ )
137
+
138
+ # define all distinct identifiers in edgelist
139
+ distinct_identifiers = (
140
+ pd.concat(
141
+ [
142
+ formatted_edgelist[CPR_EDGELIST.IDENTIFIER_UPSTREAM],
143
+ formatted_edgelist[CPR_EDGELIST.IDENTIFIER_DOWNSTREAM],
144
+ ]
145
+ )
146
+ .drop_duplicates()
147
+ .reset_index(drop=True)
148
+ .to_frame()
149
+ .rename({0: "feature_id"}, axis=1)
150
+ )
151
+
152
+ # merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
153
+ features_on_pathway = features_to_pathway_species(
154
+ feature_identifiers=distinct_identifiers,
155
+ species_identifiers=species_identifiers,
156
+ ontologies=ontologies,
157
+ feature_id_var="feature_id",
158
+ )
159
+
160
+ # add s_ids of both upstream and downstream edges to pathway
161
+ edges_on_pathway = formatted_edgelist.merge(
162
+ features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
163
+ {
164
+ SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
165
+ IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_UPSTREAM,
166
+ },
167
+ axis=1,
168
+ )
169
+ ).merge(
170
+ features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
171
+ {
172
+ SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
173
+ IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
174
+ },
175
+ axis=1,
176
+ )
177
+ )
178
+
179
+ return edges_on_pathway
180
+
181
+
182
+ def edgelist_to_scids(
183
+ formatted_edgelist: pd.DataFrame,
184
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
185
+ species_identifiers: pd.DataFrame,
186
+ ontologies: set,
187
+ ):
188
+ """
189
+
190
+ Edgelist to Compartmentalized Species IDds
191
+
192
+ Map an edgelist of possible mechanistic interactions onto a
193
+ pathadex pathway
194
+
195
+ Parameters:
196
+ formatted_edgelist: pd.DataFrame
197
+ pd.Dataframe containing a "identifier_upstream" and
198
+ "identifier_downstream" variables used to to match entries
199
+ sbml_dfs: sbml_dfs_core.SBML_dfs
200
+ A mechanistic model
201
+ species_identifiers: pd.DataFrame
202
+ A table of molecular species identifiers produced from
203
+ sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
204
+ ontologies: set
205
+ A set of ontologies used to match features to pathway species
206
+
207
+ Returns:
208
+ edgelist_w_scids: pd.DataFrame
209
+ formatted_edgelist with upstream features mapped to "sc_id_upstream" and
210
+ downstream species mapped to "sc_id_downstream"
211
+ """
212
+
213
+ _check_species_identifiers_table(species_identifiers)
214
+
215
+ # map edges onto pathway entities based on shared identifiers
216
+ edges_on_pathway = edgelist_to_pathway_species(
217
+ formatted_edgelist=formatted_edgelist,
218
+ species_identifiers=species_identifiers,
219
+ ontologies=ontologies,
220
+ )
221
+
222
+ # expand from s_ids to sc_ids
223
+ s_id_pairs = edges_on_pathway[
224
+ [CPR_EDGELIST.S_ID_UPSTREAM, CPR_EDGELIST.S_ID_DOWNSTREAM]
225
+ ].drop_duplicates()
226
+ sc_id_pairs = s_id_pairs.merge(
227
+ sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
228
+ .reset_index()
229
+ .rename(
230
+ {
231
+ SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
232
+ SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM,
233
+ },
234
+ axis=1,
235
+ )
236
+ ).merge(
237
+ sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
238
+ .reset_index()
239
+ .rename(
240
+ {
241
+ SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
242
+ SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM,
243
+ },
244
+ axis=1,
245
+ )
246
+ )
247
+
248
+ # map sc_ids back to edges_on_pathway
249
+ # join lookup table of s_id_upstream, s_id_downstream -> sc_ids
250
+ edgelist_w_scids = edges_on_pathway.merge(sc_id_pairs)
251
+
252
+ logger_msg = (
253
+ f"{edgelist_w_scids.shape[0]} interactions mapped "
254
+ "onto pairs of compartmentalized species in the mechanistic model"
255
+ )
256
+ if edgelist_w_scids.shape[0] == 0:
257
+ logger.warning(logger_msg)
258
+ else:
259
+ logger.info(logger_msg)
260
+
261
+ return edgelist_w_scids
262
+
263
+
264
+ def filter_to_direct_mechanistic_interactions(
265
+ formatted_edgelist: pd.DataFrame,
266
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
267
+ species_identifiers: pd.DataFrame,
268
+ ontologies: set,
269
+ ) -> pd.DataFrame:
270
+ """
271
+ Filter to Direct Mechanistic Interactions
272
+
273
+ Filter an edgelist to direct mechanistic interactions
274
+
275
+ Parameters:
276
+ formatted_edgelist: pd.DataFrame
277
+ pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
278
+ sbml_dfs: sbml_dfs_core.SBML_dfs
279
+ A mechanistic model
280
+ species_identifiers: pd.DataFrame
281
+ A table of molecular species identifiers
282
+ produced from sbml_dfs.get_identifiers("species") generally
283
+ using sbml_dfs_core.export_sbml_dfs()
284
+ ontologies: set
285
+ A set of ontologies used to match features to pathway species
286
+
287
+ Returns:
288
+ edgelist_w_direct_mechanistic_interactions: pd.DataFrame
289
+ formatted_edgelist filtered to mechanistic reactions present in the pathway representation
290
+ """
291
+
292
+ edgelist_w_scids = _edgelist_to_scids_if_needed(
293
+ formatted_edgelist, sbml_dfs, species_identifiers, ontologies
294
+ )
295
+
296
+ # reduce to distinct sc_id pairs
297
+ sc_id_pairs = edgelist_w_scids[CPR_EDGELIST_REQ_VARS].drop_duplicates()
298
+
299
+ # define all existing direct regulatory interactions
300
+ pathway_interactions = pd.concat(
301
+ [
302
+ # pair 0 -> <0 # modifiers affect substrates
303
+ sbml_dfs.reaction_species[
304
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
305
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
306
+ .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
307
+ .merge(
308
+ sbml_dfs.reaction_species[
309
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
310
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
311
+ {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
312
+ )
313
+ ),
314
+ # pair <0 -> >0 # substrates affect products
315
+ sbml_dfs.reaction_species[
316
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
317
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
318
+ .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
319
+ .merge(
320
+ sbml_dfs.reaction_species[
321
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
322
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
323
+ {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
324
+ )
325
+ ),
326
+ # pair 0 -> >0 # modifiers affect products
327
+ sbml_dfs.reaction_species[
328
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
329
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
330
+ .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
331
+ .merge(
332
+ sbml_dfs.reaction_species[
333
+ sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
334
+ ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
335
+ {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
336
+ )
337
+ ),
338
+ ]
339
+ ).reset_index(drop=True)
340
+
341
+ # filter pathway interactions based on matches to sc_id_pairs
342
+ direct_edge_interactions = (
343
+ sc_id_pairs.merge(pathway_interactions)
344
+ .merge(
345
+ sbml_dfs.species[SBML_DFS.S_NAME]
346
+ .to_frame()
347
+ .rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_UPSTREAM}, axis=1),
348
+ left_on=CPR_EDGELIST.S_ID_UPSTREAM,
349
+ right_index=True,
350
+ # add species metadata for matches
351
+ )
352
+ .merge(
353
+ sbml_dfs.species[SBML_DFS.S_NAME]
354
+ .to_frame()
355
+ .rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_DOWNSTREAM}, axis=1),
356
+ left_on=CPR_EDGELIST.S_ID_DOWNSTREAM,
357
+ right_index=True,
358
+ # add metadata for reactions where interaction occurs
359
+ )
360
+ .merge(
361
+ sbml_dfs.reactions[SBML_DFS.R_NAME].to_frame(),
362
+ left_on=SBML_DFS.R_ID,
363
+ right_index=True,
364
+ )
365
+ )
366
+
367
+ edgelist_w_direct_mechanistic_interactions = edgelist_w_scids.merge(
368
+ direct_edge_interactions[
369
+ [
370
+ CPR_EDGELIST.SC_ID_UPSTREAM,
371
+ CPR_EDGELIST.SC_ID_DOWNSTREAM,
372
+ SBML_DFS.R_ID,
373
+ CPR_EDGELIST.S_NAME_UPSTREAM,
374
+ CPR_EDGELIST.S_NAME_DOWNSTREAM,
375
+ SBML_DFS.R_NAME,
376
+ ]
377
+ ]
378
+ )
379
+
380
+ return edgelist_w_direct_mechanistic_interactions
381
+
382
+
383
+ def filter_to_indirect_mechanistic_interactions(
384
+ formatted_edgelist: pd.DataFrame,
385
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
386
+ species_identifiers: pd.DataFrame,
387
+ cpr_graph: ig.Graph,
388
+ ontologies: set,
389
+ precomputed_distances=None,
390
+ max_path_length=10,
391
+ ):
392
+ """
393
+ Filter to Indirect Mechanistic Interactions
394
+
395
+ Filter an edgelist to indirect mechanistic interactions.
396
+ Indirect relationships are identified by searching a
397
+ network for paths from an upstream species to a downstream species
398
+
399
+ Parameters:
400
+ formatted_edgelist: pd.DataFrame
401
+ pd.Dataframe containing a "identifier_upstream" and
402
+ "identifier_downstream" variables used to to match entries
403
+ sbml_dfs: sbml_dfs_core.SBML_dfs
404
+ A mechanistic model
405
+ species_identifiers: pandas.DataFrame
406
+ A table of molecular species identifiers produced from
407
+ sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
408
+ cpr_graph: igraph.Graph
409
+ A network representation of the sbml_dfs model
410
+ ontologies: set
411
+ A set of ontologies used to match features to pathway species
412
+ precomputed_distances: None or a pd.DataFrame containing path lengths and weights
413
+ between pairs of cspecies.
414
+ max_path_length: int
415
+ Maximum number of steps to consider.
416
+
417
+ Returns:
418
+ edgelist_w_indirect_mechanistic_interactions: pd.DataFrame
419
+ formatted_edgelist filtered to mechanistic reactions which can be described
420
+ by an indirect mechanism. The mechanism is described by a path weight, length,
421
+ and a vpath and epath list of vertices and edges which were traversed to create the path.
422
+ """
423
+
424
+ edgelist_w_scids = _edgelist_to_scids_if_needed(
425
+ formatted_edgelist, sbml_dfs, species_identifiers, ontologies
426
+ )
427
+
428
+ if precomputed_distances is not None:
429
+ # rename to match conventions in precomputed_distances
430
+ # filter by these precomputed distances and then restore naming
431
+ edgelist_w_scids = paths._filter_paths_by_precomputed_distances(
432
+ edgelist_w_scids.rename(
433
+ {
434
+ CPR_EDGELIST.SC_ID_UPSTREAM: CPR_EDGELIST.SC_ID_ORIGIN,
435
+ CPR_EDGELIST.SC_ID_DOWNSTREAM: CPR_EDGELIST.SC_ID_DEST,
436
+ },
437
+ axis=1,
438
+ ),
439
+ precomputed_distances,
440
+ ).rename(
441
+ {
442
+ CPR_EDGELIST.SC_ID_ORIGIN: CPR_EDGELIST.SC_ID_UPSTREAM,
443
+ CPR_EDGELIST.SC_ID_DEST: CPR_EDGELIST.SC_ID_DOWNSTREAM,
444
+ },
445
+ axis=1,
446
+ )
447
+
448
+ # find paths from 1 upstream to all desired downstream sc_ids
449
+ # (this is the convention with igraph)
450
+ indexed_origin_vertices = edgelist_w_scids.set_index(CPR_EDGELIST.SC_ID_UPSTREAM)
451
+
452
+ # loop through upstream cspecies and find paths to all downstream species
453
+ global_dict = dict()
454
+ for an_origin_index in indexed_origin_vertices.index.unique(): # type: ignore
455
+ origin_targets = indexed_origin_vertices.loc[
456
+ an_origin_index
457
+ ] # type: pd.DataFrame
458
+
459
+ # if indexing only a single entry pd.DataFrame becomes a pd.Series
460
+ # convert back to DataFrame for consistency
461
+ origin_targets = utils.ensure_pd_df(origin_targets)
462
+
463
+ # log entry for debugging
464
+ logger.debug(
465
+ f"finding paths from {an_origin_index} to "
466
+ f"{origin_targets.shape[0]} target vertices"
467
+ )
468
+
469
+ # find all paths from indexed_origin to desired destination
470
+ shortest_paths = paths.find_shortest_reaction_paths(
471
+ cpr_graph,
472
+ sbml_dfs,
473
+ origin=an_origin_index,
474
+ # find all unique destinations (as a list for compatibility with igraph dest)
475
+ dest=origin_targets[CPR_EDGELIST.SC_ID_DOWNSTREAM].unique().tolist(),
476
+ weight_var=CPR_GRAPH_EDGES.WEIGHTS,
477
+ )
478
+
479
+ if shortest_paths is None:
480
+ continue
481
+
482
+ vertices, edges = shortest_paths
483
+ indexed_edges = edges.set_index("path")
484
+ indexed_vertices = vertices.set_index("path")
485
+
486
+ paths_list = list()
487
+ for ind in indexed_edges.index.unique():
488
+ one_path = indexed_edges.loc[ind]
489
+
490
+ # make sure that we are working with a DF
491
+ if type(one_path) is pd.Series:
492
+ one_path = one_path.to_frame().T
493
+
494
+ if one_path.shape[0] > max_path_length:
495
+ continue
496
+
497
+ # find the destination node
498
+ # this is annoying because if the graph is undirected
499
+ # its not clear if the from or to edge is the actual destination
500
+ # when taking advantage of the fact that igraph lets you
501
+ # look up multiple destinations at once this information is lost
502
+ ancestor_species = {an_origin_index}
503
+ if one_path.shape[0] > 1:
504
+ penultimate_edge = one_path.iloc[one_path.shape[0] - 2]
505
+ ancestor_species = ancestor_species.union(
506
+ {
507
+ penultimate_edge[CPR_GRAPH_EDGES.FROM],
508
+ penultimate_edge[CPR_GRAPH_EDGES.TO],
509
+ }
510
+ )
511
+
512
+ terminal_edge = one_path.iloc[one_path.shape[0] - 1]
513
+ ending_cspecies = {terminal_edge[CPR_GRAPH_EDGES.FROM], terminal_edge[CPR_GRAPH_EDGES.TO]}.difference(ancestor_species) # type: ignore
514
+
515
+ if len(ending_cspecies) != 1:
516
+ raise ValueError(
517
+ "The terminal edge could not be determined when summarizing paths"
518
+ )
519
+ ending_cspecies = ending_cspecies.pop()
520
+
521
+ path_series = pd.Series(
522
+ {
523
+ CPR_GRAPH_EDGES.FROM: an_origin_index,
524
+ CPR_GRAPH_EDGES.TO: ending_cspecies,
525
+ "weight": sum(one_path[CPR_GRAPH_EDGES.WEIGHTS]),
526
+ "path_length": one_path.shape[0],
527
+ "vpath": indexed_vertices.loc[ind],
528
+ "epath": one_path,
529
+ } # type: ignore
530
+ ) # type: pd.Series
531
+
532
+ paths_list.append(path_series)
533
+
534
+ if len(paths_list) > 0:
535
+ origin_paths = pd.DataFrame(paths_list)
536
+ global_dict[an_origin_index] = origin_paths
537
+
538
+ if len(global_dict.keys()) == 0:
539
+ logger.warning(
540
+ "None of the provide molecular pairs could be mechanistically linked with a network path"
541
+ )
542
+ return None
543
+
544
+ all_shortest_paths = pd.concat(global_dict.values())
545
+
546
+ indirect_shortest_paths = edgelist_w_scids.merge(
547
+ all_shortest_paths,
548
+ left_on=[CPR_EDGELIST.SC_ID_UPSTREAM, CPR_EDGELIST.SC_ID_DOWNSTREAM],
549
+ right_on=[CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO],
550
+ )
551
+
552
+ return indirect_shortest_paths
553
+
554
+
555
+ def _edgelist_to_scids_if_needed(
556
+ edgelist: pd.DataFrame,
557
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
558
+ species_identifiers: pd.DataFrame,
559
+ ontologies: set,
560
+ ) -> pd.DataFrame:
561
+ """Map a set of edgelist species to cspecies or skip if cspecies were provided."""
562
+
563
+ if utils.match_pd_vars(edgelist, CPR_EDGELIST_REQ_VARS).are_present:
564
+ logger.info(
565
+ f"An edgelist with {', '.join(CPR_EDGELIST_REQ_VARS)} was provided; identifier matching will be skipped"
566
+ )
567
+ return edgelist
568
+ else:
569
+ utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
570
+
571
+ _check_species_identifiers_table(species_identifiers)
572
+
573
+ edgelist_w_scids = edgelist_to_scids(
574
+ edgelist,
575
+ sbml_dfs=sbml_dfs,
576
+ species_identifiers=species_identifiers,
577
+ ontologies=ontologies,
578
+ )
579
+
580
+ return edgelist_w_scids
581
+
582
+
583
+ def _check_species_identifiers_table(
584
+ species_identifiers: pd.DataFrame,
585
+ required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
586
+ ):
587
+ missing_required_vars = required_vars.difference(
588
+ set(species_identifiers.columns.tolist())
589
+ )
590
+ if len(missing_required_vars) > 0:
591
+ raise ValueError(
592
+ f"{len(missing_required_vars)} required variables "
593
+ "were missing from the species_identifiers table: "
594
+ f"{', '.join(missing_required_vars)}"
595
+ )
596
+
597
+ return None
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.metadata import PackageNotFoundError
4
+ from importlib.metadata import version
5
+
6
+ try:
7
+ __version__ = version("calicolabs-cpr")
8
+ except PackageNotFoundError:
9
+ # package is not installed
10
+ pass
@@ -0,0 +1,86 @@
1
+ """Module to contain constants for the modify submodule"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ VALID_ANNOTATION_TYPES = [
8
+ "foci",
9
+ "reactions",
10
+ "species",
11
+ "compartments",
12
+ "compartmentalized_species",
13
+ "reaction_species",
14
+ "remove",
15
+ ]
16
+
17
+ # if_all defines reactions species which must all be present for a filter to occur
18
+ # except_any defines reaction species which will override "if_all"
19
+ # as_substrates defines reaction species which must be present as a substrate for filtering to occur
20
+ COFACTOR_SCHEMA = {
21
+ "ATP PO4 donation": {"if_all": ["ATP", "ADP"], "except_any": ["AMP"]},
22
+ "GTP PO4 donation": {"if_all": ["GTP", "GDP"]},
23
+ "ATP PPi donation": {"if_all": ["ATP", "AMP"], "except_any": ["ADP"]},
24
+ "NADH H- donation": {"if_all": ["NADH", "NAD+"], "as_substrate": ["NADH"]},
25
+ "NADPH H- donation": {"if_all": ["NADPH", "NADP+"], "as_substrate": ["NADPH"]},
26
+ "SAH methyltransferase": {"if_all": ["SAH", "SAM"]},
27
+ "Glutathione oxidation": {"if_all": ["GSSG", "GSH"], "except_any": ["NADPH"]},
28
+ # "Glutamine aminotransferase" :
29
+ # {"if_all" : ["Gln", "Glu"],
30
+ # "except_any" : ["ATP"]},
31
+ "Water": {"if_all": ["water"]},
32
+ "PO4": {"if_all": ["PO4"]},
33
+ "PPi": {"if_all": ["PPi"]},
34
+ "H+": {"if_all": ["H+"]},
35
+ "O2": {"if_all": ["O2"]},
36
+ "CO2": {"if_all": ["CO2"]},
37
+ "Na+": {"if_all": ["Na+"]},
38
+ "Cl-": {"if_all": ["Cl-"]},
39
+ "CoA": {"if_all": ["CoA"]},
40
+ "HCO3-": {"if_all": ["HCO3"]},
41
+ }
42
+
43
+ COFACTOR_CHEBI_IDS = pd.DataFrame(
44
+ [
45
+ ("ADP", 456216), # ADP(3−)
46
+ ("ADP", 16761),
47
+ ("AMP", 16027),
48
+ ("ATP", 30616), # ATP(4-)
49
+ ("ATP", 15422),
50
+ ("CO2", 16526),
51
+ ("HCO3", 17544),
52
+ ("H2CO3", 28976),
53
+ ("GDP", 17552),
54
+ ("GSH", 16856),
55
+ ("GSSG", 17858),
56
+ ("GTP", 15996),
57
+ ("Glu", 29985),
58
+ ("Gln", 58359),
59
+ ("H+", 15378),
60
+ ("H+", 24636),
61
+ ("O2", 15379),
62
+ ("NADH", 57945), # NADH(2−)
63
+ ("NADH", 16908), # NADH
64
+ ("NAD+", 57540), # NAD(1-)
65
+ ("NAD+", 15846), # NAD(+)
66
+ ("NADPH", 16474),
67
+ ("NADP+", 18009),
68
+ ("NADP+", 58349), # NADP(3−)
69
+ ("PO4", 18367),
70
+ ("PPi", 29888), # H2PO4
71
+ ("PPi", 18361), # PPi4-
72
+ ("SAH", 16680),
73
+ ("SAM", 15414),
74
+ ("water", 15377),
75
+ ("water", 16234), # HO-
76
+ ("Na+", 29101),
77
+ ("Cl-", 29311),
78
+ ("CoA", 1146900),
79
+ ("CoA", 57287),
80
+ ("acetyl-CoA", 15351),
81
+ ("FAD", 16238),
82
+ ("FADH2", 17877),
83
+ ("UDP", 17659),
84
+ ],
85
+ columns=["cofactor", "chebi"],
86
+ )