napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,464 @@
1
+ """Module containing functions to interoperate with rcpr's netcontextr functions"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from tempfile import NamedTemporaryFile
8
+ from typing import Any
9
+ from typing import Callable
10
+ from typing import Iterable
11
+
12
+ import pandas as pd
13
+ from napistu import sbml_dfs_core
14
+ from napistu import utils
15
+ from napistu.rpy2 import has_rpy2
16
+ from napistu.rpy2 import warn_if_no_rpy2
17
+
18
+ from napistu.rpy2.constants import COL_GENE
19
+ from napistu.rpy2.constants import COL_PROTEIN_1
20
+ from napistu.rpy2.constants import COL_PROTEIN_2
21
+ from napistu.rpy2.constants import FIELD_INTERACTIONS
22
+ from napistu.rpy2.constants import FIELD_GENES
23
+ from napistu.rpy2.constants import FIELD_REACTIONS
24
+ from napistu.rpy2.constants import COL_ROLE
25
+ from napistu.rpy2.constants import COL_REACTION_ID
26
+ from napistu.rpy2.constants import COL_STOICHIOMETRY
27
+ from napistu.rpy2.constants import NETCONTEXTR_ONTOLOGY
28
+ from napistu.rpy2.constants import NETCONTEXTR_SBO_MAP
29
+
30
+ if has_rpy2:
31
+ from napistu.rpy2.callr import pandas_to_r_dataframe
32
+ from rpy2.robjects import ListVector
33
+ import rpy2.robjects as robjs
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ @warn_if_no_rpy2
39
+ def _none2null(none_obj):
40
+ return robjs.r("NULL")
41
+
42
+
43
+ @warn_if_no_rpy2
44
+ def sbml_dfs_to_rcpr_string_graph(
45
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
46
+ reaction_data: str = "string",
47
+ identifier_ontology: str = "ensembl_gene",
48
+ rescale_data: Callable[[pd.DataFrame], pd.DataFrame] | None = lambda x: x / 1000,
49
+ ) -> ListVector:
50
+ """Converts an sbml_dfs to a rcpr string graph
51
+
52
+ This utility converts the sbml_dfs to the format returned
53
+ by `rcpr::createStringGraph`.
54
+
55
+ Args:
56
+ sbml_dfs (SBML_dfs): the sbml_dfs from string.
57
+ It is assumed that this sbml_dfs has only reactions with exactly
58
+ two reactands and a 1:1 mapping between s_id and sc_id.
59
+ reaction_data (str, optional): The reaction data that contains
60
+ the string scores. Defaults to 'string'.
61
+ identifier_ontology (str, optional): The ontology to use for the
62
+ protein identifiers. Defaults to `ensembl_gene` (default in rcpr)
63
+ rescale_data (Callable[pd.DataFrame], optional): A function to rescale
64
+ the data. Defaults to lambda x: x/1000 (default in rcpr)
65
+
66
+ Returns:
67
+ This is a list of dataframes almost the same as `rcpr::createStringGraph`:
68
+ - `genes`: a dataframe with column `gene`
69
+ and the extra column `s_id`, `sc_id`
70
+ - `interactions`: a dataframe with columns `protein1`, `protein2` and
71
+ the scores from string
72
+ and the extra column `r_id`
73
+
74
+ The extra columns `s_id` and `r_id` are used to map the genes and reactions
75
+ to the sbml_dfs. This is useful for mapping back rcpr results to the
76
+ sbml_dfs.
77
+ """
78
+
79
+ dat_gene = (
80
+ sbml_dfs.species["s_Identifiers"]
81
+ # Get the identifiers for the given ontology
82
+ .map(lambda ids: ids.hoist(identifier_ontology))
83
+ .rename(COL_GENE)
84
+ .to_frame()
85
+ # Merge with compartmentalized species to get the sc_id
86
+ .merge(
87
+ sbml_dfs.compartmentalized_species[["s_id"]].reset_index(drop=False),
88
+ left_index=True,
89
+ right_on="s_id",
90
+ )[[COL_GENE, "sc_id", "s_id"]]
91
+ )
92
+
93
+ # Perform validations
94
+ assert dat_gene["s_id"].is_unique
95
+ assert dat_gene["sc_id"].is_unique
96
+ assert dat_gene[COL_GENE].is_unique
97
+ assert dat_gene[COL_GENE].hasnans is False
98
+
99
+ # Reshape into the correct format
100
+ dat_reactions = dat_gene[["sc_id", COL_GENE]].merge(
101
+ sbml_dfs.reaction_species[["r_id", "sc_id"]], on="sc_id"
102
+ )[[COL_GENE, "r_id"]]
103
+ # assert that this has the correct shape, ie 2x the shape of the number
104
+ # of reactions
105
+ assert (
106
+ dat_reactions.shape[0] == 2 * sbml_dfs.reactions.shape[0]
107
+ ), "There should be exactly 2 reactants per reactions"
108
+
109
+ # This is the fastest way I found to reshape this into the
110
+ # Edgelist format
111
+ dat_reactions["flag"] = dat_reactions["r_id"].duplicated()
112
+ dat_interactions = dat_reactions.pivot(
113
+ index="r_id", columns="flag", values=COL_GENE
114
+ )
115
+ dat_interactions.columns = pd.Index([COL_PROTEIN_1, COL_PROTEIN_2], dtype=object)
116
+ if rescale_data is not None:
117
+ reaction_df = rescale_data(sbml_dfs.reactions_data[reaction_data])
118
+ else:
119
+ reaction_df = sbml_dfs.reactions_data[reaction_data]
120
+
121
+ dat_interactions = dat_interactions.join(reaction_df).reset_index(drop=False)
122
+
123
+ genes = pandas_to_r_dataframe(dat_gene)
124
+ interactions = pandas_to_r_dataframe(dat_interactions)
125
+
126
+ out = ListVector({FIELD_GENES: genes, FIELD_INTERACTIONS: interactions})
127
+ return out
128
+
129
+
130
+ @warn_if_no_rpy2
131
+ def load_and_clean_hpa_data(
132
+ rcpr,
133
+ uri_hpa: str,
134
+ ):
135
+ """Load and cleans HPA data using rcpr
136
+
137
+ Args:
138
+ rcpr (): The rpy2 rcpr object
139
+ uri_hpa (str): The uri of the HPA data
140
+
141
+ Returns:
142
+ rpy2 object: The cleaned HPA data
143
+ """
144
+
145
+ with NamedTemporaryFile() as f:
146
+ # R cannot work with gcs uris
147
+ # thus download the file to a temporary
148
+ # location incase it is a gcs uri
149
+ if os.path.exists(uri_hpa):
150
+ # if the file is already a local
151
+ # file, just use it
152
+ path_hpa = uri_hpa
153
+ else:
154
+ path_hpa = f.name
155
+ utils.copy_uri(uri_hpa, path_hpa)
156
+
157
+ hpa_localization_data = rcpr.load_and_clean_hpa_data(path_hpa)
158
+ return hpa_localization_data
159
+
160
+
161
+ @warn_if_no_rpy2
162
+ def load_and_clean_gtex_data(rcpr_rpy2, uri_gtex: str, by_tissue_zfpkm: bool = False):
163
+ """Load and cleans GTEx data using rcpr
164
+
165
+ Args:
166
+ rcpr_rpy2 (): The rpy2 rcpr object
167
+ uri_gtex (str): The uri of the GTEx data
168
+ by_tissue_zfpkm (bool, optional): Whether to return the data normalized
169
+ by tissue using zfpkm. Defaults to False.
170
+ Returns:
171
+ rpy2 object: The cleaned GTEx data
172
+ """
173
+ with NamedTemporaryFile() as f:
174
+ # R cannot work with gcs uris
175
+ # thus download the file to a temporary
176
+ # location incase it is a gcs uri
177
+ if os.path.exists(uri_gtex):
178
+ # if the file is already a local
179
+ # file, just use it
180
+ path_gtex = uri_gtex
181
+ else:
182
+ path_gtex = f.name
183
+ utils.copy_uri(uri_gtex, path_gtex)
184
+
185
+ gtex_tissue_data = rcpr_rpy2.load_and_clean_gtex_data(path_gtex)
186
+
187
+ if by_tissue_zfpkm:
188
+ gtex_tissue_data = rcpr_rpy2.gene_expression_by_tissue(gtex_tissue_data)
189
+ return gtex_tissue_data
190
+
191
+
192
+ def annotate_genes(
193
+ rcpr, rcpr_graph: ListVector, data, field_name: str, **kwargs
194
+ ) -> ListVector:
195
+ """Annotates the genes in the graph with the given gene data
196
+
197
+ See the rcpr documentation about the exact format
198
+ required.
199
+
200
+ Args:
201
+ rcpr (): The rpy2 rcpr object
202
+ rcpr_graph (ListVector): The graph to annotate
203
+ data (complicated): "
204
+ field_name (str): The name of the column in the gene data to annotate with
205
+
206
+ Returns:
207
+ ListVector: The annotated graph
208
+ """
209
+ # Annotate the genes
210
+ rcpr_graph_annot = rcpr.annotate_genes(rcpr_graph, data, field_name, **kwargs)
211
+ return rcpr_graph_annot
212
+
213
+
214
+ def trim_network_by_gene_attribute(
215
+ rcpr,
216
+ rcpr_graph: ListVector,
217
+ field_name: str,
218
+ field_value: Any = None,
219
+ **kwargs,
220
+ ) -> ListVector:
221
+ """Trims the network by a gene attribute
222
+
223
+ See the R function `rcpr::trim_network_by_gene_attribute` for
224
+ more details.
225
+
226
+ Args:
227
+ rcpr (): The rpy2 rcpr object
228
+ rcpr_graph (ListVector): The graph to trim
229
+ field_name (str): The name of the column in the gene data to trim by
230
+ field_value (Any): One or more values to trim by
231
+
232
+ Returns:
233
+ ListVector: The trimmed graph
234
+ """
235
+ if field_value is None:
236
+ field_value = robjs.r("NaN")
237
+ rcpr_graph_trimmed = rcpr.trim_network_by_gene_attribute(
238
+ rcpr_graph, field_name=field_name, field_value=field_value, **kwargs
239
+ )
240
+ return rcpr_graph_trimmed
241
+
242
+
243
+ def apply_context_to_sbml_dfs(
244
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
245
+ rcpr_graph: ListVector,
246
+ inplace=True,
247
+ remove_species=False,
248
+ ) -> sbml_dfs_core.SBML_dfs:
249
+ """Applies the context to the SBML dfs
250
+
251
+ This is currently an in-place modification of
252
+ the sbml_dfs object.
253
+
254
+ Args:
255
+ sbml_dfs (SbmlDfs): The SBML dfs to apply the context to
256
+ rcpr_graph (ListVector): The graph to apply the context from
257
+ inplace (bool, optional): Whether to modify the sbml_dfs in-place
258
+ when applying the context. Defaults to True. "False" not yet implemented.
259
+ remove_species (bool, optional): Whether to remove
260
+ (compartmentalized) species that are no longer in the reactions.
261
+ Defaults to False.
262
+
263
+ Returns:
264
+ SbmlDfs: The SBML dfs with the context applied
265
+ """
266
+ if not inplace:
267
+ raise NotImplementedError("Only inplace is currently supported")
268
+
269
+ # r_ids after trimming
270
+ r_ids_new = set(rcpr_graph.rx("interactions")[0].rx("r_id")[0])
271
+
272
+ # find original r_ids
273
+ r_ids_old = set(sbml_dfs.reactions.index.tolist())
274
+
275
+ # find the r_ids that are in the original but not in the new
276
+ r_ids_to_remove = r_ids_old - r_ids_new
277
+
278
+ # assert that no new r_ids were added
279
+ if len(diff_ids := r_ids_new - r_ids_old) != 0:
280
+ raise ValueError(
281
+ f"New reactions present in rcpr, not present in smbl_dfs: {', '.join(diff_ids)}"
282
+ )
283
+
284
+ sbml_dfs.remove_reactions(r_ids_to_remove, remove_species=remove_species)
285
+
286
+ return sbml_dfs
287
+
288
+
289
+ def sbml_dfs_to_rcpr_reactions(
290
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
291
+ identifier_ontology: str = NETCONTEXTR_ONTOLOGY,
292
+ ) -> ListVector:
293
+ """Converts an sbml_dfs to a rcpr reaction graph
294
+
295
+ This utility converts the sbml_dfs to the format validated by
296
+ by `rcpr::validate_netcontextr_reactions`.
297
+
298
+ It converts the smbl_dfs into a reaction graph by:
299
+ - Building the `reactions` dataframe:
300
+ - Using the `species` identifiers to map `reaction_species`
301
+ to `genes` using the `identifier_ontology`.
302
+ Note that one species may be split into multiple `genes`
303
+ and multiple species may be combined into a single `gene`.
304
+ - Converting `sbo_terms` to roles.
305
+ - renaming `r_id` to `reaction_id`
306
+ - Building `genes` dataframe by taking all unique `genes` from the `reactions`
307
+
308
+ Args:
309
+ sbml_dfs (SBML_dfs): an sbml_dfs.
310
+ identifier_ontology (str, optional): The ontology to use for the
311
+ identifiers. Defaults to `ensembl_gene` (default in rcpr)
312
+
313
+ Returns:
314
+ This is a list of dataframes that validate with validate_netcontextr_reactions:
315
+ - `genes`: a dataframe with column `gene`
316
+ - `reactions`: a dataframe with columns "gene", "reaction_id", "role", "rsc_id"
317
+ representing the reaction data split up into individual reactions.
318
+ """
319
+
320
+ # Get the reactions
321
+ dat_reactions = _get_reactions(sbml_dfs, identifier_ontology)
322
+ # Get the genes
323
+ dat_gene = dat_reactions[[COL_GENE]].drop_duplicates()
324
+ # Note that no 1:1 mapping between genes and species can be made
325
+ # as multiple species could have the same gene annotation
326
+ # and also even one species could have multiple gene identifiers
327
+ genes = pandas_to_r_dataframe(dat_gene)
328
+ reactions = pandas_to_r_dataframe(dat_reactions)
329
+
330
+ out = ListVector({FIELD_GENES: genes, FIELD_REACTIONS: reactions})
331
+ return out
332
+
333
+
334
+ def trim_reactions_by_gene_attribute(
335
+ rcpr,
336
+ rcpr_reactions: ListVector,
337
+ field_name: str,
338
+ field_value: Any = None,
339
+ **kwargs,
340
+ ) -> ListVector:
341
+ """Trims rcpr reactions by a gene attribute
342
+
343
+ See the R function `rcpr::trim_reactions_by_gene_attribute` for
344
+ more details.
345
+
346
+ Args:
347
+ rcpr (): The rpy2 rcpr object
348
+ rcpr_reactions (ListVector): The graph to trim
349
+ field_name (str): The name of the column in the gene data to trim by
350
+ field_value (Any): One or more values to trim by
351
+
352
+ Returns:
353
+ ListVector: The trimmed graph
354
+ """
355
+ if field_value is None:
356
+ field_value = robjs.r("NaN")
357
+ rcpr_reactions_trimmed = rcpr.trim_reactions_by_gene_attribute(
358
+ rcpr_reactions, field_name=field_name, field_value=field_value, **kwargs
359
+ )
360
+ return rcpr_reactions_trimmed
361
+
362
+
363
+ def apply_reactions_context_to_sbml_dfs(
364
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
365
+ rcpr_reactions: ListVector,
366
+ considered_reactions: Iterable[str] | None = None,
367
+ inplace=True,
368
+ remove_species=False,
369
+ ) -> sbml_dfs_core.SBML_dfs:
370
+ """Applies the context to the SBML dfs
371
+
372
+ This is currently an in-place modification of
373
+ the sbml_dfs object.
374
+
375
+ Args:
376
+ sbml_dfs (sbml_dfs_core.SBML_dfs): The SBML dfs to apply the context to
377
+ rcpr_reactions (ListVector): The contextualized
378
+ considered_reactions (Iterable[str], optional): The reactions that were
379
+ considered for contextualisation. If None, all reactions that are
380
+ in the sbml_dfs are considered and filtered out if they are not part of
381
+ the rcpr_reactions. If provided, only reactions considered and not part
382
+ of the rcpr_reactions are removed. Defaults to None.
383
+ inplace (bool, optional): Whether to apply the context inplace.
384
+ Only True currently implemented.
385
+ remove_species (bool, optional): Whether to remove
386
+ (compartmentalized) species that are no longer in the reactions.
387
+ Defaults to False.
388
+
389
+ Returns:
390
+ SbmlDfs: The SBML dfs with the context applied
391
+ """
392
+ if not inplace:
393
+ raise NotImplementedError("Only inplace is currently supported")
394
+
395
+ # r_ids after trimming
396
+ r_ids_new = _get_rids_from_rcpr_reactions(rcpr_reactions)
397
+
398
+ # find original r_ids
399
+ if considered_reactions is None:
400
+ r_ids_old = set(sbml_dfs.reactions.index.tolist())
401
+ else:
402
+ r_ids_old = set(considered_reactions)
403
+
404
+ # find the r_ids that are in the original but not in the new
405
+ r_ids_to_remove = r_ids_old - r_ids_new
406
+
407
+ # assert that no new r_ids were added
408
+ if len(diff_ids := r_ids_new - r_ids_old) != 0:
409
+ raise ValueError(
410
+ "New reactions present in rcpr, not present in the considered "
411
+ f"reactions: {', '.join(diff_ids)}"
412
+ )
413
+
414
+ sbml_dfs.remove_reactions(r_ids_to_remove, remove_species=remove_species)
415
+
416
+ return sbml_dfs
417
+
418
+
419
+ def _get_rids_from_rcpr_reactions(rcpr_reactions: ListVector) -> set[str]:
420
+ """Gets the r_ids from the rcpr reactions"""
421
+ return set(rcpr_reactions.rx(FIELD_REACTIONS)[0].rx(COL_REACTION_ID)[0])
422
+
423
+
424
+ def _get_reactions(
425
+ sbml_dfs: sbml_dfs_core.SBML_dfs, identifier_ontology: str = NETCONTEXTR_ONTOLOGY
426
+ ) -> pd.DataFrame:
427
+ """Gets the reactions from the sbml_dfs"""
428
+ dat_reaction = (
429
+ sbml_dfs.species["s_Identifiers"]
430
+ # Get the identifiers for the given ontology
431
+ .map(lambda ids: ids.hoist(identifier_ontology, squeeze=False))
432
+ .map(lambda x: x if len(x) > 0 else None)
433
+ .dropna()
434
+ .rename(COL_GENE)
435
+ .to_frame()
436
+ .explode(COL_GENE)
437
+ # Merge with compartmentalized species to get the sc_id
438
+ .merge(
439
+ sbml_dfs.compartmentalized_species[["s_id"]].reset_index(drop=False),
440
+ left_index=True,
441
+ right_on="s_id",
442
+ )[[COL_GENE, "sc_id"]]
443
+ .merge(
444
+ sbml_dfs.reaction_species[
445
+ ["r_id", "sc_id", "sbo_term", "stoichiometry"]
446
+ ].reset_index(drop=False),
447
+ on="sc_id",
448
+ )
449
+ .assign(**{COL_ROLE: lambda x: x["sbo_term"].map(NETCONTEXTR_SBO_MAP)})
450
+ .rename({"r_id": COL_REACTION_ID, "stoichiometry": COL_STOICHIOMETRY}, axis=1)
451
+ )
452
+ fil = dat_reaction[COL_ROLE].isna()
453
+ if fil.sum() > 0:
454
+ missing_sbo_terms = dat_reaction.loc[fil, "sbo_term"].unique()
455
+ logger.warning(
456
+ f"Found {fil.sum()} reactions had an sbo term that was not"
457
+ "mappable to a rcpr role. These are ignored. "
458
+ f"The sbo terms are: {', '.join(missing_sbo_terms)}"
459
+ )
460
+
461
+ dat_reaction = dat_reaction.loc[~fil, :]
462
+ return dat_reaction[
463
+ [COL_ROLE, COL_GENE, COL_REACTION_ID, COL_STOICHIOMETRY, "rsc_id"]
464
+ ]