napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/rpy2/rids.py CHANGED
@@ -3,697 +3,110 @@ from __future__ import annotations
3
3
  import logging
4
4
 
5
5
  import pandas as pd
6
- from napistu import constants
7
- from napistu import identifiers
8
- from napistu import sbml_dfs_core
9
- from napistu import source
10
- from napistu import utils
11
- from napistu.rpy2 import callr
12
- from napistu.rpy2 import report_r_exceptions
13
- from napistu.rpy2 import warn_if_no_rpy2
6
+ from napistu.rpy2 import (
7
+ require_rpy2,
8
+ report_r_exceptions,
9
+ )
10
+
11
+ from napistu.rpy2.callr import bioconductor_org_r_function, r_dataframe_to_pandas
14
12
 
15
- from napistu.constants import SBML_DFS
16
- from napistu.constants import BQB
17
- from napistu.constants import IDENTIFIERS
18
13
  from napistu.constants import ONTOLOGIES
19
- from napistu.constants import ONTOLOGY_ALIASES
20
14
  from napistu.rpy2.constants import BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
21
- from napistu.rpy2.constants import BIOC_DOGMATIC_MAPPING_ONTOLOGIES
22
- from napistu.rpy2.constants import BIOC_PROTEIN_ONTOLOGIES
23
- from napistu.rpy2.constants import BIOC_NAME_ONTOLOGIES
24
- from napistu.rpy2.constants import BIOC_GENE_ONTOLOGIES # noqa
15
+ from napistu.rpy2.constants import BIOC_ONTOLOGY_MAPPING
25
16
  from napistu.rpy2.constants import BIOC_NOMENCLATURE
26
17
 
27
18
  logger = logging.getLogger(__name__)
28
19
 
29
20
 
30
- @warn_if_no_rpy2
31
- @report_r_exceptions
32
- def expand_identifiers(
33
- sbml_dfs: sbml_dfs_core.SBML_dfs,
34
- id_type: str,
35
- species: str,
36
- expanded_ontologies: list[str],
37
- r_paths: str | None = None,
38
- ) -> pd.Series:
39
- """
40
- Expand Identifiers
41
-
42
- Update a table's identifiers to include additional related ontologies
43
-
44
- Ontologies are pulled from the bioconductor "org" packages. This is effective, but inelegant.
45
-
46
- Parameters
47
- ----------
48
- sbml_dfs : SBML_dfs
49
- A relational pathway model built around reactions interconverting compartmentalized species.
50
- id_type: str
51
- Identifiers to expand: species, compartments, or reactions
52
- species: str
53
- Species name
54
- expanded_ontologies: list
55
- Ontologies to add or complete
56
- r_paths: str
57
- Path to an R packages directory
58
-
59
- Returns
60
- -------
61
- a pd.Series with identifiers as the index and updated Identifiers objects as values
62
- """
63
-
64
- if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
65
- raise TypeError("sbml_dfs is not an sbml_dfs_core.SBML_dfs object")
66
-
67
- # pull out all identifiers as a pd.DataFrame
68
- all_entity_identifiers = sbml_dfs.get_identifiers(id_type)
69
- if not isinstance(all_entity_identifiers, pd.DataFrame):
70
- raise TypeError("all_entity_identifiers must be a pandas DataFrame")
71
-
72
- if id_type == "species":
73
- all_entity_identifiers = _check_species_identifiers_entrez_gene_ontology(
74
- all_entity_identifiers
75
- )
76
-
77
- valid_expanded_ontologies = BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
78
- elif id_type in ["reactions", "compartments"]:
79
- raise NotImplementedError(
80
- f"No converters implemented to expand {id_type} annotations"
81
- )
82
- else:
83
- raise ValueError(f"{id_type} is an invalid id_type")
84
-
85
- invalid_expanded_ontologies = set(expanded_ontologies).difference(
86
- valid_expanded_ontologies
87
- )
88
-
89
- if len(invalid_expanded_ontologies) != 0:
90
- raise NotImplementedError(
91
- f"No converters implemented to expand {id_type} annotations to {', '.join(invalid_expanded_ontologies)}"
92
- )
93
-
94
- # find entries in valid_expanded_ontologies which are already present
95
- # these are the entries that will be used to expand to other ontologies
96
- # or fill in ontologies with incomplete annotations
97
- starting_ontologies = valid_expanded_ontologies.intersection(
98
- set(all_entity_identifiers["ontology"])
99
- )
100
-
101
- if len(starting_ontologies) == 0:
102
- raise ValueError(f"No ontologies with {id_type} converters are present")
103
-
104
- required_conversion_ontologies = set(starting_ontologies).union(
105
- set(expanded_ontologies)
106
- )
107
-
108
- # pull down entrez ids + mapping to other ontologies
109
- mapping_ontologies = required_conversion_ontologies.intersection(
110
- BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
111
- )
112
-
113
- mappings_dict = create_bioconductor_mapping_tables(
114
- mappings=mapping_ontologies, species=species, r_paths=r_paths
115
- )
116
-
117
- # start with entrez IDs (since all other ontologies are mapped to them in the
118
- # bioconductor "org" packages)
119
-
120
- # get these values by just looking up the mappings between entrez genes and genomic loci
121
- running_ids = merge_bioconductor_mappings(mappings_dict, mapping_ontologies)
122
-
123
- # map from existing ontologies to expanded ontologies
124
- ontology_mappings = list()
125
- # starting w/
126
- for start in starting_ontologies:
127
- # ending w/
128
- for end in expanded_ontologies:
129
- if start == end:
130
- continue
131
- lookup = (
132
- running_ids[[start, end]]
133
- .rename(columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"})
134
- .assign(ontology=start)
135
- .assign(new_ontology=end)
136
- )
137
- ontology_mappings.append(lookup)
138
-
139
- ontology_mappings_df = pd.concat(ontology_mappings).dropna()
140
-
141
- # old identifiers joined with new identifiers
142
-
143
- # first, define the names of keys and ids
144
- table_pk_var = sbml_dfs.schema[id_type]["pk"]
145
- table_id_var = sbml_dfs.schema[id_type]["id"]
146
-
147
- # retain bqb terms to define how an identifier is related to sid
148
- # this relation will be preserved for the new ids
149
-
150
- merged_identifiers = all_entity_identifiers[
151
- [
152
- table_pk_var,
153
- IDENTIFIERS.ONTOLOGY,
154
- IDENTIFIERS.IDENTIFIER,
155
- IDENTIFIERS.BQB,
156
- ]
157
- ].merge(ontology_mappings_df)
158
-
159
- # new, possibly redundant identifiers
160
- new_identifiers = merged_identifiers[
161
- [table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
162
- ].rename(
163
- columns={
164
- "new_ontology": IDENTIFIERS.ONTOLOGY,
165
- "new_identifier": IDENTIFIERS.IDENTIFIER,
166
- }
167
- )
168
-
169
- expanded_identifiers_df = (
170
- pd.concat(
171
- [
172
- all_entity_identifiers[
173
- [
174
- table_pk_var,
175
- IDENTIFIERS.ONTOLOGY,
176
- IDENTIFIERS.IDENTIFIER,
177
- IDENTIFIERS.URL,
178
- IDENTIFIERS.BQB,
179
- ]
180
- ],
181
- new_identifiers,
182
- # ignore new identifier if it already exists
183
- ]
184
- )
185
- # remove duplicated identifiers
186
- .groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
187
- .first()
188
- .reset_index()
189
- .set_index(table_pk_var)
190
- )
191
-
192
- # create a dictionary of new Identifiers objects
193
- expanded_identifiers_dict = {
194
- i: _expand_identifiers_new_entries(i, expanded_identifiers_df)
195
- for i in expanded_identifiers_df.index.unique()
196
- }
197
-
198
- output = pd.Series(expanded_identifiers_dict).rename(table_id_var)
199
- output.index.name = table_pk_var
200
-
201
- return output
202
-
203
-
204
- @warn_if_no_rpy2
21
+ @require_rpy2
205
22
  @report_r_exceptions
206
23
  def create_bioconductor_mapping_tables(
207
24
  mappings: set[str], species: str, r_paths: str | None = None
208
25
  ) -> dict[str, pd.DataFrame]:
209
- """
210
- Create Bioconductor Mapping Tables
26
+ """Create Bioconductor Mapping Tables.
211
27
 
212
28
  Creating a dictionary of mappings between entrez and other ontologies.
213
29
 
214
- Args:
215
- mappings (set):
216
- A set of ontologies to work with. The valid ontologies are:
217
- "ensembl_gene", "ensembl_transcript", and "uniprot".
218
- species (str):
219
- The organismal species that we are working with (e.g., Homo sapiens).
220
- r_paths (str, optional):
221
- Optional path to a library of R packages.
30
+ Parameters
31
+ ----------
32
+ mappings : set[str]
33
+ A set of ontologies to work with. The valid ontologies are:
34
+ "ensembl_gene", "ensembl_transcript", and "uniprot".
35
+ species : str
36
+ The organismal species that we are working with (e.g., Homo sapiens).
37
+ r_paths : str | None, optional
38
+ Optional path to a library of R packages.
222
39
 
223
- Returns:
224
- mappings_dict (dict):
225
- A table of entrez ids, and tables mapping from each ontology in "mappings" to entrez.
40
+ Returns
41
+ -------
42
+ dict[str, pd.DataFrame]
43
+ A table of entrez ids, and tables mapping from each ontology in "mappings" to entrez.
226
44
 
45
+ Raises
46
+ ------
47
+ ValueError
48
+ If any of the requested mappings are not supported
227
49
  """
228
-
229
- if not isinstance(mappings, set):
230
- raise TypeError(f"mappings must be a set, but got {type(mappings).__name__}")
231
- if not isinstance(species, str):
232
- raise TypeError(f"species must be a str, but got {type(species).__name__}")
233
-
234
50
  logger.info(
235
- f"Creating mapping tables from entrez genes to/from {', '.join(mappings)}"
51
+ "Creating mapping tables from entrez genes to/from %s", ", ".join(mappings)
236
52
  )
237
53
 
238
54
  invalid_mappings = set(mappings).difference(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)
239
-
240
55
  if len(invalid_mappings) > 0:
241
56
  raise ValueError(
242
57
  f"{len(invalid_mappings)} mappings could not be created: {', '.join(invalid_mappings)}.\n"
243
58
  f"The valid mappings are {', '.join(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)}"
244
59
  )
245
60
 
246
- mappings_dict = dict()
61
+ mappings_dict = {}
247
62
 
248
- # all mappings are with respect to entrez. so we will always want to obtain entrez ids
249
- mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE] = (
250
- callr.r_dataframe_to_pandas(
251
- callr.bioconductor_org_r_function(
252
- BIOC_NOMENCLATURE.CHR_TBL, species, r_paths=None
253
- )
254
- )
255
- .drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
256
- .rename(
257
- columns={BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE}
258
- )
259
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
260
- )
261
-
262
- if ONTOLOGIES.ENSEMBL_GENE in mappings:
263
- # "entrez <> ensembl genes"
264
- mappings_dict[ONTOLOGIES.ENSEMBL_GENE] = (
265
- callr.r_dataframe_to_pandas(
266
- callr.bioconductor_org_r_function(
267
- BIOC_NOMENCLATURE.ENSG_TBL, species, r_paths=r_paths
268
- )
269
- )
270
- .rename(
271
- columns={
272
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
273
- BIOC_NOMENCLATURE.ENSEMBL_GENE: ONTOLOGIES.ENSEMBL_GENE,
274
- }
275
- )
276
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
277
- )
278
-
279
- if ONTOLOGIES.ENSEMBL_TRANSCRIPT in mappings:
280
- # "entrez <> ensembl transcripts"
281
- mappings_dict[ONTOLOGIES.ENSEMBL_TRANSCRIPT] = (
282
- callr.r_dataframe_to_pandas(
283
- callr.bioconductor_org_r_function(
284
- BIOC_NOMENCLATURE.ENST_TBL, species, r_paths=r_paths
285
- )
286
- )
287
- .rename(
288
- columns={
289
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
290
- BIOC_NOMENCLATURE.ENSEMBL_TRANSCRIPT: ONTOLOGIES.ENSEMBL_TRANSCRIPT,
291
- }
292
- )
293
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
294
- )
295
-
296
- if ONTOLOGIES.ENSEMBL_PROTEIN in mappings:
297
- # "entrez <> ensembl proteins"
298
- mappings_dict[ONTOLOGIES.ENSEMBL_PROTEIN] = (
299
- callr.r_dataframe_to_pandas(
300
- callr.bioconductor_org_r_function(
301
- BIOC_NOMENCLATURE.ENSP_TBL, species, r_paths=r_paths
302
- )
303
- )
304
- .rename(
305
- columns={
306
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
307
- BIOC_NOMENCLATURE.ENSEMBL_PROTEIN: ONTOLOGIES.ENSEMBL_PROTEIN,
308
- }
309
- )
310
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
311
- )
312
-
313
- if ONTOLOGIES.UNIPROT in mappings:
314
- # "entrez <> uniprot"
315
- mappings_dict[ONTOLOGIES.UNIPROT] = (
316
- callr.r_dataframe_to_pandas(
317
- callr.bioconductor_org_r_function(
318
- BIOC_NOMENCLATURE.UNIPROT_TBL, species, r_paths=r_paths
319
- )
320
- )
321
- .rename(
322
- columns={
323
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
324
- BIOC_NOMENCLATURE.UNIPROT: ONTOLOGIES.UNIPROT,
325
- }
326
- )
327
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
328
- )
329
-
330
- if ONTOLOGIES.GENE_NAME in mappings:
331
- # "entrez <> gene name"
332
- mappings_dict[ONTOLOGIES.GENE_NAME] = (
333
- callr.r_dataframe_to_pandas(
334
- callr.bioconductor_org_r_function(
335
- BIOC_NOMENCLATURE.NAME_TBL, species, r_paths=r_paths
336
- )
337
- )
338
- .rename(
339
- columns={
340
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
341
- BIOC_NOMENCLATURE.GENE_NAME: ONTOLOGIES.GENE_NAME,
342
- }
343
- )
344
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
345
- )
346
-
347
- if ONTOLOGIES.SYMBOL in mappings:
348
- # "entrez <> gene symbol"
349
- mappings_dict[ONTOLOGIES.SYMBOL] = (
350
- callr.r_dataframe_to_pandas(
351
- callr.bioconductor_org_r_function(
352
- BIOC_NOMENCLATURE.SYMBOL_TBL, species, r_paths=r_paths
353
- )
354
- )
355
- .rename(
356
- columns={
357
- BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
358
- BIOC_NOMENCLATURE.SYMBOL: ONTOLOGIES.SYMBOL,
359
- }
360
- )
361
- .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
362
- )
63
+ # Create mapping tables for each requested ontology
64
+ for ontology in mappings:
65
+ mappings_dict[ontology] = _create_single_mapping(ontology, species, r_paths)
363
66
 
364
67
  return mappings_dict
365
68
 
366
69
 
367
- def merge_bioconductor_mappings(
368
- mappings_dict: dict, mapping_ontologies: set[str]
70
+ def _create_single_mapping(
71
+ ontology: str, species: str, r_paths: str | None = None
369
72
  ) -> pd.DataFrame:
370
- """Combine multiple ontologies by recursively joining on Entrez Gene"""
371
-
372
- running_ids = mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE]
373
-
374
- for mapping in mapping_ontologies:
375
- logger.debug(f"adding entries for {mapping} to running_ids")
376
- mapping_df = mappings_dict[mapping]
377
-
378
- running_ids = running_ids.join(mapping_df)
73
+ """Create a single mapping table for a given ontology.
379
74
 
380
- running_ids = running_ids.reset_index()
381
-
382
- return running_ids
383
-
384
-
385
- def stack_bioconductor_mappings(
386
- mappings_dict: dict[str, pd.DataFrame], mapping_ontologies: set[str]
387
- ) -> pd.DataFrame:
388
- """
389
- Stack Bioconductor Mappings
390
-
391
- Convert a dict of mappings between entrez identifiers and other identifiers to a single table.
392
-
393
- Args:
394
- mappings_dict (dict):
395
- A dictionary containing mappings between entrez and other ontologies.
396
- mapping_ontologies (set):
397
- A set of mappings to combine.
398
-
399
- Returns:
400
- mappings_df (pd.DataFrame):
401
- A table containing entrez_gene_id, ontology, and identifier.
402
- """
403
-
404
- mappings_list = list()
405
- for ont in mapping_ontologies:
406
- one_mapping_df = (
407
- mappings_dict[ont].assign(ontology=ont).rename({ont: "identifier"}, axis=1)
408
- )
409
-
410
- mappings_list.append(one_mapping_df)
411
-
412
- return pd.concat(mappings_list)
413
-
414
-
415
- def _check_species_identifiers_entrez_gene_ontology(
416
- entity_identifiers_df: pd.DataFrame,
417
- ) -> pd.DataFrame:
418
- """
419
- Check whether species ontologies contain ncbigene or ncbi_gene
420
- If so, replaced them to ncbi_entrez_gene.
421
- Return: entity_identifiers_df with proper gene ontology types.
422
- """
423
-
424
- intersect_gene_onto = set(entity_identifiers_df["ontology"]).intersection(
425
- ONTOLOGY_ALIASES.NCBI_ENTREZ_GENE
426
- )
427
-
428
- # if entity_identifiers_df contains members of ENTREZ_ONTOLOGY_ALIASES,
429
- # replace to ncbi_entrez_gene
430
- if intersect_gene_onto:
431
- logger.info(
432
- f" Replace unmatching ontology {', '.join(intersect_gene_onto)} to {ONTOLOGIES.NCBI_ENTREZ_GENE}."
433
- )
434
-
435
- filtered_onto_df = entity_identifiers_df[
436
- entity_identifiers_df["ontology"].isin(list(intersect_gene_onto))
437
- ]
438
-
439
- entity_identifiers_df.loc[filtered_onto_df.index, "ontology"] = (
440
- ONTOLOGIES.NCBI_ENTREZ_GENE
441
- )
442
-
443
- return entity_identifiers_df
444
-
445
-
446
- def update_expanded_identifiers(
447
- model: sbml_dfs_core.SBML_dfs, id_type: str, expanded_ids: pd.Series
448
- ) -> sbml_dfs_core.SBML_dfs:
449
- """Update the expanded identifiers for a model.
450
-
451
- Args:
452
- model (sbml_dfs_core.SBML_dfs): _description_
453
- id_type (str): _description_
454
- expanded_ids (str): _description_
455
- """
456
- ids = getattr(model, id_type)
457
-
458
- # make sure expanded_ids and original model.species have same number of s_ids
459
- # if a s_id only in model.species, adding it to expanded_ids.
460
- if ids.shape[0] != expanded_ids.shape[0]:
461
- matched_expanded_ids = expanded_ids.combine_first(ids[SBML_DFS.S_IDENTIFIERS])
462
- logger.debug(
463
- f"{ids.shape[0] - expanded_ids.shape[0]} "
464
- "ids are not included in expanded ids"
465
- )
466
- else:
467
- matched_expanded_ids = expanded_ids
468
-
469
- updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
470
- pd.DataFrame(matched_expanded_ids)
471
- )
472
-
473
- setattr(model, id_type, updated_ids)
474
-
475
- return model
476
-
477
-
478
- def create_dogmatic_sbml_dfs(
479
- species: str, r_paths: str | None = None
480
- ) -> sbml_dfs_core.SBML_dfs:
481
- """
482
- Create Dogmatic SMBL_DFs
483
-
484
- Create an SBML_dfs model which is pretty much just proteins and no
485
- reactions, as well as annotations linking proteins to genes, and
486
- creating nice labels for genes/proteins.
487
-
488
- Args:
489
- species (str):
490
- An organismal species (e.g., Homo sapiens)
491
- r_paths (str or None)
492
- Optional, p]ath to an R packages directory
75
+ Parameters
76
+ ----------
77
+ ontology : str
78
+ The ontology to map (e.g. ENSEMBL_GENE, UNIPROT)
79
+ species : str
80
+ The organismal species to map
81
+ r_paths : str | None, optional
82
+ Optional path to R packages directory
493
83
 
494
- Returns:
495
- dogmatic_sbml_dfs (sbml.SBML_dfs)
496
- A pathway model which (pretty much) just contains proteins and
497
- diverse identifiers
84
+ Returns
85
+ -------
86
+ pd.DataFrame
87
+ DataFrame containing the mapping between entrez and the target ontology
498
88
  """
499
89
 
500
- dogmatic_mappings = connect_dogmatic_mappings(species)
90
+ if ontology not in BIOC_ONTOLOGY_MAPPING:
91
+ raise ValueError(f"Unsupported ontology: {ontology}")
501
92
 
502
- logger.info("Creating inputs for sbml_dfs_from_edgelist()")
93
+ table_name, column_name = BIOC_ONTOLOGY_MAPPING[ontology]
503
94
 
504
- # format entries for sbml_dfs_from_edgelist()
505
- species_df = dogmatic_mappings["cluster_consensus_identifiers_df"].join(
506
- dogmatic_mappings["s_name_series"]
95
+ df = r_dataframe_to_pandas(
96
+ bioconductor_org_r_function(table_name, species, r_paths=r_paths)
507
97
  )
508
98
 
509
- # stub required but invariant variables
510
- compartments_df = sbml_dfs_core._stub_compartments()
511
- interaction_source = source.Source(init=True)
99
+ # Drop chromosome column if this is the chromosome table
100
+ # this was only introduced so we had a table with 1 row per unique entrez id
101
+ if table_name == BIOC_NOMENCLATURE.CHR_TBL:
102
+ df = df.drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
512
103
 
513
- # interactions table. This is required to create the sbml_dfs but we'll drop the info later
514
- interaction_edgelist = species_df.rename(
104
+ # Rename columns and set index
105
+ df = df.rename(
515
106
  columns={
516
- "s_name": "upstream_name",
517
- SBML_DFS.S_IDENTIFIERS: SBML_DFS.R_IDENTIFIERS,
107
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
108
+ column_name: ontology,
518
109
  }
519
- )
520
- interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
521
- interaction_edgelist["upstream_compartment"] = "cellular_component"
522
- interaction_edgelist["downstream_compartment"] = "cellular_component"
523
- interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
524
- interaction_edgelist["sbo_term"] = constants.MINI_SBO_FROM_NAME["reactant"]
525
- interaction_edgelist["r_isreversible"] = False
526
-
527
- dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
528
- interaction_edgelist=interaction_edgelist,
529
- species_df=species_df,
530
- compartments_df=compartments_df,
531
- interaction_source=interaction_source,
532
- upstream_stoichiometry=-1,
533
- downstream_stoichiometry=1,
534
- downstream_sbo_name="product",
535
- )
536
-
537
- # remove all reactions except 1 (so it still passes sbml_dfs.validate())
538
- # this self reaction will be removed when creating the graph
539
- dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
540
-
541
- return dogmatic_sbml_dfs
542
-
543
-
544
- def connect_dogmatic_mappings(species: str, r_paths: str | None = None) -> dict:
545
- """
546
- Connect Dogmatic Mappings
547
-
548
- Merge all ontologies into greedy clusters based on shared associations to entrez ids
549
-
550
- Args:
551
- species (str):
552
- An organismal species (e.g., Homo sapiens)
553
- r_paths (str or None)
554
- Optional, p]ath to an R packages directory
555
-
556
- Returns:
557
- dict with:
558
- - s_name_series: a series where the index is distinct molecular species and the values are names.
559
- - cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
560
- and values are identifiers objects.
561
- """
562
-
563
- mappings_dict = create_bioconductor_mapping_tables(
564
- mappings=BIOC_DOGMATIC_MAPPING_ONTOLOGIES,
565
- species=species,
566
- r_paths=r_paths,
567
- )
568
-
569
- protein_mappings = stack_bioconductor_mappings(
570
- mappings_dict, set(BIOC_PROTEIN_ONTOLOGIES)
571
- )
572
-
573
- # apply greedy graph-based clustering to connect proteins with a common mapping to entrez
574
- edgelist_df = utils.format_identifiers_as_edgelist(
575
- protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
576
- )
577
- connected_indices = utils.find_weakly_connected_subgraphs(
578
- edgelist_df[["ind", "id"]]
579
- )
580
-
581
- # add clusters to proteins. Each cluster will be a distinct molecular species
582
- protein_mappings_w_clusters = protein_mappings.reset_index().merge(
583
- connected_indices
584
- )
585
-
586
- # combine entrez + cluster so we can pass cluster to non-protein attributes
587
- entrez_clusters = protein_mappings_w_clusters[
588
- [ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
589
- ].drop_duplicates()
590
- other_ontologies = BIOC_DOGMATIC_MAPPING_ONTOLOGIES.difference(
591
- set(BIOC_PROTEIN_ONTOLOGIES)
592
- )
593
- other_mappings = stack_bioconductor_mappings(mappings_dict, other_ontologies)
594
- other_mappings_w_clusters = entrez_clusters.merge(
595
- other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
596
- )
597
-
598
- possible_names = pd.concat(
599
- [
600
- protein_mappings_w_clusters.query(
601
- "ontology in @BIOC_NAME_ONTOLOGIES.keys()"
602
- ),
603
- other_mappings_w_clusters.query("ontology in @BIOC_NAME_ONTOLOGIES.keys()"),
604
- ]
605
- )[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
606
-
607
- possible_names.loc[:, "ontology_preference"] = possible_names[
608
- IDENTIFIERS.ONTOLOGY
609
- ].map(BIOC_NAME_ONTOLOGIES)
610
-
611
- # remove possible names which are present in multiple clusters.
612
- # all clusters will need unique names to use sbml_dfs_from_edgelist()
613
- id_counts = (
614
- possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
615
- .drop_duplicates()
616
- .value_counts(IDENTIFIERS.IDENTIFIER)
617
- )
618
- possible_names = possible_names[
619
- ~possible_names[IDENTIFIERS.IDENTIFIER].isin(
620
- id_counts[id_counts > 1].index.tolist()
621
- )
622
- ]
623
-
624
- s_name_series = (
625
- utils._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
626
- .sort_values(["ontology_preference", "nameness_score"])
627
- .groupby("cluster")
628
- .first()
629
- .rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
630
- )
631
-
632
- protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
633
- ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
634
- ]
635
- gene_ids = other_mappings_w_clusters.query(
636
- "ontology in @BIOC_GENE_ONTOLOGIES"
637
- ).assign(bqb=BQB.IS_ENCODED_BY)[
638
- ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
639
- ]
640
- entrez_ids = entrez_clusters.assign(
641
- ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
642
- ).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
643
- ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
644
- ]
645
-
646
- # combine all ids to setup a single cluster-level Identifiers
647
- all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
648
- all_ids.loc[:, IDENTIFIERS.URL] = [
649
- identifiers.create_uri_url(x, y)
650
- for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
651
- ]
652
-
653
- # create one Identifiers object for each new species
654
- cluster_consensus_identifiers = {
655
- k: identifiers.Identifiers(
656
- list(
657
- v[
658
- [
659
- IDENTIFIERS.ONTOLOGY,
660
- IDENTIFIERS.IDENTIFIER,
661
- IDENTIFIERS.URL,
662
- IDENTIFIERS.BQB,
663
- ]
664
- ]
665
- .reset_index(drop=True)
666
- .T.to_dict()
667
- .values()
668
- )
669
- )
670
- for k, v in all_ids.groupby("cluster")
671
- }
672
-
673
- cluster_consensus_identifiers_df = pd.DataFrame(
674
- cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
675
- ).T
676
- cluster_consensus_identifiers_df.index.name = "cluster"
677
-
678
- out_dict = {
679
- "s_name_series": s_name_series,
680
- "cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
681
- }
682
-
683
- return out_dict
684
-
685
-
686
- @warn_if_no_rpy2
687
- def _expand_identifiers_new_entries(
688
- sysid: str, expanded_identifiers_df: pd.DataFrame
689
- ) -> identifiers.Identifiers:
690
- """Expand Identifiers to include Bioconductor annotations"""
691
- entry = expanded_identifiers_df.loc[sysid]
692
-
693
- if type(entry) is pd.Series:
694
- sysis_id_list = [entry.to_dict()]
695
- else:
696
- # multiple annotations
697
- sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
110
+ ).set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
698
111
 
699
- return identifiers.Identifiers(sysis_id_list)
112
+ return df