napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,649 @@
1
+ from typing import Dict, List, Optional, Set
2
+ import logging
3
+
4
+ import pandas as pd
5
+ from pydantic import BaseModel, Field, field_validator
6
+
7
+ from napistu import sbml_dfs_core
8
+ from napistu import identifiers
9
+ from napistu.ontologies.mygene import create_python_mapping_tables
10
+ from napistu.constants import SBML_DFS, ONTOLOGIES, IDENTIFIERS, SBML_DFS_SCHEMA
11
+ from napistu.ontologies.constants import INTERCONVERTIBLE_GENIC_ONTOLOGIES
12
+ from napistu.ontologies.constants import GENODEXITO_DEFS
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Genodexito:
18
+ """A tool for mapping gene identifiers across ontologies.
19
+
20
+ Genodexito provides a unified interface for mapping between different gene identifier
21
+ ontologies (e.g. Ensembl, Entrez, UniProt). It supports both an R-centric workflow
22
+ using Bioconductor through RPy2, as well as a Python-centric workflow using MyGene.info.
23
+
24
+ The class automatically handles fallback between the two methods if one fails.
25
+
26
+ Parameters
27
+ ----------
28
+ species : str, optional
29
+ The organismal species to map identifiers for, by default "Homo sapiens"
30
+ preferred_method : str, optional
31
+ Which mapping method to try first ("bioconductor" or "python"), by default "bioconductor"
32
+ allow_fallback : bool, optional
33
+ Whether to allow falling back to the other method if preferred fails, by default True
34
+ r_paths : Optional[List[str]], optional
35
+ Optional paths to R libraries for Bioconductor, by default None
36
+ test_mode : bool, optional
37
+ If True, limit queries to 1000 genes for testing purposes, by default False
38
+
39
+ Attributes
40
+ ----------
41
+ mappings : Optional[Dict[str, pd.DataFrame]]
42
+ Dictionary of mapping tables between ontologies
43
+ mapper_used : Optional[str]
44
+ Which mapping method was successfully used ("bioconductor" or "python")
45
+ merged_mappings : Optional[pd.DataFrame]
46
+ Combined wide-format mapping table
47
+ stacked_mappings : Optional[pd.DataFrame]
48
+ Combined long-format mapping table
49
+
50
+ Methods
51
+ -------
52
+ create_mapping_tables(mappings: Set[str], overwrite: bool = False)
53
+ Create mapping tables between different ontologies. This is the primary method
54
+ to fetch and store identifier mappings. Must be called before using other methods.
55
+
56
+ merge_mappings(ontologies: Optional[Set[str]] = None)
57
+ Create a wide-format table where each row is an Entrez gene ID and columns
58
+ contain the corresponding identifiers in other ontologies.
59
+
60
+ stack_mappings(ontologies: Optional[Set[str]] = None)
61
+ Create a long-format table combining all mappings, with columns for
62
+ ontology type and identifier values.
63
+
64
+ expand_sbml_dfs_ids(sbml_dfs: sbml_dfs_core.SBML_dfs, ontologies: Optional[Set[str]] = None)
65
+ Update the expanded identifiers for a model by adding additional related
66
+ ontologies pulled from Bioconductor or MyGene.info.
67
+
68
+ Examples
69
+ --------
70
+ >>> # Initialize mapper with Python method
71
+ >>> geno = Genodexito(preferred_method="python")
72
+ >>>
73
+ >>> # Create mapping tables for specific ontologies
74
+ >>> mappings = {'ensembl_gene', 'symbol', 'uniprot'}
75
+ >>> geno.create_mapping_tables(mappings)
76
+ >>>
77
+ >>> # Create merged wide-format table
78
+ >>> geno.merge_mappings()
79
+ >>> print(geno.merged_mappings.head())
80
+ >>>
81
+ >>> # Create stacked long-format table
82
+ >>> geno.stack_mappings()
83
+ >>> print(geno.stacked_mappings.head())
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ species: str = "Homo sapiens",
89
+ preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
90
+ allow_fallback: bool = True,
91
+ r_paths: Optional[List[str]] = None,
92
+ test_mode: bool = False,
93
+ ) -> None:
94
+ """
95
+ Initialize unified gene mapper
96
+
97
+ Parameters
98
+ ----------
99
+ species : str, optional
100
+ Species name, by default "Homo sapiens"
101
+ preferred_method : str, optional
102
+ Which mapping method to try first ("bioconductor" or "python"), by default "bioconductor"
103
+ allow_fallback : bool, optional
104
+ Whether to allow falling back to other method if preferred fails, by default True
105
+ r_paths : Optional[List[str]], optional
106
+ Optional paths to R libraries for Bioconductor, by default None
107
+ test_mode : bool, optional
108
+ If True, limit queries to 1000 genes for testing purposes, by default False
109
+ """
110
+ # Validate configuration using Pydantic model
111
+ config = GenodexitoConfig(
112
+ species=species,
113
+ preferred_method=preferred_method,
114
+ allow_fallback=allow_fallback,
115
+ r_paths=r_paths,
116
+ test_mode=test_mode,
117
+ )
118
+
119
+ self.species = config.species
120
+ self.preferred_method = config.preferred_method
121
+ self.allow_fallback = config.allow_fallback
122
+ self.r_paths = config.r_paths
123
+ self.test_mode = config.test_mode
124
+
125
+ # Initialize empty attributes
126
+ self.mappings: Optional[Dict[str, pd.DataFrame]] = None
127
+ self.mapper_used: Optional[str] = None
128
+ self.merged_mappings: Optional[pd.DataFrame] = None
129
+ self.stacked_mappings: Optional[pd.DataFrame] = None
130
+
131
+ def create_mapping_tables(
132
+ self, mappings: Set[str], overwrite: bool = False
133
+ ) -> None:
134
+ """Create mapping tables between different ontologies.
135
+
136
+ This is a drop-in replacement for create_bioconductor_mapping_tables that handles
137
+ both Bioconductor and Python-based mapping methods.
138
+
139
+ Parameters
140
+ ----------
141
+ mappings : Set[str]
142
+ Set of ontologies to create mappings for
143
+ overwrite : bool, optional
144
+ Whether to overwrite existing mappings, by default False
145
+
146
+ Returns
147
+ -------
148
+ None
149
+ Updates self.mappings and self.mapper_used in place
150
+ """
151
+
152
+ # check for existing mappings
153
+ if self.mappings is not None and not overwrite:
154
+ logger.warning(
155
+ f"Mapping tables for {self.species} already exist. Use overwrite=True to create new mappings."
156
+ )
157
+ return None
158
+
159
+ if self.preferred_method == GENODEXITO_DEFS.BIOCONDUCTOR:
160
+ try:
161
+ # Only import R functionality when needed
162
+ from napistu.rpy2.rids import create_bioconductor_mapping_tables
163
+
164
+ self.mappings = create_bioconductor_mapping_tables(
165
+ mappings=mappings, species=self.species, r_paths=self.r_paths
166
+ )
167
+ self.mapper_used = GENODEXITO_DEFS.BIOCONDUCTOR
168
+ except Exception as e:
169
+ if self.allow_fallback:
170
+ logger.warning(
171
+ f"Error creating bioconductor mapping tables for {self.species} with {mappings}. Falling back to python."
172
+ )
173
+ self.mappings = create_python_mapping_tables(
174
+ mappings=mappings,
175
+ species=self.species,
176
+ test_mode=self.test_mode,
177
+ )
178
+ self.mapper_used = GENODEXITO_DEFS.PYTHON
179
+ else:
180
+ logger.error(
181
+ f"Error creating bioconductor mapping tables for {self.species} with {mappings} and fallback is disabled."
182
+ )
183
+ raise e
184
+
185
+ elif self.preferred_method == GENODEXITO_DEFS.PYTHON:
186
+ try:
187
+ self.mappings = create_python_mapping_tables(
188
+ mappings=mappings, species=self.species, test_mode=self.test_mode
189
+ )
190
+ self.mapper_used = GENODEXITO_DEFS.PYTHON
191
+ except Exception as e:
192
+ if self.allow_fallback:
193
+ logger.warning(
194
+ f"Error creating mygene Python mapping tables for {self.species} with {mappings}. Trying the bioconductor fallback."
195
+ )
196
+ # Only import R functionality when needed
197
+ from napistu.rpy2.rids import create_bioconductor_mapping_tables
198
+
199
+ self.mappings = create_bioconductor_mapping_tables(
200
+ mappings=mappings, species=self.species, r_paths=self.r_paths
201
+ )
202
+ self.mapper_used = GENODEXITO_DEFS.BIOCONDUCTOR
203
+ else:
204
+ logger.error(
205
+ f"Error creating Python mapping tables for {self.species} with {mappings} and fallback is disabled."
206
+ )
207
+ raise e
208
+
209
+ else:
210
+ raise ValueError(f"Invalid preferred_method: {self.preferred_method}")
211
+
212
+ return None
213
+
214
+ def merge_mappings(self, ontologies: Optional[Set[str]] = None) -> None:
215
+ """Merge mappings into a single wide table.
216
+
217
+ Creates a wide-format table where each row is an Entrez gene ID and
218
+ columns contain the corresponding identifiers in other ontologies.
219
+
220
+ Parameters
221
+ ----------
222
+ ontologies : Optional[Set[str]], optional
223
+ Set of ontologies to include in merged table, by default None
224
+ If None, uses all available ontologies
225
+
226
+ Returns
227
+ -------
228
+ None
229
+ Updates self.merged_mappings in place
230
+
231
+ Raises
232
+ ------
233
+ ValueError
234
+ If mappings don't exist or requested ontologies are invalid
235
+ TypeError
236
+ If any identifiers are not strings
237
+ ValueError
238
+ If any mapping tables contain NA values
239
+ """
240
+
241
+ # mappings must exist and be valid
242
+ self._check_mappings()
243
+ ontologies = self._use_mappings(ontologies)
244
+
245
+ running_ids = self.mappings[ONTOLOGIES.NCBI_ENTREZ_GENE]
246
+
247
+ for mapping in ontologies:
248
+ logger.debug(f"adding entries for {mapping} to running_ids")
249
+ mapping_df = self.mappings[mapping]
250
+
251
+ running_ids = running_ids.join(mapping_df)
252
+
253
+ running_ids = running_ids.reset_index()
254
+
255
+ self.merged_mappings = running_ids
256
+
257
+ return None
258
+
259
+ def stack_mappings(self, ontologies: Optional[Set[str]] = None) -> None:
260
+ """Stack mappings into a single long table.
261
+
262
+ Convert a dict of mappings between Entrez identifiers and other identifiers
263
+ into a single long-format table.
264
+
265
+ Parameters
266
+ ----------
267
+ ontologies : Optional[Set[str]], optional
268
+ Set of ontologies to include in stacked table, by default None
269
+ If None, uses all available ontologies
270
+
271
+ Returns
272
+ -------
273
+ None
274
+ Updates self.stacked_mappings in place
275
+
276
+ Raises
277
+ ------
278
+ ValueError
279
+ If mappings don't exist or requested ontologies are invalid
280
+ TypeError
281
+ If any identifiers are not strings
282
+ ValueError
283
+ If any mapping tables contain NA values
284
+ """
285
+
286
+ # mappings must exist and be valid
287
+ self._check_mappings()
288
+ ontologies = self._use_mappings(ontologies)
289
+
290
+ mappings_list = list()
291
+ for ont in ontologies:
292
+ one_mapping_df = (
293
+ self.mappings[ont]
294
+ .assign(ontology=ont)
295
+ .rename({ont: IDENTIFIERS.IDENTIFIER}, axis=1)
296
+ )
297
+
298
+ mappings_list.append(one_mapping_df)
299
+
300
+ self.stacked_mappings = pd.concat(mappings_list)
301
+
302
+ def expand_sbml_dfs_ids(
303
+ self, sbml_dfs: sbml_dfs_core.SBML_dfs, ontologies: Optional[Set[str]] = None
304
+ ) -> sbml_dfs_core.SBML_dfs:
305
+ """Update the expanded identifiers for a model.
306
+
307
+ Parameters
308
+ ----------
309
+ sbml_dfs : sbml_dfs_core.SBML_dfs
310
+ The SBML model to update with expanded identifiers
311
+ ontologies : Optional[Set[str]], optional
312
+ Set of ontologies to use for mapping. If None, uses all available ontologies
313
+ from INTERCONVERTIBLE_GENIC_ONTOLOGIES.
314
+
315
+ Returns
316
+ -------
317
+ sbml_dfs_core.SBML_dfs
318
+ Updated SBML model with expanded identifiers
319
+ """
320
+
321
+ ids = getattr(sbml_dfs, "species")
322
+
323
+ # If no ontologies specified, use all available ones
324
+ if ontologies is None:
325
+ ontologies = INTERCONVERTIBLE_GENIC_ONTOLOGIES
326
+ else:
327
+ # Ensure ncbi_entrez_gene is included in the ontologies
328
+ ontologies = set(ontologies)
329
+ ontologies.add(ONTOLOGIES.NCBI_ENTREZ_GENE)
330
+
331
+ invalid_ontologies = ontologies - INTERCONVERTIBLE_GENIC_ONTOLOGIES
332
+ if invalid_ontologies:
333
+ raise ValueError(
334
+ f"Invalid ontologies: {', '.join(invalid_ontologies)}.\n"
335
+ f"Valid options are: {', '.join(sorted(INTERCONVERTIBLE_GENIC_ONTOLOGIES))}"
336
+ )
337
+
338
+ # create mapping tables if they don't exist
339
+ if self.mappings is None:
340
+ self.create_mapping_tables(ontologies)
341
+
342
+ # select and validate mappings
343
+ ontologies = self._use_mappings(ontologies)
344
+
345
+ if self.merged_mappings is None:
346
+ self.merge_mappings(ontologies)
347
+
348
+ # merge existing and new identifiers
349
+ expanded_ids = self._create_expanded_identifiers(sbml_dfs, ontologies)
350
+
351
+ # make sure expanded_ids and original model.species have same number of s_ids
352
+ # if a s_id only in model.species, adding it to expanded_ids.
353
+ if ids.shape[0] != expanded_ids.shape[0]:
354
+ matched_expanded_ids = expanded_ids.combine_first(
355
+ ids[SBML_DFS.S_IDENTIFIERS]
356
+ )
357
+ logger.debug(
358
+ f"{ids.shape[0] - expanded_ids.shape[0]} "
359
+ "ids are not included in expanded ids"
360
+ )
361
+ else:
362
+ matched_expanded_ids = expanded_ids
363
+
364
+ updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
365
+ pd.DataFrame(matched_expanded_ids)
366
+ )
367
+
368
+ setattr(sbml_dfs, "species", updated_ids)
369
+
370
+ return sbml_dfs
371
+
372
+ def _check_mappings(self) -> None:
373
+ """Check that mappings exist and contain required ontologies.
374
+
375
+ Raises
376
+ ------
377
+ ValueError
378
+ If mappings don't exist or don't contain NCBI_ENTREZ_GENE
379
+ TypeError
380
+ If any identifiers are not strings
381
+ ValueError
382
+ If any mapping tables contain NA values
383
+ """
384
+ if self.mappings is None:
385
+ raise ValueError(
386
+ f"Mapping tables for {self.species} do not exist. Use create_mapping_tables to create new mappings."
387
+ )
388
+
389
+ # entrez should always be present if any mappings exist
390
+ if ONTOLOGIES.NCBI_ENTREZ_GENE not in self.mappings.keys():
391
+ raise ValueError(
392
+ f"Mapping tables for {self.species} do not contain {ONTOLOGIES.NCBI_ENTREZ_GENE}. Use create_mapping_tables to create new mappings."
393
+ )
394
+
395
+ # Check that all identifiers are strings
396
+ for ontology, df in self.mappings.items():
397
+ # Check index (which should be NCBI_ENTREZ_GENE)
398
+ if not df.index.dtype == "object":
399
+ raise TypeError(
400
+ f"Index of mapping table for {ontology} contains non-string values. "
401
+ f"Found type: {df.index.dtype}"
402
+ )
403
+
404
+ # Check all columns
405
+ for col in df.columns:
406
+ if not df[col].dtype == "object":
407
+ raise TypeError(
408
+ f"Column {col} in mapping table for {ontology} contains non-string values. "
409
+ f"Found type: {df[col].dtype}"
410
+ )
411
+
412
+ # Check for NA values in index
413
+ if df.index.isna().any():
414
+ raise ValueError(
415
+ f"Mapping table for {ontology} contains NA values in index (NCBI_ENTREZ_GENE). "
416
+ f"Found {df.index.isna().sum()} NA values."
417
+ )
418
+
419
+ # Check for NA values in columns
420
+ na_counts = df.isna().sum()
421
+ if na_counts.any():
422
+ na_cols = na_counts[na_counts > 0].index.tolist()
423
+ raise ValueError(
424
+ f"Mapping table for {ontology} contains NA values in columns: {na_cols}. "
425
+ f"NA counts per column: {na_counts[na_cols].to_dict()}"
426
+ )
427
+
428
+ def _use_mappings(self, ontologies: Optional[Set[str]]) -> Set[str]:
429
+ """Validate and process ontologies for mapping operations.
430
+
431
+ Parameters
432
+ ----------
433
+ ontologies : Optional[Set[str]]
434
+ Set of ontologies to validate. If None, uses all available mappings.
435
+
436
+ Returns
437
+ -------
438
+ Set[str]
439
+ Set of validated ontologies to use
440
+
441
+ Raises
442
+ ------
443
+ ValueError
444
+ If mappings don't exist or ontologies are invalid
445
+ """
446
+
447
+ if self.mappings is None:
448
+ raise ValueError(
449
+ f"Mapping tables for {self.species} do not exist. Use create_mapping_tables to create new mappings."
450
+ )
451
+
452
+ if ontologies is None:
453
+ return set(self.mappings.keys())
454
+
455
+ # validate provided mappings to see if they are genic ontologies within the controlled vocabulary
456
+ never_valid_mappings = set(ontologies) - INTERCONVERTIBLE_GENIC_ONTOLOGIES
457
+ if never_valid_mappings:
458
+ raise ValueError(
459
+ f"Invalid mappings: {', '.join(never_valid_mappings)}. "
460
+ f"Valid mappings are {', '.join(INTERCONVERTIBLE_GENIC_ONTOLOGIES)}"
461
+ )
462
+
463
+ # validate provided mappings against existing mappings
464
+ missing_mappings = set(ontologies) - set(self.mappings.keys())
465
+ if missing_mappings:
466
+ raise ValueError(
467
+ f"Missing mappings: {', '.join(missing_mappings)}. "
468
+ f"Recreate mappings by calling create_mapping_tables() while including "
469
+ f"{', '.join(missing_mappings)} and other mappings of interest."
470
+ )
471
+
472
+ return ontologies
473
+
474
+ def _create_expanded_identifiers(
475
+ self,
476
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
477
+ ontologies: Optional[Set[str]] = None,
478
+ ) -> pd.Series:
479
+ """Create expanded identifiers for SBML species.
480
+
481
+ Update a table's identifiers to include additional related ontologies.
482
+ Ontologies are pulled from the bioconductor "org" packages or MyGene.info.
483
+
484
+ Parameters
485
+ ----------
486
+ sbml_dfs : sbml_dfs_core.SBML_dfs
487
+ A relational pathway model built around reactions interconverting
488
+ compartmentalized species
489
+ ontologies : Optional[Set[str]], optional
490
+ Ontologies to add or complete, by default None
491
+ If None, uses all available ontologies
492
+
493
+ Returns
494
+ -------
495
+ pd.Series
496
+ Series with identifiers as the index and updated Identifiers objects as values
497
+
498
+ Raises
499
+ ------
500
+ ValueError
501
+ If merged mappings don't exist or all requested ontologies already exist
502
+ TypeError
503
+ If identifiers are not in expected format
504
+ """
505
+
506
+ ontologies = self._use_mappings(ontologies)
507
+ if self.merged_mappings is None:
508
+ raise ValueError(
509
+ "Merged mappings do not exist. Use merge_mappings() to create new mappings."
510
+ )
511
+
512
+ # pull out all identifiers as a pd.DataFrame
513
+ all_entity_identifiers = sbml_dfs.get_identifiers("species")
514
+ if not isinstance(all_entity_identifiers, pd.DataFrame):
515
+ raise TypeError("all_entity_identifiers must be a pandas DataFrame")
516
+
517
+ # find entries in valid_expanded_ontologies which are already present
518
+ # these are the entries that will be used to expand to other ontologies
519
+ # or fill in ontologies with incomplete annotations
520
+ starting_ontologies = ontologies.intersection(
521
+ set(all_entity_identifiers["ontology"])
522
+ )
523
+
524
+ if len(starting_ontologies) == 0:
525
+ raise ValueError(
526
+ f"None of the ontologies currently in the sbml_dfs match `ontologies`. The currently included ontologies are {set(all_entity_identifiers['ontology'])}. If there are major genic ontologies in this list then you may need to use ontologies.clean_ontologies() to convert from aliases to ontologies in the ONTOLOGIES controlled vocabulary."
527
+ )
528
+
529
+ expanded_ontologies = ontologies - starting_ontologies
530
+ if len(expanded_ontologies) == 0:
531
+ raise ValueError(
532
+ "All of the requested ontologies already exist in species' s_Identifiers"
533
+ )
534
+
535
+ # map from existing ontologies to expanded ontologies
536
+ ontology_mappings = list()
537
+ # starting w/
538
+ for start in starting_ontologies:
539
+ # ending w/
540
+ for end in expanded_ontologies:
541
+ if start == end:
542
+ continue
543
+ lookup = (
544
+ self.merged_mappings[[start, end]]
545
+ .rename(
546
+ columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"}
547
+ )
548
+ .assign(ontology=start)
549
+ .assign(new_ontology=end)
550
+ )
551
+
552
+ ontology_mappings.append(lookup)
553
+
554
+ ontology_mappings_df = pd.concat(ontology_mappings).dropna()
555
+
556
+ # old identifiers joined with new identifiers
557
+
558
+ # first, define the names of keys and ids
559
+ table_pk_var = SBML_DFS_SCHEMA.SCHEMA[SBML_DFS.SPECIES]["pk"]
560
+
561
+ # retain bqb terms to define how an identifier is related to sid
562
+ # this relation will be preserved for the new ids
563
+
564
+ merged_identifiers = all_entity_identifiers[
565
+ [
566
+ table_pk_var,
567
+ IDENTIFIERS.ONTOLOGY,
568
+ IDENTIFIERS.IDENTIFIER,
569
+ IDENTIFIERS.BQB,
570
+ ]
571
+ ].merge(ontology_mappings_df)
572
+
573
+ # new, possibly redundant identifiers
574
+ new_identifiers = merged_identifiers[
575
+ [table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
576
+ ].rename(
577
+ columns={
578
+ "new_ontology": IDENTIFIERS.ONTOLOGY,
579
+ "new_identifier": IDENTIFIERS.IDENTIFIER,
580
+ }
581
+ )
582
+
583
+ expanded_identifiers_df = pd.concat(
584
+ [
585
+ all_entity_identifiers[
586
+ [
587
+ table_pk_var,
588
+ IDENTIFIERS.ONTOLOGY,
589
+ IDENTIFIERS.IDENTIFIER,
590
+ IDENTIFIERS.URL,
591
+ IDENTIFIERS.BQB,
592
+ ]
593
+ ],
594
+ new_identifiers,
595
+ # ignore new identifier if it already exists
596
+ ]
597
+ )
598
+
599
+ output = identifiers.df_to_identifiers(
600
+ expanded_identifiers_df, SBML_DFS.SPECIES
601
+ )
602
+
603
+ return output
604
+
605
+
606
+ class GenodexitoConfig(BaseModel):
607
+ """Configuration for Genodexito with validation.
608
+
609
+ Attributes:
610
+ species: Species name to use for mapping
611
+ preferred_method: Which mapping method to try first
612
+ allow_fallback: Whether to allow fallback to other method
613
+ r_paths: Optional paths to R libraries
614
+ test_mode: Whether to limit queries for testing
615
+ """
616
+
617
+ species: str = Field(default="Homo sapiens", description="Species name to use")
618
+ preferred_method: str = Field(
619
+ default=GENODEXITO_DEFS.BIOCONDUCTOR,
620
+ description="Which mapping method to try first",
621
+ )
622
+ allow_fallback: bool = Field(
623
+ default=True, description="Whether to allow fallback to other method"
624
+ )
625
+ r_paths: Optional[List[str]] = Field(
626
+ default=None, description="Optional paths to R libraries"
627
+ )
628
+ test_mode: bool = Field(
629
+ default=False, description="Whether to limit queries for testing"
630
+ )
631
+
632
+ @field_validator("preferred_method")
633
+ @classmethod
634
+ def validate_preferred_method(cls, v: str) -> str:
635
+ """Validate that preferred_method is one of the allowed values."""
636
+ if v not in {GENODEXITO_DEFS.BIOCONDUCTOR, GENODEXITO_DEFS.PYTHON}:
637
+ raise ValueError(
638
+ f"Invalid preferred_method: {v}. "
639
+ f"Must be one of: {GENODEXITO_DEFS.BIOCONDUCTOR}, {GENODEXITO_DEFS.PYTHON}"
640
+ )
641
+ return v
642
+
643
+ @field_validator("r_paths")
644
+ @classmethod
645
+ def validate_r_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]:
646
+ """Validate that r_paths contains only strings."""
647
+ if v is not None and not all(isinstance(path, str) for path in v):
648
+ raise ValueError("All elements in r_paths must be strings")
649
+ return v