napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -145
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +47 -65
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +156 -19
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,369 @@
1
+ import logging
2
+ from typing import Dict, List, Set, Union
3
+ from types import GeneratorType
4
+ import itertools
5
+
6
+ import mygene
7
+ import pandas as pd
8
+
9
+ from napistu.constants import ONTOLOGIES
10
+ from napistu.ontologies.constants import (
11
+ MYGENE_DEFS,
12
+ NAPISTU_FROM_MYGENE_FIELDS,
13
+ NAPISTU_TO_MYGENE_FIELDS,
14
+ INTERCONVERTIBLE_GENIC_ONTOLOGIES,
15
+ MYGENE_QUERY_DEFS_LIST,
16
+ MYGENE_DEFAULT_QUERIES,
17
+ SPECIES_TO_TAXID,
18
+ )
19
+
20
+ # Configure logging to suppress biothings warnings
21
+ logging.getLogger("biothings.client").setLevel(logging.ERROR)
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def create_python_mapping_tables(
26
+ mappings: Set[str], species: str = "Homo sapiens", test_mode: bool = False
27
+ ) -> Dict[str, pd.DataFrame]:
28
+ """Create genome-wide mapping tables between Entrez and other gene identifiers.
29
+
30
+ Python equivalent of create_bioconductor_mapping_tables using MyGene.info API.
31
+
32
+ Parameters
33
+ ----------
34
+ mappings : Set[str]
35
+ Set of ontologies to create mappings for. Must be valid ontologies from
36
+ INTERCONVERTIBLE_GENIC_ONTOLOGIES.
37
+ species : str, default "Homo sapiens"
38
+ Species name (e.g., "Homo sapiens", "Mus musculus"). Must be a key in
39
+ SPECIES_TO_TAXID or a valid NCBI taxonomy ID.
40
+ test_mode : bool, default False
41
+ If True, only fetch the first 1000 genes for testing purposes.
42
+
43
+ Returns
44
+ -------
45
+ Dict[str, pd.DataFrame]
46
+ Dictionary with ontology names as keys and DataFrames as values.
47
+ Each DataFrame has Entrez gene IDs as index and mapped identifiers as values.
48
+
49
+ Raises
50
+ ------
51
+ ValueError
52
+ If any requested mappings are invalid or species is not recognized.
53
+ ImportError
54
+ If mygene package is not available.
55
+
56
+ Notes
57
+ -----
58
+ The function uses MyGene.info API to fetch gene annotations and creates mapping
59
+ tables between different gene identifier systems. It supports various ontologies
60
+ like Ensembl genes/transcripts/proteins, UniProt, gene symbols, etc.
61
+
62
+ Examples
63
+ --------
64
+ >>> mappings = {'ensembl_gene', 'symbol', 'uniprot'}
65
+ >>> tables = create_python_mapping_tables(mappings, 'Homo sapiens')
66
+ >>> print(tables['symbol'].head())
67
+ """
68
+
69
+ mygene_fields = _format_mygene_fields(mappings)
70
+
71
+ # Convert species name to taxonomy ID
72
+ taxa_id = _format_mygene_species(species)
73
+
74
+ # Initialize MyGene client
75
+ mg = mygene.MyGeneInfo()
76
+
77
+ # Fetch comprehensive gene data
78
+ logger.info("Fetching genome-wide gene data from MyGene...")
79
+ all_genes_df = _fetch_mygene_data_all_queries(
80
+ mg=mg, taxa_id=taxa_id, fields=mygene_fields, test_mode=test_mode
81
+ )
82
+
83
+ if all_genes_df.empty:
84
+ raise ValueError(f"No gene data retrieved for species: {species}")
85
+
86
+ logger.info(f"Retrieved {len(all_genes_df)} genes and RNAs")
87
+ mapping_tables = _create_mygene_mapping_tables(all_genes_df, mygene_fields)
88
+
89
+ return mapping_tables
90
+
91
+
92
+ def _fetch_mygene_data_all_queries(
93
+ mg: mygene.MyGeneInfo,
94
+ taxa_id: int,
95
+ fields: List[str],
96
+ query_strategies: List[str] = MYGENE_DEFAULT_QUERIES,
97
+ test_mode: bool = False,
98
+ ) -> pd.DataFrame:
99
+ """Fetch comprehensive gene data from MyGene using multiple query strategies.
100
+
101
+ Parameters
102
+ ----------
103
+ mg : mygene.MyGeneInfo
104
+ Initialized MyGene.info client
105
+ taxa_id : int
106
+ NCBI taxonomy ID for the species
107
+ fields : List[str]
108
+ List of MyGene.info fields to retrieve
109
+ query_strategies : List[str], default MYGENE_DEFAULT_QUERIES
110
+ List of query strategies to use from MYGENE_QUERY_DEFS_LIST
111
+ test_mode : bool, default False
112
+ If True, only fetch first 1000 genes
113
+
114
+ Returns
115
+ -------
116
+ pd.DataFrame
117
+ Combined DataFrame with gene data from all queries
118
+
119
+ Raises
120
+ ------
121
+ ValueError
122
+ If any query strategies are invalid
123
+ """
124
+
125
+ all_results = []
126
+
127
+ # Validate queries
128
+ invalid_queries = set(query_strategies) - set(MYGENE_QUERY_DEFS_LIST)
129
+ if invalid_queries:
130
+ raise ValueError(
131
+ f"Invalid queries: {', '.join(invalid_queries)}. "
132
+ f"Valid queries are: {', '.join(MYGENE_QUERY_DEFS_LIST)}"
133
+ )
134
+
135
+ for query in query_strategies:
136
+ results_df = _fetch_mygene_data(
137
+ mg=mg, query=query, taxa_id=taxa_id, fields=fields, test_mode=test_mode
138
+ )
139
+
140
+ all_results.append(results_df)
141
+
142
+ return pd.concat(all_results)
143
+
144
+
145
+ def _format_mygene_fields(mappings: Set[str]) -> Set[str]:
146
+ """Format and validate ontology mappings for MyGene.info queries.
147
+
148
+ Parameters
149
+ ----------
150
+ mappings : Set[str]
151
+ Set of ontologies to validate and convert to MyGene.info field names
152
+
153
+ Returns
154
+ -------
155
+ Set[str]
156
+ Set of valid MyGene.info field names including NCBI_ENTREZ_GENE
157
+
158
+ Raises
159
+ ------
160
+ ValueError
161
+ If any mappings are invalid
162
+ """
163
+ # Validate inputs
164
+ invalid_mappings = mappings - INTERCONVERTIBLE_GENIC_ONTOLOGIES
165
+ if invalid_mappings:
166
+ raise ValueError(
167
+ f"Invalid mappings: {', '.join(invalid_mappings)}. "
168
+ f"Valid options are: {', '.join(INTERCONVERTIBLE_GENIC_ONTOLOGIES)}"
169
+ )
170
+
171
+ logger.info(
172
+ f"Creating mapping tables from entrez genes to/from {', '.join(mappings)}"
173
+ )
174
+
175
+ # Convert ontologies to MyGene fields and ensure NCBI_ENTREZ_GENE is included
176
+ mygene_fields = {NAPISTU_TO_MYGENE_FIELDS[ontology] for ontology in mappings}
177
+ mygene_fields.add(MYGENE_DEFS.NCBI_ENTREZ_GENE)
178
+
179
+ return mygene_fields
180
+
181
+
182
+ def _format_mygene_species(species: Union[str, int]) -> int:
183
+ """Convert species name or taxonomy ID to NCBI taxonomy ID.
184
+
185
+ Parameters
186
+ ----------
187
+ species : Union[str, int]
188
+ Species name (e.g. "Homo sapiens") or NCBI taxonomy ID
189
+
190
+ Returns
191
+ -------
192
+ int
193
+ NCBI taxonomy ID
194
+
195
+ Raises
196
+ ------
197
+ ValueError
198
+ If species name is not recognized
199
+ """
200
+ if isinstance(species, int):
201
+ logger.debug(f"Using taxonomy ID: {species}")
202
+ return species
203
+ else:
204
+ if species not in SPECIES_TO_TAXID:
205
+ raise ValueError(
206
+ f"Invalid species: {species}. Please use a species name in "
207
+ "SPECIES_TO_TAXID or directly pass the NCBI Taxonomy ID."
208
+ )
209
+
210
+ taxid = SPECIES_TO_TAXID[species]
211
+ logger.debug(f"Using species name: {species}; taxid: {taxid}")
212
+
213
+ return taxid
214
+
215
+
216
+ def _fetch_mygene_data(
217
+ mg: mygene.MyGeneInfo,
218
+ query: str,
219
+ taxa_id: int,
220
+ fields: List[str],
221
+ test_mode: bool = False,
222
+ ) -> pd.DataFrame:
223
+ """Fetch gene data from MyGene.info for a single query.
224
+
225
+ Parameters
226
+ ----------
227
+ mg : mygene.MyGeneInfo
228
+ Initialized MyGene.info client
229
+ query : str
230
+ Query string to search for genes
231
+ taxa_id : int
232
+ NCBI taxonomy ID for the species
233
+ fields : List[str]
234
+ List of MyGene.info fields to retrieve
235
+ test_mode : bool, default False
236
+ If True, only fetch first 1000 genes
237
+
238
+ Returns
239
+ -------
240
+ pd.DataFrame
241
+ DataFrame containing gene data from the query
242
+
243
+ Raises
244
+ ------
245
+ ValueError
246
+ If query results are not in expected format
247
+ """
248
+ logger.debug(f"Querying: {query}")
249
+
250
+ result = mg.query(query, species=taxa_id, fields=",".join(fields), fetch_all=True)
251
+
252
+ # Validate result is a generator
253
+ if isinstance(result, GeneratorType):
254
+ all_hits = []
255
+
256
+ if test_mode:
257
+ # Only look at first 1000 genes in test mode
258
+ result = itertools.islice(result, 1000)
259
+
260
+ for i, gene in enumerate(result):
261
+ all_hits.append(gene)
262
+
263
+ else:
264
+ raise ValueError("The query results are not a generator")
265
+
266
+ results_df = pd.DataFrame(all_hits).assign(query_type=query)
267
+
268
+ if results_df.empty:
269
+ logger.warning(
270
+ f"No results found for {query} of species taxa id: {taxa_id} "
271
+ f"and fields: {', '.join(fields)}"
272
+ )
273
+ return pd.DataFrame()
274
+ else:
275
+ logger.info(f"Retrieved {results_df.shape[0]} genes from {query}")
276
+ return results_df
277
+
278
+
279
+ def unnest_mygene_ontology(df: pd.DataFrame, field: str) -> pd.DataFrame:
280
+ """Unnest a column containing list of dicts in MyGene.info results.
281
+
282
+ Parameters
283
+ ----------
284
+ df : pd.DataFrame
285
+ DataFrame containing MyGene.info results
286
+ field : str
287
+ Field name to unnest, must contain a period to indicate nesting
288
+
289
+ Returns
290
+ -------
291
+ pd.DataFrame
292
+ DataFrame with unnested values, containing columns for entrez ID and the
293
+ unnested field value
294
+
295
+ Raises
296
+ ------
297
+ ValueError
298
+ If field format is invalid or data structure is unexpected
299
+ """
300
+ if "." in field:
301
+ # Extract nested ontology field
302
+ col_name, key_name = field.split(".")
303
+ else:
304
+ raise ValueError(
305
+ f"This function should only be called on a nested mygene ontology "
306
+ f"field; but you passed: {field} (the period indicates nesting)"
307
+ )
308
+
309
+ valid_df = df.dropna()
310
+ rows = []
311
+ for i, row in valid_df.iterrows():
312
+ entrez = row[MYGENE_DEFS.NCBI_ENTREZ_GENE]
313
+
314
+ if isinstance(row[col_name], list):
315
+ for item in row[col_name]:
316
+ rows.append([entrez, item[key_name]])
317
+ elif isinstance(row[col_name], dict):
318
+ rows.append([entrez, row[col_name][key_name]])
319
+ else:
320
+ raise ValueError(f"Unexpected type: {type(row[col_name])} for row {i}")
321
+
322
+ return pd.DataFrame(rows, columns=[MYGENE_DEFS.NCBI_ENTREZ_GENE, field])
323
+
324
+
325
+ def _create_mygene_mapping_tables(
326
+ mygene_results_df: pd.DataFrame, mygene_fields: Set[str]
327
+ ) -> Dict[str, pd.DataFrame]:
328
+ """Create mapping tables from MyGene.info query results.
329
+
330
+ Parameters
331
+ ----------
332
+ mygene_results_df : pd.DataFrame
333
+ DataFrame containing MyGene.info query results
334
+ mygene_fields : Set[str]
335
+ Set of MyGene.info fields that were queried
336
+
337
+ Returns
338
+ -------
339
+ Dict[str, pd.DataFrame]
340
+ Dictionary mapping ontology names to DataFrames containing identifier mappings
341
+ """
342
+ mapping_tables = {}
343
+ for field in mygene_fields:
344
+ logger.info(f"Processing field: {field}")
345
+
346
+ # Select entrezgene + query field
347
+ if field == MYGENE_DEFS.NCBI_ENTREZ_GENE:
348
+ tbl = mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE]]
349
+ elif "." in field:
350
+ ontology, entity = field.split(".")
351
+ tbl = unnest_mygene_ontology(
352
+ mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE, ontology]],
353
+ field,
354
+ )
355
+ else:
356
+ tbl = mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE, field]]
357
+
358
+ mapping_tables[NAPISTU_FROM_MYGENE_FIELDS[field]] = (
359
+ # Rename records
360
+ tbl.rename(columns={c: NAPISTU_FROM_MYGENE_FIELDS[c] for c in tbl.columns})
361
+ # Force all records to be strings
362
+ .astype(str)
363
+ # Remove duplicates
364
+ .drop_duplicates()
365
+ # Set index
366
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
367
+ )
368
+
369
+ return mapping_tables
@@ -0,0 +1,198 @@
1
+ """Module for handling ontology aliases and validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, Set
6
+ import logging
7
+ from pydantic import BaseModel, field_validator
8
+ from napistu.constants import (
9
+ ONTOLOGY_SPECIES_ALIASES,
10
+ ONTOLOGIES_LIST,
11
+ IDENTIFIERS,
12
+ SBML_DFS,
13
+ )
14
+ import pandas as pd
15
+ from napistu import identifiers, sbml_dfs_core
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def rename_species_ontologies(
21
+ sbml_dfs: sbml_dfs_core.SBML_dfs, aliases=ONTOLOGY_SPECIES_ALIASES
22
+ ):
23
+ """
24
+ Rename ontologies in the species identifiers table of an SBML_dfs object using provided aliases.
25
+
26
+ This function updates the ontology names in the species identifiers of the given SBML_dfs object
27
+ according to the provided alias mapping. It validates the alias mapping, logs which ontologies will be updated,
28
+ and replaces any matching aliases in the species identifiers with their canonical ontology names.
29
+
30
+ Parameters
31
+ ----------
32
+ sbml_dfs : napistu.sbml_dfs_core.SBML_dfs
33
+ The SBML_dfs object whose species table will be updated in-place.
34
+ aliases : dict[str, set[str]], optional
35
+ Dictionary mapping canonical ontology names to sets of their aliases. By default, uses ONTOLOGY_SPECIES_ALIASES.
36
+ All keys must be valid ontologies from ONTOLOGIES_LIST. Values must not overlap between keys or with keys themselves.
37
+
38
+ Returns
39
+ -------
40
+ None
41
+ The function updates sbml_dfs.species in-place and does not return a value.
42
+
43
+ Raises
44
+ ------
45
+ ValueError
46
+ If the alias mapping is invalid (e.g., keys not in ONTOLOGIES_LIST, overlapping values, or values used as keys),
47
+ or if there is no overlap between the provided aliases and the ontologies present in the species identifiers.
48
+
49
+ Examples
50
+ --------
51
+ >>> from napistu.ontologies.renaming import rename_species_ontologies
52
+ >>> sbml_dfs = ... # an SBML_dfs object
53
+ >>> aliases = {"ncbi_entrez_gene": {"ncbigene", "ncbi_gene"}, "uniprot": {"uniprot_id"}}
54
+ >>> rename_species_ontologies(sbml_dfs, aliases)
55
+ """
56
+
57
+ species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
58
+
59
+ aliases = OntologySet(ontologies=aliases).ontologies
60
+ alias_mapping = _create_alias_mapping(aliases)
61
+
62
+ _log_ontology_updates(alias_mapping, set(species_identifiers[IDENTIFIERS.ONTOLOGY]))
63
+
64
+ species_identifiers[IDENTIFIERS.ONTOLOGY] = species_identifiers[
65
+ IDENTIFIERS.ONTOLOGY
66
+ ].map(lambda x: alias_mapping.get(x, x))
67
+
68
+ species_identifiers = identifiers.df_to_identifiers(
69
+ species_identifiers, SBML_DFS.SPECIES
70
+ )
71
+
72
+ updated_species = sbml_dfs.species.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
73
+ pd.DataFrame(species_identifiers)
74
+ )
75
+
76
+ setattr(sbml_dfs, "species", updated_species)
77
+
78
+
79
+ class OntologySet(BaseModel):
80
+ """Validate ontology mappings.
81
+
82
+ This model ensures that:
83
+ 1. All keys are valid ontologies from ONTOLOGIES_LIST
84
+ 2. The dict maps strings to sets of strings
85
+ 3. Values in the sets do not overlap between different keys
86
+ 4. Values in the sets are not also used as keys
87
+
88
+ Attributes
89
+ ----------
90
+ ontologies : Dict[str, Set[str]]
91
+ Dictionary mapping ontology names to sets of their aliases
92
+ """
93
+
94
+ ontologies: Dict[str, Set[str]]
95
+
96
+ @field_validator("ontologies")
97
+ @classmethod
98
+ def validate_ontologies(cls, v: Dict[str, Set[str]]) -> Dict[str, Set[str]]:
99
+ """Validate the ontology mapping structure.
100
+
101
+ Parameters
102
+ ----------
103
+ v : Dict[str, Set[str]]
104
+ Dictionary mapping ontology names to sets of their aliases
105
+
106
+ Returns
107
+ -------
108
+ Dict[str, Set[str]]
109
+ The validated ontology mapping dictionary
110
+
111
+ Raises
112
+ ------
113
+ ValueError
114
+ If any keys are not valid ontologies from ONTOLOGIES_LIST
115
+ If any values overlap between different ontologies
116
+ If any values are also used as ontology keys
117
+ """
118
+ # Check that all keys are valid ontologies
119
+ invalid_ontologies = set(v.keys()) - set(ONTOLOGIES_LIST)
120
+ if invalid_ontologies:
121
+ raise ValueError(
122
+ f"Invalid ontologies: {', '.join(invalid_ontologies)}. "
123
+ f"Must be one of: {', '.join(ONTOLOGIES_LIST)}"
124
+ )
125
+
126
+ # Check that values don't overlap between keys and aren't used as keys
127
+ all_values = set()
128
+ keys = set(v.keys())
129
+ for key, values in v.items():
130
+ # Check for overlap with other values
131
+ overlap = values & all_values
132
+ if overlap:
133
+ raise ValueError(
134
+ f"Found overlapping values {overlap} under multiple ontologies"
135
+ )
136
+ # Check for overlap with keys
137
+ key_overlap = values & keys
138
+ if key_overlap:
139
+ raise ValueError(
140
+ f"Found values {key_overlap} that are also used as ontology keys"
141
+ )
142
+ all_values.update(values)
143
+
144
+ return v
145
+
146
+
147
+ def _create_alias_mapping(ontology_dict: Dict[str, Set[str]]) -> Dict[str, str]:
148
+ """Create a mapping from aliases to canonical ontology names.
149
+
150
+ Only creates mappings for the aliases specified in the input dictionary.
151
+ Does not include mappings for canonical names to themselves.
152
+
153
+ Parameters
154
+ ----------
155
+ ontology_dict : Dict[str, Set[str]]
156
+ Dictionary mapping ontologies to their aliases
157
+
158
+ Returns
159
+ -------
160
+ Dict[str, str]
161
+ Dictionary mapping each alias to its canonical ontology name
162
+ """
163
+ mapping = {}
164
+ for ontology, aliases in ontology_dict.items():
165
+ # Only map aliases to canonical names
166
+ for alias in aliases:
167
+ mapping[alias] = ontology
168
+ return mapping
169
+
170
+
171
+ def _log_ontology_updates(
172
+ alias_mapping: Dict[str, str], species_ontologies: Set[str]
173
+ ) -> None:
174
+ """Log which ontology aliases will be updated.
175
+
176
+ Parameters
177
+ ----------
178
+ alias_mapping : Dict[str, str]
179
+ Dictionary mapping old ontology names to new ones
180
+ species_ontologies : Set[str]
181
+ Set of ontology names present in the species identifiers
182
+
183
+ Raises
184
+ ------
185
+ ValueError
186
+ If there is no overlap between the aliases and species ontologies
187
+ """
188
+ # Find which aliases are present in the species data
189
+ updatable_aliases = set(alias_mapping.keys()) & species_ontologies
190
+ if not updatable_aliases:
191
+ raise ValueError(
192
+ "The set of ontologies in the species identifiers and aliases do not overlap. "
193
+ "Please provide an updated aliases dict."
194
+ )
195
+
196
+ # Log which ontologies will be updated
197
+ updates = [f"{old} -> {alias_mapping[old]}" for old in updatable_aliases]
198
+ logger.info(f"Updating the following ontologies: {', '.join(updates)}")