napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -3
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dev1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,369 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Dict, List, Set, Union
|
3
|
+
from types import GeneratorType
|
4
|
+
import itertools
|
5
|
+
|
6
|
+
import mygene
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from napistu.constants import ONTOLOGIES
|
10
|
+
from napistu.ontologies.constants import (
|
11
|
+
MYGENE_DEFS,
|
12
|
+
NAPISTU_FROM_MYGENE_FIELDS,
|
13
|
+
NAPISTU_TO_MYGENE_FIELDS,
|
14
|
+
INTERCONVERTIBLE_GENIC_ONTOLOGIES,
|
15
|
+
MYGENE_QUERY_DEFS_LIST,
|
16
|
+
MYGENE_DEFAULT_QUERIES,
|
17
|
+
SPECIES_TO_TAXID,
|
18
|
+
)
|
19
|
+
|
20
|
+
# Configure logging to suppress biothings warnings
|
21
|
+
logging.getLogger("biothings.client").setLevel(logging.ERROR)
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
def create_python_mapping_tables(
|
26
|
+
mappings: Set[str], species: str = "Homo sapiens", test_mode: bool = False
|
27
|
+
) -> Dict[str, pd.DataFrame]:
|
28
|
+
"""Create genome-wide mapping tables between Entrez and other gene identifiers.
|
29
|
+
|
30
|
+
Python equivalent of create_bioconductor_mapping_tables using MyGene.info API.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
mappings : Set[str]
|
35
|
+
Set of ontologies to create mappings for. Must be valid ontologies from
|
36
|
+
INTERCONVERTIBLE_GENIC_ONTOLOGIES.
|
37
|
+
species : str, default "Homo sapiens"
|
38
|
+
Species name (e.g., "Homo sapiens", "Mus musculus"). Must be a key in
|
39
|
+
SPECIES_TO_TAXID or a valid NCBI taxonomy ID.
|
40
|
+
test_mode : bool, default False
|
41
|
+
If True, only fetch the first 1000 genes for testing purposes.
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
Dict[str, pd.DataFrame]
|
46
|
+
Dictionary with ontology names as keys and DataFrames as values.
|
47
|
+
Each DataFrame has Entrez gene IDs as index and mapped identifiers as values.
|
48
|
+
|
49
|
+
Raises
|
50
|
+
------
|
51
|
+
ValueError
|
52
|
+
If any requested mappings are invalid or species is not recognized.
|
53
|
+
ImportError
|
54
|
+
If mygene package is not available.
|
55
|
+
|
56
|
+
Notes
|
57
|
+
-----
|
58
|
+
The function uses MyGene.info API to fetch gene annotations and creates mapping
|
59
|
+
tables between different gene identifier systems. It supports various ontologies
|
60
|
+
like Ensembl genes/transcripts/proteins, UniProt, gene symbols, etc.
|
61
|
+
|
62
|
+
Examples
|
63
|
+
--------
|
64
|
+
>>> mappings = {'ensembl_gene', 'symbol', 'uniprot'}
|
65
|
+
>>> tables = create_python_mapping_tables(mappings, 'Homo sapiens')
|
66
|
+
>>> print(tables['symbol'].head())
|
67
|
+
"""
|
68
|
+
|
69
|
+
mygene_fields = _format_mygene_fields(mappings)
|
70
|
+
|
71
|
+
# Convert species name to taxonomy ID
|
72
|
+
taxa_id = _format_mygene_species(species)
|
73
|
+
|
74
|
+
# Initialize MyGene client
|
75
|
+
mg = mygene.MyGeneInfo()
|
76
|
+
|
77
|
+
# Fetch comprehensive gene data
|
78
|
+
logger.info("Fetching genome-wide gene data from MyGene...")
|
79
|
+
all_genes_df = _fetch_mygene_data_all_queries(
|
80
|
+
mg=mg, taxa_id=taxa_id, fields=mygene_fields, test_mode=test_mode
|
81
|
+
)
|
82
|
+
|
83
|
+
if all_genes_df.empty:
|
84
|
+
raise ValueError(f"No gene data retrieved for species: {species}")
|
85
|
+
|
86
|
+
logger.info(f"Retrieved {len(all_genes_df)} genes and RNAs")
|
87
|
+
mapping_tables = _create_mygene_mapping_tables(all_genes_df, mygene_fields)
|
88
|
+
|
89
|
+
return mapping_tables
|
90
|
+
|
91
|
+
|
92
|
+
def _fetch_mygene_data_all_queries(
|
93
|
+
mg: mygene.MyGeneInfo,
|
94
|
+
taxa_id: int,
|
95
|
+
fields: List[str],
|
96
|
+
query_strategies: List[str] = MYGENE_DEFAULT_QUERIES,
|
97
|
+
test_mode: bool = False,
|
98
|
+
) -> pd.DataFrame:
|
99
|
+
"""Fetch comprehensive gene data from MyGene using multiple query strategies.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
mg : mygene.MyGeneInfo
|
104
|
+
Initialized MyGene.info client
|
105
|
+
taxa_id : int
|
106
|
+
NCBI taxonomy ID for the species
|
107
|
+
fields : List[str]
|
108
|
+
List of MyGene.info fields to retrieve
|
109
|
+
query_strategies : List[str], default MYGENE_DEFAULT_QUERIES
|
110
|
+
List of query strategies to use from MYGENE_QUERY_DEFS_LIST
|
111
|
+
test_mode : bool, default False
|
112
|
+
If True, only fetch first 1000 genes
|
113
|
+
|
114
|
+
Returns
|
115
|
+
-------
|
116
|
+
pd.DataFrame
|
117
|
+
Combined DataFrame with gene data from all queries
|
118
|
+
|
119
|
+
Raises
|
120
|
+
------
|
121
|
+
ValueError
|
122
|
+
If any query strategies are invalid
|
123
|
+
"""
|
124
|
+
|
125
|
+
all_results = []
|
126
|
+
|
127
|
+
# Validate queries
|
128
|
+
invalid_queries = set(query_strategies) - set(MYGENE_QUERY_DEFS_LIST)
|
129
|
+
if invalid_queries:
|
130
|
+
raise ValueError(
|
131
|
+
f"Invalid queries: {', '.join(invalid_queries)}. "
|
132
|
+
f"Valid queries are: {', '.join(MYGENE_QUERY_DEFS_LIST)}"
|
133
|
+
)
|
134
|
+
|
135
|
+
for query in query_strategies:
|
136
|
+
results_df = _fetch_mygene_data(
|
137
|
+
mg=mg, query=query, taxa_id=taxa_id, fields=fields, test_mode=test_mode
|
138
|
+
)
|
139
|
+
|
140
|
+
all_results.append(results_df)
|
141
|
+
|
142
|
+
return pd.concat(all_results)
|
143
|
+
|
144
|
+
|
145
|
+
def _format_mygene_fields(mappings: Set[str]) -> Set[str]:
|
146
|
+
"""Format and validate ontology mappings for MyGene.info queries.
|
147
|
+
|
148
|
+
Parameters
|
149
|
+
----------
|
150
|
+
mappings : Set[str]
|
151
|
+
Set of ontologies to validate and convert to MyGene.info field names
|
152
|
+
|
153
|
+
Returns
|
154
|
+
-------
|
155
|
+
Set[str]
|
156
|
+
Set of valid MyGene.info field names including NCBI_ENTREZ_GENE
|
157
|
+
|
158
|
+
Raises
|
159
|
+
------
|
160
|
+
ValueError
|
161
|
+
If any mappings are invalid
|
162
|
+
"""
|
163
|
+
# Validate inputs
|
164
|
+
invalid_mappings = mappings - INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
165
|
+
if invalid_mappings:
|
166
|
+
raise ValueError(
|
167
|
+
f"Invalid mappings: {', '.join(invalid_mappings)}. "
|
168
|
+
f"Valid options are: {', '.join(INTERCONVERTIBLE_GENIC_ONTOLOGIES)}"
|
169
|
+
)
|
170
|
+
|
171
|
+
logger.info(
|
172
|
+
f"Creating mapping tables from entrez genes to/from {', '.join(mappings)}"
|
173
|
+
)
|
174
|
+
|
175
|
+
# Convert ontologies to MyGene fields and ensure NCBI_ENTREZ_GENE is included
|
176
|
+
mygene_fields = {NAPISTU_TO_MYGENE_FIELDS[ontology] for ontology in mappings}
|
177
|
+
mygene_fields.add(MYGENE_DEFS.NCBI_ENTREZ_GENE)
|
178
|
+
|
179
|
+
return mygene_fields
|
180
|
+
|
181
|
+
|
182
|
+
def _format_mygene_species(species: Union[str, int]) -> int:
|
183
|
+
"""Convert species name or taxonomy ID to NCBI taxonomy ID.
|
184
|
+
|
185
|
+
Parameters
|
186
|
+
----------
|
187
|
+
species : Union[str, int]
|
188
|
+
Species name (e.g. "Homo sapiens") or NCBI taxonomy ID
|
189
|
+
|
190
|
+
Returns
|
191
|
+
-------
|
192
|
+
int
|
193
|
+
NCBI taxonomy ID
|
194
|
+
|
195
|
+
Raises
|
196
|
+
------
|
197
|
+
ValueError
|
198
|
+
If species name is not recognized
|
199
|
+
"""
|
200
|
+
if isinstance(species, int):
|
201
|
+
logger.debug(f"Using taxonomy ID: {species}")
|
202
|
+
return species
|
203
|
+
else:
|
204
|
+
if species not in SPECIES_TO_TAXID:
|
205
|
+
raise ValueError(
|
206
|
+
f"Invalid species: {species}. Please use a species name in "
|
207
|
+
"SPECIES_TO_TAXID or directly pass the NCBI Taxonomy ID."
|
208
|
+
)
|
209
|
+
|
210
|
+
taxid = SPECIES_TO_TAXID[species]
|
211
|
+
logger.debug(f"Using species name: {species}; taxid: {taxid}")
|
212
|
+
|
213
|
+
return taxid
|
214
|
+
|
215
|
+
|
216
|
+
def _fetch_mygene_data(
|
217
|
+
mg: mygene.MyGeneInfo,
|
218
|
+
query: str,
|
219
|
+
taxa_id: int,
|
220
|
+
fields: List[str],
|
221
|
+
test_mode: bool = False,
|
222
|
+
) -> pd.DataFrame:
|
223
|
+
"""Fetch gene data from MyGene.info for a single query.
|
224
|
+
|
225
|
+
Parameters
|
226
|
+
----------
|
227
|
+
mg : mygene.MyGeneInfo
|
228
|
+
Initialized MyGene.info client
|
229
|
+
query : str
|
230
|
+
Query string to search for genes
|
231
|
+
taxa_id : int
|
232
|
+
NCBI taxonomy ID for the species
|
233
|
+
fields : List[str]
|
234
|
+
List of MyGene.info fields to retrieve
|
235
|
+
test_mode : bool, default False
|
236
|
+
If True, only fetch first 1000 genes
|
237
|
+
|
238
|
+
Returns
|
239
|
+
-------
|
240
|
+
pd.DataFrame
|
241
|
+
DataFrame containing gene data from the query
|
242
|
+
|
243
|
+
Raises
|
244
|
+
------
|
245
|
+
ValueError
|
246
|
+
If query results are not in expected format
|
247
|
+
"""
|
248
|
+
logger.debug(f"Querying: {query}")
|
249
|
+
|
250
|
+
result = mg.query(query, species=taxa_id, fields=",".join(fields), fetch_all=True)
|
251
|
+
|
252
|
+
# Validate result is a generator
|
253
|
+
if isinstance(result, GeneratorType):
|
254
|
+
all_hits = []
|
255
|
+
|
256
|
+
if test_mode:
|
257
|
+
# Only look at first 1000 genes in test mode
|
258
|
+
result = itertools.islice(result, 1000)
|
259
|
+
|
260
|
+
for i, gene in enumerate(result):
|
261
|
+
all_hits.append(gene)
|
262
|
+
|
263
|
+
else:
|
264
|
+
raise ValueError("The query results are not a generator")
|
265
|
+
|
266
|
+
results_df = pd.DataFrame(all_hits).assign(query_type=query)
|
267
|
+
|
268
|
+
if results_df.empty:
|
269
|
+
logger.warning(
|
270
|
+
f"No results found for {query} of species taxa id: {taxa_id} "
|
271
|
+
f"and fields: {', '.join(fields)}"
|
272
|
+
)
|
273
|
+
return pd.DataFrame()
|
274
|
+
else:
|
275
|
+
logger.info(f"Retrieved {results_df.shape[0]} genes from {query}")
|
276
|
+
return results_df
|
277
|
+
|
278
|
+
|
279
|
+
def unnest_mygene_ontology(df: pd.DataFrame, field: str) -> pd.DataFrame:
|
280
|
+
"""Unnest a column containing list of dicts in MyGene.info results.
|
281
|
+
|
282
|
+
Parameters
|
283
|
+
----------
|
284
|
+
df : pd.DataFrame
|
285
|
+
DataFrame containing MyGene.info results
|
286
|
+
field : str
|
287
|
+
Field name to unnest, must contain a period to indicate nesting
|
288
|
+
|
289
|
+
Returns
|
290
|
+
-------
|
291
|
+
pd.DataFrame
|
292
|
+
DataFrame with unnested values, containing columns for entrez ID and the
|
293
|
+
unnested field value
|
294
|
+
|
295
|
+
Raises
|
296
|
+
------
|
297
|
+
ValueError
|
298
|
+
If field format is invalid or data structure is unexpected
|
299
|
+
"""
|
300
|
+
if "." in field:
|
301
|
+
# Extract nested ontology field
|
302
|
+
col_name, key_name = field.split(".")
|
303
|
+
else:
|
304
|
+
raise ValueError(
|
305
|
+
f"This function should only be called on a nested mygene ontology "
|
306
|
+
f"field; but you passed: {field} (the period indicates nesting)"
|
307
|
+
)
|
308
|
+
|
309
|
+
valid_df = df.dropna()
|
310
|
+
rows = []
|
311
|
+
for i, row in valid_df.iterrows():
|
312
|
+
entrez = row[MYGENE_DEFS.NCBI_ENTREZ_GENE]
|
313
|
+
|
314
|
+
if isinstance(row[col_name], list):
|
315
|
+
for item in row[col_name]:
|
316
|
+
rows.append([entrez, item[key_name]])
|
317
|
+
elif isinstance(row[col_name], dict):
|
318
|
+
rows.append([entrez, row[col_name][key_name]])
|
319
|
+
else:
|
320
|
+
raise ValueError(f"Unexpected type: {type(row[col_name])} for row {i}")
|
321
|
+
|
322
|
+
return pd.DataFrame(rows, columns=[MYGENE_DEFS.NCBI_ENTREZ_GENE, field])
|
323
|
+
|
324
|
+
|
325
|
+
def _create_mygene_mapping_tables(
|
326
|
+
mygene_results_df: pd.DataFrame, mygene_fields: Set[str]
|
327
|
+
) -> Dict[str, pd.DataFrame]:
|
328
|
+
"""Create mapping tables from MyGene.info query results.
|
329
|
+
|
330
|
+
Parameters
|
331
|
+
----------
|
332
|
+
mygene_results_df : pd.DataFrame
|
333
|
+
DataFrame containing MyGene.info query results
|
334
|
+
mygene_fields : Set[str]
|
335
|
+
Set of MyGene.info fields that were queried
|
336
|
+
|
337
|
+
Returns
|
338
|
+
-------
|
339
|
+
Dict[str, pd.DataFrame]
|
340
|
+
Dictionary mapping ontology names to DataFrames containing identifier mappings
|
341
|
+
"""
|
342
|
+
mapping_tables = {}
|
343
|
+
for field in mygene_fields:
|
344
|
+
logger.info(f"Processing field: {field}")
|
345
|
+
|
346
|
+
# Select entrezgene + query field
|
347
|
+
if field == MYGENE_DEFS.NCBI_ENTREZ_GENE:
|
348
|
+
tbl = mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE]]
|
349
|
+
elif "." in field:
|
350
|
+
ontology, entity = field.split(".")
|
351
|
+
tbl = unnest_mygene_ontology(
|
352
|
+
mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE, ontology]],
|
353
|
+
field,
|
354
|
+
)
|
355
|
+
else:
|
356
|
+
tbl = mygene_results_df.loc[:, [MYGENE_DEFS.NCBI_ENTREZ_GENE, field]]
|
357
|
+
|
358
|
+
mapping_tables[NAPISTU_FROM_MYGENE_FIELDS[field]] = (
|
359
|
+
# Rename records
|
360
|
+
tbl.rename(columns={c: NAPISTU_FROM_MYGENE_FIELDS[c] for c in tbl.columns})
|
361
|
+
# Force all records to be strings
|
362
|
+
.astype(str)
|
363
|
+
# Remove duplicates
|
364
|
+
.drop_duplicates()
|
365
|
+
# Set index
|
366
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
367
|
+
)
|
368
|
+
|
369
|
+
return mapping_tables
|
@@ -0,0 +1,198 @@
|
|
1
|
+
"""Module for handling ontology aliases and validation."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Dict, Set
|
6
|
+
import logging
|
7
|
+
from pydantic import BaseModel, field_validator
|
8
|
+
from napistu.constants import (
|
9
|
+
ONTOLOGY_SPECIES_ALIASES,
|
10
|
+
ONTOLOGIES_LIST,
|
11
|
+
IDENTIFIERS,
|
12
|
+
SBML_DFS,
|
13
|
+
)
|
14
|
+
import pandas as pd
|
15
|
+
from napistu import identifiers, sbml_dfs_core
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def rename_species_ontologies(
|
21
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, aliases=ONTOLOGY_SPECIES_ALIASES
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Rename ontologies in the species identifiers table of an SBML_dfs object using provided aliases.
|
25
|
+
|
26
|
+
This function updates the ontology names in the species identifiers of the given SBML_dfs object
|
27
|
+
according to the provided alias mapping. It validates the alias mapping, logs which ontologies will be updated,
|
28
|
+
and replaces any matching aliases in the species identifiers with their canonical ontology names.
|
29
|
+
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
sbml_dfs : napistu.sbml_dfs_core.SBML_dfs
|
33
|
+
The SBML_dfs object whose species table will be updated in-place.
|
34
|
+
aliases : dict[str, set[str]], optional
|
35
|
+
Dictionary mapping canonical ontology names to sets of their aliases. By default, uses ONTOLOGY_SPECIES_ALIASES.
|
36
|
+
All keys must be valid ontologies from ONTOLOGIES_LIST. Values must not overlap between keys or with keys themselves.
|
37
|
+
|
38
|
+
Returns
|
39
|
+
-------
|
40
|
+
None
|
41
|
+
The function updates sbml_dfs.species in-place and does not return a value.
|
42
|
+
|
43
|
+
Raises
|
44
|
+
------
|
45
|
+
ValueError
|
46
|
+
If the alias mapping is invalid (e.g., keys not in ONTOLOGIES_LIST, overlapping values, or values used as keys),
|
47
|
+
or if there is no overlap between the provided aliases and the ontologies present in the species identifiers.
|
48
|
+
|
49
|
+
Examples
|
50
|
+
--------
|
51
|
+
>>> from napistu.ontologies.renaming import rename_species_ontologies
|
52
|
+
>>> sbml_dfs = ... # an SBML_dfs object
|
53
|
+
>>> aliases = {"ncbi_entrez_gene": {"ncbigene", "ncbi_gene"}, "uniprot": {"uniprot_id"}}
|
54
|
+
>>> rename_species_ontologies(sbml_dfs, aliases)
|
55
|
+
"""
|
56
|
+
|
57
|
+
species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
58
|
+
|
59
|
+
aliases = OntologySet(ontologies=aliases).ontologies
|
60
|
+
alias_mapping = _create_alias_mapping(aliases)
|
61
|
+
|
62
|
+
_log_ontology_updates(alias_mapping, set(species_identifiers[IDENTIFIERS.ONTOLOGY]))
|
63
|
+
|
64
|
+
species_identifiers[IDENTIFIERS.ONTOLOGY] = species_identifiers[
|
65
|
+
IDENTIFIERS.ONTOLOGY
|
66
|
+
].map(lambda x: alias_mapping.get(x, x))
|
67
|
+
|
68
|
+
species_identifiers = identifiers.df_to_identifiers(
|
69
|
+
species_identifiers, SBML_DFS.SPECIES
|
70
|
+
)
|
71
|
+
|
72
|
+
updated_species = sbml_dfs.species.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
73
|
+
pd.DataFrame(species_identifiers)
|
74
|
+
)
|
75
|
+
|
76
|
+
setattr(sbml_dfs, "species", updated_species)
|
77
|
+
|
78
|
+
|
79
|
+
class OntologySet(BaseModel):
|
80
|
+
"""Validate ontology mappings.
|
81
|
+
|
82
|
+
This model ensures that:
|
83
|
+
1. All keys are valid ontologies from ONTOLOGIES_LIST
|
84
|
+
2. The dict maps strings to sets of strings
|
85
|
+
3. Values in the sets do not overlap between different keys
|
86
|
+
4. Values in the sets are not also used as keys
|
87
|
+
|
88
|
+
Attributes
|
89
|
+
----------
|
90
|
+
ontologies : Dict[str, Set[str]]
|
91
|
+
Dictionary mapping ontology names to sets of their aliases
|
92
|
+
"""
|
93
|
+
|
94
|
+
ontologies: Dict[str, Set[str]]
|
95
|
+
|
96
|
+
@field_validator("ontologies")
|
97
|
+
@classmethod
|
98
|
+
def validate_ontologies(cls, v: Dict[str, Set[str]]) -> Dict[str, Set[str]]:
|
99
|
+
"""Validate the ontology mapping structure.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
v : Dict[str, Set[str]]
|
104
|
+
Dictionary mapping ontology names to sets of their aliases
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
Dict[str, Set[str]]
|
109
|
+
The validated ontology mapping dictionary
|
110
|
+
|
111
|
+
Raises
|
112
|
+
------
|
113
|
+
ValueError
|
114
|
+
If any keys are not valid ontologies from ONTOLOGIES_LIST
|
115
|
+
If any values overlap between different ontologies
|
116
|
+
If any values are also used as ontology keys
|
117
|
+
"""
|
118
|
+
# Check that all keys are valid ontologies
|
119
|
+
invalid_ontologies = set(v.keys()) - set(ONTOLOGIES_LIST)
|
120
|
+
if invalid_ontologies:
|
121
|
+
raise ValueError(
|
122
|
+
f"Invalid ontologies: {', '.join(invalid_ontologies)}. "
|
123
|
+
f"Must be one of: {', '.join(ONTOLOGIES_LIST)}"
|
124
|
+
)
|
125
|
+
|
126
|
+
# Check that values don't overlap between keys and aren't used as keys
|
127
|
+
all_values = set()
|
128
|
+
keys = set(v.keys())
|
129
|
+
for key, values in v.items():
|
130
|
+
# Check for overlap with other values
|
131
|
+
overlap = values & all_values
|
132
|
+
if overlap:
|
133
|
+
raise ValueError(
|
134
|
+
f"Found overlapping values {overlap} under multiple ontologies"
|
135
|
+
)
|
136
|
+
# Check for overlap with keys
|
137
|
+
key_overlap = values & keys
|
138
|
+
if key_overlap:
|
139
|
+
raise ValueError(
|
140
|
+
f"Found values {key_overlap} that are also used as ontology keys"
|
141
|
+
)
|
142
|
+
all_values.update(values)
|
143
|
+
|
144
|
+
return v
|
145
|
+
|
146
|
+
|
147
|
+
def _create_alias_mapping(ontology_dict: Dict[str, Set[str]]) -> Dict[str, str]:
|
148
|
+
"""Create a mapping from aliases to canonical ontology names.
|
149
|
+
|
150
|
+
Only creates mappings for the aliases specified in the input dictionary.
|
151
|
+
Does not include mappings for canonical names to themselves.
|
152
|
+
|
153
|
+
Parameters
|
154
|
+
----------
|
155
|
+
ontology_dict : Dict[str, Set[str]]
|
156
|
+
Dictionary mapping ontologies to their aliases
|
157
|
+
|
158
|
+
Returns
|
159
|
+
-------
|
160
|
+
Dict[str, str]
|
161
|
+
Dictionary mapping each alias to its canonical ontology name
|
162
|
+
"""
|
163
|
+
mapping = {}
|
164
|
+
for ontology, aliases in ontology_dict.items():
|
165
|
+
# Only map aliases to canonical names
|
166
|
+
for alias in aliases:
|
167
|
+
mapping[alias] = ontology
|
168
|
+
return mapping
|
169
|
+
|
170
|
+
|
171
|
+
def _log_ontology_updates(
|
172
|
+
alias_mapping: Dict[str, str], species_ontologies: Set[str]
|
173
|
+
) -> None:
|
174
|
+
"""Log which ontology aliases will be updated.
|
175
|
+
|
176
|
+
Parameters
|
177
|
+
----------
|
178
|
+
alias_mapping : Dict[str, str]
|
179
|
+
Dictionary mapping old ontology names to new ones
|
180
|
+
species_ontologies : Set[str]
|
181
|
+
Set of ontology names present in the species identifiers
|
182
|
+
|
183
|
+
Raises
|
184
|
+
------
|
185
|
+
ValueError
|
186
|
+
If there is no overlap between the aliases and species ontologies
|
187
|
+
"""
|
188
|
+
# Find which aliases are present in the species data
|
189
|
+
updatable_aliases = set(alias_mapping.keys()) & species_ontologies
|
190
|
+
if not updatable_aliases:
|
191
|
+
raise ValueError(
|
192
|
+
"The set of ontologies in the species identifiers and aliases do not overlap. "
|
193
|
+
"Please provide an updated aliases dict."
|
194
|
+
)
|
195
|
+
|
196
|
+
# Log which ontologies will be updated
|
197
|
+
updates = [f"{old} -> {alias_mapping[old]}" for old in updatable_aliases]
|
198
|
+
logger.info(f"Updating the following ontologies: {', '.join(updates)}")
|