napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -3
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dev1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,649 @@
|
|
1
|
+
from typing import Dict, List, Optional, Set
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from pydantic import BaseModel, Field, field_validator
|
6
|
+
|
7
|
+
from napistu import sbml_dfs_core
|
8
|
+
from napistu import identifiers
|
9
|
+
from napistu.ontologies.mygene import create_python_mapping_tables
|
10
|
+
from napistu.constants import SBML_DFS, ONTOLOGIES, IDENTIFIERS, SBML_DFS_SCHEMA
|
11
|
+
from napistu.ontologies.constants import INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
12
|
+
from napistu.ontologies.constants import GENODEXITO_DEFS
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class Genodexito:
|
18
|
+
"""A tool for mapping gene identifiers across ontologies.
|
19
|
+
|
20
|
+
Genodexito provides a unified interface for mapping between different gene identifier
|
21
|
+
ontologies (e.g. Ensembl, Entrez, UniProt). It supports both an R-centric workflow
|
22
|
+
using Bioconductor through RPy2, as well as a Python-centric workflow using MyGene.info.
|
23
|
+
|
24
|
+
The class automatically handles fallback between the two methods if one fails.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
species : str, optional
|
29
|
+
The organismal species to map identifiers for, by default "Homo sapiens"
|
30
|
+
preferred_method : str, optional
|
31
|
+
Which mapping method to try first ("bioconductor" or "python"), by default "bioconductor"
|
32
|
+
allow_fallback : bool, optional
|
33
|
+
Whether to allow falling back to the other method if preferred fails, by default True
|
34
|
+
r_paths : Optional[List[str]], optional
|
35
|
+
Optional paths to R libraries for Bioconductor, by default None
|
36
|
+
test_mode : bool, optional
|
37
|
+
If True, limit queries to 1000 genes for testing purposes, by default False
|
38
|
+
|
39
|
+
Attributes
|
40
|
+
----------
|
41
|
+
mappings : Optional[Dict[str, pd.DataFrame]]
|
42
|
+
Dictionary of mapping tables between ontologies
|
43
|
+
mapper_used : Optional[str]
|
44
|
+
Which mapping method was successfully used ("bioconductor" or "python")
|
45
|
+
merged_mappings : Optional[pd.DataFrame]
|
46
|
+
Combined wide-format mapping table
|
47
|
+
stacked_mappings : Optional[pd.DataFrame]
|
48
|
+
Combined long-format mapping table
|
49
|
+
|
50
|
+
Methods
|
51
|
+
-------
|
52
|
+
create_mapping_tables(mappings: Set[str], overwrite: bool = False)
|
53
|
+
Create mapping tables between different ontologies. This is the primary method
|
54
|
+
to fetch and store identifier mappings. Must be called before using other methods.
|
55
|
+
|
56
|
+
merge_mappings(ontologies: Optional[Set[str]] = None)
|
57
|
+
Create a wide-format table where each row is an Entrez gene ID and columns
|
58
|
+
contain the corresponding identifiers in other ontologies.
|
59
|
+
|
60
|
+
stack_mappings(ontologies: Optional[Set[str]] = None)
|
61
|
+
Create a long-format table combining all mappings, with columns for
|
62
|
+
ontology type and identifier values.
|
63
|
+
|
64
|
+
expand_sbml_dfs_ids(sbml_dfs: sbml_dfs_core.SBML_dfs, ontologies: Optional[Set[str]] = None)
|
65
|
+
Update the expanded identifiers for a model by adding additional related
|
66
|
+
ontologies pulled from Bioconductor or MyGene.info.
|
67
|
+
|
68
|
+
Examples
|
69
|
+
--------
|
70
|
+
>>> # Initialize mapper with Python method
|
71
|
+
>>> geno = Genodexito(preferred_method="python")
|
72
|
+
>>>
|
73
|
+
>>> # Create mapping tables for specific ontologies
|
74
|
+
>>> mappings = {'ensembl_gene', 'symbol', 'uniprot'}
|
75
|
+
>>> geno.create_mapping_tables(mappings)
|
76
|
+
>>>
|
77
|
+
>>> # Create merged wide-format table
|
78
|
+
>>> geno.merge_mappings()
|
79
|
+
>>> print(geno.merged_mappings.head())
|
80
|
+
>>>
|
81
|
+
>>> # Create stacked long-format table
|
82
|
+
>>> geno.stack_mappings()
|
83
|
+
>>> print(geno.stacked_mappings.head())
|
84
|
+
"""
|
85
|
+
|
86
|
+
def __init__(
|
87
|
+
self,
|
88
|
+
species: str = "Homo sapiens",
|
89
|
+
preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
|
90
|
+
allow_fallback: bool = True,
|
91
|
+
r_paths: Optional[List[str]] = None,
|
92
|
+
test_mode: bool = False,
|
93
|
+
) -> None:
|
94
|
+
"""
|
95
|
+
Initialize unified gene mapper
|
96
|
+
|
97
|
+
Parameters
|
98
|
+
----------
|
99
|
+
species : str, optional
|
100
|
+
Species name, by default "Homo sapiens"
|
101
|
+
preferred_method : str, optional
|
102
|
+
Which mapping method to try first ("bioconductor" or "python"), by default "bioconductor"
|
103
|
+
allow_fallback : bool, optional
|
104
|
+
Whether to allow falling back to other method if preferred fails, by default True
|
105
|
+
r_paths : Optional[List[str]], optional
|
106
|
+
Optional paths to R libraries for Bioconductor, by default None
|
107
|
+
test_mode : bool, optional
|
108
|
+
If True, limit queries to 1000 genes for testing purposes, by default False
|
109
|
+
"""
|
110
|
+
# Validate configuration using Pydantic model
|
111
|
+
config = GenodexitoConfig(
|
112
|
+
species=species,
|
113
|
+
preferred_method=preferred_method,
|
114
|
+
allow_fallback=allow_fallback,
|
115
|
+
r_paths=r_paths,
|
116
|
+
test_mode=test_mode,
|
117
|
+
)
|
118
|
+
|
119
|
+
self.species = config.species
|
120
|
+
self.preferred_method = config.preferred_method
|
121
|
+
self.allow_fallback = config.allow_fallback
|
122
|
+
self.r_paths = config.r_paths
|
123
|
+
self.test_mode = config.test_mode
|
124
|
+
|
125
|
+
# Initialize empty attributes
|
126
|
+
self.mappings: Optional[Dict[str, pd.DataFrame]] = None
|
127
|
+
self.mapper_used: Optional[str] = None
|
128
|
+
self.merged_mappings: Optional[pd.DataFrame] = None
|
129
|
+
self.stacked_mappings: Optional[pd.DataFrame] = None
|
130
|
+
|
131
|
+
def create_mapping_tables(
|
132
|
+
self, mappings: Set[str], overwrite: bool = False
|
133
|
+
) -> None:
|
134
|
+
"""Create mapping tables between different ontologies.
|
135
|
+
|
136
|
+
This is a drop-in replacement for create_bioconductor_mapping_tables that handles
|
137
|
+
both Bioconductor and Python-based mapping methods.
|
138
|
+
|
139
|
+
Parameters
|
140
|
+
----------
|
141
|
+
mappings : Set[str]
|
142
|
+
Set of ontologies to create mappings for
|
143
|
+
overwrite : bool, optional
|
144
|
+
Whether to overwrite existing mappings, by default False
|
145
|
+
|
146
|
+
Returns
|
147
|
+
-------
|
148
|
+
None
|
149
|
+
Updates self.mappings and self.mapper_used in place
|
150
|
+
"""
|
151
|
+
|
152
|
+
# check for existing mappings
|
153
|
+
if self.mappings is not None and not overwrite:
|
154
|
+
logger.warning(
|
155
|
+
f"Mapping tables for {self.species} already exist. Use overwrite=True to create new mappings."
|
156
|
+
)
|
157
|
+
return None
|
158
|
+
|
159
|
+
if self.preferred_method == GENODEXITO_DEFS.BIOCONDUCTOR:
|
160
|
+
try:
|
161
|
+
# Only import R functionality when needed
|
162
|
+
from napistu.rpy2.rids import create_bioconductor_mapping_tables
|
163
|
+
|
164
|
+
self.mappings = create_bioconductor_mapping_tables(
|
165
|
+
mappings=mappings, species=self.species, r_paths=self.r_paths
|
166
|
+
)
|
167
|
+
self.mapper_used = GENODEXITO_DEFS.BIOCONDUCTOR
|
168
|
+
except Exception as e:
|
169
|
+
if self.allow_fallback:
|
170
|
+
logger.warning(
|
171
|
+
f"Error creating bioconductor mapping tables for {self.species} with {mappings}. Falling back to python."
|
172
|
+
)
|
173
|
+
self.mappings = create_python_mapping_tables(
|
174
|
+
mappings=mappings,
|
175
|
+
species=self.species,
|
176
|
+
test_mode=self.test_mode,
|
177
|
+
)
|
178
|
+
self.mapper_used = GENODEXITO_DEFS.PYTHON
|
179
|
+
else:
|
180
|
+
logger.error(
|
181
|
+
f"Error creating bioconductor mapping tables for {self.species} with {mappings} and fallback is disabled."
|
182
|
+
)
|
183
|
+
raise e
|
184
|
+
|
185
|
+
elif self.preferred_method == GENODEXITO_DEFS.PYTHON:
|
186
|
+
try:
|
187
|
+
self.mappings = create_python_mapping_tables(
|
188
|
+
mappings=mappings, species=self.species, test_mode=self.test_mode
|
189
|
+
)
|
190
|
+
self.mapper_used = GENODEXITO_DEFS.PYTHON
|
191
|
+
except Exception as e:
|
192
|
+
if self.allow_fallback:
|
193
|
+
logger.warning(
|
194
|
+
f"Error creating mygene Python mapping tables for {self.species} with {mappings}. Trying the bioconductor fallback."
|
195
|
+
)
|
196
|
+
# Only import R functionality when needed
|
197
|
+
from napistu.rpy2.rids import create_bioconductor_mapping_tables
|
198
|
+
|
199
|
+
self.mappings = create_bioconductor_mapping_tables(
|
200
|
+
mappings=mappings, species=self.species, r_paths=self.r_paths
|
201
|
+
)
|
202
|
+
self.mapper_used = GENODEXITO_DEFS.BIOCONDUCTOR
|
203
|
+
else:
|
204
|
+
logger.error(
|
205
|
+
f"Error creating Python mapping tables for {self.species} with {mappings} and fallback is disabled."
|
206
|
+
)
|
207
|
+
raise e
|
208
|
+
|
209
|
+
else:
|
210
|
+
raise ValueError(f"Invalid preferred_method: {self.preferred_method}")
|
211
|
+
|
212
|
+
return None
|
213
|
+
|
214
|
+
def merge_mappings(self, ontologies: Optional[Set[str]] = None) -> None:
|
215
|
+
"""Merge mappings into a single wide table.
|
216
|
+
|
217
|
+
Creates a wide-format table where each row is an Entrez gene ID and
|
218
|
+
columns contain the corresponding identifiers in other ontologies.
|
219
|
+
|
220
|
+
Parameters
|
221
|
+
----------
|
222
|
+
ontologies : Optional[Set[str]], optional
|
223
|
+
Set of ontologies to include in merged table, by default None
|
224
|
+
If None, uses all available ontologies
|
225
|
+
|
226
|
+
Returns
|
227
|
+
-------
|
228
|
+
None
|
229
|
+
Updates self.merged_mappings in place
|
230
|
+
|
231
|
+
Raises
|
232
|
+
------
|
233
|
+
ValueError
|
234
|
+
If mappings don't exist or requested ontologies are invalid
|
235
|
+
TypeError
|
236
|
+
If any identifiers are not strings
|
237
|
+
ValueError
|
238
|
+
If any mapping tables contain NA values
|
239
|
+
"""
|
240
|
+
|
241
|
+
# mappings must exist and be valid
|
242
|
+
self._check_mappings()
|
243
|
+
ontologies = self._use_mappings(ontologies)
|
244
|
+
|
245
|
+
running_ids = self.mappings[ONTOLOGIES.NCBI_ENTREZ_GENE]
|
246
|
+
|
247
|
+
for mapping in ontologies:
|
248
|
+
logger.debug(f"adding entries for {mapping} to running_ids")
|
249
|
+
mapping_df = self.mappings[mapping]
|
250
|
+
|
251
|
+
running_ids = running_ids.join(mapping_df)
|
252
|
+
|
253
|
+
running_ids = running_ids.reset_index()
|
254
|
+
|
255
|
+
self.merged_mappings = running_ids
|
256
|
+
|
257
|
+
return None
|
258
|
+
|
259
|
+
def stack_mappings(self, ontologies: Optional[Set[str]] = None) -> None:
|
260
|
+
"""Stack mappings into a single long table.
|
261
|
+
|
262
|
+
Convert a dict of mappings between Entrez identifiers and other identifiers
|
263
|
+
into a single long-format table.
|
264
|
+
|
265
|
+
Parameters
|
266
|
+
----------
|
267
|
+
ontologies : Optional[Set[str]], optional
|
268
|
+
Set of ontologies to include in stacked table, by default None
|
269
|
+
If None, uses all available ontologies
|
270
|
+
|
271
|
+
Returns
|
272
|
+
-------
|
273
|
+
None
|
274
|
+
Updates self.stacked_mappings in place
|
275
|
+
|
276
|
+
Raises
|
277
|
+
------
|
278
|
+
ValueError
|
279
|
+
If mappings don't exist or requested ontologies are invalid
|
280
|
+
TypeError
|
281
|
+
If any identifiers are not strings
|
282
|
+
ValueError
|
283
|
+
If any mapping tables contain NA values
|
284
|
+
"""
|
285
|
+
|
286
|
+
# mappings must exist and be valid
|
287
|
+
self._check_mappings()
|
288
|
+
ontologies = self._use_mappings(ontologies)
|
289
|
+
|
290
|
+
mappings_list = list()
|
291
|
+
for ont in ontologies:
|
292
|
+
one_mapping_df = (
|
293
|
+
self.mappings[ont]
|
294
|
+
.assign(ontology=ont)
|
295
|
+
.rename({ont: IDENTIFIERS.IDENTIFIER}, axis=1)
|
296
|
+
)
|
297
|
+
|
298
|
+
mappings_list.append(one_mapping_df)
|
299
|
+
|
300
|
+
self.stacked_mappings = pd.concat(mappings_list)
|
301
|
+
|
302
|
+
def expand_sbml_dfs_ids(
|
303
|
+
self, sbml_dfs: sbml_dfs_core.SBML_dfs, ontologies: Optional[Set[str]] = None
|
304
|
+
) -> sbml_dfs_core.SBML_dfs:
|
305
|
+
"""Update the expanded identifiers for a model.
|
306
|
+
|
307
|
+
Parameters
|
308
|
+
----------
|
309
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
310
|
+
The SBML model to update with expanded identifiers
|
311
|
+
ontologies : Optional[Set[str]], optional
|
312
|
+
Set of ontologies to use for mapping. If None, uses all available ontologies
|
313
|
+
from INTERCONVERTIBLE_GENIC_ONTOLOGIES.
|
314
|
+
|
315
|
+
Returns
|
316
|
+
-------
|
317
|
+
sbml_dfs_core.SBML_dfs
|
318
|
+
Updated SBML model with expanded identifiers
|
319
|
+
"""
|
320
|
+
|
321
|
+
ids = getattr(sbml_dfs, "species")
|
322
|
+
|
323
|
+
# If no ontologies specified, use all available ones
|
324
|
+
if ontologies is None:
|
325
|
+
ontologies = INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
326
|
+
else:
|
327
|
+
# Ensure ncbi_entrez_gene is included in the ontologies
|
328
|
+
ontologies = set(ontologies)
|
329
|
+
ontologies.add(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
330
|
+
|
331
|
+
invalid_ontologies = ontologies - INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
332
|
+
if invalid_ontologies:
|
333
|
+
raise ValueError(
|
334
|
+
f"Invalid ontologies: {', '.join(invalid_ontologies)}.\n"
|
335
|
+
f"Valid options are: {', '.join(sorted(INTERCONVERTIBLE_GENIC_ONTOLOGIES))}"
|
336
|
+
)
|
337
|
+
|
338
|
+
# create mapping tables if they don't exist
|
339
|
+
if self.mappings is None:
|
340
|
+
self.create_mapping_tables(ontologies)
|
341
|
+
|
342
|
+
# select and validate mappings
|
343
|
+
ontologies = self._use_mappings(ontologies)
|
344
|
+
|
345
|
+
if self.merged_mappings is None:
|
346
|
+
self.merge_mappings(ontologies)
|
347
|
+
|
348
|
+
# merge existing and new identifiers
|
349
|
+
expanded_ids = self._create_expanded_identifiers(sbml_dfs, ontologies)
|
350
|
+
|
351
|
+
# make sure expanded_ids and original model.species have same number of s_ids
|
352
|
+
# if a s_id only in model.species, adding it to expanded_ids.
|
353
|
+
if ids.shape[0] != expanded_ids.shape[0]:
|
354
|
+
matched_expanded_ids = expanded_ids.combine_first(
|
355
|
+
ids[SBML_DFS.S_IDENTIFIERS]
|
356
|
+
)
|
357
|
+
logger.debug(
|
358
|
+
f"{ids.shape[0] - expanded_ids.shape[0]} "
|
359
|
+
"ids are not included in expanded ids"
|
360
|
+
)
|
361
|
+
else:
|
362
|
+
matched_expanded_ids = expanded_ids
|
363
|
+
|
364
|
+
updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
365
|
+
pd.DataFrame(matched_expanded_ids)
|
366
|
+
)
|
367
|
+
|
368
|
+
setattr(sbml_dfs, "species", updated_ids)
|
369
|
+
|
370
|
+
return sbml_dfs
|
371
|
+
|
372
|
+
def _check_mappings(self) -> None:
|
373
|
+
"""Check that mappings exist and contain required ontologies.
|
374
|
+
|
375
|
+
Raises
|
376
|
+
------
|
377
|
+
ValueError
|
378
|
+
If mappings don't exist or don't contain NCBI_ENTREZ_GENE
|
379
|
+
TypeError
|
380
|
+
If any identifiers are not strings
|
381
|
+
ValueError
|
382
|
+
If any mapping tables contain NA values
|
383
|
+
"""
|
384
|
+
if self.mappings is None:
|
385
|
+
raise ValueError(
|
386
|
+
f"Mapping tables for {self.species} do not exist. Use create_mapping_tables to create new mappings."
|
387
|
+
)
|
388
|
+
|
389
|
+
# entrez should always be present if any mappings exist
|
390
|
+
if ONTOLOGIES.NCBI_ENTREZ_GENE not in self.mappings.keys():
|
391
|
+
raise ValueError(
|
392
|
+
f"Mapping tables for {self.species} do not contain {ONTOLOGIES.NCBI_ENTREZ_GENE}. Use create_mapping_tables to create new mappings."
|
393
|
+
)
|
394
|
+
|
395
|
+
# Check that all identifiers are strings
|
396
|
+
for ontology, df in self.mappings.items():
|
397
|
+
# Check index (which should be NCBI_ENTREZ_GENE)
|
398
|
+
if not df.index.dtype == "object":
|
399
|
+
raise TypeError(
|
400
|
+
f"Index of mapping table for {ontology} contains non-string values. "
|
401
|
+
f"Found type: {df.index.dtype}"
|
402
|
+
)
|
403
|
+
|
404
|
+
# Check all columns
|
405
|
+
for col in df.columns:
|
406
|
+
if not df[col].dtype == "object":
|
407
|
+
raise TypeError(
|
408
|
+
f"Column {col} in mapping table for {ontology} contains non-string values. "
|
409
|
+
f"Found type: {df[col].dtype}"
|
410
|
+
)
|
411
|
+
|
412
|
+
# Check for NA values in index
|
413
|
+
if df.index.isna().any():
|
414
|
+
raise ValueError(
|
415
|
+
f"Mapping table for {ontology} contains NA values in index (NCBI_ENTREZ_GENE). "
|
416
|
+
f"Found {df.index.isna().sum()} NA values."
|
417
|
+
)
|
418
|
+
|
419
|
+
# Check for NA values in columns
|
420
|
+
na_counts = df.isna().sum()
|
421
|
+
if na_counts.any():
|
422
|
+
na_cols = na_counts[na_counts > 0].index.tolist()
|
423
|
+
raise ValueError(
|
424
|
+
f"Mapping table for {ontology} contains NA values in columns: {na_cols}. "
|
425
|
+
f"NA counts per column: {na_counts[na_cols].to_dict()}"
|
426
|
+
)
|
427
|
+
|
428
|
+
def _use_mappings(self, ontologies: Optional[Set[str]]) -> Set[str]:
|
429
|
+
"""Validate and process ontologies for mapping operations.
|
430
|
+
|
431
|
+
Parameters
|
432
|
+
----------
|
433
|
+
ontologies : Optional[Set[str]]
|
434
|
+
Set of ontologies to validate. If None, uses all available mappings.
|
435
|
+
|
436
|
+
Returns
|
437
|
+
-------
|
438
|
+
Set[str]
|
439
|
+
Set of validated ontologies to use
|
440
|
+
|
441
|
+
Raises
|
442
|
+
------
|
443
|
+
ValueError
|
444
|
+
If mappings don't exist or ontologies are invalid
|
445
|
+
"""
|
446
|
+
|
447
|
+
if self.mappings is None:
|
448
|
+
raise ValueError(
|
449
|
+
f"Mapping tables for {self.species} do not exist. Use create_mapping_tables to create new mappings."
|
450
|
+
)
|
451
|
+
|
452
|
+
if ontologies is None:
|
453
|
+
return set(self.mappings.keys())
|
454
|
+
|
455
|
+
# validate provided mappings to see if they are genic ontologies within the controlled vocabulary
|
456
|
+
never_valid_mappings = set(ontologies) - INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
457
|
+
if never_valid_mappings:
|
458
|
+
raise ValueError(
|
459
|
+
f"Invalid mappings: {', '.join(never_valid_mappings)}. "
|
460
|
+
f"Valid mappings are {', '.join(INTERCONVERTIBLE_GENIC_ONTOLOGIES)}"
|
461
|
+
)
|
462
|
+
|
463
|
+
# validate provided mappings against existing mappings
|
464
|
+
missing_mappings = set(ontologies) - set(self.mappings.keys())
|
465
|
+
if missing_mappings:
|
466
|
+
raise ValueError(
|
467
|
+
f"Missing mappings: {', '.join(missing_mappings)}. "
|
468
|
+
f"Recreate mappings by calling create_mapping_tables() while including "
|
469
|
+
f"{', '.join(missing_mappings)} and other mappings of interest."
|
470
|
+
)
|
471
|
+
|
472
|
+
return ontologies
|
473
|
+
|
474
|
+
def _create_expanded_identifiers(
|
475
|
+
self,
|
476
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
477
|
+
ontologies: Optional[Set[str]] = None,
|
478
|
+
) -> pd.Series:
|
479
|
+
"""Create expanded identifiers for SBML species.
|
480
|
+
|
481
|
+
Update a table's identifiers to include additional related ontologies.
|
482
|
+
Ontologies are pulled from the bioconductor "org" packages or MyGene.info.
|
483
|
+
|
484
|
+
Parameters
|
485
|
+
----------
|
486
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
487
|
+
A relational pathway model built around reactions interconverting
|
488
|
+
compartmentalized species
|
489
|
+
ontologies : Optional[Set[str]], optional
|
490
|
+
Ontologies to add or complete, by default None
|
491
|
+
If None, uses all available ontologies
|
492
|
+
|
493
|
+
Returns
|
494
|
+
-------
|
495
|
+
pd.Series
|
496
|
+
Series with identifiers as the index and updated Identifiers objects as values
|
497
|
+
|
498
|
+
Raises
|
499
|
+
------
|
500
|
+
ValueError
|
501
|
+
If merged mappings don't exist or all requested ontologies already exist
|
502
|
+
TypeError
|
503
|
+
If identifiers are not in expected format
|
504
|
+
"""
|
505
|
+
|
506
|
+
ontologies = self._use_mappings(ontologies)
|
507
|
+
if self.merged_mappings is None:
|
508
|
+
raise ValueError(
|
509
|
+
"Merged mappings do not exist. Use merge_mappings() to create new mappings."
|
510
|
+
)
|
511
|
+
|
512
|
+
# pull out all identifiers as a pd.DataFrame
|
513
|
+
all_entity_identifiers = sbml_dfs.get_identifiers("species")
|
514
|
+
if not isinstance(all_entity_identifiers, pd.DataFrame):
|
515
|
+
raise TypeError("all_entity_identifiers must be a pandas DataFrame")
|
516
|
+
|
517
|
+
# find entries in valid_expanded_ontologies which are already present
|
518
|
+
# these are the entries that will be used to expand to other ontologies
|
519
|
+
# or fill in ontologies with incomplete annotations
|
520
|
+
starting_ontologies = ontologies.intersection(
|
521
|
+
set(all_entity_identifiers["ontology"])
|
522
|
+
)
|
523
|
+
|
524
|
+
if len(starting_ontologies) == 0:
|
525
|
+
raise ValueError(
|
526
|
+
f"None of the ontologies currently in the sbml_dfs match `ontologies`. The currently included ontologies are {set(all_entity_identifiers['ontology'])}. If there are major genic ontologies in this list then you may need to use ontologies.clean_ontologies() to convert from aliases to ontologies in the ONTOLOGIES controlled vocabulary."
|
527
|
+
)
|
528
|
+
|
529
|
+
expanded_ontologies = ontologies - starting_ontologies
|
530
|
+
if len(expanded_ontologies) == 0:
|
531
|
+
raise ValueError(
|
532
|
+
"All of the requested ontologies already exist in species' s_Identifiers"
|
533
|
+
)
|
534
|
+
|
535
|
+
# map from existing ontologies to expanded ontologies
|
536
|
+
ontology_mappings = list()
|
537
|
+
# starting w/
|
538
|
+
for start in starting_ontologies:
|
539
|
+
# ending w/
|
540
|
+
for end in expanded_ontologies:
|
541
|
+
if start == end:
|
542
|
+
continue
|
543
|
+
lookup = (
|
544
|
+
self.merged_mappings[[start, end]]
|
545
|
+
.rename(
|
546
|
+
columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"}
|
547
|
+
)
|
548
|
+
.assign(ontology=start)
|
549
|
+
.assign(new_ontology=end)
|
550
|
+
)
|
551
|
+
|
552
|
+
ontology_mappings.append(lookup)
|
553
|
+
|
554
|
+
ontology_mappings_df = pd.concat(ontology_mappings).dropna()
|
555
|
+
|
556
|
+
# old identifiers joined with new identifiers
|
557
|
+
|
558
|
+
# first, define the names of keys and ids
|
559
|
+
table_pk_var = SBML_DFS_SCHEMA.SCHEMA[SBML_DFS.SPECIES]["pk"]
|
560
|
+
|
561
|
+
# retain bqb terms to define how an identifier is related to sid
|
562
|
+
# this relation will be preserved for the new ids
|
563
|
+
|
564
|
+
merged_identifiers = all_entity_identifiers[
|
565
|
+
[
|
566
|
+
table_pk_var,
|
567
|
+
IDENTIFIERS.ONTOLOGY,
|
568
|
+
IDENTIFIERS.IDENTIFIER,
|
569
|
+
IDENTIFIERS.BQB,
|
570
|
+
]
|
571
|
+
].merge(ontology_mappings_df)
|
572
|
+
|
573
|
+
# new, possibly redundant identifiers
|
574
|
+
new_identifiers = merged_identifiers[
|
575
|
+
[table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
|
576
|
+
].rename(
|
577
|
+
columns={
|
578
|
+
"new_ontology": IDENTIFIERS.ONTOLOGY,
|
579
|
+
"new_identifier": IDENTIFIERS.IDENTIFIER,
|
580
|
+
}
|
581
|
+
)
|
582
|
+
|
583
|
+
expanded_identifiers_df = pd.concat(
|
584
|
+
[
|
585
|
+
all_entity_identifiers[
|
586
|
+
[
|
587
|
+
table_pk_var,
|
588
|
+
IDENTIFIERS.ONTOLOGY,
|
589
|
+
IDENTIFIERS.IDENTIFIER,
|
590
|
+
IDENTIFIERS.URL,
|
591
|
+
IDENTIFIERS.BQB,
|
592
|
+
]
|
593
|
+
],
|
594
|
+
new_identifiers,
|
595
|
+
# ignore new identifier if it already exists
|
596
|
+
]
|
597
|
+
)
|
598
|
+
|
599
|
+
output = identifiers.df_to_identifiers(
|
600
|
+
expanded_identifiers_df, SBML_DFS.SPECIES
|
601
|
+
)
|
602
|
+
|
603
|
+
return output
|
604
|
+
|
605
|
+
|
606
|
+
class GenodexitoConfig(BaseModel):
|
607
|
+
"""Configuration for Genodexito with validation.
|
608
|
+
|
609
|
+
Attributes:
|
610
|
+
species: Species name to use for mapping
|
611
|
+
preferred_method: Which mapping method to try first
|
612
|
+
allow_fallback: Whether to allow fallback to other method
|
613
|
+
r_paths: Optional paths to R libraries
|
614
|
+
test_mode: Whether to limit queries for testing
|
615
|
+
"""
|
616
|
+
|
617
|
+
species: str = Field(default="Homo sapiens", description="Species name to use")
|
618
|
+
preferred_method: str = Field(
|
619
|
+
default=GENODEXITO_DEFS.BIOCONDUCTOR,
|
620
|
+
description="Which mapping method to try first",
|
621
|
+
)
|
622
|
+
allow_fallback: bool = Field(
|
623
|
+
default=True, description="Whether to allow fallback to other method"
|
624
|
+
)
|
625
|
+
r_paths: Optional[List[str]] = Field(
|
626
|
+
default=None, description="Optional paths to R libraries"
|
627
|
+
)
|
628
|
+
test_mode: bool = Field(
|
629
|
+
default=False, description="Whether to limit queries for testing"
|
630
|
+
)
|
631
|
+
|
632
|
+
@field_validator("preferred_method")
|
633
|
+
@classmethod
|
634
|
+
def validate_preferred_method(cls, v: str) -> str:
|
635
|
+
"""Validate that preferred_method is one of the allowed values."""
|
636
|
+
if v not in {GENODEXITO_DEFS.BIOCONDUCTOR, GENODEXITO_DEFS.PYTHON}:
|
637
|
+
raise ValueError(
|
638
|
+
f"Invalid preferred_method: {v}. "
|
639
|
+
f"Must be one of: {GENODEXITO_DEFS.BIOCONDUCTOR}, {GENODEXITO_DEFS.PYTHON}"
|
640
|
+
)
|
641
|
+
return v
|
642
|
+
|
643
|
+
@field_validator("r_paths")
|
644
|
+
@classmethod
|
645
|
+
def validate_r_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]:
|
646
|
+
"""Validate that r_paths contains only strings."""
|
647
|
+
if v is not None and not all(isinstance(path, str) for path in v):
|
648
|
+
raise ValueError("All elements in r_paths must be strings")
|
649
|
+
return v
|