napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,510 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import Optional, Union, Set, Dict, List
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from napistu import identifiers
|
10
|
+
from napistu.constants import ONTOLOGIES_LIST, SBML_DFS, IDENTIFIERS
|
11
|
+
from napistu.matching.constants import FEATURE_ID_VAR_DEFAULT
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def features_to_pathway_species(
|
17
|
+
feature_identifiers: pd.DataFrame,
|
18
|
+
species_identifiers: pd.DataFrame,
|
19
|
+
ontologies: set,
|
20
|
+
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
21
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
22
|
+
expand_identifiers: bool = False,
|
23
|
+
identifier_delimiter: str = "/",
|
24
|
+
verbose: bool = False,
|
25
|
+
) -> pd.DataFrame:
|
26
|
+
"""
|
27
|
+
Features to Pathway Species
|
28
|
+
|
29
|
+
Match a table of molecular species to their corresponding species in a pathway representation.
|
30
|
+
|
31
|
+
Parameters:
|
32
|
+
feature_identifiers: pd.DataFrame
|
33
|
+
pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
|
34
|
+
species_identifiers: pd.DataFrame
|
35
|
+
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
|
36
|
+
generally using sbml_dfs_core.export_sbml_dfs()
|
37
|
+
ontologies: set
|
38
|
+
A set of ontologies used to match features to pathway species
|
39
|
+
feature_identifiers_var: str
|
40
|
+
Variable in "feature_identifiers" containing identifiers
|
41
|
+
expand_identifiers: bool, default=False
|
42
|
+
If True, split identifiers in feature_identifiers_var by identifier_delimiter and explode into multiple rows
|
43
|
+
identifier_delimiter: str, default="/"
|
44
|
+
Delimiter to use for splitting identifiers if expand_identifiers is True
|
45
|
+
verbose: bool, default=False
|
46
|
+
If True, log mapping statistics at the end of the function
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
pathway_species: pd.DataFrame
|
50
|
+
species_identifiers joined to feature_identifiers based on shared identifiers
|
51
|
+
"""
|
52
|
+
|
53
|
+
# Check for identifier column
|
54
|
+
if feature_identifiers_var not in feature_identifiers.columns.to_list():
|
55
|
+
raise ValueError(
|
56
|
+
f"{feature_identifiers_var} must be a variable in 'feature_identifiers', "
|
57
|
+
f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
|
58
|
+
)
|
59
|
+
|
60
|
+
# Respect or create feature_id column
|
61
|
+
feature_identifiers = _ensure_feature_id_var(feature_identifiers, feature_id_var)
|
62
|
+
|
63
|
+
# Optionally expand identifiers into multiple rows
|
64
|
+
if expand_identifiers:
|
65
|
+
# Count the number of expansions by counting delimiters
|
66
|
+
n_expansions = (
|
67
|
+
feature_identifiers[feature_identifiers_var]
|
68
|
+
.astype(str)
|
69
|
+
.str.count(identifier_delimiter)
|
70
|
+
.sum()
|
71
|
+
)
|
72
|
+
if n_expansions > 0:
|
73
|
+
logger.info(
|
74
|
+
f"Expanding identifiers: {n_expansions} delimiters found in '{feature_identifiers_var}', will expand to more rows."
|
75
|
+
)
|
76
|
+
|
77
|
+
# Split, strip whitespace, and explode
|
78
|
+
feature_identifiers = feature_identifiers.copy()
|
79
|
+
feature_identifiers[feature_identifiers_var] = (
|
80
|
+
feature_identifiers[feature_identifiers_var]
|
81
|
+
.astype(str)
|
82
|
+
.str.split(identifier_delimiter)
|
83
|
+
.apply(lambda lst: [x.strip() for x in lst])
|
84
|
+
)
|
85
|
+
feature_identifiers = feature_identifiers.explode(
|
86
|
+
feature_identifiers_var, ignore_index=True
|
87
|
+
)
|
88
|
+
|
89
|
+
# check identifiers table
|
90
|
+
identifiers._check_species_identifiers_table(species_identifiers)
|
91
|
+
|
92
|
+
available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
|
93
|
+
unavailable_ontologies = ontologies.difference(available_ontologies)
|
94
|
+
|
95
|
+
# no ontologies present
|
96
|
+
if len(unavailable_ontologies) == len(ontologies):
|
97
|
+
raise ValueError(
|
98
|
+
f"None of the requested ontologies ({', '.join(ontologies)}) "
|
99
|
+
"were used to annotate pathway species. Available ontologies are: "
|
100
|
+
f"{', '.join(available_ontologies)}"
|
101
|
+
)
|
102
|
+
|
103
|
+
# 1+ desired ontologies are not present
|
104
|
+
if len(unavailable_ontologies) > 0:
|
105
|
+
raise ValueError(
|
106
|
+
f"Some of the requested ontologies ({', '.join(unavailable_ontologies)}) "
|
107
|
+
"were NOT used to annotate pathway species. Available ontologies are: "
|
108
|
+
f"{', '.join(available_ontologies)}"
|
109
|
+
)
|
110
|
+
|
111
|
+
relevant_identifiers = species_identifiers[
|
112
|
+
species_identifiers[IDENTIFIERS.ONTOLOGY].isin(ontologies)
|
113
|
+
]
|
114
|
+
|
115
|
+
# map features to pathway species
|
116
|
+
pathway_species = feature_identifiers.merge(
|
117
|
+
relevant_identifiers,
|
118
|
+
left_on=feature_identifiers_var,
|
119
|
+
right_on=IDENTIFIERS.IDENTIFIER,
|
120
|
+
)
|
121
|
+
|
122
|
+
if pathway_species.shape[0] == 0:
|
123
|
+
logger.warning(
|
124
|
+
"None of the provided species identifiers matched entries of the pathway; returning None"
|
125
|
+
)
|
126
|
+
None
|
127
|
+
|
128
|
+
# report the fraction of unmapped species
|
129
|
+
if verbose:
|
130
|
+
_log_feature_species_mapping_stats(pathway_species, feature_id_var)
|
131
|
+
|
132
|
+
return pathway_species
|
133
|
+
|
134
|
+
|
135
|
+
def match_features_to_wide_pathway_species(
|
136
|
+
wide_df: pd.DataFrame,
|
137
|
+
species_identifiers: pd.DataFrame,
|
138
|
+
ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
|
139
|
+
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
140
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
141
|
+
verbose: bool = False,
|
142
|
+
) -> pd.DataFrame:
|
143
|
+
"""
|
144
|
+
Convert a wide-format DataFrame with multiple ontology columns to long format,
|
145
|
+
and match features to pathway species by ontology and identifier.
|
146
|
+
|
147
|
+
Parameters
|
148
|
+
----------
|
149
|
+
wide_df : pd.DataFrame
|
150
|
+
DataFrame with ontology identifier columns and any number of results columns.
|
151
|
+
All non-ontology columns are treated as results.
|
152
|
+
species_identifiers : pd.DataFrame
|
153
|
+
DataFrame as required by features_to_pathway_species
|
154
|
+
ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
|
155
|
+
Either:
|
156
|
+
- Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
|
157
|
+
- Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
|
158
|
+
- None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
|
159
|
+
feature_identifiers_var : str, default="identifier"
|
160
|
+
Name for the identifier column in the long format
|
161
|
+
feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
|
162
|
+
Name for the feature id column in the long format
|
163
|
+
verbose : bool, default=False
|
164
|
+
Whether to print verbose output
|
165
|
+
|
166
|
+
Returns
|
167
|
+
-------
|
168
|
+
pd.DataFrame
|
169
|
+
Output of match_by_ontology_and_identifier
|
170
|
+
|
171
|
+
Examples
|
172
|
+
--------
|
173
|
+
>>> # Example with auto-detected ontology columns and multiple results
|
174
|
+
>>> wide_df = pd.DataFrame({
|
175
|
+
... 'uniprot': ['P12345', 'Q67890'],
|
176
|
+
... 'chebi': ['15377', '16810'],
|
177
|
+
... 'log2fc': [1.0, 2.0],
|
178
|
+
... 'pvalue': [0.01, 0.05]
|
179
|
+
... })
|
180
|
+
>>> result = match_features_to_wide_pathway_species(
|
181
|
+
... wide_df=wide_df,
|
182
|
+
... species_identifiers=species_identifiers
|
183
|
+
... )
|
184
|
+
|
185
|
+
>>> # Example with custom ontology mapping
|
186
|
+
>>> wide_df = pd.DataFrame({
|
187
|
+
... 'protein_id': ['P12345', 'Q67890'],
|
188
|
+
... 'compound_id': ['15377', '16810'],
|
189
|
+
... 'expression': [1.0, 2.0],
|
190
|
+
... 'confidence': [0.8, 0.9]
|
191
|
+
... })
|
192
|
+
>>> result = match_features_to_wide_pathway_species(
|
193
|
+
... wide_df=wide_df,
|
194
|
+
... species_identifiers=species_identifiers,
|
195
|
+
... ontologies={'protein_id': 'uniprot', 'compound_id': 'chebi'}
|
196
|
+
... )
|
197
|
+
"""
|
198
|
+
# Make a copy to avoid modifying the input
|
199
|
+
wide_df = wide_df.copy()
|
200
|
+
|
201
|
+
# Validate ontologies and get the set of ontology columns
|
202
|
+
ontology_cols = _validate_wide_ontologies(wide_df, ontologies)
|
203
|
+
melt_cols = list(ontology_cols)
|
204
|
+
|
205
|
+
# Apply renaming if a mapping is provided
|
206
|
+
if isinstance(ontologies, dict):
|
207
|
+
wide_df = wide_df.rename(columns=ontologies)
|
208
|
+
|
209
|
+
# Ensure feature_id column exists
|
210
|
+
wide_df = _ensure_feature_id_var(wide_df, feature_id_var)
|
211
|
+
|
212
|
+
# All non-ontology columns are treated as results
|
213
|
+
results_cols = list(set(wide_df.columns) - set(melt_cols))
|
214
|
+
if not results_cols:
|
215
|
+
raise ValueError("No results columns found in DataFrame")
|
216
|
+
|
217
|
+
logger.info(f"Using columns as results: {results_cols}")
|
218
|
+
|
219
|
+
# Melt ontology columns to long format, keeping all results columns
|
220
|
+
long_df = wide_df.melt(
|
221
|
+
id_vars=results_cols,
|
222
|
+
value_vars=melt_cols,
|
223
|
+
var_name=IDENTIFIERS.ONTOLOGY,
|
224
|
+
value_name=feature_identifiers_var,
|
225
|
+
).dropna(subset=[feature_identifiers_var])
|
226
|
+
|
227
|
+
logger.debug(f"Final long format shape: {long_df.shape}")
|
228
|
+
|
229
|
+
# Call the matching function with the validated ontologies
|
230
|
+
out = match_by_ontology_and_identifier(
|
231
|
+
feature_identifiers=long_df,
|
232
|
+
species_identifiers=species_identifiers,
|
233
|
+
ontologies=ontology_cols,
|
234
|
+
feature_identifiers_var=feature_identifiers_var,
|
235
|
+
)
|
236
|
+
|
237
|
+
if verbose:
|
238
|
+
_log_feature_species_mapping_stats(out, feature_id_var)
|
239
|
+
|
240
|
+
return out
|
241
|
+
|
242
|
+
|
243
|
+
def match_by_ontology_and_identifier(
|
244
|
+
feature_identifiers: pd.DataFrame,
|
245
|
+
species_identifiers: pd.DataFrame,
|
246
|
+
ontologies: Union[str, Set[str], List[str]],
|
247
|
+
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
248
|
+
verbose: bool = False,
|
249
|
+
) -> pd.DataFrame:
|
250
|
+
"""
|
251
|
+
Match features to pathway species based on both ontology and identifier matches.
|
252
|
+
Performs separate matching for each ontology and concatenates the results.
|
253
|
+
|
254
|
+
Parameters
|
255
|
+
----------
|
256
|
+
feature_identifiers : pd.DataFrame
|
257
|
+
DataFrame containing feature identifiers and results.
|
258
|
+
Must have columns [ontology, feature_identifiers_var, results]
|
259
|
+
species_identifiers : pd.DataFrame
|
260
|
+
DataFrame containing species identifiers from pathway.
|
261
|
+
Must have columns [ontology, identifier]
|
262
|
+
ontologies : Union[str, Set[str], List[str]]
|
263
|
+
Ontologies to match on. Can be:
|
264
|
+
- A single ontology string
|
265
|
+
- A set of ontology strings
|
266
|
+
- A list of ontology strings
|
267
|
+
feature_identifiers_var : str, default="identifier"
|
268
|
+
Name of the identifier column in feature_identifiers
|
269
|
+
verbose : bool, default=False
|
270
|
+
Whether to print verbose output
|
271
|
+
|
272
|
+
Returns
|
273
|
+
-------
|
274
|
+
pd.DataFrame
|
275
|
+
Concatenated results of matching for each ontology.
|
276
|
+
Contains all columns from features_to_pathway_species()
|
277
|
+
|
278
|
+
Examples
|
279
|
+
--------
|
280
|
+
>>> # Match using a single ontology
|
281
|
+
>>> result = match_by_ontology_and_identifier(
|
282
|
+
... feature_identifiers=features_df,
|
283
|
+
... species_identifiers=species_df,
|
284
|
+
... ontologies="uniprot"
|
285
|
+
... )
|
286
|
+
|
287
|
+
>>> # Match using multiple ontologies
|
288
|
+
>>> result = match_by_ontology_and_identifier(
|
289
|
+
... feature_identifiers=features_df,
|
290
|
+
... species_identifiers=species_df,
|
291
|
+
... ontologies={"uniprot", "chebi"}
|
292
|
+
... )
|
293
|
+
"""
|
294
|
+
# Convert string to set for consistent handling
|
295
|
+
if isinstance(ontologies, str):
|
296
|
+
ontologies = {ontologies}
|
297
|
+
elif isinstance(ontologies, list):
|
298
|
+
ontologies = set(ontologies)
|
299
|
+
|
300
|
+
# Validate ontologies
|
301
|
+
invalid_onts = ontologies - set(ONTOLOGIES_LIST)
|
302
|
+
if invalid_onts:
|
303
|
+
raise ValueError(
|
304
|
+
f"Invalid ontologies specified: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
305
|
+
)
|
306
|
+
|
307
|
+
# Initialize list to store results
|
308
|
+
matched_dfs = []
|
309
|
+
|
310
|
+
# Process each ontology separately
|
311
|
+
for ont in ontologies:
|
312
|
+
# Filter feature identifiers to current ontology and drop ontology column
|
313
|
+
ont_features = (
|
314
|
+
feature_identifiers[feature_identifiers[IDENTIFIERS.ONTOLOGY] == ont]
|
315
|
+
.drop(columns=[IDENTIFIERS.ONTOLOGY])
|
316
|
+
.copy()
|
317
|
+
)
|
318
|
+
|
319
|
+
if ont_features.empty:
|
320
|
+
logger.warning(f"No features found for ontology: {ont}")
|
321
|
+
continue
|
322
|
+
|
323
|
+
# Filter species identifiers to current ontology
|
324
|
+
ont_species = species_identifiers[
|
325
|
+
species_identifiers[IDENTIFIERS.ONTOLOGY] == ont
|
326
|
+
].copy()
|
327
|
+
|
328
|
+
if ont_species.empty:
|
329
|
+
logger.warning(f"No species found for ontology: {ont}")
|
330
|
+
continue
|
331
|
+
|
332
|
+
logger.debug(
|
333
|
+
f"Matching {len(ont_features)} features to {len(ont_species)} species for ontology {ont}"
|
334
|
+
)
|
335
|
+
|
336
|
+
# Match features to species for this ontology
|
337
|
+
matched = features_to_pathway_species(
|
338
|
+
feature_identifiers=ont_features,
|
339
|
+
species_identifiers=ont_species,
|
340
|
+
ontologies={ont},
|
341
|
+
feature_identifiers_var=feature_identifiers_var,
|
342
|
+
verbose=verbose,
|
343
|
+
)
|
344
|
+
|
345
|
+
if matched.empty:
|
346
|
+
logger.warning(f"No matches found for ontology: {ont}")
|
347
|
+
continue
|
348
|
+
|
349
|
+
matched_dfs.append(matched)
|
350
|
+
|
351
|
+
if not matched_dfs:
|
352
|
+
logger.warning("No matches found for any ontology")
|
353
|
+
return pd.DataFrame() # Return empty DataFrame with correct columns
|
354
|
+
|
355
|
+
# Combine results from all ontologies
|
356
|
+
result = pd.concat(matched_dfs, axis=0, ignore_index=True)
|
357
|
+
|
358
|
+
logger.info(
|
359
|
+
f"Found {len(result)} total matches across {len(matched_dfs)} ontologies"
|
360
|
+
)
|
361
|
+
|
362
|
+
return result
|
363
|
+
|
364
|
+
|
365
|
+
def _validate_wide_ontologies(
|
366
|
+
wide_df: pd.DataFrame,
|
367
|
+
ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None,
|
368
|
+
) -> Set[str]:
|
369
|
+
"""
|
370
|
+
Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
|
371
|
+
|
372
|
+
Parameters
|
373
|
+
----------
|
374
|
+
wide_df : pd.DataFrame
|
375
|
+
DataFrame with one column per ontology and a results column
|
376
|
+
ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
|
377
|
+
Either:
|
378
|
+
- String specifying a single ontology column
|
379
|
+
- Set of columns to treat as ontologies
|
380
|
+
- Dict mapping wide column names to ontology names
|
381
|
+
- None to automatically detect ontology columns based on ONTOLOGIES_LIST
|
382
|
+
|
383
|
+
Returns
|
384
|
+
-------
|
385
|
+
Set[str]
|
386
|
+
Set of validated ontology names. For dictionary mappings, returns the target ontology names.
|
387
|
+
|
388
|
+
Raises
|
389
|
+
------
|
390
|
+
ValueError
|
391
|
+
If validation fails for any ontology specification or no valid ontologies are found
|
392
|
+
"""
|
393
|
+
# Convert string input to set
|
394
|
+
if isinstance(ontologies, str):
|
395
|
+
ontologies = {ontologies}
|
396
|
+
|
397
|
+
# Get the set of ontology columns
|
398
|
+
if isinstance(ontologies, dict):
|
399
|
+
# Check source columns exist in DataFrame
|
400
|
+
missing_cols = set(ontologies.keys()) - set(wide_df.columns)
|
401
|
+
if missing_cols:
|
402
|
+
raise ValueError(f"Source columns not found in DataFrame: {missing_cols}")
|
403
|
+
# Validate target ontologies against ONTOLOGIES_LIST
|
404
|
+
invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
|
405
|
+
if invalid_onts:
|
406
|
+
raise ValueError(
|
407
|
+
f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
408
|
+
)
|
409
|
+
# Return target ontology names instead of source column names
|
410
|
+
ontology_cols = set(ontologies.values())
|
411
|
+
|
412
|
+
elif isinstance(ontologies, set):
|
413
|
+
# Check specified columns exist in DataFrame
|
414
|
+
missing_cols = ontologies - set(wide_df.columns)
|
415
|
+
if missing_cols:
|
416
|
+
raise ValueError(
|
417
|
+
f"Specified ontology columns not found in DataFrame: {missing_cols}"
|
418
|
+
)
|
419
|
+
# Validate specified ontologies against ONTOLOGIES_LIST
|
420
|
+
invalid_onts = ontologies - set(ONTOLOGIES_LIST)
|
421
|
+
if invalid_onts:
|
422
|
+
raise ValueError(
|
423
|
+
f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
424
|
+
)
|
425
|
+
ontology_cols = ontologies
|
426
|
+
|
427
|
+
else:
|
428
|
+
# Auto-detect ontology columns by matching against ONTOLOGIES_LIST
|
429
|
+
ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
|
430
|
+
if not ontology_cols:
|
431
|
+
raise ValueError(
|
432
|
+
f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
|
433
|
+
)
|
434
|
+
logger.info(f"Auto-detected ontology columns: {ontology_cols}")
|
435
|
+
|
436
|
+
logger.debug(f"Validated ontology columns: {ontology_cols}")
|
437
|
+
return ontology_cols
|
438
|
+
|
439
|
+
|
440
|
+
def _log_feature_species_mapping_stats(
|
441
|
+
pathway_species: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
|
442
|
+
):
|
443
|
+
"""
|
444
|
+
Log statistics about the mapping between feature_id and s_id in the pathway_species DataFrame.
|
445
|
+
"""
|
446
|
+
|
447
|
+
# Percent change in feature_ids
|
448
|
+
n_feature_ids = pathway_species[feature_id_var].nunique()
|
449
|
+
n_input_feature_ids = (
|
450
|
+
pathway_species[feature_id_var].max() + 1
|
451
|
+
if feature_id_var in pathway_species.columns
|
452
|
+
else 0
|
453
|
+
)
|
454
|
+
percent_change = (
|
455
|
+
100 * (n_feature_ids - n_input_feature_ids) / n_input_feature_ids
|
456
|
+
if n_input_feature_ids
|
457
|
+
else 0
|
458
|
+
)
|
459
|
+
logger.info(
|
460
|
+
f"{percent_change:+.1f}% change in feature_ids ({n_feature_ids} vs {n_input_feature_ids})"
|
461
|
+
)
|
462
|
+
|
463
|
+
# Number of times an s_id maps to 1+ feature_ids (with s_name)
|
464
|
+
s_id_counts = pathway_species.groupby(SBML_DFS.S_ID)[feature_id_var].nunique()
|
465
|
+
s_id_multi = s_id_counts[s_id_counts > 1]
|
466
|
+
logger.info(f"{len(s_id_multi)} s_id(s) map to more than one feature_id.")
|
467
|
+
if not s_id_multi.empty:
|
468
|
+
examples = pathway_species[
|
469
|
+
pathway_species[SBML_DFS.S_ID].isin(s_id_multi.index)
|
470
|
+
][[SBML_DFS.S_ID, SBML_DFS.S_NAME, feature_id_var]]
|
471
|
+
logger.info(
|
472
|
+
f"Examples of s_id mapping to multiple feature_ids (showing up to 3):\n{examples.groupby([SBML_DFS.S_ID, SBML_DFS.S_NAME])[feature_id_var].apply(list).head(3)}"
|
473
|
+
)
|
474
|
+
|
475
|
+
# Number of times a feature_id maps to 1+ s_ids (with s_name)
|
476
|
+
feature_id_counts = pathway_species.groupby(feature_id_var)[SBML_DFS.S_ID].nunique()
|
477
|
+
feature_id_multi = feature_id_counts[feature_id_counts > 1]
|
478
|
+
logger.info(f"{len(feature_id_multi)} feature_id(s) map to more than one s_id.")
|
479
|
+
if not feature_id_multi.empty:
|
480
|
+
examples = pathway_species[
|
481
|
+
pathway_species[feature_id_var].isin(feature_id_multi.index)
|
482
|
+
][[feature_id_var, SBML_DFS.S_ID, SBML_DFS.S_NAME]]
|
483
|
+
logger.info(
|
484
|
+
f"Examples of feature_id mapping to multiple s_ids (showing up to 3):\n{examples.groupby([feature_id_var])[[SBML_DFS.S_ID, SBML_DFS.S_NAME]].apply(lambda df: list(df.itertuples(index=False, name=None))).head(3)}"
|
485
|
+
)
|
486
|
+
|
487
|
+
|
488
|
+
def _ensure_feature_id_var(
|
489
|
+
df: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
|
490
|
+
) -> pd.DataFrame:
|
491
|
+
"""
|
492
|
+
Ensure the DataFrame has a feature_id column, creating one if it doesn't exist.
|
493
|
+
|
494
|
+
Parameters
|
495
|
+
----------
|
496
|
+
df : pd.DataFrame
|
497
|
+
DataFrame to check/modify
|
498
|
+
feature_id_var : str, default=FEATURE_ID_VAR_DEFAULT
|
499
|
+
Name of the feature ID column
|
500
|
+
|
501
|
+
Returns
|
502
|
+
-------
|
503
|
+
pd.DataFrame
|
504
|
+
DataFrame with guaranteed feature_id column
|
505
|
+
"""
|
506
|
+
if feature_id_var not in df.columns:
|
507
|
+
logger.warning(f"No {feature_id_var} column found in DataFrame, creating one")
|
508
|
+
df = df.copy()
|
509
|
+
df[feature_id_var] = np.arange(len(df))
|
510
|
+
return df
|
napistu/mcp/__init__.py
CHANGED
@@ -18,15 +18,18 @@ except ImportError:
|
|
18
18
|
is_available = False
|
19
19
|
|
20
20
|
if is_available:
|
21
|
-
from .server import create_server
|
22
|
-
from .profiles import get_profile
|
21
|
+
from napistu.mcp.server import create_server
|
22
|
+
from napistu.mcp.profiles import get_profile
|
23
|
+
from napistu.mcp.constants import MCP_PROFILES
|
23
24
|
|
24
|
-
def start_server(
|
25
|
+
def start_server(
|
26
|
+
profile_name: str = MCP_PROFILES.EXECUTION, **kwargs
|
27
|
+
) -> Dict[str, Any]:
|
25
28
|
"""
|
26
29
|
Start an MCP server with a specific profile.
|
27
30
|
|
28
31
|
Args:
|
29
|
-
profile_name: Name of the profile ('
|
32
|
+
profile_name: Name of the profile ('execution', 'docs', or 'full')
|
30
33
|
**kwargs: Additional configuration options
|
31
34
|
|
32
35
|
Returns:
|