chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1863 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enrichment analysis tools for spatial transcriptomics data.
|
|
3
|
+
|
|
4
|
+
This module provides both standard and spatially-aware enrichment analysis methods:
|
|
5
|
+
- Standard methods: GSEA, ORA, ssGSEA, Enrichr (via gseapy)
|
|
6
|
+
- Spatial methods: EnrichMap-based spatial enrichment analysis
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from scipy import stats
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from ..models.data import EnrichmentParameters
|
|
18
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
19
|
+
from statsmodels.stats.multitest import multipletests
|
|
20
|
+
|
|
21
|
+
from ..models.analysis import EnrichmentResult
|
|
22
|
+
from ..utils.adata_utils import store_analysis_metadata, to_dense
|
|
23
|
+
from ..utils.dependency_manager import require
|
|
24
|
+
from ..utils.exceptions import ParameterError, ProcessingError
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ============================================================================
|
|
30
|
+
# MCP RESPONSE OPTIMIZATION
|
|
31
|
+
# ============================================================================
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _filter_significant_statistics(
|
|
35
|
+
gene_set_statistics: dict,
|
|
36
|
+
enrichment_scores: dict,
|
|
37
|
+
pvalues: dict,
|
|
38
|
+
adjusted_pvalues: dict,
|
|
39
|
+
method: str,
|
|
40
|
+
fdr_threshold: Optional[float] = None,
|
|
41
|
+
) -> tuple:
|
|
42
|
+
"""
|
|
43
|
+
Filter all enrichment result dictionaries to only include significant pathways.
|
|
44
|
+
|
|
45
|
+
This dramatically reduces MCP response size for large gene set databases
|
|
46
|
+
(e.g., KEGG 311 pathways, GO 10,000 terms) while preserving all important
|
|
47
|
+
information for users.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
gene_set_statistics: Full statistics for all gene sets
|
|
51
|
+
enrichment_scores: Enrichment scores for all gene sets
|
|
52
|
+
pvalues: P-values for all gene sets
|
|
53
|
+
adjusted_pvalues: FDR-corrected p-values for all gene sets
|
|
54
|
+
method: Enrichment method used ('gsea', 'ora', 'enrichr', 'ssgsea')
|
|
55
|
+
fdr_threshold: FDR threshold for significance (default: None for method-based auto)
|
|
56
|
+
Method-based defaults (based on statistical best practices):
|
|
57
|
+
- GSEA: FDR < 0.25 (official recommendation from Subramanian et al. 2005)
|
|
58
|
+
- ORA/Enrichr: FDR < 0.05 (standard statistical threshold)
|
|
59
|
+
- ssGSEA: No filtering (no p-values produced)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tuple of (filtered_statistics, filtered_scores, filtered_pvals, filtered_adj_pvals)
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
Before: 311 pathways × 4 dicts × 100 chars = 124KB (KEGG)
|
|
66
|
+
After: ~15 significant pathways × 4 dicts × 100 chars = 6KB (95% reduction)
|
|
67
|
+
|
|
68
|
+
References:
|
|
69
|
+
- GSEA: Subramanian et al. (2005) PNAS 102(43):15545-15550
|
|
70
|
+
"We recommend an FDR cutoff of 25% when dealing with a single database"
|
|
71
|
+
- ORA: Standard multiple testing correction threshold (Benjamini & Hochberg 1995)
|
|
72
|
+
"""
|
|
73
|
+
if not adjusted_pvalues:
|
|
74
|
+
# No p-values available (e.g., ssGSEA), return all results without filtering
|
|
75
|
+
return gene_set_statistics, enrichment_scores, pvalues, adjusted_pvalues
|
|
76
|
+
|
|
77
|
+
# Auto-determine threshold based on ANALYSIS METHOD if not specified
|
|
78
|
+
# This is statistically principled: different methods have different FDR standards
|
|
79
|
+
if fdr_threshold is None:
|
|
80
|
+
method_lower = method.lower()
|
|
81
|
+
if method_lower == "gsea":
|
|
82
|
+
# GSEA official recommendation: FDR < 0.25
|
|
83
|
+
# From Subramanian et al. 2005: "An FDR of 25% indicates that the result
|
|
84
|
+
# is likely to be valid 3 out of 4 times"
|
|
85
|
+
fdr_threshold = 0.25
|
|
86
|
+
elif method_lower in ("ora", "enrichr", "pathway_ora", "pathway_enrichr"):
|
|
87
|
+
# ORA and Enrichr: standard statistical threshold
|
|
88
|
+
# Based on Benjamini-Hochberg FDR control at 5%
|
|
89
|
+
fdr_threshold = 0.05
|
|
90
|
+
else:
|
|
91
|
+
# Default fallback for unknown methods
|
|
92
|
+
fdr_threshold = 0.05
|
|
93
|
+
|
|
94
|
+
# Find significant pathways
|
|
95
|
+
significant = {
|
|
96
|
+
name
|
|
97
|
+
for name, fdr in adjusted_pvalues.items()
|
|
98
|
+
if fdr is not None and fdr < fdr_threshold
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Filter all dictionaries
|
|
102
|
+
filtered_stats = {
|
|
103
|
+
name: stats
|
|
104
|
+
for name, stats in gene_set_statistics.items()
|
|
105
|
+
if name in significant
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
filtered_scores = {
|
|
109
|
+
name: score for name, score in enrichment_scores.items() if name in significant
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
filtered_pvals = {
|
|
113
|
+
name: pval for name, pval in pvalues.items() if name in significant
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
filtered_adj_pvals = {
|
|
117
|
+
name: adj_pval
|
|
118
|
+
for name, adj_pval in adjusted_pvalues.items()
|
|
119
|
+
if name in significant
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return filtered_stats, filtered_scores, filtered_pvals, filtered_adj_pvals
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ============================================================================
|
|
126
|
+
# GENE SET UTILITIES
|
|
127
|
+
# ============================================================================
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _filter_gene_sets_by_size(
|
|
131
|
+
gene_sets: dict[str, list[str]], min_size: int, max_size: int
|
|
132
|
+
) -> dict[str, list[str]]:
|
|
133
|
+
"""
|
|
134
|
+
Filter gene sets by size constraints.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
gene_sets : Dict[str, List[str]]
|
|
139
|
+
Dictionary mapping gene set names to gene lists
|
|
140
|
+
min_size : int
|
|
141
|
+
Minimum number of genes required
|
|
142
|
+
max_size : int
|
|
143
|
+
Maximum number of genes allowed
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
Dict[str, List[str]]
|
|
148
|
+
Filtered gene sets within size constraints
|
|
149
|
+
"""
|
|
150
|
+
return {
|
|
151
|
+
name: genes
|
|
152
|
+
for name, genes in gene_sets.items()
|
|
153
|
+
if min_size <= len(genes) <= max_size
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ============================================================================
|
|
158
|
+
# SPARSE MATRIX UTILITIES
|
|
159
|
+
# ============================================================================
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _compute_std_sparse_compatible(X, axis=0, ddof=1):
|
|
163
|
+
"""
|
|
164
|
+
Compute standard deviation compatible with both dense and sparse matrices.
|
|
165
|
+
|
|
166
|
+
For sparse matrices, uses the formula: std = sqrt(E[X^2] - E[X]^2) with Bessel correction.
|
|
167
|
+
For dense matrices, uses numpy's built-in std method.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
X: Input matrix (can be sparse or dense)
|
|
171
|
+
axis: Axis along which to compute std (0 for columns, 1 for rows)
|
|
172
|
+
ddof: Delta Degrees of Freedom for Bessel correction (default: 1)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
1D numpy array of standard deviations
|
|
176
|
+
"""
|
|
177
|
+
import scipy.sparse as sp
|
|
178
|
+
|
|
179
|
+
if sp.issparse(X):
|
|
180
|
+
# Sparse matrix: use mathematical formula
|
|
181
|
+
n = X.shape[axis]
|
|
182
|
+
mean = np.array(X.mean(axis=axis)).flatten()
|
|
183
|
+
mean_of_squares = np.array(X.power(2).mean(axis=axis)).flatten()
|
|
184
|
+
|
|
185
|
+
# Compute variance with Bessel correction: n/(n-ddof)
|
|
186
|
+
variance = mean_of_squares - np.power(mean, 2)
|
|
187
|
+
variance = np.maximum(variance, 0) # Avoid numerical errors
|
|
188
|
+
if ddof > 0:
|
|
189
|
+
variance = variance * n / (n - ddof) # Bessel correction
|
|
190
|
+
|
|
191
|
+
return np.sqrt(variance)
|
|
192
|
+
else:
|
|
193
|
+
# Dense matrix: use numpy's built-in method
|
|
194
|
+
return np.array(X.std(axis=axis, ddof=ddof)).flatten()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ============================================================================
|
|
198
|
+
# GENE FORMAT CONVERSION UTILITIES
|
|
199
|
+
# ============================================================================
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _convert_gene_format_for_matching(
|
|
203
|
+
pathway_genes: list[str], dataset_genes: set, species: str
|
|
204
|
+
) -> tuple[list[str], dict[str, str]]:
|
|
205
|
+
"""
|
|
206
|
+
Rule-based gene format conversion to match dataset format.
|
|
207
|
+
|
|
208
|
+
Handles common gene format variations between pathway databases and datasets:
|
|
209
|
+
- Uppercase (GENE) vs Title case (Gene) vs lowercase (gene)
|
|
210
|
+
- Species-specific formatting rules
|
|
211
|
+
- Special prefixes like Gm/GM/gm for mouse genes
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
pathway_genes: Gene names from pathway database (usually uppercase from gseapy)
|
|
215
|
+
dataset_genes: Available gene names in dataset
|
|
216
|
+
species: Species specified by user ("mouse" or "human")
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
(dataset_format_genes, conversion_map)
|
|
220
|
+
dataset_format_genes: Gene names in dataset format that can be found
|
|
221
|
+
conversion_map: Maps dataset_format -> original_pathway_format
|
|
222
|
+
"""
|
|
223
|
+
dataset_format_genes = []
|
|
224
|
+
conversion_map = {}
|
|
225
|
+
|
|
226
|
+
for gene in pathway_genes:
|
|
227
|
+
# Try direct match first
|
|
228
|
+
if gene in dataset_genes:
|
|
229
|
+
dataset_format_genes.append(gene)
|
|
230
|
+
conversion_map[gene] = gene
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
# Apply multiple format conversion rules
|
|
234
|
+
format_variations = []
|
|
235
|
+
|
|
236
|
+
if species == "mouse":
|
|
237
|
+
# Mouse-specific format rules (order matters for efficiency)
|
|
238
|
+
# Rule 1: Title case (most common): Cd5l, Gbp2b
|
|
239
|
+
if len(gene) > 1:
|
|
240
|
+
format_variations.append(gene[0].upper() + gene[1:].lower())
|
|
241
|
+
# Rule 2: All lowercase: cd5l, gbp2b
|
|
242
|
+
format_variations.append(gene.lower())
|
|
243
|
+
# Rule 3: All uppercase: CD5L, GBP2B
|
|
244
|
+
format_variations.append(gene.upper())
|
|
245
|
+
# Rule 4: Capitalize first letter only
|
|
246
|
+
format_variations.append(gene.capitalize())
|
|
247
|
+
|
|
248
|
+
# Special rule for Gm-prefixed genes (common in mouse)
|
|
249
|
+
if gene.upper().startswith("GM"):
|
|
250
|
+
format_variations.extend(
|
|
251
|
+
[
|
|
252
|
+
"gm" + gene[2:].lower(), # gm42418
|
|
253
|
+
"Gm" + gene[2:].lower(), # Gm42418
|
|
254
|
+
"GM" + gene[2:].upper(), # GM42418
|
|
255
|
+
]
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
elif species == "human":
|
|
259
|
+
# Human-specific format rules
|
|
260
|
+
# Rule 1: All uppercase (most common): HES1, FABP4
|
|
261
|
+
format_variations.append(gene.upper())
|
|
262
|
+
# Rule 2: All lowercase: hes1, fabp4
|
|
263
|
+
format_variations.append(gene.lower())
|
|
264
|
+
# Rule 3: Capitalize first letter
|
|
265
|
+
format_variations.append(gene.capitalize())
|
|
266
|
+
|
|
267
|
+
# Remove duplicates while preserving order
|
|
268
|
+
seen = set()
|
|
269
|
+
unique_variations = []
|
|
270
|
+
for variation in format_variations:
|
|
271
|
+
if variation not in seen and variation != gene: # Skip if same as original
|
|
272
|
+
seen.add(variation)
|
|
273
|
+
unique_variations.append(variation)
|
|
274
|
+
|
|
275
|
+
# Try each format variation against dataset
|
|
276
|
+
for variant in unique_variations:
|
|
277
|
+
if variant in dataset_genes:
|
|
278
|
+
dataset_format_genes.append(variant) # Use dataset's actual format
|
|
279
|
+
conversion_map[variant] = gene
|
|
280
|
+
break # Stop after first match
|
|
281
|
+
|
|
282
|
+
return dataset_format_genes, conversion_map
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ============================================================================
|
|
286
|
+
# ENRICHR DATABASE MAPPING
|
|
287
|
+
# ============================================================================
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def map_gene_set_database_to_enrichr_library(database_name: str, species: str) -> str:
|
|
291
|
+
"""Map user-friendly database names to actual Enrichr library names.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
database_name: User-friendly database name from MCP interface
|
|
295
|
+
species: Species ('human', 'mouse', or 'zebrafish')
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Actual Enrichr library name
|
|
299
|
+
|
|
300
|
+
Raises:
|
|
301
|
+
ValueError: If database_name is not supported
|
|
302
|
+
"""
|
|
303
|
+
mapping = {
|
|
304
|
+
"GO_Biological_Process": "GO_Biological_Process_2025",
|
|
305
|
+
"GO_Molecular_Function": "GO_Molecular_Function_2025",
|
|
306
|
+
"GO_Cellular_Component": "GO_Cellular_Component_2025",
|
|
307
|
+
"KEGG_Pathways": (
|
|
308
|
+
"KEGG_2021_Human" if species.lower() == "human" else "KEGG_2019_Mouse"
|
|
309
|
+
),
|
|
310
|
+
"Reactome_Pathways": "Reactome_Pathways_2024",
|
|
311
|
+
"MSigDB_Hallmark": "MSigDB_Hallmark_2020",
|
|
312
|
+
"Cell_Type_Markers": "CellMarker_Augmented_2021",
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
if database_name not in mapping:
|
|
316
|
+
available_options = list(mapping)
|
|
317
|
+
raise ParameterError(
|
|
318
|
+
f"Unknown gene set database: {database_name}. "
|
|
319
|
+
f"Available options: {available_options}"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
return mapping[database_name]
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# ============================================================================
|
|
326
|
+
# ENRICHMENT ANALYSIS FUNCTIONS
|
|
327
|
+
# ============================================================================
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def perform_gsea(
|
|
331
|
+
adata,
|
|
332
|
+
gene_sets: dict[str, list[str]],
|
|
333
|
+
ranking_key: Optional[str] = None,
|
|
334
|
+
method: str = "signal_to_noise",
|
|
335
|
+
permutation_num: int = 1000,
|
|
336
|
+
min_size: int = 10,
|
|
337
|
+
max_size: int = 500,
|
|
338
|
+
species: Optional[str] = None,
|
|
339
|
+
database: Optional[str] = None,
|
|
340
|
+
ctx: "ToolContext" = None,
|
|
341
|
+
) -> "EnrichmentResult":
|
|
342
|
+
"""
|
|
343
|
+
Perform Gene Set Enrichment Analysis (GSEA).
|
|
344
|
+
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
347
|
+
adata : AnnData
|
|
348
|
+
Annotated data matrix
|
|
349
|
+
gene_sets : Dict[str, List[str]]
|
|
350
|
+
Gene sets to test
|
|
351
|
+
ranking_key : Optional[str]
|
|
352
|
+
Key in adata.var for pre-computed ranking. If None, compute from expression
|
|
353
|
+
method : str
|
|
354
|
+
Method for ranking genes if ranking_key is None
|
|
355
|
+
permutation_num : int
|
|
356
|
+
Number of permutations
|
|
357
|
+
min_size : int
|
|
358
|
+
Minimum gene set size
|
|
359
|
+
max_size : int
|
|
360
|
+
Maximum gene set size
|
|
361
|
+
species : Optional[str]
|
|
362
|
+
Species for the analysis (e.g., 'mouse', 'human')
|
|
363
|
+
database : Optional[str]
|
|
364
|
+
Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
|
|
365
|
+
ctx : ToolContext
|
|
366
|
+
MCP tool context for logging
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
Dict containing enrichment results
|
|
371
|
+
"""
|
|
372
|
+
require("gseapy", ctx, feature="GSEA analysis")
|
|
373
|
+
import gseapy as gp
|
|
374
|
+
|
|
375
|
+
# Prepare ranking
|
|
376
|
+
if ranking_key and ranking_key in adata.var:
|
|
377
|
+
# Use pre-computed ranking
|
|
378
|
+
ranking = adata.var[ranking_key].to_dict()
|
|
379
|
+
else:
|
|
380
|
+
# Compute ranking from expression data
|
|
381
|
+
# Use raw data when available and not log-normalized (full gene set for GSEA)
|
|
382
|
+
# IMPORTANT: Keep X and var_names consistent to avoid dimension mismatch
|
|
383
|
+
if "log1p" not in adata.uns and adata.raw is not None:
|
|
384
|
+
X = adata.raw.X
|
|
385
|
+
var_names = adata.raw.var_names
|
|
386
|
+
else:
|
|
387
|
+
X = adata.X
|
|
388
|
+
var_names = adata.var_names
|
|
389
|
+
|
|
390
|
+
# Compute gene ranking metric
|
|
391
|
+
# IMPORTANT: GSEA requires biologically meaningful ranking, not just variance
|
|
392
|
+
# Reference: Subramanian et al. (2005) PNAS, GSEA-MSIGDB documentation
|
|
393
|
+
|
|
394
|
+
if "condition" in adata.obs or "group" in adata.obs:
|
|
395
|
+
group_key = "condition" if "condition" in adata.obs else "group"
|
|
396
|
+
groups = adata.obs[group_key].unique()
|
|
397
|
+
|
|
398
|
+
if len(groups) == 2:
|
|
399
|
+
# Binary comparison: Use Signal-to-Noise Ratio (GSEA default)
|
|
400
|
+
# S2N = (μ1 - μ2) / (σ1 + σ2)
|
|
401
|
+
# This captures both differential expression AND expression stability
|
|
402
|
+
group1_mask = adata.obs[group_key] == groups[0]
|
|
403
|
+
group2_mask = adata.obs[group_key] == groups[1]
|
|
404
|
+
|
|
405
|
+
# Compute means
|
|
406
|
+
mean1 = np.array(X[group1_mask, :].mean(axis=0)).flatten()
|
|
407
|
+
mean2 = np.array(X[group2_mask, :].mean(axis=0)).flatten()
|
|
408
|
+
|
|
409
|
+
# Compute standard deviations (sparse-compatible)
|
|
410
|
+
std1 = _compute_std_sparse_compatible(X[group1_mask, :], axis=0, ddof=1)
|
|
411
|
+
std2 = _compute_std_sparse_compatible(X[group2_mask, :], axis=0, ddof=1)
|
|
412
|
+
|
|
413
|
+
# Apply minimum std threshold (GSEA standard: 0.2 * |mean|)
|
|
414
|
+
# This prevents division by zero and reduces noise from low-variance genes
|
|
415
|
+
min_std_factor = 0.2
|
|
416
|
+
std1 = np.maximum(std1, min_std_factor * np.abs(mean1))
|
|
417
|
+
std2 = np.maximum(std2, min_std_factor * np.abs(mean2))
|
|
418
|
+
|
|
419
|
+
# Compute Signal-to-Noise Ratio
|
|
420
|
+
s2n = (mean1 - mean2) / (std1 + std2)
|
|
421
|
+
ranking = dict(zip(var_names, s2n, strict=True))
|
|
422
|
+
|
|
423
|
+
else:
|
|
424
|
+
# Multi-group: Use Coefficient of Variation (normalized variance)
|
|
425
|
+
# CV = σ / μ - accounts for mean-variance relationship
|
|
426
|
+
# This is more appropriate than raw variance for genes with different expression levels
|
|
427
|
+
mean = np.array(X.mean(axis=0)).flatten()
|
|
428
|
+
std = _compute_std_sparse_compatible(X, axis=0, ddof=1)
|
|
429
|
+
|
|
430
|
+
# Compute CV (avoid division by zero)
|
|
431
|
+
cv = np.zeros_like(mean)
|
|
432
|
+
nonzero_mask = np.abs(mean) > 1e-10
|
|
433
|
+
cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
|
|
434
|
+
|
|
435
|
+
ranking = dict(zip(var_names, cv, strict=False))
|
|
436
|
+
else:
|
|
437
|
+
# No group information: Use best available ranking method
|
|
438
|
+
if "highly_variable_rank" in adata.var:
|
|
439
|
+
# Prefer pre-computed HVG ranking (most robust)
|
|
440
|
+
ranking = adata.var["highly_variable_rank"].to_dict()
|
|
441
|
+
elif "dispersions_norm" in adata.var:
|
|
442
|
+
# Use Seurat-style normalized dispersion
|
|
443
|
+
ranking = adata.var["dispersions_norm"].to_dict()
|
|
444
|
+
else:
|
|
445
|
+
# Fallback: Coefficient of Variation (better than raw variance)
|
|
446
|
+
# Use sparse-compatible std calculation
|
|
447
|
+
mean = np.array(X.mean(axis=0)).flatten()
|
|
448
|
+
std = _compute_std_sparse_compatible(X, axis=0, ddof=1)
|
|
449
|
+
|
|
450
|
+
cv = np.zeros_like(mean)
|
|
451
|
+
nonzero_mask = np.abs(mean) > 1e-10
|
|
452
|
+
cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
|
|
453
|
+
|
|
454
|
+
ranking = dict(zip(var_names, cv, strict=False))
|
|
455
|
+
|
|
456
|
+
# Run GSEA preranked
|
|
457
|
+
try:
|
|
458
|
+
# Convert ranking dict to DataFrame for gseapy
|
|
459
|
+
ranking_df = pd.DataFrame.from_dict(ranking, orient="index", columns=["score"])
|
|
460
|
+
ranking_df.index.name = "gene"
|
|
461
|
+
ranking_df = ranking_df.sort_values("score", ascending=False)
|
|
462
|
+
|
|
463
|
+
res = gp.prerank(
|
|
464
|
+
rnk=ranking_df, # Pass DataFrame instead of dict
|
|
465
|
+
gene_sets=gene_sets,
|
|
466
|
+
processes=1,
|
|
467
|
+
permutation_num=permutation_num,
|
|
468
|
+
min_size=min_size,
|
|
469
|
+
max_size=max_size,
|
|
470
|
+
seed=42,
|
|
471
|
+
verbose=False,
|
|
472
|
+
no_plot=True,
|
|
473
|
+
outdir=None,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Extract results
|
|
477
|
+
results_df = res.res2d
|
|
478
|
+
|
|
479
|
+
# Prepare output
|
|
480
|
+
enrichment_scores = {}
|
|
481
|
+
pvalues = {}
|
|
482
|
+
adjusted_pvalues = {}
|
|
483
|
+
gene_set_statistics = {}
|
|
484
|
+
|
|
485
|
+
for _idx, row in results_df.iterrows():
|
|
486
|
+
term = row["Term"]
|
|
487
|
+
enrichment_scores[term] = row["ES"]
|
|
488
|
+
pvalues[term] = row["NOM p-val"]
|
|
489
|
+
adjusted_pvalues[term] = row["FDR q-val"]
|
|
490
|
+
gene_set_statistics[term] = {
|
|
491
|
+
"es": row["ES"],
|
|
492
|
+
"nes": row["NES"],
|
|
493
|
+
"pval": row["NOM p-val"],
|
|
494
|
+
"fdr": row["FDR q-val"],
|
|
495
|
+
"size": row.get(
|
|
496
|
+
"Matched_size", row.get("Gene %", 0)
|
|
497
|
+
), # Different versions use different column names
|
|
498
|
+
"lead_genes": (
|
|
499
|
+
row.get("Lead_genes", "").split(";")[:10]
|
|
500
|
+
if "Lead_genes" in row
|
|
501
|
+
else []
|
|
502
|
+
),
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
# Get top enriched and depleted
|
|
506
|
+
results_df_sorted = results_df.sort_values("NES", ascending=False)
|
|
507
|
+
top_enriched = (
|
|
508
|
+
results_df_sorted[results_df_sorted["NES"] > 0].head(10)["Term"].tolist()
|
|
509
|
+
)
|
|
510
|
+
top_depleted = (
|
|
511
|
+
results_df_sorted[results_df_sorted["NES"] < 0].head(10)["Term"].tolist()
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Save results to adata.uns for visualization
|
|
515
|
+
# Store full results DataFrame for visualization
|
|
516
|
+
adata.uns["gsea_results"] = results_df
|
|
517
|
+
|
|
518
|
+
# Store gene set membership for validation
|
|
519
|
+
adata.uns["enrichment_gene_sets"] = gene_sets
|
|
520
|
+
|
|
521
|
+
# Store metadata for scientific provenance tracking
|
|
522
|
+
store_analysis_metadata(
|
|
523
|
+
adata,
|
|
524
|
+
analysis_name="enrichment_gsea",
|
|
525
|
+
method="gsea",
|
|
526
|
+
parameters={
|
|
527
|
+
"permutation_num": permutation_num,
|
|
528
|
+
"ranking_method": method,
|
|
529
|
+
"min_size": min_size,
|
|
530
|
+
"max_size": max_size,
|
|
531
|
+
"ranking_key": ranking_key,
|
|
532
|
+
},
|
|
533
|
+
results_keys={"uns": ["gsea_results", "enrichment_gene_sets"]},
|
|
534
|
+
statistics={
|
|
535
|
+
"n_gene_sets": len(gene_sets),
|
|
536
|
+
"n_significant": len(results_df[results_df["FDR q-val"] < 0.05]),
|
|
537
|
+
},
|
|
538
|
+
species=species,
|
|
539
|
+
database=database,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Filter all result dictionaries to only significant pathways (reduces MCP response size)
|
|
543
|
+
# Uses method-based FDR threshold: GSEA = 0.25 (Subramanian et al. 2005)
|
|
544
|
+
(
|
|
545
|
+
filtered_statistics,
|
|
546
|
+
filtered_scores,
|
|
547
|
+
filtered_pvals,
|
|
548
|
+
filtered_adj_pvals,
|
|
549
|
+
) = _filter_significant_statistics(
|
|
550
|
+
gene_set_statistics,
|
|
551
|
+
enrichment_scores,
|
|
552
|
+
pvalues,
|
|
553
|
+
adjusted_pvalues,
|
|
554
|
+
method="gsea", # Method-based FDR: 0.25 for GSEA
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
return EnrichmentResult(
|
|
558
|
+
method="gsea",
|
|
559
|
+
n_gene_sets=len(gene_sets),
|
|
560
|
+
n_significant=len(results_df[results_df["FDR q-val"] < 0.05]),
|
|
561
|
+
enrichment_scores=filtered_scores,
|
|
562
|
+
pvalues=filtered_pvals,
|
|
563
|
+
adjusted_pvalues=filtered_adj_pvals,
|
|
564
|
+
gene_set_statistics=filtered_statistics,
|
|
565
|
+
top_gene_sets=top_enriched,
|
|
566
|
+
top_depleted_sets=top_depleted,
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
except Exception as e:
|
|
570
|
+
logger.error(f"GSEA failed: {e}")
|
|
571
|
+
raise
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def perform_ora(
|
|
575
|
+
adata,
|
|
576
|
+
gene_sets: dict[str, list[str]],
|
|
577
|
+
gene_list: Optional[list[str]] = None,
|
|
578
|
+
pvalue_threshold: float = 0.05,
|
|
579
|
+
min_size: int = 10,
|
|
580
|
+
max_size: int = 500,
|
|
581
|
+
species: Optional[str] = None,
|
|
582
|
+
database: Optional[str] = None,
|
|
583
|
+
ctx: "ToolContext" = None,
|
|
584
|
+
) -> "EnrichmentResult":
|
|
585
|
+
"""
|
|
586
|
+
Perform Over-Representation Analysis (ORA).
|
|
587
|
+
|
|
588
|
+
Parameters
|
|
589
|
+
----------
|
|
590
|
+
adata : AnnData
|
|
591
|
+
Annotated data matrix
|
|
592
|
+
gene_sets : Dict[str, List[str]]
|
|
593
|
+
Gene sets to test
|
|
594
|
+
gene_list : Optional[List[str]]
|
|
595
|
+
List of genes to test. If None, use DEGs from rank_genes_groups
|
|
596
|
+
pvalue_threshold : float
|
|
597
|
+
P-value threshold for selecting DEGs (only used if rank_genes_groups exists)
|
|
598
|
+
min_size : int
|
|
599
|
+
Minimum gene set size
|
|
600
|
+
max_size : int
|
|
601
|
+
Maximum gene set size
|
|
602
|
+
species : Optional[str]
|
|
603
|
+
Species for the analysis (e.g., 'mouse', 'human')
|
|
604
|
+
database : Optional[str]
|
|
605
|
+
Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
|
|
606
|
+
ctx : ToolContext
|
|
607
|
+
MCP tool context for logging
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
Dict containing enrichment results
|
|
612
|
+
|
|
613
|
+
Notes
|
|
614
|
+
-----
|
|
615
|
+
LogFC filtering removed: ORA should use genes pre-filtered by find_markers.
|
|
616
|
+
Different statistical methods (Wilcoxon, t-test) produce different logFC scales,
|
|
617
|
+
making a fixed threshold inappropriate. Gene filtering is the responsibility of
|
|
618
|
+
differential expression analysis, not enrichment analysis.
|
|
619
|
+
"""
|
|
620
|
+
# Get gene list if not provided
|
|
621
|
+
if gene_list is None:
|
|
622
|
+
# Try to get DEGs from adata
|
|
623
|
+
if "rank_genes_groups" in adata.uns:
|
|
624
|
+
# Get DEGs
|
|
625
|
+
result = adata.uns["rank_genes_groups"]
|
|
626
|
+
names = result["names"]
|
|
627
|
+
|
|
628
|
+
# Check if pvals exist (not all rank_genes_groups have pvals)
|
|
629
|
+
pvals = None
|
|
630
|
+
if "pvals_adj" in result:
|
|
631
|
+
pvals = result["pvals_adj"]
|
|
632
|
+
elif "pvals" in result:
|
|
633
|
+
pvals = result["pvals"]
|
|
634
|
+
|
|
635
|
+
# Get DEGs from all groups and merge
|
|
636
|
+
# IMPORTANT: names is a numpy recarray with shape (n_genes,)
|
|
637
|
+
# and dtype.names contains group names as fields
|
|
638
|
+
# Access genes by group name: names[group_name][i]
|
|
639
|
+
degs_set = set() # Use set for O(1) duplicate check
|
|
640
|
+
|
|
641
|
+
# Iterate over all groups
|
|
642
|
+
for group_name in names.dtype.names:
|
|
643
|
+
for i in range(len(names)):
|
|
644
|
+
# Skip genes that don't pass filter criteria
|
|
645
|
+
if pvals is not None and pvals[group_name][i] >= pvalue_threshold:
|
|
646
|
+
continue
|
|
647
|
+
if pvals is None and i >= 100: # Top 100 genes when no pvals
|
|
648
|
+
continue
|
|
649
|
+
|
|
650
|
+
degs_set.add(names[group_name][i])
|
|
651
|
+
|
|
652
|
+
gene_list = list(degs_set)
|
|
653
|
+
else:
|
|
654
|
+
# Use highly variable genes
|
|
655
|
+
if "highly_variable" in adata.var:
|
|
656
|
+
gene_list = adata.var_names[adata.var["highly_variable"]].tolist()
|
|
657
|
+
else:
|
|
658
|
+
# Use top variable genes (based on Coefficient of Variation)
|
|
659
|
+
# CV = σ/μ is more appropriate than raw variance
|
|
660
|
+
mean = np.array(adata.X.mean(axis=0)).flatten()
|
|
661
|
+
std = _compute_std_sparse_compatible(adata.X, axis=0, ddof=1)
|
|
662
|
+
|
|
663
|
+
# Compute CV (avoid division by zero)
|
|
664
|
+
cv = np.zeros_like(mean)
|
|
665
|
+
nonzero_mask = np.abs(mean) > 1e-10
|
|
666
|
+
cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
|
|
667
|
+
|
|
668
|
+
top_indices = np.argsort(cv)[-500:]
|
|
669
|
+
gene_list = adata.var_names[top_indices].tolist()
|
|
670
|
+
|
|
671
|
+
# Background genes
|
|
672
|
+
# IMPORTANT: Use adata.raw if available, as rank_genes_groups may have been
|
|
673
|
+
# run on raw data with different gene name casing (e.g., uppercase)
|
|
674
|
+
# while filtered adata.var_names may be lowercase
|
|
675
|
+
if adata.raw is not None:
|
|
676
|
+
background_genes = set(adata.raw.var_names)
|
|
677
|
+
else:
|
|
678
|
+
background_genes = set(adata.var_names)
|
|
679
|
+
|
|
680
|
+
# Case-insensitive matching as fallback for gene name format differences
|
|
681
|
+
# (e.g., MT.CO1 vs MT-CO1, uppercase vs lowercase)
|
|
682
|
+
query_genes = set(gene_list) & background_genes
|
|
683
|
+
|
|
684
|
+
# If no direct matches, try case-insensitive matching
|
|
685
|
+
if len(query_genes) == 0 and len(gene_list) > 0:
|
|
686
|
+
# Create case-insensitive lookup
|
|
687
|
+
gene_name_map = {g.upper(): g for g in background_genes}
|
|
688
|
+
query_genes = set()
|
|
689
|
+
for gene in gene_list:
|
|
690
|
+
if gene.upper() in gene_name_map:
|
|
691
|
+
query_genes.add(gene_name_map[gene.upper()])
|
|
692
|
+
|
|
693
|
+
# Perform hypergeometric test for each gene set
|
|
694
|
+
enrichment_scores = {}
|
|
695
|
+
pvalues = {}
|
|
696
|
+
gene_set_statistics = {}
|
|
697
|
+
|
|
698
|
+
for gs_name, gs_genes in gene_sets.items():
|
|
699
|
+
gs_genes_set = set(gs_genes) & background_genes
|
|
700
|
+
|
|
701
|
+
if len(gs_genes_set) < min_size or len(gs_genes_set) > max_size:
|
|
702
|
+
continue
|
|
703
|
+
|
|
704
|
+
# Hypergeometric test
|
|
705
|
+
# a: genes in both query and gene set
|
|
706
|
+
# b: genes in query but not in gene set
|
|
707
|
+
# c: genes in gene set but not in query
|
|
708
|
+
# d: genes in neither
|
|
709
|
+
|
|
710
|
+
a = len(query_genes & gs_genes_set)
|
|
711
|
+
b = len(query_genes - gs_genes_set)
|
|
712
|
+
c = len(gs_genes_set - query_genes)
|
|
713
|
+
d = len(background_genes - query_genes - gs_genes_set)
|
|
714
|
+
|
|
715
|
+
# Fisher's exact test
|
|
716
|
+
odds_ratio, p_value = stats.fisher_exact(
|
|
717
|
+
[[a, b], [c, d]], alternative="greater"
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
enrichment_scores[gs_name] = odds_ratio
|
|
721
|
+
pvalues[gs_name] = p_value
|
|
722
|
+
|
|
723
|
+
gene_set_statistics[gs_name] = {
|
|
724
|
+
"odds_ratio": odds_ratio,
|
|
725
|
+
"pval": p_value,
|
|
726
|
+
"overlap": a,
|
|
727
|
+
"query_size": len(query_genes),
|
|
728
|
+
"gs_size": len(gs_genes_set),
|
|
729
|
+
"overlapping_genes": list(query_genes & gs_genes_set)[:20], # Top 20
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
# Multiple testing correction
|
|
733
|
+
if pvalues:
|
|
734
|
+
pval_array = np.array(list(pvalues.values()))
|
|
735
|
+
_, adjusted_pvals, _, _ = multipletests(pval_array, method="fdr_bh")
|
|
736
|
+
adjusted_pvalues = dict(zip(pvalues.keys(), adjusted_pvals, strict=False))
|
|
737
|
+
else:
|
|
738
|
+
adjusted_pvalues = {}
|
|
739
|
+
|
|
740
|
+
# Get top results
|
|
741
|
+
sorted_by_pval = sorted(pvalues.items(), key=lambda x: x[1])
|
|
742
|
+
top_gene_sets = [x[0] for x in sorted_by_pval[:10]]
|
|
743
|
+
|
|
744
|
+
# Save results to adata.uns for visualization
|
|
745
|
+
# Create DataFrame for visualization compatibility
|
|
746
|
+
ora_df = pd.DataFrame(
|
|
747
|
+
{
|
|
748
|
+
"pathway": list(enrichment_scores),
|
|
749
|
+
"odds_ratio": list(enrichment_scores.values()),
|
|
750
|
+
"pvalue": [pvalues.get(k, 1.0) for k in enrichment_scores],
|
|
751
|
+
"adjusted_pvalue": [
|
|
752
|
+
adjusted_pvalues.get(k, 1.0) for k in enrichment_scores
|
|
753
|
+
],
|
|
754
|
+
}
|
|
755
|
+
)
|
|
756
|
+
ora_df["NES"] = ora_df["odds_ratio"] # Use odds_ratio as score for visualization
|
|
757
|
+
ora_df = ora_df.sort_values("pvalue")
|
|
758
|
+
|
|
759
|
+
adata.uns["ora_results"] = ora_df
|
|
760
|
+
adata.uns["gsea_results"] = (
|
|
761
|
+
ora_df # Also save as gsea_results for visualization compatibility
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
# Store gene set membership for validation
|
|
765
|
+
adata.uns["enrichment_gene_sets"] = gene_sets
|
|
766
|
+
|
|
767
|
+
# Store metadata for scientific provenance tracking
|
|
768
|
+
store_analysis_metadata(
|
|
769
|
+
adata,
|
|
770
|
+
analysis_name="enrichment_ora",
|
|
771
|
+
method="ora",
|
|
772
|
+
parameters={
|
|
773
|
+
"pvalue_threshold": pvalue_threshold,
|
|
774
|
+
"min_size": min_size,
|
|
775
|
+
"max_size": max_size,
|
|
776
|
+
"n_query_genes": len(query_genes),
|
|
777
|
+
},
|
|
778
|
+
results_keys={"uns": ["ora_results", "enrichment_gene_sets"]},
|
|
779
|
+
statistics={
|
|
780
|
+
"n_gene_sets": len(gene_sets),
|
|
781
|
+
"n_significant": sum(
|
|
782
|
+
1 for p in adjusted_pvalues.values() if p is not None and p < 0.05
|
|
783
|
+
),
|
|
784
|
+
"n_query_genes": len(query_genes),
|
|
785
|
+
},
|
|
786
|
+
species=species,
|
|
787
|
+
database=database,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Filter all result dictionaries to only significant pathways (reduces MCP response size)
|
|
791
|
+
# Uses method-based FDR threshold: ORA = 0.05 (standard statistical threshold)
|
|
792
|
+
(
|
|
793
|
+
filtered_statistics,
|
|
794
|
+
filtered_scores,
|
|
795
|
+
filtered_pvals,
|
|
796
|
+
filtered_adj_pvals,
|
|
797
|
+
) = _filter_significant_statistics(
|
|
798
|
+
gene_set_statistics,
|
|
799
|
+
enrichment_scores,
|
|
800
|
+
pvalues,
|
|
801
|
+
adjusted_pvalues,
|
|
802
|
+
method="ora", # Method-based FDR: 0.05 for ORA
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
return EnrichmentResult(
|
|
806
|
+
method="ora",
|
|
807
|
+
n_gene_sets=len(gene_sets),
|
|
808
|
+
n_significant=sum(
|
|
809
|
+
1 for p in adjusted_pvalues.values() if p is not None and p < 0.05
|
|
810
|
+
),
|
|
811
|
+
enrichment_scores=filtered_scores,
|
|
812
|
+
pvalues=filtered_pvals,
|
|
813
|
+
adjusted_pvalues=filtered_adj_pvals,
|
|
814
|
+
gene_set_statistics=filtered_statistics,
|
|
815
|
+
top_gene_sets=top_gene_sets,
|
|
816
|
+
top_depleted_sets=[], # ORA does not produce depleted gene sets
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def perform_ssgsea(
|
|
821
|
+
adata,
|
|
822
|
+
gene_sets: dict[str, list[str]],
|
|
823
|
+
min_size: int = 10,
|
|
824
|
+
max_size: int = 500,
|
|
825
|
+
species: Optional[str] = None,
|
|
826
|
+
database: Optional[str] = None,
|
|
827
|
+
ctx: "ToolContext" = None,
|
|
828
|
+
) -> "EnrichmentResult":
|
|
829
|
+
"""
|
|
830
|
+
Perform single-sample Gene Set Enrichment Analysis (ssGSEA).
|
|
831
|
+
|
|
832
|
+
This calculates enrichment scores for each sample independently.
|
|
833
|
+
|
|
834
|
+
Parameters
|
|
835
|
+
----------
|
|
836
|
+
adata : AnnData
|
|
837
|
+
Annotated data matrix
|
|
838
|
+
gene_sets : Dict[str, List[str]]
|
|
839
|
+
Gene sets to test
|
|
840
|
+
min_size : int
|
|
841
|
+
Minimum gene set size
|
|
842
|
+
max_size : int
|
|
843
|
+
Maximum gene set size
|
|
844
|
+
species : Optional[str]
|
|
845
|
+
Species for the analysis (e.g., 'mouse', 'human')
|
|
846
|
+
database : Optional[str]
|
|
847
|
+
Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
|
|
848
|
+
ctx : ToolContext
|
|
849
|
+
MCP tool context for logging
|
|
850
|
+
|
|
851
|
+
Returns
|
|
852
|
+
-------
|
|
853
|
+
Dict containing enrichment results
|
|
854
|
+
"""
|
|
855
|
+
require("gseapy", ctx, feature="ssGSEA analysis")
|
|
856
|
+
import gseapy as gp
|
|
857
|
+
|
|
858
|
+
# Memory-efficient batch processing for large datasets
|
|
859
|
+
# Threshold: process in batches if > 1000 samples to avoid OOM
|
|
860
|
+
BATCH_SIZE = 500
|
|
861
|
+
n_samples = adata.n_obs
|
|
862
|
+
|
|
863
|
+
# Run ssGSEA (with batch processing for large datasets)
|
|
864
|
+
try:
|
|
865
|
+
if n_samples <= BATCH_SIZE:
|
|
866
|
+
# Small dataset: process all at once (original behavior)
|
|
867
|
+
expr_df = pd.DataFrame(
|
|
868
|
+
to_dense(adata.X).T, index=adata.var_names, columns=adata.obs_names
|
|
869
|
+
)
|
|
870
|
+
res = gp.ssgsea(
|
|
871
|
+
data=expr_df,
|
|
872
|
+
gene_sets=gene_sets,
|
|
873
|
+
min_size=min_size,
|
|
874
|
+
max_size=max_size,
|
|
875
|
+
permutation_num=0,
|
|
876
|
+
no_plot=True,
|
|
877
|
+
threads=1,
|
|
878
|
+
seed=42,
|
|
879
|
+
)
|
|
880
|
+
else:
|
|
881
|
+
# Large dataset: batch processing to reduce peak memory
|
|
882
|
+
# Memory reduction: O(n_genes × n_samples) -> O(n_genes × batch_size)
|
|
883
|
+
all_batch_results = []
|
|
884
|
+
|
|
885
|
+
for batch_start in range(0, n_samples, BATCH_SIZE):
|
|
886
|
+
batch_end = min(batch_start + BATCH_SIZE, n_samples)
|
|
887
|
+
batch_indices = list(range(batch_start, batch_end))
|
|
888
|
+
|
|
889
|
+
# Extract batch - only convert this batch to dense
|
|
890
|
+
batch_X = to_dense(adata.X[batch_indices, :])
|
|
891
|
+
batch_df = pd.DataFrame(
|
|
892
|
+
batch_X.T,
|
|
893
|
+
index=adata.var_names,
|
|
894
|
+
columns=adata.obs_names[batch_indices],
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
batch_res = gp.ssgsea(
|
|
898
|
+
data=batch_df,
|
|
899
|
+
gene_sets=gene_sets,
|
|
900
|
+
min_size=min_size,
|
|
901
|
+
max_size=max_size,
|
|
902
|
+
permutation_num=0,
|
|
903
|
+
no_plot=True,
|
|
904
|
+
threads=1,
|
|
905
|
+
seed=42,
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
if hasattr(batch_res, "results"):
|
|
909
|
+
all_batch_results.append(batch_res.results)
|
|
910
|
+
|
|
911
|
+
# Free batch memory
|
|
912
|
+
del batch_X, batch_df
|
|
913
|
+
|
|
914
|
+
# Merge batch results into unified format
|
|
915
|
+
# Create a mock result object with combined results
|
|
916
|
+
class CombinedResult:
|
|
917
|
+
def __init__(self, results_list):
|
|
918
|
+
self.results = {}
|
|
919
|
+
for batch_results in results_list:
|
|
920
|
+
if isinstance(batch_results, dict):
|
|
921
|
+
self.results.update(batch_results)
|
|
922
|
+
|
|
923
|
+
res = CombinedResult(all_batch_results)
|
|
924
|
+
|
|
925
|
+
# Extract results - ssGSEA stores enrichment scores in res.results
|
|
926
|
+
if hasattr(res, "results") and isinstance(res.results, dict):
|
|
927
|
+
# res.results is a dict where keys are sample names and values are DataFrames
|
|
928
|
+
# We need to reorganize this into gene sets x samples format
|
|
929
|
+
all_samples = list(res.results.keys())
|
|
930
|
+
all_gene_sets = set()
|
|
931
|
+
|
|
932
|
+
# Get all gene sets
|
|
933
|
+
for sample_df in res.results.values():
|
|
934
|
+
if isinstance(sample_df, pd.DataFrame) and "Term" in sample_df.columns:
|
|
935
|
+
all_gene_sets.update(sample_df["Term"].values)
|
|
936
|
+
|
|
937
|
+
all_gene_sets = list(all_gene_sets)
|
|
938
|
+
|
|
939
|
+
# Create scores matrix
|
|
940
|
+
scores_matrix = pd.DataFrame(
|
|
941
|
+
index=all_gene_sets, columns=all_samples, dtype=float
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
# Fill in scores
|
|
945
|
+
for sample, df in res.results.items():
|
|
946
|
+
if (
|
|
947
|
+
isinstance(df, pd.DataFrame)
|
|
948
|
+
and "Term" in df.columns
|
|
949
|
+
and "ES" in df.columns
|
|
950
|
+
):
|
|
951
|
+
for _, row in df.iterrows():
|
|
952
|
+
if row["Term"] in scores_matrix.index:
|
|
953
|
+
scores_matrix.loc[row["Term"], sample] = row["ES"]
|
|
954
|
+
|
|
955
|
+
scores_df = scores_matrix.fillna(0) # Fill missing values with 0
|
|
956
|
+
else:
|
|
957
|
+
error_msg = "ssGSEA results format not recognized."
|
|
958
|
+
logger.error(error_msg)
|
|
959
|
+
raise ProcessingError(error_msg)
|
|
960
|
+
|
|
961
|
+
# Calculate statistics across samples
|
|
962
|
+
enrichment_scores = {}
|
|
963
|
+
gene_set_statistics = {}
|
|
964
|
+
|
|
965
|
+
if not scores_df.empty:
|
|
966
|
+
for gs_name in scores_df.index:
|
|
967
|
+
scores = scores_df.loc[gs_name].values
|
|
968
|
+
enrichment_scores[gs_name] = float(np.mean(scores))
|
|
969
|
+
|
|
970
|
+
gene_set_statistics[gs_name] = {
|
|
971
|
+
"mean_score": float(np.mean(scores)),
|
|
972
|
+
"std_score": float(np.std(scores)),
|
|
973
|
+
"min_score": float(np.min(scores)),
|
|
974
|
+
"max_score": float(np.max(scores)),
|
|
975
|
+
"size": len(gene_sets.get(gs_name, [])),
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
# Add scores to adata
|
|
979
|
+
for gs_name in scores_df.index:
|
|
980
|
+
adata.obs[f"ssgsea_{gs_name}"] = scores_df.loc[gs_name].values
|
|
981
|
+
|
|
982
|
+
# Store gene set membership for validation
|
|
983
|
+
adata.uns["enrichment_gene_sets"] = gene_sets
|
|
984
|
+
|
|
985
|
+
# Store metadata for scientific provenance tracking
|
|
986
|
+
obs_keys = [f"ssgsea_{gs_name}" for gs_name in scores_df.index]
|
|
987
|
+
store_analysis_metadata(
|
|
988
|
+
adata,
|
|
989
|
+
analysis_name="enrichment_ssgsea",
|
|
990
|
+
method="ssgsea",
|
|
991
|
+
parameters={
|
|
992
|
+
"min_size": min_size,
|
|
993
|
+
"max_size": max_size,
|
|
994
|
+
},
|
|
995
|
+
results_keys={"obs": obs_keys, "uns": ["enrichment_gene_sets"]},
|
|
996
|
+
statistics={
|
|
997
|
+
"n_gene_sets": len(gene_sets),
|
|
998
|
+
"n_samples": adata.n_obs,
|
|
999
|
+
},
|
|
1000
|
+
species=species,
|
|
1001
|
+
database=database,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
# Get top gene sets by mean enrichment
|
|
1005
|
+
sorted_by_mean = sorted(
|
|
1006
|
+
enrichment_scores.items(), key=lambda x: x[1], reverse=True
|
|
1007
|
+
)
|
|
1008
|
+
top_gene_sets = [x[0] for x in sorted_by_mean[:10]]
|
|
1009
|
+
|
|
1010
|
+
# ssGSEA doesn't provide p-values, so return empty gene_set_statistics
|
|
1011
|
+
# to reduce MCP response size (no significance filtering possible)
|
|
1012
|
+
pvalues = None
|
|
1013
|
+
adjusted_pvalues = None
|
|
1014
|
+
|
|
1015
|
+
return EnrichmentResult(
|
|
1016
|
+
method="ssgsea",
|
|
1017
|
+
n_gene_sets=len(gene_sets),
|
|
1018
|
+
# IMPORTANT: ssGSEA does NOT perform significance testing
|
|
1019
|
+
# Setting n_significant=0 is honest: no pathways are "statistically significant"
|
|
1020
|
+
# All gene sets receive enrichment scores, but these are sample-level metrics
|
|
1021
|
+
# without associated p-values. Use GSEA or ORA for significance testing.
|
|
1022
|
+
n_significant=0, # ssGSEA doesn't test significance - no p-values produced
|
|
1023
|
+
enrichment_scores=enrichment_scores, # Mean scores per gene set
|
|
1024
|
+
pvalues=pvalues,
|
|
1025
|
+
adjusted_pvalues=adjusted_pvalues,
|
|
1026
|
+
gene_set_statistics={}, # Empty to reduce response size (no p-values available)
|
|
1027
|
+
top_gene_sets=top_gene_sets,
|
|
1028
|
+
top_depleted_sets=[], # ssGSEA doesn't produce depleted sets
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
except Exception as e:
|
|
1032
|
+
logger.error(f"ssGSEA failed: {e}")
|
|
1033
|
+
raise
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
def perform_enrichr(
|
|
1037
|
+
gene_list: list[str],
|
|
1038
|
+
gene_sets: Optional[str] = None,
|
|
1039
|
+
organism: str = "human",
|
|
1040
|
+
ctx: "ToolContext" = None,
|
|
1041
|
+
) -> "EnrichmentResult":
|
|
1042
|
+
"""
|
|
1043
|
+
Perform enrichment analysis using Enrichr web service.
|
|
1044
|
+
|
|
1045
|
+
Parameters
|
|
1046
|
+
----------
|
|
1047
|
+
gene_list : List[str]
|
|
1048
|
+
List of genes to analyze
|
|
1049
|
+
gene_sets : Optional[str]
|
|
1050
|
+
Enrichr library name. If None, use default libraries
|
|
1051
|
+
organism : str
|
|
1052
|
+
Organism ('human' or 'mouse')
|
|
1053
|
+
ctx : ToolContext
|
|
1054
|
+
MCP tool context for logging
|
|
1055
|
+
|
|
1056
|
+
Returns
|
|
1057
|
+
-------
|
|
1058
|
+
Dict containing enrichment results
|
|
1059
|
+
"""
|
|
1060
|
+
require("gseapy", ctx, feature="Enrichr analysis")
|
|
1061
|
+
import gseapy as gp
|
|
1062
|
+
|
|
1063
|
+
# Default gene set libraries
|
|
1064
|
+
if gene_sets is None:
|
|
1065
|
+
gene_sets = [
|
|
1066
|
+
"GO_Biological_Process_2023",
|
|
1067
|
+
"GO_Molecular_Function_2023",
|
|
1068
|
+
"GO_Cellular_Component_2023",
|
|
1069
|
+
"KEGG_2021_Human" if organism == "human" else "KEGG_2019_Mouse",
|
|
1070
|
+
"Reactome_2022",
|
|
1071
|
+
"MSigDB_Hallmark_2020",
|
|
1072
|
+
]
|
|
1073
|
+
elif isinstance(gene_sets, str):
|
|
1074
|
+
# Map user-friendly database name to actual Enrichr library name
|
|
1075
|
+
enrichr_library = map_gene_set_database_to_enrichr_library(gene_sets, organism)
|
|
1076
|
+
gene_sets = [enrichr_library]
|
|
1077
|
+
|
|
1078
|
+
# Run Enrichr
|
|
1079
|
+
try:
|
|
1080
|
+
enr = gp.enrichr(
|
|
1081
|
+
gene_list=gene_list,
|
|
1082
|
+
gene_sets=gene_sets,
|
|
1083
|
+
organism=organism.capitalize(),
|
|
1084
|
+
outdir=None,
|
|
1085
|
+
cutoff=0.05,
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
# Get results - enr.results is already a DataFrame
|
|
1089
|
+
all_results = enr.results
|
|
1090
|
+
|
|
1091
|
+
# Prepare output
|
|
1092
|
+
enrichment_scores = {}
|
|
1093
|
+
pvalues = {}
|
|
1094
|
+
adjusted_pvalues = {}
|
|
1095
|
+
gene_set_statistics = {}
|
|
1096
|
+
|
|
1097
|
+
# Process all results in a single pass (optimized: 3 loops -> 1)
|
|
1098
|
+
genes_found_in_results = []
|
|
1099
|
+
for _idx, row in all_results.iterrows():
|
|
1100
|
+
term = row["Term"]
|
|
1101
|
+
enrichment_scores[term] = row["Combined Score"]
|
|
1102
|
+
pvalues[term] = row["P-value"]
|
|
1103
|
+
adjusted_pvalues[term] = row["Adjusted P-value"]
|
|
1104
|
+
|
|
1105
|
+
genes_str = row["Genes"]
|
|
1106
|
+
genes_list = genes_str.split(";") if isinstance(genes_str, str) else []
|
|
1107
|
+
genes_found_in_results.extend(genes_list)
|
|
1108
|
+
|
|
1109
|
+
gene_set_statistics[term] = {
|
|
1110
|
+
"combined_score": row["Combined Score"],
|
|
1111
|
+
"pval": row["P-value"],
|
|
1112
|
+
"adjusted_pval": row["Adjusted P-value"],
|
|
1113
|
+
"z_score": row.get("Z-score", np.nan),
|
|
1114
|
+
"overlap": row["Overlap"],
|
|
1115
|
+
"genes": genes_list,
|
|
1116
|
+
"odds_ratio": row.get("Odds Ratio", 1.0),
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
# Get top results
|
|
1120
|
+
all_results_sorted = all_results.sort_values("Combined Score", ascending=False)
|
|
1121
|
+
top_gene_sets = all_results_sorted.head(10)["Term"].tolist()
|
|
1122
|
+
|
|
1123
|
+
# Filter all result dictionaries to only significant pathways (reduces MCP response size)
|
|
1124
|
+
# Uses method-based FDR threshold: Enrichr = 0.05 (same as ORA, hypergeometric-based)
|
|
1125
|
+
(
|
|
1126
|
+
filtered_statistics,
|
|
1127
|
+
filtered_scores,
|
|
1128
|
+
filtered_pvals,
|
|
1129
|
+
filtered_adj_pvals,
|
|
1130
|
+
) = _filter_significant_statistics(
|
|
1131
|
+
gene_set_statistics,
|
|
1132
|
+
enrichment_scores,
|
|
1133
|
+
pvalues,
|
|
1134
|
+
adjusted_pvalues,
|
|
1135
|
+
method="enrichr", # Method-based FDR: 0.05 for Enrichr
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
return EnrichmentResult(
|
|
1139
|
+
method="enrichr",
|
|
1140
|
+
n_gene_sets=len(all_results),
|
|
1141
|
+
n_significant=len(all_results[all_results["Adjusted P-value"] < 0.05]),
|
|
1142
|
+
enrichment_scores=filtered_scores,
|
|
1143
|
+
pvalues=filtered_pvals,
|
|
1144
|
+
adjusted_pvalues=filtered_adj_pvals,
|
|
1145
|
+
gene_set_statistics=filtered_statistics,
|
|
1146
|
+
top_gene_sets=top_gene_sets,
|
|
1147
|
+
top_depleted_sets=[], # Enrichr doesn't produce depleted sets
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
except Exception as e:
|
|
1151
|
+
logger.error(f"Enrichr failed: {e}")
|
|
1152
|
+
raise
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
# ============================================================================
|
|
1156
|
+
# Spatial Enrichment Analysis Functions (EnrichMap-based)
|
|
1157
|
+
# ============================================================================
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def perform_spatial_enrichment(
|
|
1161
|
+
data_id: str,
|
|
1162
|
+
ctx: "ToolContext",
|
|
1163
|
+
gene_sets: Union[list[str], dict[str, list[str]]],
|
|
1164
|
+
score_keys: Optional[Union[str, list[str]]] = None,
|
|
1165
|
+
spatial_key: str = "spatial",
|
|
1166
|
+
n_neighbors: int = 6,
|
|
1167
|
+
smoothing: bool = True,
|
|
1168
|
+
correct_spatial_covariates: bool = True,
|
|
1169
|
+
batch_key: Optional[str] = None,
|
|
1170
|
+
species: str = "unknown",
|
|
1171
|
+
database: Optional[str] = None,
|
|
1172
|
+
) -> "EnrichmentResult":
|
|
1173
|
+
"""
|
|
1174
|
+
Perform spatially-aware gene set enrichment analysis using EnrichMap.
|
|
1175
|
+
|
|
1176
|
+
Parameters
|
|
1177
|
+
----------
|
|
1178
|
+
data_id : str
|
|
1179
|
+
Identifier for the spatial data in the data store
|
|
1180
|
+
ctx : ToolContext
|
|
1181
|
+
MCP tool context for data access and logging
|
|
1182
|
+
gene_sets : Union[List[str], Dict[str, List[str]]]
|
|
1183
|
+
Either a single gene list or a dictionary of gene sets where keys are
|
|
1184
|
+
signature names and values are lists of genes
|
|
1185
|
+
score_keys : Optional[Union[str, List[str]]]
|
|
1186
|
+
Names for the gene signatures if gene_sets is a list. Ignored if gene_sets
|
|
1187
|
+
is already a dictionary
|
|
1188
|
+
spatial_key : str
|
|
1189
|
+
Key in adata.obsm containing spatial coordinates (default: "spatial")
|
|
1190
|
+
n_neighbors : int
|
|
1191
|
+
Number of nearest spatial neighbors for smoothing (default: 6)
|
|
1192
|
+
smoothing : bool
|
|
1193
|
+
Whether to perform spatial smoothing (default: True)
|
|
1194
|
+
correct_spatial_covariates : bool
|
|
1195
|
+
Whether to correct for spatial covariates using GAM (default: True)
|
|
1196
|
+
batch_key : Optional[str]
|
|
1197
|
+
Column in adata.obs for batch-wise normalization
|
|
1198
|
+
species : str
|
|
1199
|
+
Species for the analysis (e.g., 'mouse', 'human')
|
|
1200
|
+
database : Optional[str]
|
|
1201
|
+
Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
|
|
1202
|
+
|
|
1203
|
+
Returns
|
|
1204
|
+
-------
|
|
1205
|
+
Dict[str, Any]
|
|
1206
|
+
Dictionary containing:
|
|
1207
|
+
- data_id: ID of the data with enrichment scores
|
|
1208
|
+
- signatures: List of computed signatures
|
|
1209
|
+
- score_columns: List of column names containing scores
|
|
1210
|
+
- gene_contributions: Dictionary of gene contributions per signature
|
|
1211
|
+
- summary_stats: Summary statistics for each signature
|
|
1212
|
+
"""
|
|
1213
|
+
# Check if EnrichMap is available
|
|
1214
|
+
require("enrichmap", ctx, feature="spatial enrichment analysis")
|
|
1215
|
+
|
|
1216
|
+
# Import EnrichMap
|
|
1217
|
+
import enrichmap as em
|
|
1218
|
+
|
|
1219
|
+
# Get data using standard ctx pattern
|
|
1220
|
+
adata = await ctx.get_adata(data_id)
|
|
1221
|
+
|
|
1222
|
+
# Validate spatial coordinates
|
|
1223
|
+
if spatial_key not in adata.obsm:
|
|
1224
|
+
raise ProcessingError(
|
|
1225
|
+
f"Spatial coordinates '{spatial_key}' not found in adata.obsm"
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
# Convert single gene list to dictionary format
|
|
1229
|
+
if isinstance(gene_sets, list):
|
|
1230
|
+
if score_keys is None:
|
|
1231
|
+
score_keys = "enrichmap_signature"
|
|
1232
|
+
gene_sets = {score_keys: gene_sets}
|
|
1233
|
+
|
|
1234
|
+
# Validate gene sets with format conversion
|
|
1235
|
+
available_genes = set(adata.var_names)
|
|
1236
|
+
validated_gene_sets = {}
|
|
1237
|
+
|
|
1238
|
+
for sig_name, genes in gene_sets.items():
|
|
1239
|
+
# Try direct matching first
|
|
1240
|
+
common_genes = [gene for gene in genes if gene in available_genes]
|
|
1241
|
+
|
|
1242
|
+
# If few matches and we know the species, try format conversion
|
|
1243
|
+
if len(common_genes) < len(genes) * 0.5 and species != "unknown":
|
|
1244
|
+
dataset_format_genes, _ = _convert_gene_format_for_matching(
|
|
1245
|
+
genes, available_genes, species
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
if len(dataset_format_genes) > len(common_genes):
|
|
1249
|
+
# Format conversion helped, use dataset format genes for EnrichMap
|
|
1250
|
+
common_genes = dataset_format_genes
|
|
1251
|
+
|
|
1252
|
+
if len(common_genes) < 2:
|
|
1253
|
+
await ctx.warning(
|
|
1254
|
+
f"Signature '{sig_name}' has {len(common_genes)} genes in the dataset. Skipping."
|
|
1255
|
+
)
|
|
1256
|
+
continue
|
|
1257
|
+
validated_gene_sets[sig_name] = common_genes
|
|
1258
|
+
await ctx.info(
|
|
1259
|
+
f"Signature '{sig_name}': {len(common_genes)}/{len(genes)} genes found"
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
if not validated_gene_sets:
|
|
1263
|
+
raise ProcessingError(
|
|
1264
|
+
f"No valid gene signatures found (≥2 genes). "
|
|
1265
|
+
f"Dataset: {len(available_genes)} genes, requested: {len(gene_sets)} signatures. "
|
|
1266
|
+
f"Check species (human/mouse) and gene name format."
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
# Run EnrichMap scoring - process each gene set individually
|
|
1270
|
+
failed_signatures = []
|
|
1271
|
+
successful_signatures = []
|
|
1272
|
+
|
|
1273
|
+
for sig_name, genes in validated_gene_sets.items():
|
|
1274
|
+
try:
|
|
1275
|
+
em.tl.score(
|
|
1276
|
+
adata=adata,
|
|
1277
|
+
gene_set=genes, # Fixed: use gene_set (correct API parameter name)
|
|
1278
|
+
score_key=sig_name, # Fixed: provide explicit score_key
|
|
1279
|
+
spatial_key=spatial_key,
|
|
1280
|
+
n_neighbors=n_neighbors,
|
|
1281
|
+
smoothing=smoothing,
|
|
1282
|
+
correct_spatial_covariates=correct_spatial_covariates,
|
|
1283
|
+
batch_key=batch_key,
|
|
1284
|
+
)
|
|
1285
|
+
successful_signatures.append(sig_name)
|
|
1286
|
+
|
|
1287
|
+
except Exception as e:
|
|
1288
|
+
await ctx.warning(f"EnrichMap failed for '{sig_name}': {e}")
|
|
1289
|
+
failed_signatures.append((sig_name, str(e)))
|
|
1290
|
+
|
|
1291
|
+
# Check if any signatures were processed successfully
|
|
1292
|
+
if not successful_signatures:
|
|
1293
|
+
error_details = "; ".join(
|
|
1294
|
+
[f"{name}: {error}" for name, error in failed_signatures]
|
|
1295
|
+
)
|
|
1296
|
+
raise ProcessingError(
|
|
1297
|
+
f"All EnrichMap scoring failed. This may indicate:\n"
|
|
1298
|
+
f"1. EnrichMap package installation issues\n"
|
|
1299
|
+
f"2. Incompatible gene names or data format\n"
|
|
1300
|
+
f"3. Insufficient spatial information\n"
|
|
1301
|
+
f"Details: {error_details}"
|
|
1302
|
+
)
|
|
1303
|
+
|
|
1304
|
+
# Update validated_gene_sets to only include successful ones
|
|
1305
|
+
validated_gene_sets = {
|
|
1306
|
+
sig: validated_gene_sets[sig] for sig in successful_signatures
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
if ctx and failed_signatures:
|
|
1310
|
+
await ctx.warning(
|
|
1311
|
+
f"Failed to process {len(failed_signatures)} gene sets: {[name for name, _ in failed_signatures]}"
|
|
1312
|
+
)
|
|
1313
|
+
|
|
1314
|
+
# Collect results
|
|
1315
|
+
score_columns = [f"{sig}_score" for sig in validated_gene_sets]
|
|
1316
|
+
|
|
1317
|
+
# Calculate summary statistics
|
|
1318
|
+
summary_stats = {}
|
|
1319
|
+
for sig_name in validated_gene_sets:
|
|
1320
|
+
score_col = f"{sig_name}_score"
|
|
1321
|
+
scores = adata.obs[score_col]
|
|
1322
|
+
|
|
1323
|
+
summary_stats[sig_name] = {
|
|
1324
|
+
"mean": float(scores.mean()),
|
|
1325
|
+
"std": float(scores.std()),
|
|
1326
|
+
"min": float(scores.min()),
|
|
1327
|
+
"max": float(scores.max()),
|
|
1328
|
+
"median": float(scores.median()),
|
|
1329
|
+
"q25": float(scores.quantile(0.25)),
|
|
1330
|
+
"q75": float(scores.quantile(0.75)),
|
|
1331
|
+
"n_genes": len(validated_gene_sets[sig_name]),
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
# Store gene set membership for validation
|
|
1335
|
+
adata.uns["enrichment_gene_sets"] = validated_gene_sets
|
|
1336
|
+
|
|
1337
|
+
# Store metadata for scientific provenance tracking
|
|
1338
|
+
store_analysis_metadata(
|
|
1339
|
+
adata,
|
|
1340
|
+
analysis_name="enrichment_spatial",
|
|
1341
|
+
method="spatial_enrichmap",
|
|
1342
|
+
parameters={
|
|
1343
|
+
"spatial_key": spatial_key,
|
|
1344
|
+
"n_neighbors": n_neighbors,
|
|
1345
|
+
"smoothing": smoothing,
|
|
1346
|
+
"correct_spatial_covariates": correct_spatial_covariates,
|
|
1347
|
+
"batch_key": batch_key,
|
|
1348
|
+
},
|
|
1349
|
+
results_keys={
|
|
1350
|
+
"obs": score_columns,
|
|
1351
|
+
"uns": ["gene_contributions", "enrichment_gene_sets"],
|
|
1352
|
+
},
|
|
1353
|
+
statistics={
|
|
1354
|
+
"n_gene_sets": len(validated_gene_sets),
|
|
1355
|
+
"n_successful_signatures": len(successful_signatures),
|
|
1356
|
+
"n_failed_signatures": len(failed_signatures),
|
|
1357
|
+
},
|
|
1358
|
+
species=species,
|
|
1359
|
+
database=database,
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1362
|
+
# Create enrichment scores (use max score per gene set)
|
|
1363
|
+
enrichment_scores = {
|
|
1364
|
+
sig_name: float(stats["max"]) for sig_name, stats in summary_stats.items()
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
# Sort by enrichment score to get top gene sets
|
|
1368
|
+
sorted_sigs = sorted(enrichment_scores.items(), key=lambda x: x[1], reverse=True)
|
|
1369
|
+
top_gene_sets = [sig_name for sig_name, _ in sorted_sigs[:10]]
|
|
1370
|
+
|
|
1371
|
+
# Spatial enrichment doesn't provide p-values, so return empty gene_set_statistics
|
|
1372
|
+
# to reduce MCP response size (no significance filtering possible)
|
|
1373
|
+
pvalues = None
|
|
1374
|
+
adjusted_pvalues = None
|
|
1375
|
+
|
|
1376
|
+
return EnrichmentResult(
|
|
1377
|
+
method="spatial_enrichmap",
|
|
1378
|
+
n_gene_sets=len(validated_gene_sets),
|
|
1379
|
+
n_significant=len(successful_signatures),
|
|
1380
|
+
enrichment_scores=enrichment_scores,
|
|
1381
|
+
pvalues=pvalues,
|
|
1382
|
+
adjusted_pvalues=adjusted_pvalues,
|
|
1383
|
+
gene_set_statistics={}, # Empty to reduce response size (no p-values available)
|
|
1384
|
+
spatial_scores_key=None, # Scores are in obs columns, not obsm
|
|
1385
|
+
top_gene_sets=top_gene_sets,
|
|
1386
|
+
top_depleted_sets=[], # Spatial enrichment doesn't produce depleted sets
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
# ============================================================================
|
|
1391
|
+
# Gene Set Loading Functions
|
|
1392
|
+
# ============================================================================
|
|
1393
|
+
# Simplified from GeneSetLoader class - no need for class overhead when
|
|
1394
|
+
# functions are only called once from load_gene_sets()
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
def _get_organism_name(species: str) -> str:
|
|
1398
|
+
"""Get organism name for gseapy from species code."""
|
|
1399
|
+
return "Homo sapiens" if species.lower() == "human" else "Mus musculus"
|
|
1400
|
+
|
|
1401
|
+
|
|
1402
|
+
def load_msigdb_gene_sets(
|
|
1403
|
+
species: str,
|
|
1404
|
+
collection: str = "H",
|
|
1405
|
+
subcollection: Optional[str] = None,
|
|
1406
|
+
min_size: int = 10,
|
|
1407
|
+
max_size: int = 500,
|
|
1408
|
+
) -> dict[str, list[str]]:
|
|
1409
|
+
"""
|
|
1410
|
+
Load gene sets from MSigDB using gseapy.
|
|
1411
|
+
|
|
1412
|
+
Parameters
|
|
1413
|
+
----------
|
|
1414
|
+
species : str
|
|
1415
|
+
Species for gene sets ('human' or 'mouse')
|
|
1416
|
+
collection : str
|
|
1417
|
+
MSigDB collection name:
|
|
1418
|
+
- H: hallmark gene sets
|
|
1419
|
+
- C1: positional gene sets
|
|
1420
|
+
- C2: curated gene sets (e.g., CGP, CP:KEGG, CP:REACTOME)
|
|
1421
|
+
- C3: motif gene sets
|
|
1422
|
+
- C4: computational gene sets
|
|
1423
|
+
- C5: GO gene sets (CC, BP, MF)
|
|
1424
|
+
- C6: oncogenic signatures
|
|
1425
|
+
- C7: immunologic signatures
|
|
1426
|
+
- C8: cell type signatures
|
|
1427
|
+
subcollection : Optional[str]
|
|
1428
|
+
Subcollection for specific databases (e.g., 'CP:KEGG', 'GO:BP')
|
|
1429
|
+
min_size : int
|
|
1430
|
+
Minimum gene set size
|
|
1431
|
+
max_size : int
|
|
1432
|
+
Maximum gene set size
|
|
1433
|
+
|
|
1434
|
+
Returns
|
|
1435
|
+
-------
|
|
1436
|
+
Dict[str, List[str]]
|
|
1437
|
+
Dictionary of gene sets
|
|
1438
|
+
"""
|
|
1439
|
+
try:
|
|
1440
|
+
import gseapy as gp
|
|
1441
|
+
|
|
1442
|
+
organism = _get_organism_name(species)
|
|
1443
|
+
gene_sets_dict = {}
|
|
1444
|
+
|
|
1445
|
+
if collection == "H":
|
|
1446
|
+
# Hallmark gene sets
|
|
1447
|
+
gene_sets = gp.get_library_name(organism=organism)
|
|
1448
|
+
if "MSigDB_Hallmark_2020" in gene_sets:
|
|
1449
|
+
gene_sets_dict = gp.get_library(
|
|
1450
|
+
"MSigDB_Hallmark_2020", organism=organism
|
|
1451
|
+
)
|
|
1452
|
+
|
|
1453
|
+
elif collection == "C2" and subcollection == "CP:KEGG":
|
|
1454
|
+
# KEGG pathways
|
|
1455
|
+
if species.lower() == "human":
|
|
1456
|
+
gene_sets_dict = gp.get_library("KEGG_2021_Human", organism=organism)
|
|
1457
|
+
else:
|
|
1458
|
+
gene_sets_dict = gp.get_library("KEGG_2019_Mouse", organism=organism)
|
|
1459
|
+
|
|
1460
|
+
elif collection == "C2" and subcollection == "CP:REACTOME":
|
|
1461
|
+
# Reactome pathways
|
|
1462
|
+
gene_sets_dict = gp.get_library("Reactome_2022", organism=organism)
|
|
1463
|
+
|
|
1464
|
+
elif collection == "C5":
|
|
1465
|
+
# GO gene sets
|
|
1466
|
+
if subcollection == "GO:BP" or subcollection is None:
|
|
1467
|
+
gene_sets_dict.update(
|
|
1468
|
+
gp.get_library("GO_Biological_Process_2023", organism=organism)
|
|
1469
|
+
)
|
|
1470
|
+
if subcollection == "GO:MF" or subcollection is None:
|
|
1471
|
+
gene_sets_dict.update(
|
|
1472
|
+
gp.get_library("GO_Molecular_Function_2023", organism=organism)
|
|
1473
|
+
)
|
|
1474
|
+
if subcollection == "GO:CC" or subcollection is None:
|
|
1475
|
+
gene_sets_dict.update(
|
|
1476
|
+
gp.get_library("GO_Cellular_Component_2023", organism=organism)
|
|
1477
|
+
)
|
|
1478
|
+
|
|
1479
|
+
elif collection == "C8":
|
|
1480
|
+
# Cell type signatures
|
|
1481
|
+
gene_sets_dict = gp.get_library(
|
|
1482
|
+
"CellMarker_Augmented_2021", organism=organism
|
|
1483
|
+
)
|
|
1484
|
+
|
|
1485
|
+
# Filter by size
|
|
1486
|
+
filtered_sets = _filter_gene_sets_by_size(gene_sets_dict, min_size, max_size)
|
|
1487
|
+
return filtered_sets
|
|
1488
|
+
|
|
1489
|
+
except Exception as e:
|
|
1490
|
+
raise ProcessingError(f"Failed to load MSigDB gene sets: {e}") from e
|
|
1491
|
+
|
|
1492
|
+
|
|
1493
|
+
def load_go_gene_sets(
|
|
1494
|
+
species: str,
|
|
1495
|
+
aspect: str = "BP",
|
|
1496
|
+
min_size: int = 10,
|
|
1497
|
+
max_size: int = 500,
|
|
1498
|
+
) -> dict[str, list[str]]:
|
|
1499
|
+
"""
|
|
1500
|
+
Load GO terms using gseapy.
|
|
1501
|
+
|
|
1502
|
+
Parameters
|
|
1503
|
+
----------
|
|
1504
|
+
species : str
|
|
1505
|
+
Species for gene sets ('human' or 'mouse')
|
|
1506
|
+
aspect : str
|
|
1507
|
+
GO aspect: 'BP' (biological process), 'MF' (molecular function),
|
|
1508
|
+
'CC' (cellular component)
|
|
1509
|
+
min_size : int
|
|
1510
|
+
Minimum gene set size
|
|
1511
|
+
max_size : int
|
|
1512
|
+
Maximum gene set size
|
|
1513
|
+
|
|
1514
|
+
Returns
|
|
1515
|
+
-------
|
|
1516
|
+
Dict[str, List[str]]
|
|
1517
|
+
Dictionary of GO gene sets
|
|
1518
|
+
"""
|
|
1519
|
+
aspect_map = {
|
|
1520
|
+
"BP": "GO_Biological_Process_2023",
|
|
1521
|
+
"MF": "GO_Molecular_Function_2023",
|
|
1522
|
+
"CC": "GO_Cellular_Component_2023",
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
if aspect not in aspect_map:
|
|
1526
|
+
raise ParameterError(f"Invalid GO aspect: {aspect}")
|
|
1527
|
+
|
|
1528
|
+
try:
|
|
1529
|
+
import gseapy as gp
|
|
1530
|
+
|
|
1531
|
+
organism = _get_organism_name(species)
|
|
1532
|
+
gene_sets = gp.get_library(aspect_map[aspect], organism=organism)
|
|
1533
|
+
|
|
1534
|
+
# Filter by size
|
|
1535
|
+
filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
|
|
1536
|
+
return filtered_sets
|
|
1537
|
+
|
|
1538
|
+
except Exception as e:
|
|
1539
|
+
raise ProcessingError(f"Failed to load GO gene sets: {e}") from e
|
|
1540
|
+
|
|
1541
|
+
|
|
1542
|
+
def load_kegg_gene_sets(
|
|
1543
|
+
species: str, min_size: int = 10, max_size: int = 500
|
|
1544
|
+
) -> dict[str, list[str]]:
|
|
1545
|
+
"""
|
|
1546
|
+
Load KEGG pathways using gseapy.
|
|
1547
|
+
|
|
1548
|
+
Parameters
|
|
1549
|
+
----------
|
|
1550
|
+
species : str
|
|
1551
|
+
Species for gene sets ('human' or 'mouse')
|
|
1552
|
+
min_size : int
|
|
1553
|
+
Minimum gene set size
|
|
1554
|
+
max_size : int
|
|
1555
|
+
Maximum gene set size
|
|
1556
|
+
|
|
1557
|
+
Returns
|
|
1558
|
+
-------
|
|
1559
|
+
Dict[str, List[str]]
|
|
1560
|
+
Dictionary of KEGG pathway gene sets
|
|
1561
|
+
"""
|
|
1562
|
+
try:
|
|
1563
|
+
import gseapy as gp
|
|
1564
|
+
|
|
1565
|
+
organism = _get_organism_name(species)
|
|
1566
|
+
|
|
1567
|
+
if species.lower() == "human":
|
|
1568
|
+
gene_sets = gp.get_library("KEGG_2021_Human", organism=organism)
|
|
1569
|
+
else:
|
|
1570
|
+
gene_sets = gp.get_library("KEGG_2019_Mouse", organism=organism)
|
|
1571
|
+
|
|
1572
|
+
# Filter by size
|
|
1573
|
+
filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
|
|
1574
|
+
return filtered_sets
|
|
1575
|
+
|
|
1576
|
+
except Exception as e:
|
|
1577
|
+
raise ProcessingError(f"Failed to load KEGG pathways: {e}") from e
|
|
1578
|
+
|
|
1579
|
+
|
|
1580
|
+
def load_reactome_gene_sets(
|
|
1581
|
+
species: str, min_size: int = 10, max_size: int = 500
|
|
1582
|
+
) -> dict[str, list[str]]:
|
|
1583
|
+
"""
|
|
1584
|
+
Load Reactome pathways using gseapy.
|
|
1585
|
+
|
|
1586
|
+
Parameters
|
|
1587
|
+
----------
|
|
1588
|
+
species : str
|
|
1589
|
+
Species for gene sets ('human' or 'mouse')
|
|
1590
|
+
min_size : int
|
|
1591
|
+
Minimum gene set size
|
|
1592
|
+
max_size : int
|
|
1593
|
+
Maximum gene set size
|
|
1594
|
+
|
|
1595
|
+
Returns
|
|
1596
|
+
-------
|
|
1597
|
+
Dict[str, List[str]]
|
|
1598
|
+
Dictionary of Reactome pathway gene sets
|
|
1599
|
+
"""
|
|
1600
|
+
try:
|
|
1601
|
+
import gseapy as gp
|
|
1602
|
+
|
|
1603
|
+
organism = _get_organism_name(species)
|
|
1604
|
+
gene_sets = gp.get_library("Reactome_2022", organism=organism)
|
|
1605
|
+
|
|
1606
|
+
# Filter by size (use shared utility for consistency)
|
|
1607
|
+
filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
|
|
1608
|
+
return filtered_sets
|
|
1609
|
+
|
|
1610
|
+
except Exception as e:
|
|
1611
|
+
raise ProcessingError(f"Failed to load Reactome pathways: {e}") from e
|
|
1612
|
+
|
|
1613
|
+
|
|
1614
|
+
def load_cell_marker_gene_sets(
|
|
1615
|
+
species: str, min_size: int = 5, max_size: int = 200
|
|
1616
|
+
) -> dict[str, list[str]]:
|
|
1617
|
+
"""
|
|
1618
|
+
Load cell type marker gene sets using gseapy.
|
|
1619
|
+
|
|
1620
|
+
Parameters
|
|
1621
|
+
----------
|
|
1622
|
+
species : str
|
|
1623
|
+
Species for gene sets ('human' or 'mouse')
|
|
1624
|
+
min_size : int
|
|
1625
|
+
Minimum gene set size
|
|
1626
|
+
max_size : int
|
|
1627
|
+
Maximum gene set size
|
|
1628
|
+
|
|
1629
|
+
Returns
|
|
1630
|
+
-------
|
|
1631
|
+
Dict[str, List[str]]
|
|
1632
|
+
Dictionary of cell type marker gene sets
|
|
1633
|
+
"""
|
|
1634
|
+
try:
|
|
1635
|
+
import gseapy as gp
|
|
1636
|
+
|
|
1637
|
+
organism = _get_organism_name(species)
|
|
1638
|
+
gene_sets = gp.get_library("CellMarker_Augmented_2021", organism=organism)
|
|
1639
|
+
|
|
1640
|
+
# Filter by size
|
|
1641
|
+
filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
|
|
1642
|
+
return filtered_sets
|
|
1643
|
+
|
|
1644
|
+
except Exception as e:
|
|
1645
|
+
raise ProcessingError(f"Failed to load cell markers: {e}") from e
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
def load_gene_sets(
|
|
1649
|
+
database: str,
|
|
1650
|
+
species: str = "human",
|
|
1651
|
+
min_genes: int = 10,
|
|
1652
|
+
max_genes: int = 500,
|
|
1653
|
+
ctx: "ToolContext" = None,
|
|
1654
|
+
) -> dict[str, list[str]]:
|
|
1655
|
+
"""
|
|
1656
|
+
Load gene sets from specified database.
|
|
1657
|
+
|
|
1658
|
+
Parameters
|
|
1659
|
+
----------
|
|
1660
|
+
database : str
|
|
1661
|
+
Database name:
|
|
1662
|
+
- GO_Biological_Process, GO_Molecular_Function, GO_Cellular_Component
|
|
1663
|
+
- KEGG_Pathways
|
|
1664
|
+
- Reactome_Pathways
|
|
1665
|
+
- MSigDB_Hallmark
|
|
1666
|
+
- Cell_Type_Markers
|
|
1667
|
+
species : str
|
|
1668
|
+
Species ('human' or 'mouse')
|
|
1669
|
+
min_genes : int
|
|
1670
|
+
Minimum gene set size
|
|
1671
|
+
max_genes : int
|
|
1672
|
+
Maximum gene set size
|
|
1673
|
+
ctx : ToolContext
|
|
1674
|
+
MCP tool context for logging
|
|
1675
|
+
|
|
1676
|
+
Returns
|
|
1677
|
+
-------
|
|
1678
|
+
Dict[str, List[str]]
|
|
1679
|
+
Dictionary of gene sets
|
|
1680
|
+
"""
|
|
1681
|
+
# Direct function calls - no class overhead
|
|
1682
|
+
database_map = {
|
|
1683
|
+
"GO_Biological_Process": lambda: load_go_gene_sets(
|
|
1684
|
+
species, "BP", min_genes, max_genes
|
|
1685
|
+
),
|
|
1686
|
+
"GO_Molecular_Function": lambda: load_go_gene_sets(
|
|
1687
|
+
species, "MF", min_genes, max_genes
|
|
1688
|
+
),
|
|
1689
|
+
"GO_Cellular_Component": lambda: load_go_gene_sets(
|
|
1690
|
+
species, "CC", min_genes, max_genes
|
|
1691
|
+
),
|
|
1692
|
+
"KEGG_Pathways": lambda: load_kegg_gene_sets(species, min_genes, max_genes),
|
|
1693
|
+
"Reactome_Pathways": lambda: load_reactome_gene_sets(
|
|
1694
|
+
species, min_genes, max_genes
|
|
1695
|
+
),
|
|
1696
|
+
"MSigDB_Hallmark": lambda: load_msigdb_gene_sets(
|
|
1697
|
+
species, "H", None, min_genes, max_genes
|
|
1698
|
+
),
|
|
1699
|
+
"Cell_Type_Markers": lambda: load_cell_marker_gene_sets(
|
|
1700
|
+
species, min_genes, max_genes
|
|
1701
|
+
),
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
if database not in database_map:
|
|
1705
|
+
raise ParameterError(
|
|
1706
|
+
f"Unknown database: {database}. Available: {list(database_map)}"
|
|
1707
|
+
)
|
|
1708
|
+
|
|
1709
|
+
gene_sets = database_map[database]()
|
|
1710
|
+
return gene_sets
|
|
1711
|
+
|
|
1712
|
+
|
|
1713
|
+
# ============================================================================
|
|
1714
|
+
# UNIFIED ENRICHMENT ANALYSIS ENTRY POINT
|
|
1715
|
+
# ============================================================================
|
|
1716
|
+
|
|
1717
|
+
|
|
1718
|
+
async def analyze_enrichment(
|
|
1719
|
+
data_id: str,
|
|
1720
|
+
ctx: "ToolContext",
|
|
1721
|
+
params: "EnrichmentParameters",
|
|
1722
|
+
) -> EnrichmentResult:
|
|
1723
|
+
"""
|
|
1724
|
+
Unified entry point for gene set enrichment analysis.
|
|
1725
|
+
|
|
1726
|
+
This function handles all enrichment methods with a consistent interface:
|
|
1727
|
+
- Gene set loading from databases
|
|
1728
|
+
- Method dispatch (GSEA, ORA, ssGSEA, Enrichr, spatial)
|
|
1729
|
+
- Error handling with clear messages
|
|
1730
|
+
|
|
1731
|
+
Args:
|
|
1732
|
+
data_id: Dataset ID
|
|
1733
|
+
ctx: ToolContext for data access and logging
|
|
1734
|
+
params: EnrichmentParameters with method, species, database, etc.
|
|
1735
|
+
|
|
1736
|
+
Returns:
|
|
1737
|
+
EnrichmentResult with enrichment scores and statistics
|
|
1738
|
+
|
|
1739
|
+
Raises:
|
|
1740
|
+
ParameterError: If params is None or invalid
|
|
1741
|
+
ProcessingError: If gene set loading or analysis fails
|
|
1742
|
+
"""
|
|
1743
|
+
# Import here to avoid circular imports
|
|
1744
|
+
from ..utils.adata_utils import get_highly_variable_genes
|
|
1745
|
+
|
|
1746
|
+
# Validate params
|
|
1747
|
+
if params is None:
|
|
1748
|
+
raise ParameterError(
|
|
1749
|
+
"params parameter is required for enrichment analysis.\n"
|
|
1750
|
+
"You must provide EnrichmentParameters with at least 'species' specified.\n"
|
|
1751
|
+
"Example: params={'species': 'mouse', 'method': 'pathway_ora'}"
|
|
1752
|
+
)
|
|
1753
|
+
|
|
1754
|
+
# Get adata
|
|
1755
|
+
adata = await ctx.get_adata(data_id)
|
|
1756
|
+
|
|
1757
|
+
# Load gene sets
|
|
1758
|
+
gene_sets = params.gene_sets
|
|
1759
|
+
if gene_sets is None and params.gene_set_database:
|
|
1760
|
+
await ctx.info(f"Loading gene sets from {params.gene_set_database}")
|
|
1761
|
+
try:
|
|
1762
|
+
gene_sets = load_gene_sets(
|
|
1763
|
+
database=params.gene_set_database,
|
|
1764
|
+
species=params.species,
|
|
1765
|
+
min_genes=params.min_genes,
|
|
1766
|
+
max_genes=params.max_genes,
|
|
1767
|
+
ctx=ctx,
|
|
1768
|
+
)
|
|
1769
|
+
await ctx.info(
|
|
1770
|
+
f"Loaded {len(gene_sets)} gene sets from {params.gene_set_database}"
|
|
1771
|
+
)
|
|
1772
|
+
except Exception as e:
|
|
1773
|
+
await ctx.error(f"Gene set database loading failed: {e}")
|
|
1774
|
+
raise ProcessingError(
|
|
1775
|
+
f"Failed to load gene sets from {params.gene_set_database}: {e}\n\n"
|
|
1776
|
+
f"SOLUTIONS:\n"
|
|
1777
|
+
f"1. Check your internet connection\n"
|
|
1778
|
+
f"2. Verify species parameter: '{params.species}'\n"
|
|
1779
|
+
f"3. Try a different database (KEGG_Pathways, GO_Biological_Process)\n"
|
|
1780
|
+
f"4. Provide custom gene sets via 'gene_sets' parameter"
|
|
1781
|
+
) from e
|
|
1782
|
+
|
|
1783
|
+
# Validate gene sets
|
|
1784
|
+
if gene_sets is None or len(gene_sets) == 0:
|
|
1785
|
+
raise ProcessingError(
|
|
1786
|
+
"No valid gene sets available. "
|
|
1787
|
+
"Please provide gene sets via 'gene_sets' parameter or "
|
|
1788
|
+
"specify a valid 'gene_set_database'."
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
# Dispatch to appropriate method
|
|
1792
|
+
if params.method == "spatial_enrichmap":
|
|
1793
|
+
result = perform_spatial_enrichment(
|
|
1794
|
+
data_id=data_id,
|
|
1795
|
+
ctx=ctx,
|
|
1796
|
+
gene_sets=gene_sets,
|
|
1797
|
+
score_keys=params.score_keys,
|
|
1798
|
+
spatial_key=params.spatial_key,
|
|
1799
|
+
n_neighbors=params.n_neighbors,
|
|
1800
|
+
smoothing=params.smoothing,
|
|
1801
|
+
correct_spatial_covariates=params.correct_spatial_covariates,
|
|
1802
|
+
batch_key=params.batch_key,
|
|
1803
|
+
species=params.species,
|
|
1804
|
+
database=params.gene_set_database,
|
|
1805
|
+
)
|
|
1806
|
+
await ctx.info(
|
|
1807
|
+
"Spatial enrichment complete. Use visualize_data with "
|
|
1808
|
+
"plot_type='pathway_enrichment' to visualize."
|
|
1809
|
+
)
|
|
1810
|
+
|
|
1811
|
+
elif params.method == "pathway_gsea":
|
|
1812
|
+
result = perform_gsea(
|
|
1813
|
+
adata=adata,
|
|
1814
|
+
gene_sets=gene_sets,
|
|
1815
|
+
ranking_key=params.score_keys,
|
|
1816
|
+
permutation_num=params.n_permutations,
|
|
1817
|
+
min_size=params.min_genes,
|
|
1818
|
+
max_size=params.max_genes,
|
|
1819
|
+
species=params.species,
|
|
1820
|
+
database=params.gene_set_database,
|
|
1821
|
+
ctx=ctx,
|
|
1822
|
+
)
|
|
1823
|
+
await ctx.info("GSEA complete. Use visualize_data to see results.")
|
|
1824
|
+
|
|
1825
|
+
elif params.method == "pathway_ora":
|
|
1826
|
+
result = perform_ora(
|
|
1827
|
+
adata=adata,
|
|
1828
|
+
gene_sets=gene_sets,
|
|
1829
|
+
pvalue_threshold=params.pvalue_cutoff,
|
|
1830
|
+
min_size=params.min_genes,
|
|
1831
|
+
max_size=params.max_genes,
|
|
1832
|
+
species=params.species,
|
|
1833
|
+
database=params.gene_set_database,
|
|
1834
|
+
ctx=ctx,
|
|
1835
|
+
)
|
|
1836
|
+
await ctx.info("ORA complete. Use visualize_data to see results.")
|
|
1837
|
+
|
|
1838
|
+
elif params.method == "pathway_ssgsea":
|
|
1839
|
+
result = perform_ssgsea(
|
|
1840
|
+
adata=adata,
|
|
1841
|
+
gene_sets=gene_sets,
|
|
1842
|
+
min_size=params.min_genes,
|
|
1843
|
+
max_size=params.max_genes,
|
|
1844
|
+
species=params.species,
|
|
1845
|
+
database=params.gene_set_database,
|
|
1846
|
+
ctx=ctx,
|
|
1847
|
+
)
|
|
1848
|
+
await ctx.info("ssGSEA complete. Use visualize_data to see results.")
|
|
1849
|
+
|
|
1850
|
+
elif params.method == "pathway_enrichr":
|
|
1851
|
+
gene_list = get_highly_variable_genes(adata, max_genes=500)
|
|
1852
|
+
result = perform_enrichr(
|
|
1853
|
+
gene_list=gene_list,
|
|
1854
|
+
gene_sets=params.gene_set_database,
|
|
1855
|
+
organism=params.species,
|
|
1856
|
+
ctx=ctx,
|
|
1857
|
+
)
|
|
1858
|
+
await ctx.info("Enrichr complete. Use visualize_data to see results.")
|
|
1859
|
+
|
|
1860
|
+
else:
|
|
1861
|
+
raise ParameterError(f"Unknown enrichment method: {params.method}")
|
|
1862
|
+
|
|
1863
|
+
return result
|