chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1863 @@
1
+ """
2
+ Enrichment analysis tools for spatial transcriptomics data.
3
+
4
+ This module provides both standard and spatially-aware enrichment analysis methods:
5
+ - Standard methods: GSEA, ORA, ssGSEA, Enrichr (via gseapy)
6
+ - Spatial methods: EnrichMap-based spatial enrichment analysis
7
+ """
8
+
9
+ import logging
10
+ from typing import TYPE_CHECKING, Optional, Union
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from scipy import stats
15
+
16
+ if TYPE_CHECKING:
17
+ from ..models.data import EnrichmentParameters
18
+ from ..spatial_mcp_adapter import ToolContext
19
+ from statsmodels.stats.multitest import multipletests
20
+
21
+ from ..models.analysis import EnrichmentResult
22
+ from ..utils.adata_utils import store_analysis_metadata, to_dense
23
+ from ..utils.dependency_manager import require
24
+ from ..utils.exceptions import ParameterError, ProcessingError
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ============================================================================
30
+ # MCP RESPONSE OPTIMIZATION
31
+ # ============================================================================
32
+
33
+
34
+ def _filter_significant_statistics(
35
+ gene_set_statistics: dict,
36
+ enrichment_scores: dict,
37
+ pvalues: dict,
38
+ adjusted_pvalues: dict,
39
+ method: str,
40
+ fdr_threshold: Optional[float] = None,
41
+ ) -> tuple:
42
+ """
43
+ Filter all enrichment result dictionaries to only include significant pathways.
44
+
45
+ This dramatically reduces MCP response size for large gene set databases
46
+ (e.g., KEGG 311 pathways, GO 10,000 terms) while preserving all important
47
+ information for users.
48
+
49
+ Args:
50
+ gene_set_statistics: Full statistics for all gene sets
51
+ enrichment_scores: Enrichment scores for all gene sets
52
+ pvalues: P-values for all gene sets
53
+ adjusted_pvalues: FDR-corrected p-values for all gene sets
54
+ method: Enrichment method used ('gsea', 'ora', 'enrichr', 'ssgsea')
55
+ fdr_threshold: FDR threshold for significance (default: None for method-based auto)
56
+ Method-based defaults (based on statistical best practices):
57
+ - GSEA: FDR < 0.25 (official recommendation from Subramanian et al. 2005)
58
+ - ORA/Enrichr: FDR < 0.05 (standard statistical threshold)
59
+ - ssGSEA: No filtering (no p-values produced)
60
+
61
+ Returns:
62
+ Tuple of (filtered_statistics, filtered_scores, filtered_pvals, filtered_adj_pvals)
63
+
64
+ Example:
65
+ Before: 311 pathways × 4 dicts × 100 chars = 124KB (KEGG)
66
+ After: ~15 significant pathways × 4 dicts × 100 chars = 6KB (95% reduction)
67
+
68
+ References:
69
+ - GSEA: Subramanian et al. (2005) PNAS 102(43):15545-15550
70
+ "We recommend an FDR cutoff of 25% when dealing with a single database"
71
+ - ORA: Standard multiple testing correction threshold (Benjamini & Hochberg 1995)
72
+ """
73
+ if not adjusted_pvalues:
74
+ # No p-values available (e.g., ssGSEA), return all results without filtering
75
+ return gene_set_statistics, enrichment_scores, pvalues, adjusted_pvalues
76
+
77
+ # Auto-determine threshold based on ANALYSIS METHOD if not specified
78
+ # This is statistically principled: different methods have different FDR standards
79
+ if fdr_threshold is None:
80
+ method_lower = method.lower()
81
+ if method_lower == "gsea":
82
+ # GSEA official recommendation: FDR < 0.25
83
+ # From Subramanian et al. 2005: "An FDR of 25% indicates that the result
84
+ # is likely to be valid 3 out of 4 times"
85
+ fdr_threshold = 0.25
86
+ elif method_lower in ("ora", "enrichr", "pathway_ora", "pathway_enrichr"):
87
+ # ORA and Enrichr: standard statistical threshold
88
+ # Based on Benjamini-Hochberg FDR control at 5%
89
+ fdr_threshold = 0.05
90
+ else:
91
+ # Default fallback for unknown methods
92
+ fdr_threshold = 0.05
93
+
94
+ # Find significant pathways
95
+ significant = {
96
+ name
97
+ for name, fdr in adjusted_pvalues.items()
98
+ if fdr is not None and fdr < fdr_threshold
99
+ }
100
+
101
+ # Filter all dictionaries
102
+ filtered_stats = {
103
+ name: stats
104
+ for name, stats in gene_set_statistics.items()
105
+ if name in significant
106
+ }
107
+
108
+ filtered_scores = {
109
+ name: score for name, score in enrichment_scores.items() if name in significant
110
+ }
111
+
112
+ filtered_pvals = {
113
+ name: pval for name, pval in pvalues.items() if name in significant
114
+ }
115
+
116
+ filtered_adj_pvals = {
117
+ name: adj_pval
118
+ for name, adj_pval in adjusted_pvalues.items()
119
+ if name in significant
120
+ }
121
+
122
+ return filtered_stats, filtered_scores, filtered_pvals, filtered_adj_pvals
123
+
124
+
125
+ # ============================================================================
126
+ # GENE SET UTILITIES
127
+ # ============================================================================
128
+
129
+
130
+ def _filter_gene_sets_by_size(
131
+ gene_sets: dict[str, list[str]], min_size: int, max_size: int
132
+ ) -> dict[str, list[str]]:
133
+ """
134
+ Filter gene sets by size constraints.
135
+
136
+ Parameters
137
+ ----------
138
+ gene_sets : Dict[str, List[str]]
139
+ Dictionary mapping gene set names to gene lists
140
+ min_size : int
141
+ Minimum number of genes required
142
+ max_size : int
143
+ Maximum number of genes allowed
144
+
145
+ Returns
146
+ -------
147
+ Dict[str, List[str]]
148
+ Filtered gene sets within size constraints
149
+ """
150
+ return {
151
+ name: genes
152
+ for name, genes in gene_sets.items()
153
+ if min_size <= len(genes) <= max_size
154
+ }
155
+
156
+
157
+ # ============================================================================
158
+ # SPARSE MATRIX UTILITIES
159
+ # ============================================================================
160
+
161
+
162
+ def _compute_std_sparse_compatible(X, axis=0, ddof=1):
163
+ """
164
+ Compute standard deviation compatible with both dense and sparse matrices.
165
+
166
+ For sparse matrices, uses the formula: std = sqrt(E[X^2] - E[X]^2) with Bessel correction.
167
+ For dense matrices, uses numpy's built-in std method.
168
+
169
+ Args:
170
+ X: Input matrix (can be sparse or dense)
171
+ axis: Axis along which to compute std (0 for columns, 1 for rows)
172
+ ddof: Delta Degrees of Freedom for Bessel correction (default: 1)
173
+
174
+ Returns:
175
+ 1D numpy array of standard deviations
176
+ """
177
+ import scipy.sparse as sp
178
+
179
+ if sp.issparse(X):
180
+ # Sparse matrix: use mathematical formula
181
+ n = X.shape[axis]
182
+ mean = np.array(X.mean(axis=axis)).flatten()
183
+ mean_of_squares = np.array(X.power(2).mean(axis=axis)).flatten()
184
+
185
+ # Compute variance with Bessel correction: n/(n-ddof)
186
+ variance = mean_of_squares - np.power(mean, 2)
187
+ variance = np.maximum(variance, 0) # Avoid numerical errors
188
+ if ddof > 0:
189
+ variance = variance * n / (n - ddof) # Bessel correction
190
+
191
+ return np.sqrt(variance)
192
+ else:
193
+ # Dense matrix: use numpy's built-in method
194
+ return np.array(X.std(axis=axis, ddof=ddof)).flatten()
195
+
196
+
197
+ # ============================================================================
198
+ # GENE FORMAT CONVERSION UTILITIES
199
+ # ============================================================================
200
+
201
+
202
+ def _convert_gene_format_for_matching(
203
+ pathway_genes: list[str], dataset_genes: set, species: str
204
+ ) -> tuple[list[str], dict[str, str]]:
205
+ """
206
+ Rule-based gene format conversion to match dataset format.
207
+
208
+ Handles common gene format variations between pathway databases and datasets:
209
+ - Uppercase (GENE) vs Title case (Gene) vs lowercase (gene)
210
+ - Species-specific formatting rules
211
+ - Special prefixes like Gm/GM/gm for mouse genes
212
+
213
+ Args:
214
+ pathway_genes: Gene names from pathway database (usually uppercase from gseapy)
215
+ dataset_genes: Available gene names in dataset
216
+ species: Species specified by user ("mouse" or "human")
217
+
218
+ Returns:
219
+ (dataset_format_genes, conversion_map)
220
+ dataset_format_genes: Gene names in dataset format that can be found
221
+ conversion_map: Maps dataset_format -> original_pathway_format
222
+ """
223
+ dataset_format_genes = []
224
+ conversion_map = {}
225
+
226
+ for gene in pathway_genes:
227
+ # Try direct match first
228
+ if gene in dataset_genes:
229
+ dataset_format_genes.append(gene)
230
+ conversion_map[gene] = gene
231
+ continue
232
+
233
+ # Apply multiple format conversion rules
234
+ format_variations = []
235
+
236
+ if species == "mouse":
237
+ # Mouse-specific format rules (order matters for efficiency)
238
+ # Rule 1: Title case (most common): Cd5l, Gbp2b
239
+ if len(gene) > 1:
240
+ format_variations.append(gene[0].upper() + gene[1:].lower())
241
+ # Rule 2: All lowercase: cd5l, gbp2b
242
+ format_variations.append(gene.lower())
243
+ # Rule 3: All uppercase: CD5L, GBP2B
244
+ format_variations.append(gene.upper())
245
+ # Rule 4: Capitalize first letter only
246
+ format_variations.append(gene.capitalize())
247
+
248
+ # Special rule for Gm-prefixed genes (common in mouse)
249
+ if gene.upper().startswith("GM"):
250
+ format_variations.extend(
251
+ [
252
+ "gm" + gene[2:].lower(), # gm42418
253
+ "Gm" + gene[2:].lower(), # Gm42418
254
+ "GM" + gene[2:].upper(), # GM42418
255
+ ]
256
+ )
257
+
258
+ elif species == "human":
259
+ # Human-specific format rules
260
+ # Rule 1: All uppercase (most common): HES1, FABP4
261
+ format_variations.append(gene.upper())
262
+ # Rule 2: All lowercase: hes1, fabp4
263
+ format_variations.append(gene.lower())
264
+ # Rule 3: Capitalize first letter
265
+ format_variations.append(gene.capitalize())
266
+
267
+ # Remove duplicates while preserving order
268
+ seen = set()
269
+ unique_variations = []
270
+ for variation in format_variations:
271
+ if variation not in seen and variation != gene: # Skip if same as original
272
+ seen.add(variation)
273
+ unique_variations.append(variation)
274
+
275
+ # Try each format variation against dataset
276
+ for variant in unique_variations:
277
+ if variant in dataset_genes:
278
+ dataset_format_genes.append(variant) # Use dataset's actual format
279
+ conversion_map[variant] = gene
280
+ break # Stop after first match
281
+
282
+ return dataset_format_genes, conversion_map
283
+
284
+
285
+ # ============================================================================
286
+ # ENRICHR DATABASE MAPPING
287
+ # ============================================================================
288
+
289
+
290
+ def map_gene_set_database_to_enrichr_library(database_name: str, species: str) -> str:
291
+ """Map user-friendly database names to actual Enrichr library names.
292
+
293
+ Args:
294
+ database_name: User-friendly database name from MCP interface
295
+ species: Species ('human', 'mouse', or 'zebrafish')
296
+
297
+ Returns:
298
+ Actual Enrichr library name
299
+
300
+ Raises:
301
+ ValueError: If database_name is not supported
302
+ """
303
+ mapping = {
304
+ "GO_Biological_Process": "GO_Biological_Process_2025",
305
+ "GO_Molecular_Function": "GO_Molecular_Function_2025",
306
+ "GO_Cellular_Component": "GO_Cellular_Component_2025",
307
+ "KEGG_Pathways": (
308
+ "KEGG_2021_Human" if species.lower() == "human" else "KEGG_2019_Mouse"
309
+ ),
310
+ "Reactome_Pathways": "Reactome_Pathways_2024",
311
+ "MSigDB_Hallmark": "MSigDB_Hallmark_2020",
312
+ "Cell_Type_Markers": "CellMarker_Augmented_2021",
313
+ }
314
+
315
+ if database_name not in mapping:
316
+ available_options = list(mapping)
317
+ raise ParameterError(
318
+ f"Unknown gene set database: {database_name}. "
319
+ f"Available options: {available_options}"
320
+ )
321
+
322
+ return mapping[database_name]
323
+
324
+
325
+ # ============================================================================
326
+ # ENRICHMENT ANALYSIS FUNCTIONS
327
+ # ============================================================================
328
+
329
+
330
+ def perform_gsea(
331
+ adata,
332
+ gene_sets: dict[str, list[str]],
333
+ ranking_key: Optional[str] = None,
334
+ method: str = "signal_to_noise",
335
+ permutation_num: int = 1000,
336
+ min_size: int = 10,
337
+ max_size: int = 500,
338
+ species: Optional[str] = None,
339
+ database: Optional[str] = None,
340
+ ctx: "ToolContext" = None,
341
+ ) -> "EnrichmentResult":
342
+ """
343
+ Perform Gene Set Enrichment Analysis (GSEA).
344
+
345
+ Parameters
346
+ ----------
347
+ adata : AnnData
348
+ Annotated data matrix
349
+ gene_sets : Dict[str, List[str]]
350
+ Gene sets to test
351
+ ranking_key : Optional[str]
352
+ Key in adata.var for pre-computed ranking. If None, compute from expression
353
+ method : str
354
+ Method for ranking genes if ranking_key is None
355
+ permutation_num : int
356
+ Number of permutations
357
+ min_size : int
358
+ Minimum gene set size
359
+ max_size : int
360
+ Maximum gene set size
361
+ species : Optional[str]
362
+ Species for the analysis (e.g., 'mouse', 'human')
363
+ database : Optional[str]
364
+ Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
365
+ ctx : ToolContext
366
+ MCP tool context for logging
367
+
368
+ Returns
369
+ -------
370
+ Dict containing enrichment results
371
+ """
372
+ require("gseapy", ctx, feature="GSEA analysis")
373
+ import gseapy as gp
374
+
375
+ # Prepare ranking
376
+ if ranking_key and ranking_key in adata.var:
377
+ # Use pre-computed ranking
378
+ ranking = adata.var[ranking_key].to_dict()
379
+ else:
380
+ # Compute ranking from expression data
381
+ # Use raw data when available and not log-normalized (full gene set for GSEA)
382
+ # IMPORTANT: Keep X and var_names consistent to avoid dimension mismatch
383
+ if "log1p" not in adata.uns and adata.raw is not None:
384
+ X = adata.raw.X
385
+ var_names = adata.raw.var_names
386
+ else:
387
+ X = adata.X
388
+ var_names = adata.var_names
389
+
390
+ # Compute gene ranking metric
391
+ # IMPORTANT: GSEA requires biologically meaningful ranking, not just variance
392
+ # Reference: Subramanian et al. (2005) PNAS, GSEA-MSIGDB documentation
393
+
394
+ if "condition" in adata.obs or "group" in adata.obs:
395
+ group_key = "condition" if "condition" in adata.obs else "group"
396
+ groups = adata.obs[group_key].unique()
397
+
398
+ if len(groups) == 2:
399
+ # Binary comparison: Use Signal-to-Noise Ratio (GSEA default)
400
+ # S2N = (μ1 - μ2) / (σ1 + σ2)
401
+ # This captures both differential expression AND expression stability
402
+ group1_mask = adata.obs[group_key] == groups[0]
403
+ group2_mask = adata.obs[group_key] == groups[1]
404
+
405
+ # Compute means
406
+ mean1 = np.array(X[group1_mask, :].mean(axis=0)).flatten()
407
+ mean2 = np.array(X[group2_mask, :].mean(axis=0)).flatten()
408
+
409
+ # Compute standard deviations (sparse-compatible)
410
+ std1 = _compute_std_sparse_compatible(X[group1_mask, :], axis=0, ddof=1)
411
+ std2 = _compute_std_sparse_compatible(X[group2_mask, :], axis=0, ddof=1)
412
+
413
+ # Apply minimum std threshold (GSEA standard: 0.2 * |mean|)
414
+ # This prevents division by zero and reduces noise from low-variance genes
415
+ min_std_factor = 0.2
416
+ std1 = np.maximum(std1, min_std_factor * np.abs(mean1))
417
+ std2 = np.maximum(std2, min_std_factor * np.abs(mean2))
418
+
419
+ # Compute Signal-to-Noise Ratio
420
+ s2n = (mean1 - mean2) / (std1 + std2)
421
+ ranking = dict(zip(var_names, s2n, strict=True))
422
+
423
+ else:
424
+ # Multi-group: Use Coefficient of Variation (normalized variance)
425
+ # CV = σ / μ - accounts for mean-variance relationship
426
+ # This is more appropriate than raw variance for genes with different expression levels
427
+ mean = np.array(X.mean(axis=0)).flatten()
428
+ std = _compute_std_sparse_compatible(X, axis=0, ddof=1)
429
+
430
+ # Compute CV (avoid division by zero)
431
+ cv = np.zeros_like(mean)
432
+ nonzero_mask = np.abs(mean) > 1e-10
433
+ cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
434
+
435
+ ranking = dict(zip(var_names, cv, strict=False))
436
+ else:
437
+ # No group information: Use best available ranking method
438
+ if "highly_variable_rank" in adata.var:
439
+ # Prefer pre-computed HVG ranking (most robust)
440
+ ranking = adata.var["highly_variable_rank"].to_dict()
441
+ elif "dispersions_norm" in adata.var:
442
+ # Use Seurat-style normalized dispersion
443
+ ranking = adata.var["dispersions_norm"].to_dict()
444
+ else:
445
+ # Fallback: Coefficient of Variation (better than raw variance)
446
+ # Use sparse-compatible std calculation
447
+ mean = np.array(X.mean(axis=0)).flatten()
448
+ std = _compute_std_sparse_compatible(X, axis=0, ddof=1)
449
+
450
+ cv = np.zeros_like(mean)
451
+ nonzero_mask = np.abs(mean) > 1e-10
452
+ cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
453
+
454
+ ranking = dict(zip(var_names, cv, strict=False))
455
+
456
+ # Run GSEA preranked
457
+ try:
458
+ # Convert ranking dict to DataFrame for gseapy
459
+ ranking_df = pd.DataFrame.from_dict(ranking, orient="index", columns=["score"])
460
+ ranking_df.index.name = "gene"
461
+ ranking_df = ranking_df.sort_values("score", ascending=False)
462
+
463
+ res = gp.prerank(
464
+ rnk=ranking_df, # Pass DataFrame instead of dict
465
+ gene_sets=gene_sets,
466
+ processes=1,
467
+ permutation_num=permutation_num,
468
+ min_size=min_size,
469
+ max_size=max_size,
470
+ seed=42,
471
+ verbose=False,
472
+ no_plot=True,
473
+ outdir=None,
474
+ )
475
+
476
+ # Extract results
477
+ results_df = res.res2d
478
+
479
+ # Prepare output
480
+ enrichment_scores = {}
481
+ pvalues = {}
482
+ adjusted_pvalues = {}
483
+ gene_set_statistics = {}
484
+
485
+ for _idx, row in results_df.iterrows():
486
+ term = row["Term"]
487
+ enrichment_scores[term] = row["ES"]
488
+ pvalues[term] = row["NOM p-val"]
489
+ adjusted_pvalues[term] = row["FDR q-val"]
490
+ gene_set_statistics[term] = {
491
+ "es": row["ES"],
492
+ "nes": row["NES"],
493
+ "pval": row["NOM p-val"],
494
+ "fdr": row["FDR q-val"],
495
+ "size": row.get(
496
+ "Matched_size", row.get("Gene %", 0)
497
+ ), # Different versions use different column names
498
+ "lead_genes": (
499
+ row.get("Lead_genes", "").split(";")[:10]
500
+ if "Lead_genes" in row
501
+ else []
502
+ ),
503
+ }
504
+
505
+ # Get top enriched and depleted
506
+ results_df_sorted = results_df.sort_values("NES", ascending=False)
507
+ top_enriched = (
508
+ results_df_sorted[results_df_sorted["NES"] > 0].head(10)["Term"].tolist()
509
+ )
510
+ top_depleted = (
511
+ results_df_sorted[results_df_sorted["NES"] < 0].head(10)["Term"].tolist()
512
+ )
513
+
514
+ # Save results to adata.uns for visualization
515
+ # Store full results DataFrame for visualization
516
+ adata.uns["gsea_results"] = results_df
517
+
518
+ # Store gene set membership for validation
519
+ adata.uns["enrichment_gene_sets"] = gene_sets
520
+
521
+ # Store metadata for scientific provenance tracking
522
+ store_analysis_metadata(
523
+ adata,
524
+ analysis_name="enrichment_gsea",
525
+ method="gsea",
526
+ parameters={
527
+ "permutation_num": permutation_num,
528
+ "ranking_method": method,
529
+ "min_size": min_size,
530
+ "max_size": max_size,
531
+ "ranking_key": ranking_key,
532
+ },
533
+ results_keys={"uns": ["gsea_results", "enrichment_gene_sets"]},
534
+ statistics={
535
+ "n_gene_sets": len(gene_sets),
536
+ "n_significant": len(results_df[results_df["FDR q-val"] < 0.05]),
537
+ },
538
+ species=species,
539
+ database=database,
540
+ )
541
+
542
+ # Filter all result dictionaries to only significant pathways (reduces MCP response size)
543
+ # Uses method-based FDR threshold: GSEA = 0.25 (Subramanian et al. 2005)
544
+ (
545
+ filtered_statistics,
546
+ filtered_scores,
547
+ filtered_pvals,
548
+ filtered_adj_pvals,
549
+ ) = _filter_significant_statistics(
550
+ gene_set_statistics,
551
+ enrichment_scores,
552
+ pvalues,
553
+ adjusted_pvalues,
554
+ method="gsea", # Method-based FDR: 0.25 for GSEA
555
+ )
556
+
557
+ return EnrichmentResult(
558
+ method="gsea",
559
+ n_gene_sets=len(gene_sets),
560
+ n_significant=len(results_df[results_df["FDR q-val"] < 0.05]),
561
+ enrichment_scores=filtered_scores,
562
+ pvalues=filtered_pvals,
563
+ adjusted_pvalues=filtered_adj_pvals,
564
+ gene_set_statistics=filtered_statistics,
565
+ top_gene_sets=top_enriched,
566
+ top_depleted_sets=top_depleted,
567
+ )
568
+
569
+ except Exception as e:
570
+ logger.error(f"GSEA failed: {e}")
571
+ raise
572
+
573
+
574
+ def perform_ora(
575
+ adata,
576
+ gene_sets: dict[str, list[str]],
577
+ gene_list: Optional[list[str]] = None,
578
+ pvalue_threshold: float = 0.05,
579
+ min_size: int = 10,
580
+ max_size: int = 500,
581
+ species: Optional[str] = None,
582
+ database: Optional[str] = None,
583
+ ctx: "ToolContext" = None,
584
+ ) -> "EnrichmentResult":
585
+ """
586
+ Perform Over-Representation Analysis (ORA).
587
+
588
+ Parameters
589
+ ----------
590
+ adata : AnnData
591
+ Annotated data matrix
592
+ gene_sets : Dict[str, List[str]]
593
+ Gene sets to test
594
+ gene_list : Optional[List[str]]
595
+ List of genes to test. If None, use DEGs from rank_genes_groups
596
+ pvalue_threshold : float
597
+ P-value threshold for selecting DEGs (only used if rank_genes_groups exists)
598
+ min_size : int
599
+ Minimum gene set size
600
+ max_size : int
601
+ Maximum gene set size
602
+ species : Optional[str]
603
+ Species for the analysis (e.g., 'mouse', 'human')
604
+ database : Optional[str]
605
+ Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
606
+ ctx : ToolContext
607
+ MCP tool context for logging
608
+
609
+ Returns
610
+ -------
611
+ Dict containing enrichment results
612
+
613
+ Notes
614
+ -----
615
+ LogFC filtering removed: ORA should use genes pre-filtered by find_markers.
616
+ Different statistical methods (Wilcoxon, t-test) produce different logFC scales,
617
+ making a fixed threshold inappropriate. Gene filtering is the responsibility of
618
+ differential expression analysis, not enrichment analysis.
619
+ """
620
+ # Get gene list if not provided
621
+ if gene_list is None:
622
+ # Try to get DEGs from adata
623
+ if "rank_genes_groups" in adata.uns:
624
+ # Get DEGs
625
+ result = adata.uns["rank_genes_groups"]
626
+ names = result["names"]
627
+
628
+ # Check if pvals exist (not all rank_genes_groups have pvals)
629
+ pvals = None
630
+ if "pvals_adj" in result:
631
+ pvals = result["pvals_adj"]
632
+ elif "pvals" in result:
633
+ pvals = result["pvals"]
634
+
635
+ # Get DEGs from all groups and merge
636
+ # IMPORTANT: names is a numpy recarray with shape (n_genes,)
637
+ # and dtype.names contains group names as fields
638
+ # Access genes by group name: names[group_name][i]
639
+ degs_set = set() # Use set for O(1) duplicate check
640
+
641
+ # Iterate over all groups
642
+ for group_name in names.dtype.names:
643
+ for i in range(len(names)):
644
+ # Skip genes that don't pass filter criteria
645
+ if pvals is not None and pvals[group_name][i] >= pvalue_threshold:
646
+ continue
647
+ if pvals is None and i >= 100: # Top 100 genes when no pvals
648
+ continue
649
+
650
+ degs_set.add(names[group_name][i])
651
+
652
+ gene_list = list(degs_set)
653
+ else:
654
+ # Use highly variable genes
655
+ if "highly_variable" in adata.var:
656
+ gene_list = adata.var_names[adata.var["highly_variable"]].tolist()
657
+ else:
658
+ # Use top variable genes (based on Coefficient of Variation)
659
+ # CV = σ/μ is more appropriate than raw variance
660
+ mean = np.array(adata.X.mean(axis=0)).flatten()
661
+ std = _compute_std_sparse_compatible(adata.X, axis=0, ddof=1)
662
+
663
+ # Compute CV (avoid division by zero)
664
+ cv = np.zeros_like(mean)
665
+ nonzero_mask = np.abs(mean) > 1e-10
666
+ cv[nonzero_mask] = std[nonzero_mask] / np.abs(mean[nonzero_mask])
667
+
668
+ top_indices = np.argsort(cv)[-500:]
669
+ gene_list = adata.var_names[top_indices].tolist()
670
+
671
+ # Background genes
672
+ # IMPORTANT: Use adata.raw if available, as rank_genes_groups may have been
673
+ # run on raw data with different gene name casing (e.g., uppercase)
674
+ # while filtered adata.var_names may be lowercase
675
+ if adata.raw is not None:
676
+ background_genes = set(adata.raw.var_names)
677
+ else:
678
+ background_genes = set(adata.var_names)
679
+
680
+ # Case-insensitive matching as fallback for gene name format differences
681
+ # (e.g., MT.CO1 vs MT-CO1, uppercase vs lowercase)
682
+ query_genes = set(gene_list) & background_genes
683
+
684
+ # If no direct matches, try case-insensitive matching
685
+ if len(query_genes) == 0 and len(gene_list) > 0:
686
+ # Create case-insensitive lookup
687
+ gene_name_map = {g.upper(): g for g in background_genes}
688
+ query_genes = set()
689
+ for gene in gene_list:
690
+ if gene.upper() in gene_name_map:
691
+ query_genes.add(gene_name_map[gene.upper()])
692
+
693
+ # Perform hypergeometric test for each gene set
694
+ enrichment_scores = {}
695
+ pvalues = {}
696
+ gene_set_statistics = {}
697
+
698
+ for gs_name, gs_genes in gene_sets.items():
699
+ gs_genes_set = set(gs_genes) & background_genes
700
+
701
+ if len(gs_genes_set) < min_size or len(gs_genes_set) > max_size:
702
+ continue
703
+
704
+ # Hypergeometric test
705
+ # a: genes in both query and gene set
706
+ # b: genes in query but not in gene set
707
+ # c: genes in gene set but not in query
708
+ # d: genes in neither
709
+
710
+ a = len(query_genes & gs_genes_set)
711
+ b = len(query_genes - gs_genes_set)
712
+ c = len(gs_genes_set - query_genes)
713
+ d = len(background_genes - query_genes - gs_genes_set)
714
+
715
+ # Fisher's exact test
716
+ odds_ratio, p_value = stats.fisher_exact(
717
+ [[a, b], [c, d]], alternative="greater"
718
+ )
719
+
720
+ enrichment_scores[gs_name] = odds_ratio
721
+ pvalues[gs_name] = p_value
722
+
723
+ gene_set_statistics[gs_name] = {
724
+ "odds_ratio": odds_ratio,
725
+ "pval": p_value,
726
+ "overlap": a,
727
+ "query_size": len(query_genes),
728
+ "gs_size": len(gs_genes_set),
729
+ "overlapping_genes": list(query_genes & gs_genes_set)[:20], # Top 20
730
+ }
731
+
732
+ # Multiple testing correction
733
+ if pvalues:
734
+ pval_array = np.array(list(pvalues.values()))
735
+ _, adjusted_pvals, _, _ = multipletests(pval_array, method="fdr_bh")
736
+ adjusted_pvalues = dict(zip(pvalues.keys(), adjusted_pvals, strict=False))
737
+ else:
738
+ adjusted_pvalues = {}
739
+
740
+ # Get top results
741
+ sorted_by_pval = sorted(pvalues.items(), key=lambda x: x[1])
742
+ top_gene_sets = [x[0] for x in sorted_by_pval[:10]]
743
+
744
+ # Save results to adata.uns for visualization
745
+ # Create DataFrame for visualization compatibility
746
+ ora_df = pd.DataFrame(
747
+ {
748
+ "pathway": list(enrichment_scores),
749
+ "odds_ratio": list(enrichment_scores.values()),
750
+ "pvalue": [pvalues.get(k, 1.0) for k in enrichment_scores],
751
+ "adjusted_pvalue": [
752
+ adjusted_pvalues.get(k, 1.0) for k in enrichment_scores
753
+ ],
754
+ }
755
+ )
756
+ ora_df["NES"] = ora_df["odds_ratio"] # Use odds_ratio as score for visualization
757
+ ora_df = ora_df.sort_values("pvalue")
758
+
759
+ adata.uns["ora_results"] = ora_df
760
+ adata.uns["gsea_results"] = (
761
+ ora_df # Also save as gsea_results for visualization compatibility
762
+ )
763
+
764
+ # Store gene set membership for validation
765
+ adata.uns["enrichment_gene_sets"] = gene_sets
766
+
767
+ # Store metadata for scientific provenance tracking
768
+ store_analysis_metadata(
769
+ adata,
770
+ analysis_name="enrichment_ora",
771
+ method="ora",
772
+ parameters={
773
+ "pvalue_threshold": pvalue_threshold,
774
+ "min_size": min_size,
775
+ "max_size": max_size,
776
+ "n_query_genes": len(query_genes),
777
+ },
778
+ results_keys={"uns": ["ora_results", "enrichment_gene_sets"]},
779
+ statistics={
780
+ "n_gene_sets": len(gene_sets),
781
+ "n_significant": sum(
782
+ 1 for p in adjusted_pvalues.values() if p is not None and p < 0.05
783
+ ),
784
+ "n_query_genes": len(query_genes),
785
+ },
786
+ species=species,
787
+ database=database,
788
+ )
789
+
790
+ # Filter all result dictionaries to only significant pathways (reduces MCP response size)
791
+ # Uses method-based FDR threshold: ORA = 0.05 (standard statistical threshold)
792
+ (
793
+ filtered_statistics,
794
+ filtered_scores,
795
+ filtered_pvals,
796
+ filtered_adj_pvals,
797
+ ) = _filter_significant_statistics(
798
+ gene_set_statistics,
799
+ enrichment_scores,
800
+ pvalues,
801
+ adjusted_pvalues,
802
+ method="ora", # Method-based FDR: 0.05 for ORA
803
+ )
804
+
805
+ return EnrichmentResult(
806
+ method="ora",
807
+ n_gene_sets=len(gene_sets),
808
+ n_significant=sum(
809
+ 1 for p in adjusted_pvalues.values() if p is not None and p < 0.05
810
+ ),
811
+ enrichment_scores=filtered_scores,
812
+ pvalues=filtered_pvals,
813
+ adjusted_pvalues=filtered_adj_pvals,
814
+ gene_set_statistics=filtered_statistics,
815
+ top_gene_sets=top_gene_sets,
816
+ top_depleted_sets=[], # ORA does not produce depleted gene sets
817
+ )
818
+
819
+
820
+ def perform_ssgsea(
821
+ adata,
822
+ gene_sets: dict[str, list[str]],
823
+ min_size: int = 10,
824
+ max_size: int = 500,
825
+ species: Optional[str] = None,
826
+ database: Optional[str] = None,
827
+ ctx: "ToolContext" = None,
828
+ ) -> "EnrichmentResult":
829
+ """
830
+ Perform single-sample Gene Set Enrichment Analysis (ssGSEA).
831
+
832
+ This calculates enrichment scores for each sample independently.
833
+
834
+ Parameters
835
+ ----------
836
+ adata : AnnData
837
+ Annotated data matrix
838
+ gene_sets : Dict[str, List[str]]
839
+ Gene sets to test
840
+ min_size : int
841
+ Minimum gene set size
842
+ max_size : int
843
+ Maximum gene set size
844
+ species : Optional[str]
845
+ Species for the analysis (e.g., 'mouse', 'human')
846
+ database : Optional[str]
847
+ Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
848
+ ctx : ToolContext
849
+ MCP tool context for logging
850
+
851
+ Returns
852
+ -------
853
+ Dict containing enrichment results
854
+ """
855
+ require("gseapy", ctx, feature="ssGSEA analysis")
856
+ import gseapy as gp
857
+
858
+ # Memory-efficient batch processing for large datasets
859
+ # Threshold: process in batches if > 1000 samples to avoid OOM
860
+ BATCH_SIZE = 500
861
+ n_samples = adata.n_obs
862
+
863
+ # Run ssGSEA (with batch processing for large datasets)
864
+ try:
865
+ if n_samples <= BATCH_SIZE:
866
+ # Small dataset: process all at once (original behavior)
867
+ expr_df = pd.DataFrame(
868
+ to_dense(adata.X).T, index=adata.var_names, columns=adata.obs_names
869
+ )
870
+ res = gp.ssgsea(
871
+ data=expr_df,
872
+ gene_sets=gene_sets,
873
+ min_size=min_size,
874
+ max_size=max_size,
875
+ permutation_num=0,
876
+ no_plot=True,
877
+ threads=1,
878
+ seed=42,
879
+ )
880
+ else:
881
+ # Large dataset: batch processing to reduce peak memory
882
+ # Memory reduction: O(n_genes × n_samples) -> O(n_genes × batch_size)
883
+ all_batch_results = []
884
+
885
+ for batch_start in range(0, n_samples, BATCH_SIZE):
886
+ batch_end = min(batch_start + BATCH_SIZE, n_samples)
887
+ batch_indices = list(range(batch_start, batch_end))
888
+
889
+ # Extract batch - only convert this batch to dense
890
+ batch_X = to_dense(adata.X[batch_indices, :])
891
+ batch_df = pd.DataFrame(
892
+ batch_X.T,
893
+ index=adata.var_names,
894
+ columns=adata.obs_names[batch_indices],
895
+ )
896
+
897
+ batch_res = gp.ssgsea(
898
+ data=batch_df,
899
+ gene_sets=gene_sets,
900
+ min_size=min_size,
901
+ max_size=max_size,
902
+ permutation_num=0,
903
+ no_plot=True,
904
+ threads=1,
905
+ seed=42,
906
+ )
907
+
908
+ if hasattr(batch_res, "results"):
909
+ all_batch_results.append(batch_res.results)
910
+
911
+ # Free batch memory
912
+ del batch_X, batch_df
913
+
914
+ # Merge batch results into unified format
915
+ # Create a mock result object with combined results
916
+ class CombinedResult:
917
+ def __init__(self, results_list):
918
+ self.results = {}
919
+ for batch_results in results_list:
920
+ if isinstance(batch_results, dict):
921
+ self.results.update(batch_results)
922
+
923
+ res = CombinedResult(all_batch_results)
924
+
925
+ # Extract results - ssGSEA stores enrichment scores in res.results
926
+ if hasattr(res, "results") and isinstance(res.results, dict):
927
+ # res.results is a dict where keys are sample names and values are DataFrames
928
+ # We need to reorganize this into gene sets x samples format
929
+ all_samples = list(res.results.keys())
930
+ all_gene_sets = set()
931
+
932
+ # Get all gene sets
933
+ for sample_df in res.results.values():
934
+ if isinstance(sample_df, pd.DataFrame) and "Term" in sample_df.columns:
935
+ all_gene_sets.update(sample_df["Term"].values)
936
+
937
+ all_gene_sets = list(all_gene_sets)
938
+
939
+ # Create scores matrix
940
+ scores_matrix = pd.DataFrame(
941
+ index=all_gene_sets, columns=all_samples, dtype=float
942
+ )
943
+
944
+ # Fill in scores
945
+ for sample, df in res.results.items():
946
+ if (
947
+ isinstance(df, pd.DataFrame)
948
+ and "Term" in df.columns
949
+ and "ES" in df.columns
950
+ ):
951
+ for _, row in df.iterrows():
952
+ if row["Term"] in scores_matrix.index:
953
+ scores_matrix.loc[row["Term"], sample] = row["ES"]
954
+
955
+ scores_df = scores_matrix.fillna(0) # Fill missing values with 0
956
+ else:
957
+ error_msg = "ssGSEA results format not recognized."
958
+ logger.error(error_msg)
959
+ raise ProcessingError(error_msg)
960
+
961
+ # Calculate statistics across samples
962
+ enrichment_scores = {}
963
+ gene_set_statistics = {}
964
+
965
+ if not scores_df.empty:
966
+ for gs_name in scores_df.index:
967
+ scores = scores_df.loc[gs_name].values
968
+ enrichment_scores[gs_name] = float(np.mean(scores))
969
+
970
+ gene_set_statistics[gs_name] = {
971
+ "mean_score": float(np.mean(scores)),
972
+ "std_score": float(np.std(scores)),
973
+ "min_score": float(np.min(scores)),
974
+ "max_score": float(np.max(scores)),
975
+ "size": len(gene_sets.get(gs_name, [])),
976
+ }
977
+
978
+ # Add scores to adata
979
+ for gs_name in scores_df.index:
980
+ adata.obs[f"ssgsea_{gs_name}"] = scores_df.loc[gs_name].values
981
+
982
+ # Store gene set membership for validation
983
+ adata.uns["enrichment_gene_sets"] = gene_sets
984
+
985
+ # Store metadata for scientific provenance tracking
986
+ obs_keys = [f"ssgsea_{gs_name}" for gs_name in scores_df.index]
987
+ store_analysis_metadata(
988
+ adata,
989
+ analysis_name="enrichment_ssgsea",
990
+ method="ssgsea",
991
+ parameters={
992
+ "min_size": min_size,
993
+ "max_size": max_size,
994
+ },
995
+ results_keys={"obs": obs_keys, "uns": ["enrichment_gene_sets"]},
996
+ statistics={
997
+ "n_gene_sets": len(gene_sets),
998
+ "n_samples": adata.n_obs,
999
+ },
1000
+ species=species,
1001
+ database=database,
1002
+ )
1003
+
1004
+ # Get top gene sets by mean enrichment
1005
+ sorted_by_mean = sorted(
1006
+ enrichment_scores.items(), key=lambda x: x[1], reverse=True
1007
+ )
1008
+ top_gene_sets = [x[0] for x in sorted_by_mean[:10]]
1009
+
1010
+ # ssGSEA doesn't provide p-values, so return empty gene_set_statistics
1011
+ # to reduce MCP response size (no significance filtering possible)
1012
+ pvalues = None
1013
+ adjusted_pvalues = None
1014
+
1015
+ return EnrichmentResult(
1016
+ method="ssgsea",
1017
+ n_gene_sets=len(gene_sets),
1018
+ # IMPORTANT: ssGSEA does NOT perform significance testing
1019
+ # Setting n_significant=0 is honest: no pathways are "statistically significant"
1020
+ # All gene sets receive enrichment scores, but these are sample-level metrics
1021
+ # without associated p-values. Use GSEA or ORA for significance testing.
1022
+ n_significant=0, # ssGSEA doesn't test significance - no p-values produced
1023
+ enrichment_scores=enrichment_scores, # Mean scores per gene set
1024
+ pvalues=pvalues,
1025
+ adjusted_pvalues=adjusted_pvalues,
1026
+ gene_set_statistics={}, # Empty to reduce response size (no p-values available)
1027
+ top_gene_sets=top_gene_sets,
1028
+ top_depleted_sets=[], # ssGSEA doesn't produce depleted sets
1029
+ )
1030
+
1031
+ except Exception as e:
1032
+ logger.error(f"ssGSEA failed: {e}")
1033
+ raise
1034
+
1035
+
1036
+ def perform_enrichr(
1037
+ gene_list: list[str],
1038
+ gene_sets: Optional[str] = None,
1039
+ organism: str = "human",
1040
+ ctx: "ToolContext" = None,
1041
+ ) -> "EnrichmentResult":
1042
+ """
1043
+ Perform enrichment analysis using Enrichr web service.
1044
+
1045
+ Parameters
1046
+ ----------
1047
+ gene_list : List[str]
1048
+ List of genes to analyze
1049
+ gene_sets : Optional[str]
1050
+ Enrichr library name. If None, use default libraries
1051
+ organism : str
1052
+ Organism ('human' or 'mouse')
1053
+ ctx : ToolContext
1054
+ MCP tool context for logging
1055
+
1056
+ Returns
1057
+ -------
1058
+ Dict containing enrichment results
1059
+ """
1060
+ require("gseapy", ctx, feature="Enrichr analysis")
1061
+ import gseapy as gp
1062
+
1063
+ # Default gene set libraries
1064
+ if gene_sets is None:
1065
+ gene_sets = [
1066
+ "GO_Biological_Process_2023",
1067
+ "GO_Molecular_Function_2023",
1068
+ "GO_Cellular_Component_2023",
1069
+ "KEGG_2021_Human" if organism == "human" else "KEGG_2019_Mouse",
1070
+ "Reactome_2022",
1071
+ "MSigDB_Hallmark_2020",
1072
+ ]
1073
+ elif isinstance(gene_sets, str):
1074
+ # Map user-friendly database name to actual Enrichr library name
1075
+ enrichr_library = map_gene_set_database_to_enrichr_library(gene_sets, organism)
1076
+ gene_sets = [enrichr_library]
1077
+
1078
+ # Run Enrichr
1079
+ try:
1080
+ enr = gp.enrichr(
1081
+ gene_list=gene_list,
1082
+ gene_sets=gene_sets,
1083
+ organism=organism.capitalize(),
1084
+ outdir=None,
1085
+ cutoff=0.05,
1086
+ )
1087
+
1088
+ # Get results - enr.results is already a DataFrame
1089
+ all_results = enr.results
1090
+
1091
+ # Prepare output
1092
+ enrichment_scores = {}
1093
+ pvalues = {}
1094
+ adjusted_pvalues = {}
1095
+ gene_set_statistics = {}
1096
+
1097
+ # Process all results in a single pass (optimized: 3 loops -> 1)
1098
+ genes_found_in_results = []
1099
+ for _idx, row in all_results.iterrows():
1100
+ term = row["Term"]
1101
+ enrichment_scores[term] = row["Combined Score"]
1102
+ pvalues[term] = row["P-value"]
1103
+ adjusted_pvalues[term] = row["Adjusted P-value"]
1104
+
1105
+ genes_str = row["Genes"]
1106
+ genes_list = genes_str.split(";") if isinstance(genes_str, str) else []
1107
+ genes_found_in_results.extend(genes_list)
1108
+
1109
+ gene_set_statistics[term] = {
1110
+ "combined_score": row["Combined Score"],
1111
+ "pval": row["P-value"],
1112
+ "adjusted_pval": row["Adjusted P-value"],
1113
+ "z_score": row.get("Z-score", np.nan),
1114
+ "overlap": row["Overlap"],
1115
+ "genes": genes_list,
1116
+ "odds_ratio": row.get("Odds Ratio", 1.0),
1117
+ }
1118
+
1119
+ # Get top results
1120
+ all_results_sorted = all_results.sort_values("Combined Score", ascending=False)
1121
+ top_gene_sets = all_results_sorted.head(10)["Term"].tolist()
1122
+
1123
+ # Filter all result dictionaries to only significant pathways (reduces MCP response size)
1124
+ # Uses method-based FDR threshold: Enrichr = 0.05 (same as ORA, hypergeometric-based)
1125
+ (
1126
+ filtered_statistics,
1127
+ filtered_scores,
1128
+ filtered_pvals,
1129
+ filtered_adj_pvals,
1130
+ ) = _filter_significant_statistics(
1131
+ gene_set_statistics,
1132
+ enrichment_scores,
1133
+ pvalues,
1134
+ adjusted_pvalues,
1135
+ method="enrichr", # Method-based FDR: 0.05 for Enrichr
1136
+ )
1137
+
1138
+ return EnrichmentResult(
1139
+ method="enrichr",
1140
+ n_gene_sets=len(all_results),
1141
+ n_significant=len(all_results[all_results["Adjusted P-value"] < 0.05]),
1142
+ enrichment_scores=filtered_scores,
1143
+ pvalues=filtered_pvals,
1144
+ adjusted_pvalues=filtered_adj_pvals,
1145
+ gene_set_statistics=filtered_statistics,
1146
+ top_gene_sets=top_gene_sets,
1147
+ top_depleted_sets=[], # Enrichr doesn't produce depleted sets
1148
+ )
1149
+
1150
+ except Exception as e:
1151
+ logger.error(f"Enrichr failed: {e}")
1152
+ raise
1153
+
1154
+
1155
+ # ============================================================================
1156
+ # Spatial Enrichment Analysis Functions (EnrichMap-based)
1157
+ # ============================================================================
1158
+
1159
+
1160
+ def perform_spatial_enrichment(
1161
+ data_id: str,
1162
+ ctx: "ToolContext",
1163
+ gene_sets: Union[list[str], dict[str, list[str]]],
1164
+ score_keys: Optional[Union[str, list[str]]] = None,
1165
+ spatial_key: str = "spatial",
1166
+ n_neighbors: int = 6,
1167
+ smoothing: bool = True,
1168
+ correct_spatial_covariates: bool = True,
1169
+ batch_key: Optional[str] = None,
1170
+ species: str = "unknown",
1171
+ database: Optional[str] = None,
1172
+ ) -> "EnrichmentResult":
1173
+ """
1174
+ Perform spatially-aware gene set enrichment analysis using EnrichMap.
1175
+
1176
+ Parameters
1177
+ ----------
1178
+ data_id : str
1179
+ Identifier for the spatial data in the data store
1180
+ ctx : ToolContext
1181
+ MCP tool context for data access and logging
1182
+ gene_sets : Union[List[str], Dict[str, List[str]]]
1183
+ Either a single gene list or a dictionary of gene sets where keys are
1184
+ signature names and values are lists of genes
1185
+ score_keys : Optional[Union[str, List[str]]]
1186
+ Names for the gene signatures if gene_sets is a list. Ignored if gene_sets
1187
+ is already a dictionary
1188
+ spatial_key : str
1189
+ Key in adata.obsm containing spatial coordinates (default: "spatial")
1190
+ n_neighbors : int
1191
+ Number of nearest spatial neighbors for smoothing (default: 6)
1192
+ smoothing : bool
1193
+ Whether to perform spatial smoothing (default: True)
1194
+ correct_spatial_covariates : bool
1195
+ Whether to correct for spatial covariates using GAM (default: True)
1196
+ batch_key : Optional[str]
1197
+ Column in adata.obs for batch-wise normalization
1198
+ species : str
1199
+ Species for the analysis (e.g., 'mouse', 'human')
1200
+ database : Optional[str]
1201
+ Gene set database used (e.g., 'KEGG_Pathways', 'GO_Biological_Process')
1202
+
1203
+ Returns
1204
+ -------
1205
+ Dict[str, Any]
1206
+ Dictionary containing:
1207
+ - data_id: ID of the data with enrichment scores
1208
+ - signatures: List of computed signatures
1209
+ - score_columns: List of column names containing scores
1210
+ - gene_contributions: Dictionary of gene contributions per signature
1211
+ - summary_stats: Summary statistics for each signature
1212
+ """
1213
+ # Check if EnrichMap is available
1214
+ require("enrichmap", ctx, feature="spatial enrichment analysis")
1215
+
1216
+ # Import EnrichMap
1217
+ import enrichmap as em
1218
+
1219
+ # Get data using standard ctx pattern
1220
+ adata = await ctx.get_adata(data_id)
1221
+
1222
+ # Validate spatial coordinates
1223
+ if spatial_key not in adata.obsm:
1224
+ raise ProcessingError(
1225
+ f"Spatial coordinates '{spatial_key}' not found in adata.obsm"
1226
+ )
1227
+
1228
+ # Convert single gene list to dictionary format
1229
+ if isinstance(gene_sets, list):
1230
+ if score_keys is None:
1231
+ score_keys = "enrichmap_signature"
1232
+ gene_sets = {score_keys: gene_sets}
1233
+
1234
+ # Validate gene sets with format conversion
1235
+ available_genes = set(adata.var_names)
1236
+ validated_gene_sets = {}
1237
+
1238
+ for sig_name, genes in gene_sets.items():
1239
+ # Try direct matching first
1240
+ common_genes = [gene for gene in genes if gene in available_genes]
1241
+
1242
+ # If few matches and we know the species, try format conversion
1243
+ if len(common_genes) < len(genes) * 0.5 and species != "unknown":
1244
+ dataset_format_genes, _ = _convert_gene_format_for_matching(
1245
+ genes, available_genes, species
1246
+ )
1247
+
1248
+ if len(dataset_format_genes) > len(common_genes):
1249
+ # Format conversion helped, use dataset format genes for EnrichMap
1250
+ common_genes = dataset_format_genes
1251
+
1252
+ if len(common_genes) < 2:
1253
+ await ctx.warning(
1254
+ f"Signature '{sig_name}' has {len(common_genes)} genes in the dataset. Skipping."
1255
+ )
1256
+ continue
1257
+ validated_gene_sets[sig_name] = common_genes
1258
+ await ctx.info(
1259
+ f"Signature '{sig_name}': {len(common_genes)}/{len(genes)} genes found"
1260
+ )
1261
+
1262
+ if not validated_gene_sets:
1263
+ raise ProcessingError(
1264
+ f"No valid gene signatures found (≥2 genes). "
1265
+ f"Dataset: {len(available_genes)} genes, requested: {len(gene_sets)} signatures. "
1266
+ f"Check species (human/mouse) and gene name format."
1267
+ )
1268
+
1269
+ # Run EnrichMap scoring - process each gene set individually
1270
+ failed_signatures = []
1271
+ successful_signatures = []
1272
+
1273
+ for sig_name, genes in validated_gene_sets.items():
1274
+ try:
1275
+ em.tl.score(
1276
+ adata=adata,
1277
+ gene_set=genes, # Fixed: use gene_set (correct API parameter name)
1278
+ score_key=sig_name, # Fixed: provide explicit score_key
1279
+ spatial_key=spatial_key,
1280
+ n_neighbors=n_neighbors,
1281
+ smoothing=smoothing,
1282
+ correct_spatial_covariates=correct_spatial_covariates,
1283
+ batch_key=batch_key,
1284
+ )
1285
+ successful_signatures.append(sig_name)
1286
+
1287
+ except Exception as e:
1288
+ await ctx.warning(f"EnrichMap failed for '{sig_name}': {e}")
1289
+ failed_signatures.append((sig_name, str(e)))
1290
+
1291
+ # Check if any signatures were processed successfully
1292
+ if not successful_signatures:
1293
+ error_details = "; ".join(
1294
+ [f"{name}: {error}" for name, error in failed_signatures]
1295
+ )
1296
+ raise ProcessingError(
1297
+ f"All EnrichMap scoring failed. This may indicate:\n"
1298
+ f"1. EnrichMap package installation issues\n"
1299
+ f"2. Incompatible gene names or data format\n"
1300
+ f"3. Insufficient spatial information\n"
1301
+ f"Details: {error_details}"
1302
+ )
1303
+
1304
+ # Update validated_gene_sets to only include successful ones
1305
+ validated_gene_sets = {
1306
+ sig: validated_gene_sets[sig] for sig in successful_signatures
1307
+ }
1308
+
1309
+ if ctx and failed_signatures:
1310
+ await ctx.warning(
1311
+ f"Failed to process {len(failed_signatures)} gene sets: {[name for name, _ in failed_signatures]}"
1312
+ )
1313
+
1314
+ # Collect results
1315
+ score_columns = [f"{sig}_score" for sig in validated_gene_sets]
1316
+
1317
+ # Calculate summary statistics
1318
+ summary_stats = {}
1319
+ for sig_name in validated_gene_sets:
1320
+ score_col = f"{sig_name}_score"
1321
+ scores = adata.obs[score_col]
1322
+
1323
+ summary_stats[sig_name] = {
1324
+ "mean": float(scores.mean()),
1325
+ "std": float(scores.std()),
1326
+ "min": float(scores.min()),
1327
+ "max": float(scores.max()),
1328
+ "median": float(scores.median()),
1329
+ "q25": float(scores.quantile(0.25)),
1330
+ "q75": float(scores.quantile(0.75)),
1331
+ "n_genes": len(validated_gene_sets[sig_name]),
1332
+ }
1333
+
1334
+ # Store gene set membership for validation
1335
+ adata.uns["enrichment_gene_sets"] = validated_gene_sets
1336
+
1337
+ # Store metadata for scientific provenance tracking
1338
+ store_analysis_metadata(
1339
+ adata,
1340
+ analysis_name="enrichment_spatial",
1341
+ method="spatial_enrichmap",
1342
+ parameters={
1343
+ "spatial_key": spatial_key,
1344
+ "n_neighbors": n_neighbors,
1345
+ "smoothing": smoothing,
1346
+ "correct_spatial_covariates": correct_spatial_covariates,
1347
+ "batch_key": batch_key,
1348
+ },
1349
+ results_keys={
1350
+ "obs": score_columns,
1351
+ "uns": ["gene_contributions", "enrichment_gene_sets"],
1352
+ },
1353
+ statistics={
1354
+ "n_gene_sets": len(validated_gene_sets),
1355
+ "n_successful_signatures": len(successful_signatures),
1356
+ "n_failed_signatures": len(failed_signatures),
1357
+ },
1358
+ species=species,
1359
+ database=database,
1360
+ )
1361
+
1362
+ # Create enrichment scores (use max score per gene set)
1363
+ enrichment_scores = {
1364
+ sig_name: float(stats["max"]) for sig_name, stats in summary_stats.items()
1365
+ }
1366
+
1367
+ # Sort by enrichment score to get top gene sets
1368
+ sorted_sigs = sorted(enrichment_scores.items(), key=lambda x: x[1], reverse=True)
1369
+ top_gene_sets = [sig_name for sig_name, _ in sorted_sigs[:10]]
1370
+
1371
+ # Spatial enrichment doesn't provide p-values, so return empty gene_set_statistics
1372
+ # to reduce MCP response size (no significance filtering possible)
1373
+ pvalues = None
1374
+ adjusted_pvalues = None
1375
+
1376
+ return EnrichmentResult(
1377
+ method="spatial_enrichmap",
1378
+ n_gene_sets=len(validated_gene_sets),
1379
+ n_significant=len(successful_signatures),
1380
+ enrichment_scores=enrichment_scores,
1381
+ pvalues=pvalues,
1382
+ adjusted_pvalues=adjusted_pvalues,
1383
+ gene_set_statistics={}, # Empty to reduce response size (no p-values available)
1384
+ spatial_scores_key=None, # Scores are in obs columns, not obsm
1385
+ top_gene_sets=top_gene_sets,
1386
+ top_depleted_sets=[], # Spatial enrichment doesn't produce depleted sets
1387
+ )
1388
+
1389
+
1390
+ # ============================================================================
1391
+ # Gene Set Loading Functions
1392
+ # ============================================================================
1393
+ # Simplified from GeneSetLoader class - no need for class overhead when
1394
+ # functions are only called once from load_gene_sets()
1395
+
1396
+
1397
+ def _get_organism_name(species: str) -> str:
1398
+ """Get organism name for gseapy from species code."""
1399
+ return "Homo sapiens" if species.lower() == "human" else "Mus musculus"
1400
+
1401
+
1402
+ def load_msigdb_gene_sets(
1403
+ species: str,
1404
+ collection: str = "H",
1405
+ subcollection: Optional[str] = None,
1406
+ min_size: int = 10,
1407
+ max_size: int = 500,
1408
+ ) -> dict[str, list[str]]:
1409
+ """
1410
+ Load gene sets from MSigDB using gseapy.
1411
+
1412
+ Parameters
1413
+ ----------
1414
+ species : str
1415
+ Species for gene sets ('human' or 'mouse')
1416
+ collection : str
1417
+ MSigDB collection name:
1418
+ - H: hallmark gene sets
1419
+ - C1: positional gene sets
1420
+ - C2: curated gene sets (e.g., CGP, CP:KEGG, CP:REACTOME)
1421
+ - C3: motif gene sets
1422
+ - C4: computational gene sets
1423
+ - C5: GO gene sets (CC, BP, MF)
1424
+ - C6: oncogenic signatures
1425
+ - C7: immunologic signatures
1426
+ - C8: cell type signatures
1427
+ subcollection : Optional[str]
1428
+ Subcollection for specific databases (e.g., 'CP:KEGG', 'GO:BP')
1429
+ min_size : int
1430
+ Minimum gene set size
1431
+ max_size : int
1432
+ Maximum gene set size
1433
+
1434
+ Returns
1435
+ -------
1436
+ Dict[str, List[str]]
1437
+ Dictionary of gene sets
1438
+ """
1439
+ try:
1440
+ import gseapy as gp
1441
+
1442
+ organism = _get_organism_name(species)
1443
+ gene_sets_dict = {}
1444
+
1445
+ if collection == "H":
1446
+ # Hallmark gene sets
1447
+ gene_sets = gp.get_library_name(organism=organism)
1448
+ if "MSigDB_Hallmark_2020" in gene_sets:
1449
+ gene_sets_dict = gp.get_library(
1450
+ "MSigDB_Hallmark_2020", organism=organism
1451
+ )
1452
+
1453
+ elif collection == "C2" and subcollection == "CP:KEGG":
1454
+ # KEGG pathways
1455
+ if species.lower() == "human":
1456
+ gene_sets_dict = gp.get_library("KEGG_2021_Human", organism=organism)
1457
+ else:
1458
+ gene_sets_dict = gp.get_library("KEGG_2019_Mouse", organism=organism)
1459
+
1460
+ elif collection == "C2" and subcollection == "CP:REACTOME":
1461
+ # Reactome pathways
1462
+ gene_sets_dict = gp.get_library("Reactome_2022", organism=organism)
1463
+
1464
+ elif collection == "C5":
1465
+ # GO gene sets
1466
+ if subcollection == "GO:BP" or subcollection is None:
1467
+ gene_sets_dict.update(
1468
+ gp.get_library("GO_Biological_Process_2023", organism=organism)
1469
+ )
1470
+ if subcollection == "GO:MF" or subcollection is None:
1471
+ gene_sets_dict.update(
1472
+ gp.get_library("GO_Molecular_Function_2023", organism=organism)
1473
+ )
1474
+ if subcollection == "GO:CC" or subcollection is None:
1475
+ gene_sets_dict.update(
1476
+ gp.get_library("GO_Cellular_Component_2023", organism=organism)
1477
+ )
1478
+
1479
+ elif collection == "C8":
1480
+ # Cell type signatures
1481
+ gene_sets_dict = gp.get_library(
1482
+ "CellMarker_Augmented_2021", organism=organism
1483
+ )
1484
+
1485
+ # Filter by size
1486
+ filtered_sets = _filter_gene_sets_by_size(gene_sets_dict, min_size, max_size)
1487
+ return filtered_sets
1488
+
1489
+ except Exception as e:
1490
+ raise ProcessingError(f"Failed to load MSigDB gene sets: {e}") from e
1491
+
1492
+
1493
+ def load_go_gene_sets(
1494
+ species: str,
1495
+ aspect: str = "BP",
1496
+ min_size: int = 10,
1497
+ max_size: int = 500,
1498
+ ) -> dict[str, list[str]]:
1499
+ """
1500
+ Load GO terms using gseapy.
1501
+
1502
+ Parameters
1503
+ ----------
1504
+ species : str
1505
+ Species for gene sets ('human' or 'mouse')
1506
+ aspect : str
1507
+ GO aspect: 'BP' (biological process), 'MF' (molecular function),
1508
+ 'CC' (cellular component)
1509
+ min_size : int
1510
+ Minimum gene set size
1511
+ max_size : int
1512
+ Maximum gene set size
1513
+
1514
+ Returns
1515
+ -------
1516
+ Dict[str, List[str]]
1517
+ Dictionary of GO gene sets
1518
+ """
1519
+ aspect_map = {
1520
+ "BP": "GO_Biological_Process_2023",
1521
+ "MF": "GO_Molecular_Function_2023",
1522
+ "CC": "GO_Cellular_Component_2023",
1523
+ }
1524
+
1525
+ if aspect not in aspect_map:
1526
+ raise ParameterError(f"Invalid GO aspect: {aspect}")
1527
+
1528
+ try:
1529
+ import gseapy as gp
1530
+
1531
+ organism = _get_organism_name(species)
1532
+ gene_sets = gp.get_library(aspect_map[aspect], organism=organism)
1533
+
1534
+ # Filter by size
1535
+ filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
1536
+ return filtered_sets
1537
+
1538
+ except Exception as e:
1539
+ raise ProcessingError(f"Failed to load GO gene sets: {e}") from e
1540
+
1541
+
1542
+ def load_kegg_gene_sets(
1543
+ species: str, min_size: int = 10, max_size: int = 500
1544
+ ) -> dict[str, list[str]]:
1545
+ """
1546
+ Load KEGG pathways using gseapy.
1547
+
1548
+ Parameters
1549
+ ----------
1550
+ species : str
1551
+ Species for gene sets ('human' or 'mouse')
1552
+ min_size : int
1553
+ Minimum gene set size
1554
+ max_size : int
1555
+ Maximum gene set size
1556
+
1557
+ Returns
1558
+ -------
1559
+ Dict[str, List[str]]
1560
+ Dictionary of KEGG pathway gene sets
1561
+ """
1562
+ try:
1563
+ import gseapy as gp
1564
+
1565
+ organism = _get_organism_name(species)
1566
+
1567
+ if species.lower() == "human":
1568
+ gene_sets = gp.get_library("KEGG_2021_Human", organism=organism)
1569
+ else:
1570
+ gene_sets = gp.get_library("KEGG_2019_Mouse", organism=organism)
1571
+
1572
+ # Filter by size
1573
+ filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
1574
+ return filtered_sets
1575
+
1576
+ except Exception as e:
1577
+ raise ProcessingError(f"Failed to load KEGG pathways: {e}") from e
1578
+
1579
+
1580
+ def load_reactome_gene_sets(
1581
+ species: str, min_size: int = 10, max_size: int = 500
1582
+ ) -> dict[str, list[str]]:
1583
+ """
1584
+ Load Reactome pathways using gseapy.
1585
+
1586
+ Parameters
1587
+ ----------
1588
+ species : str
1589
+ Species for gene sets ('human' or 'mouse')
1590
+ min_size : int
1591
+ Minimum gene set size
1592
+ max_size : int
1593
+ Maximum gene set size
1594
+
1595
+ Returns
1596
+ -------
1597
+ Dict[str, List[str]]
1598
+ Dictionary of Reactome pathway gene sets
1599
+ """
1600
+ try:
1601
+ import gseapy as gp
1602
+
1603
+ organism = _get_organism_name(species)
1604
+ gene_sets = gp.get_library("Reactome_2022", organism=organism)
1605
+
1606
+ # Filter by size (use shared utility for consistency)
1607
+ filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
1608
+ return filtered_sets
1609
+
1610
+ except Exception as e:
1611
+ raise ProcessingError(f"Failed to load Reactome pathways: {e}") from e
1612
+
1613
+
1614
+ def load_cell_marker_gene_sets(
1615
+ species: str, min_size: int = 5, max_size: int = 200
1616
+ ) -> dict[str, list[str]]:
1617
+ """
1618
+ Load cell type marker gene sets using gseapy.
1619
+
1620
+ Parameters
1621
+ ----------
1622
+ species : str
1623
+ Species for gene sets ('human' or 'mouse')
1624
+ min_size : int
1625
+ Minimum gene set size
1626
+ max_size : int
1627
+ Maximum gene set size
1628
+
1629
+ Returns
1630
+ -------
1631
+ Dict[str, List[str]]
1632
+ Dictionary of cell type marker gene sets
1633
+ """
1634
+ try:
1635
+ import gseapy as gp
1636
+
1637
+ organism = _get_organism_name(species)
1638
+ gene_sets = gp.get_library("CellMarker_Augmented_2021", organism=organism)
1639
+
1640
+ # Filter by size
1641
+ filtered_sets = _filter_gene_sets_by_size(gene_sets, min_size, max_size)
1642
+ return filtered_sets
1643
+
1644
+ except Exception as e:
1645
+ raise ProcessingError(f"Failed to load cell markers: {e}") from e
1646
+
1647
+
1648
+ def load_gene_sets(
1649
+ database: str,
1650
+ species: str = "human",
1651
+ min_genes: int = 10,
1652
+ max_genes: int = 500,
1653
+ ctx: "ToolContext" = None,
1654
+ ) -> dict[str, list[str]]:
1655
+ """
1656
+ Load gene sets from specified database.
1657
+
1658
+ Parameters
1659
+ ----------
1660
+ database : str
1661
+ Database name:
1662
+ - GO_Biological_Process, GO_Molecular_Function, GO_Cellular_Component
1663
+ - KEGG_Pathways
1664
+ - Reactome_Pathways
1665
+ - MSigDB_Hallmark
1666
+ - Cell_Type_Markers
1667
+ species : str
1668
+ Species ('human' or 'mouse')
1669
+ min_genes : int
1670
+ Minimum gene set size
1671
+ max_genes : int
1672
+ Maximum gene set size
1673
+ ctx : ToolContext
1674
+ MCP tool context for logging
1675
+
1676
+ Returns
1677
+ -------
1678
+ Dict[str, List[str]]
1679
+ Dictionary of gene sets
1680
+ """
1681
+ # Direct function calls - no class overhead
1682
+ database_map = {
1683
+ "GO_Biological_Process": lambda: load_go_gene_sets(
1684
+ species, "BP", min_genes, max_genes
1685
+ ),
1686
+ "GO_Molecular_Function": lambda: load_go_gene_sets(
1687
+ species, "MF", min_genes, max_genes
1688
+ ),
1689
+ "GO_Cellular_Component": lambda: load_go_gene_sets(
1690
+ species, "CC", min_genes, max_genes
1691
+ ),
1692
+ "KEGG_Pathways": lambda: load_kegg_gene_sets(species, min_genes, max_genes),
1693
+ "Reactome_Pathways": lambda: load_reactome_gene_sets(
1694
+ species, min_genes, max_genes
1695
+ ),
1696
+ "MSigDB_Hallmark": lambda: load_msigdb_gene_sets(
1697
+ species, "H", None, min_genes, max_genes
1698
+ ),
1699
+ "Cell_Type_Markers": lambda: load_cell_marker_gene_sets(
1700
+ species, min_genes, max_genes
1701
+ ),
1702
+ }
1703
+
1704
+ if database not in database_map:
1705
+ raise ParameterError(
1706
+ f"Unknown database: {database}. Available: {list(database_map)}"
1707
+ )
1708
+
1709
+ gene_sets = database_map[database]()
1710
+ return gene_sets
1711
+
1712
+
1713
+ # ============================================================================
1714
+ # UNIFIED ENRICHMENT ANALYSIS ENTRY POINT
1715
+ # ============================================================================
1716
+
1717
+
1718
+ async def analyze_enrichment(
1719
+ data_id: str,
1720
+ ctx: "ToolContext",
1721
+ params: "EnrichmentParameters",
1722
+ ) -> EnrichmentResult:
1723
+ """
1724
+ Unified entry point for gene set enrichment analysis.
1725
+
1726
+ This function handles all enrichment methods with a consistent interface:
1727
+ - Gene set loading from databases
1728
+ - Method dispatch (GSEA, ORA, ssGSEA, Enrichr, spatial)
1729
+ - Error handling with clear messages
1730
+
1731
+ Args:
1732
+ data_id: Dataset ID
1733
+ ctx: ToolContext for data access and logging
1734
+ params: EnrichmentParameters with method, species, database, etc.
1735
+
1736
+ Returns:
1737
+ EnrichmentResult with enrichment scores and statistics
1738
+
1739
+ Raises:
1740
+ ParameterError: If params is None or invalid
1741
+ ProcessingError: If gene set loading or analysis fails
1742
+ """
1743
+ # Import here to avoid circular imports
1744
+ from ..utils.adata_utils import get_highly_variable_genes
1745
+
1746
+ # Validate params
1747
+ if params is None:
1748
+ raise ParameterError(
1749
+ "params parameter is required for enrichment analysis.\n"
1750
+ "You must provide EnrichmentParameters with at least 'species' specified.\n"
1751
+ "Example: params={'species': 'mouse', 'method': 'pathway_ora'}"
1752
+ )
1753
+
1754
+ # Get adata
1755
+ adata = await ctx.get_adata(data_id)
1756
+
1757
+ # Load gene sets
1758
+ gene_sets = params.gene_sets
1759
+ if gene_sets is None and params.gene_set_database:
1760
+ await ctx.info(f"Loading gene sets from {params.gene_set_database}")
1761
+ try:
1762
+ gene_sets = load_gene_sets(
1763
+ database=params.gene_set_database,
1764
+ species=params.species,
1765
+ min_genes=params.min_genes,
1766
+ max_genes=params.max_genes,
1767
+ ctx=ctx,
1768
+ )
1769
+ await ctx.info(
1770
+ f"Loaded {len(gene_sets)} gene sets from {params.gene_set_database}"
1771
+ )
1772
+ except Exception as e:
1773
+ await ctx.error(f"Gene set database loading failed: {e}")
1774
+ raise ProcessingError(
1775
+ f"Failed to load gene sets from {params.gene_set_database}: {e}\n\n"
1776
+ f"SOLUTIONS:\n"
1777
+ f"1. Check your internet connection\n"
1778
+ f"2. Verify species parameter: '{params.species}'\n"
1779
+ f"3. Try a different database (KEGG_Pathways, GO_Biological_Process)\n"
1780
+ f"4. Provide custom gene sets via 'gene_sets' parameter"
1781
+ ) from e
1782
+
1783
+ # Validate gene sets
1784
+ if gene_sets is None or len(gene_sets) == 0:
1785
+ raise ProcessingError(
1786
+ "No valid gene sets available. "
1787
+ "Please provide gene sets via 'gene_sets' parameter or "
1788
+ "specify a valid 'gene_set_database'."
1789
+ )
1790
+
1791
+ # Dispatch to appropriate method
1792
+ if params.method == "spatial_enrichmap":
1793
+ result = perform_spatial_enrichment(
1794
+ data_id=data_id,
1795
+ ctx=ctx,
1796
+ gene_sets=gene_sets,
1797
+ score_keys=params.score_keys,
1798
+ spatial_key=params.spatial_key,
1799
+ n_neighbors=params.n_neighbors,
1800
+ smoothing=params.smoothing,
1801
+ correct_spatial_covariates=params.correct_spatial_covariates,
1802
+ batch_key=params.batch_key,
1803
+ species=params.species,
1804
+ database=params.gene_set_database,
1805
+ )
1806
+ await ctx.info(
1807
+ "Spatial enrichment complete. Use visualize_data with "
1808
+ "plot_type='pathway_enrichment' to visualize."
1809
+ )
1810
+
1811
+ elif params.method == "pathway_gsea":
1812
+ result = perform_gsea(
1813
+ adata=adata,
1814
+ gene_sets=gene_sets,
1815
+ ranking_key=params.score_keys,
1816
+ permutation_num=params.n_permutations,
1817
+ min_size=params.min_genes,
1818
+ max_size=params.max_genes,
1819
+ species=params.species,
1820
+ database=params.gene_set_database,
1821
+ ctx=ctx,
1822
+ )
1823
+ await ctx.info("GSEA complete. Use visualize_data to see results.")
1824
+
1825
+ elif params.method == "pathway_ora":
1826
+ result = perform_ora(
1827
+ adata=adata,
1828
+ gene_sets=gene_sets,
1829
+ pvalue_threshold=params.pvalue_cutoff,
1830
+ min_size=params.min_genes,
1831
+ max_size=params.max_genes,
1832
+ species=params.species,
1833
+ database=params.gene_set_database,
1834
+ ctx=ctx,
1835
+ )
1836
+ await ctx.info("ORA complete. Use visualize_data to see results.")
1837
+
1838
+ elif params.method == "pathway_ssgsea":
1839
+ result = perform_ssgsea(
1840
+ adata=adata,
1841
+ gene_sets=gene_sets,
1842
+ min_size=params.min_genes,
1843
+ max_size=params.max_genes,
1844
+ species=params.species,
1845
+ database=params.gene_set_database,
1846
+ ctx=ctx,
1847
+ )
1848
+ await ctx.info("ssGSEA complete. Use visualize_data to see results.")
1849
+
1850
+ elif params.method == "pathway_enrichr":
1851
+ gene_list = get_highly_variable_genes(adata, max_genes=500)
1852
+ result = perform_enrichr(
1853
+ gene_list=gene_list,
1854
+ gene_sets=params.gene_set_database,
1855
+ organism=params.species,
1856
+ ctx=ctx,
1857
+ )
1858
+ await ctx.info("Enrichr complete. Use visualize_data to see results.")
1859
+
1860
+ else:
1861
+ raise ParameterError(f"Unknown enrichment method: {params.method}")
1862
+
1863
+ return result