chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
chatspatial/server.py ADDED
@@ -0,0 +1,1763 @@
1
+ """
2
+ Main server implementation for ChatSpatial using the Spatial MCP Adapter.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import warnings
8
+ from typing import Any, Optional, Union
9
+
10
+ # Suppress warnings to speed up startup
11
+ warnings.filterwarnings("ignore", category=FutureWarning)
12
+ warnings.filterwarnings("ignore", category=UserWarning)
13
+
14
+ # CRITICAL: Disable progress bars to prevent stdout pollution
15
+ # This protects against accidental stdout usage if server is imported directly
16
+ os.environ["TQDM_DISABLE"] = "1"
17
+
18
+ # Suppress scanpy/squidpy verbosity
19
+ try:
20
+ import scanpy as sc
21
+
22
+ sc.settings.verbosity = 0
23
+ except ImportError:
24
+ pass
25
+
26
+ from mcp.server.fastmcp import Context # noqa: E402
27
+ from mcp.types import ImageContent # noqa: E402
28
+
29
+ from .models.analysis import AnnotationResult # noqa: E402
30
+ from .models.analysis import CellCommunicationResult # noqa: E402
31
+ from .models.analysis import CNVResult # noqa: E402
32
+ from .models.analysis import ConditionComparisonResult # noqa: E402
33
+ from .models.analysis import DeconvolutionResult # noqa: E402
34
+ from .models.analysis import DifferentialExpressionResult # noqa: E402
35
+ from .models.analysis import EnrichmentResult # noqa: E402
36
+ from .models.analysis import IntegrationResult # noqa: E402
37
+ from .models.analysis import PreprocessingResult # noqa: E402
38
+ from .models.analysis import RNAVelocityResult # noqa: E402
39
+ from .models.analysis import SpatialDomainResult # noqa: E402
40
+ from .models.analysis import SpatialStatisticsResult # noqa: E402
41
+ from .models.analysis import SpatialVariableGenesResult # noqa: E402
42
+ from .models.analysis import TrajectoryResult # noqa: E402
43
+ from .models.data import AnnotationParameters # noqa: E402
44
+ from .models.data import CellCommunicationParameters # noqa: E402
45
+ from .models.data import CNVParameters # noqa: E402
46
+ from .models.data import ColumnInfo # noqa: E402
47
+ from .models.data import ConditionComparisonParameters # noqa: E402
48
+ from .models.data import DeconvolutionParameters # noqa: E402
49
+ from .models.data import DifferentialExpressionParameters # noqa: E402
50
+ from .models.data import EnrichmentParameters # noqa: E402
51
+ from .models.data import IntegrationParameters # noqa: E402
52
+ from .models.data import PreprocessingParameters # noqa: E402
53
+ from .models.data import RNAVelocityParameters # noqa: E402
54
+ from .models.data import SpatialDataset # noqa: E402
55
+ from .models.data import SpatialDomainParameters # noqa: E402
56
+ from .models.data import SpatialStatisticsParameters # noqa: E402
57
+ from .models.data import SpatialVariableGenesParameters # noqa: E402
58
+ from .models.data import TrajectoryParameters # noqa: E402
59
+ from .models.data import VisualizationParameters # noqa: E402
60
+ from .spatial_mcp_adapter import ToolContext # noqa: E402
61
+ from .spatial_mcp_adapter import create_spatial_mcp_server # noqa: E402
62
+ from .spatial_mcp_adapter import get_tool_annotations # noqa: E402
63
+ from .utils.exceptions import DataNotFoundError # noqa: E402
64
+ from .utils.mcp_utils import mcp_tool_error_handler # noqa: E402
65
+
66
+ # Create MCP server and adapter
67
+ mcp, adapter = create_spatial_mcp_server("ChatSpatial")
68
+
69
+ # Get data manager and visualization registry from adapter
70
+ # These module-level aliases provide consistent access patterns
71
+ data_manager = adapter.data_manager
72
+ visualization_registry = adapter.visualization_registry
73
+
74
+
75
+ def validate_dataset(data_id: str) -> None:
76
+ """Validate that a dataset exists in the data store
77
+
78
+ Args:
79
+ data_id: Dataset ID
80
+
81
+ Raises:
82
+ ValueError: If the dataset is not found
83
+ """
84
+ if not data_manager.dataset_exists(data_id):
85
+ raise DataNotFoundError(f"Dataset {data_id} not found")
86
+
87
+
88
+ @mcp.tool(annotations=get_tool_annotations("load_data"))
89
+ @mcp_tool_error_handler()
90
+ async def load_data(
91
+ data_path: str,
92
+ data_type: str = "auto",
93
+ name: Optional[str] = None,
94
+ context: Optional[Context] = None,
95
+ ) -> SpatialDataset:
96
+ """Load spatial transcriptomics data with comprehensive metadata profile
97
+
98
+ Returns detailed information about the dataset structure to help with analysis:
99
+ - Cell and gene counts
100
+ - Available metadata columns with types and sample values
101
+ - Multi-dimensional data (spatial coordinates, dimensionality reduction, etc.)
102
+ - Gene expression profiles
103
+
104
+ Args:
105
+ data_path: Path to the data file or directory
106
+ data_type: Type of spatial data (auto, 10x_visium, slide_seq, merfish, seqfish, other, h5ad).
107
+ If 'auto', will try to determine the type from the file extension or directory structure.
108
+ name: Optional name for the dataset
109
+
110
+ Returns:
111
+ Comprehensive dataset information including metadata profiles
112
+ """
113
+ # Create ToolContext for consistent logging
114
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
115
+
116
+ await ctx.info(f"Loading data from {data_path} (type: {data_type})")
117
+
118
+ # Load data using data manager
119
+ data_id = await data_manager.load_dataset(data_path, data_type, name)
120
+ dataset_info = await data_manager.get_dataset(data_id)
121
+
122
+ await ctx.info(
123
+ f"Successfully loaded {dataset_info['type']} data with "
124
+ f"{dataset_info['n_cells']} cells and {dataset_info['n_genes']} genes"
125
+ )
126
+
127
+ # Convert column info from dict to ColumnInfo objects
128
+ obs_columns = (
129
+ [ColumnInfo(**col) for col in dataset_info.get("obs_columns", [])]
130
+ if dataset_info.get("obs_columns")
131
+ else None
132
+ )
133
+ var_columns = (
134
+ [ColumnInfo(**col) for col in dataset_info.get("var_columns", [])]
135
+ if dataset_info.get("var_columns")
136
+ else None
137
+ )
138
+
139
+ # Return comprehensive dataset information
140
+ return SpatialDataset(
141
+ id=data_id,
142
+ name=dataset_info["name"],
143
+ data_type=dataset_info["type"], # Use normalized type from dataset_info
144
+ description=f"Spatial data: {dataset_info['n_cells']} cells × {dataset_info['n_genes']} genes",
145
+ n_cells=dataset_info["n_cells"],
146
+ n_genes=dataset_info["n_genes"],
147
+ spatial_coordinates_available=dataset_info["spatial_coordinates_available"],
148
+ tissue_image_available=dataset_info["tissue_image_available"],
149
+ obs_columns=obs_columns,
150
+ var_columns=var_columns,
151
+ obsm_keys=dataset_info.get("obsm_keys"),
152
+ uns_keys=dataset_info.get("uns_keys"),
153
+ top_highly_variable_genes=dataset_info.get("top_highly_variable_genes"),
154
+ top_expressed_genes=dataset_info.get("top_expressed_genes"),
155
+ )
156
+
157
+
158
+ @mcp.tool(annotations=get_tool_annotations("preprocess_data"))
159
+ @mcp_tool_error_handler()
160
+ async def preprocess_data(
161
+ data_id: str,
162
+ params: PreprocessingParameters = PreprocessingParameters(),
163
+ context: Optional[Context] = None,
164
+ ) -> PreprocessingResult:
165
+ """Preprocess spatial transcriptomics data
166
+
167
+ Args:
168
+ data_id: Dataset ID
169
+ params: Preprocessing parameters
170
+
171
+ Returns:
172
+ Preprocessing result
173
+
174
+ Notes:
175
+ Available normalization methods:
176
+ - log: Standard log normalization (default)
177
+ - sct: SCTransform v2 variance-stabilizing normalization (requires pysctransform)
178
+ Install: pip install 'chatspatial[sct]'
179
+ Best for raw UMI counts from 10x platforms (Visium, etc.)
180
+ Based on regularized negative binomial regression (Hafemeister & Satija 2019)
181
+ - pearson_residuals: Analytic Pearson residuals (built-in, similar to SCTransform)
182
+ Faster than SCTransform with comparable results for most analyses
183
+ - none: No normalization
184
+ - scvi: Use scVI for normalization and dimensionality reduction
185
+
186
+ SCTransform-specific parameters (only used when normalization='sct'):
187
+ - sct_method: 'fix-slope' (v2, default) or 'offset' (v1)
188
+ - sct_var_features_n: Number of variable features (default: 3000)
189
+ - sct_exclude_poisson: Exclude Poisson genes from regularization (default: True)
190
+ - sct_n_cells: Number of cells for parameter estimation (default: 5000)
191
+
192
+ When use_scvi_preprocessing=True, scVI will be used for advanced preprocessing
193
+ including denoising and batch effect correction.
194
+
195
+ Advanced configuration options:
196
+ - n_neighbors: Number of neighbors for graph construction (default: 15)
197
+ - clustering_resolution: Leiden clustering resolution (default: 1.0)
198
+ - clustering_key: Key name for storing clustering results (default: "leiden")
199
+ - spatial_key: Key name for spatial coordinates in obsm (default: None, auto-detected)
200
+ - batch_key: Key name for batch information in obs (default: "batch")
201
+
202
+ IMPORTANT: This preprocessing creates a filtered gene set for analysis efficiency.
203
+ Raw data is automatically preserved in adata.raw for downstream analyses requiring
204
+ comprehensive gene coverage (e.g., cell communication analysis with LIANA+).
205
+
206
+ Cell communication analysis automatically uses adata.raw when available.
207
+ """
208
+ # Validate dataset
209
+ validate_dataset(data_id)
210
+
211
+ # Create ToolContext
212
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
213
+
214
+ # Lazy import (avoid name conflict with MCP tool)
215
+ from .tools.preprocessing import preprocess_data as preprocess_func
216
+
217
+ # Call preprocessing function
218
+ result = await preprocess_func(data_id, ctx, params)
219
+
220
+ # Note: No writeback needed - adata modifications are in-place on the same object
221
+
222
+ # Save preprocessing result
223
+ await data_manager.save_result(data_id, "preprocessing", result)
224
+
225
+ return result
226
+
227
+
228
+ @mcp.tool(annotations=get_tool_annotations("compute_embeddings"))
229
+ @mcp_tool_error_handler()
230
+ async def compute_embeddings(
231
+ data_id: str,
232
+ compute_pca: bool = True,
233
+ compute_neighbors: bool = True,
234
+ compute_umap: bool = True,
235
+ compute_clustering: bool = True,
236
+ compute_diffmap: bool = False,
237
+ compute_spatial_neighbors: bool = True,
238
+ n_pcs: int = 30,
239
+ n_neighbors: int = 15,
240
+ clustering_resolution: float = 1.0,
241
+ clustering_method: str = "leiden",
242
+ force: bool = False,
243
+ context: Optional[Context] = None,
244
+ ) -> dict[str, Any]:
245
+ """Compute dimensionality reduction, clustering, and neighbor graphs.
246
+
247
+ This tool provides explicit control over embedding computations.
248
+ Analysis tools compute these lazily on-demand, but you can use this tool to:
249
+ - Control computation parameters (n_pcs, n_neighbors, resolution)
250
+ - Force recomputation with different parameters
251
+ - Compute specific embeddings independently
252
+
253
+ Args:
254
+ data_id: Dataset ID
255
+ compute_pca: Compute PCA dimensionality reduction
256
+ compute_neighbors: Compute k-NN neighbor graph
257
+ compute_umap: Compute UMAP embedding
258
+ compute_clustering: Compute Leiden/Louvain clustering
259
+ compute_diffmap: Compute diffusion map for trajectory analysis
260
+ compute_spatial_neighbors: Compute spatial neighborhood graph
261
+ n_pcs: Number of principal components (default: 30)
262
+ n_neighbors: Number of neighbors for k-NN graph (default: 15)
263
+ clustering_resolution: Clustering resolution (default: 1.0)
264
+ clustering_method: Clustering algorithm ('leiden' or 'louvain')
265
+ force: Force recomputation even if results already exist
266
+
267
+ Returns:
268
+ Summary of computed embeddings
269
+ """
270
+ # Validate dataset
271
+ validate_dataset(data_id)
272
+
273
+ # Lazy import
274
+ from .tools.embeddings import EmbeddingParameters
275
+ from .tools.embeddings import compute_embeddings as compute_embeddings_func
276
+
277
+ # Create parameters
278
+ params = EmbeddingParameters(
279
+ compute_pca=compute_pca,
280
+ compute_neighbors=compute_neighbors,
281
+ compute_umap=compute_umap,
282
+ compute_clustering=compute_clustering,
283
+ compute_diffmap=compute_diffmap,
284
+ compute_spatial_neighbors=compute_spatial_neighbors,
285
+ n_pcs=n_pcs,
286
+ n_neighbors=n_neighbors,
287
+ clustering_resolution=clustering_resolution,
288
+ clustering_method=clustering_method,
289
+ force=force,
290
+ )
291
+
292
+ # Create ToolContext
293
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
294
+
295
+ # Call function
296
+ result = await compute_embeddings_func(data_id, ctx, params)
297
+
298
+ return result.model_dump()
299
+
300
+
301
+ @mcp.tool(annotations=get_tool_annotations("visualize_data"))
302
+ @mcp_tool_error_handler() # Handles type-aware error formatting for Image/str returns
303
+ async def visualize_data(
304
+ data_id: str,
305
+ params: VisualizationParameters = VisualizationParameters(),
306
+ context: Optional[Context] = None,
307
+ ) -> Union[
308
+ ImageContent, str
309
+ ]: # Simplified: ImageContent or str (MCP 2025 best practice)
310
+ """Visualize spatial transcriptomics data
311
+
312
+ Args:
313
+ data_id: Dataset ID
314
+ params: Visualization parameters including:
315
+ - plot_type: Type of visualization. Available types:
316
+ * Basic plots: spatial, heatmap, violin, umap, dotplot
317
+ * Analysis results: cell_communication, deconvolution,
318
+ trajectory, rna_velocity, spatial_statistics
319
+ * Multi-gene/correlation: multi_gene, lr_pairs, gene_correlation
320
+ * Enrichment: pathway_enrichment (use subtype for spatial EnrichMap)
321
+ * Integration/QC: spatial_interaction, batch_integration
322
+ * CNV analysis: cnv_heatmap, spatial_cnv
323
+ * High-resolution: card_imputation
324
+ - feature: Gene or feature to visualize (single/multiple genes). For cell types,
325
+ use method-specific columns: 'cell_type_tangram', 'cell_type_scanvi',
326
+ 'cell_type_cellassign', or clustering: 'leiden', 'louvain'.
327
+ For spatial domains: use the domain_key returned by identify_spatial_domains
328
+ (e.g., 'spatial_domains_spagcn', 'spatial_domains_leiden')
329
+ - cluster_key: Column in adata.obs for grouping (e.g., 'leiden', 'cell_type').
330
+ REQUIRED for heatmap, violin, and dotplot
331
+ - subtype: Visualization variant. Required for certain plot_types:
332
+ * deconvolution: 'spatial_multi', 'dominant_type', 'diversity', 'stacked_bar', 'scatterpie', 'umap'
333
+ * spatial_statistics: 'neighborhood', 'co_occurrence', 'ripley', 'moran', 'centrality', 'getis_ord'
334
+ * pathway_enrichment: 'barplot', 'dotplot', 'spatial_score', 'spatial_correlogram'
335
+ - deconv_method: Deconvolution method ('cell2location', 'rctd', etc.).
336
+ Auto-selected if only one result exists
337
+ - batch_key: Column for batch/sample identifier (default: 'batch'). Required for batch_integration
338
+ - colormap: Color scheme (default: 'coolwarm')
339
+ - figure_size: Tuple (width, height) in inches. Auto-determined if None
340
+ - dpi: Image resolution (default: 300, publication quality)
341
+ - spot_size: Spot size for spatial plots (default: 150). Adjust for density: dense data 100-150, sparse 150-200
342
+ - alpha_img: Background tissue image opacity (default: 0.3). Lower = dimmer background
343
+ - n_cell_types: Number of top cell types in deconvolution (default: 4, max: 10)
344
+ - lr_pairs: List of (ligand, receptor) tuples for lr_pairs plot_type
345
+
346
+ Returns:
347
+ Visualization image
348
+
349
+ Examples:
350
+ # Basic spatial plot
351
+ {"plot_type": "spatial", "feature": "Cd7", "colormap": "viridis"}
352
+
353
+ # Cell type visualization
354
+ {"plot_type": "spatial", "feature": "cell_type_tangram", "colormap": "tab20",
355
+ "spot_size": 150, "alpha_img": 0.3}
356
+
357
+ # Violin plot (cluster_key required)
358
+ {"plot_type": "violin", "feature": ["Cd7", "Cd3d"], "cluster_key": "leiden"}
359
+
360
+ # Heatmap (cluster_key required)
361
+ {"plot_type": "heatmap", "feature": ["Cd7", "Cd3d"], "cluster_key": "cell_type"}
362
+
363
+ # Dotplot - marker gene expression (cluster_key required)
364
+ {"plot_type": "dotplot", "feature": ["Cd3d", "Cd4", "Cd8a", "Cd19"],
365
+ "cluster_key": "cell_type", "colormap": "Reds"}
366
+
367
+ # Spatial domains (use domain_key from identify_spatial_domains result)
368
+ {"plot_type": "spatial", "feature": "spatial_domains_spagcn", "colormap": "tab20"}
369
+
370
+ # Deconvolution results
371
+ {"plot_type": "deconvolution", "subtype": "dominant_type", "deconv_method": "cell2location",
372
+ "n_cell_types": 6}
373
+
374
+ # Spatial statistics
375
+ {"plot_type": "spatial_statistics", "subtype": "neighborhood", "cluster_key": "leiden"}
376
+
377
+ # Ligand-receptor pairs
378
+ {"plot_type": "lr_pairs", "lr_pairs": [("Fn1", "Cd79a"), ("Vegfa", "Nrp2")]}
379
+
380
+ # Batch integration QC
381
+ {"plot_type": "batch_integration", "batch_key": "sample_id"}
382
+ """
383
+ # Import to avoid name conflict
384
+ from .tools.visualization import visualize_data as visualize_func
385
+
386
+ # Validate dataset
387
+ validate_dataset(data_id)
388
+
389
+ # Create ToolContext for clean data access
390
+ ctx = ToolContext(
391
+ _data_manager=data_manager,
392
+ _mcp_context=context,
393
+ _visualization_registry=visualization_registry,
394
+ )
395
+
396
+ # Parameter validation is handled by Pydantic model
397
+ # params is already a validated VisualizationParameters instance
398
+
399
+ # Call visualization function with ToolContext
400
+ image = await visualize_func(data_id, ctx, params)
401
+
402
+ # Store visualization params and return the image
403
+ if image is not None:
404
+ # Generate cache key with subtype if applicable
405
+ # This handles plot types with subtypes (e.g., deconvolution, spatial_statistics)
406
+ subtype = params.subtype # Optional field with default None
407
+
408
+ if subtype:
409
+ cache_key = f"{data_id}_{params.plot_type}_{subtype}"
410
+ else:
411
+ cache_key = f"{data_id}_{params.plot_type}"
412
+
413
+ # Handle two return types: str (large images) or ImageContent (small images)
414
+ # Extract file_path if image is saved to disk
415
+ file_path = None
416
+ # Large image: file path returned as text (MCP 2025 best practice)
417
+ # Extract path from message (format: "Visualization saved: <path>\n...")
418
+ if isinstance(image, str) and "Visualization saved:" in image:
419
+ file_path = image.split("\n")[0].replace("Visualization saved: ", "")
420
+
421
+ # Store visualization params in registry (for regeneration on demand)
422
+ ctx.store_visualization(cache_key, params, file_path)
423
+
424
+ await ctx.info(
425
+ f"Visualization type: {params.plot_type}, feature: {params.feature or 'N/A'}"
426
+ )
427
+
428
+ return image
429
+
430
+ else:
431
+ # Return error message if no image was generated
432
+ return "Visualization generation failed, please check the data and parameter settings."
433
+
434
+
435
+ @mcp.tool(annotations=get_tool_annotations("save_visualization"))
436
+ @mcp_tool_error_handler()
437
+ async def save_visualization(
438
+ data_id: str,
439
+ plot_type: str,
440
+ subtype: Optional[str] = None,
441
+ output_dir: str = "./outputs",
442
+ filename: Optional[str] = None,
443
+ format: str = "png",
444
+ dpi: Optional[int] = None,
445
+ context: Optional[Context] = None,
446
+ ) -> str:
447
+ """Save a visualization to disk at publication quality
448
+
449
+ This function regenerates visualizations from stored metadata and the original
450
+ data, then exports at the requested quality. This secure approach avoids
451
+ unsafe pickle deserialization.
452
+
453
+ Args:
454
+ data_id: Dataset ID
455
+ plot_type: Type of plot to save (e.g., 'spatial', 'umap', 'deconvolution', 'spatial_statistics')
456
+ subtype: Optional subtype for plot types with variants (e.g., 'neighborhood', 'scatterpie')
457
+ - For pathway_enrichment: 'enrichment_plot', 'barplot', 'dotplot', 'spatial'
458
+ - For deconvolution: 'spatial_multi', 'dominant_type', 'diversity', 'stacked_bar', 'scatterpie', 'umap'
459
+ - For spatial_statistics: 'neighborhood', 'co_occurrence', 'ripley', 'moran', 'centrality', 'getis_ord'
460
+ output_dir: Directory to save the file (default: ./outputs)
461
+ filename: Custom filename (optional, auto-generated if not provided)
462
+ format: Image format (png, jpg, pdf, svg)
463
+ dpi: DPI for saved image (default: 300 for publication quality)
464
+ For publication quality, use 300+ DPI
465
+
466
+ Returns:
467
+ Path to the saved file
468
+
469
+ Examples:
470
+ Save a spatial plot: save_visualization("data1", "spatial")
471
+ Save with subtype: save_visualization("data1", "spatial_statistics", subtype="neighborhood")
472
+ Save deconvolution: save_visualization("data1", "deconvolution", subtype="scatterpie", format="pdf")
473
+ Save for publication: save_visualization("data1", "spatial", dpi=300, format="png")
474
+ """
475
+ from .tools.visualization import save_visualization as save_func
476
+
477
+ # Create ToolContext for unified data access
478
+ ctx = ToolContext(
479
+ _data_manager=data_manager,
480
+ _mcp_context=context,
481
+ _visualization_registry=visualization_registry,
482
+ )
483
+
484
+ result = await save_func(
485
+ data_id=data_id,
486
+ ctx=ctx,
487
+ plot_type=plot_type,
488
+ subtype=subtype,
489
+ output_dir=output_dir,
490
+ filename=filename,
491
+ format=format,
492
+ dpi=dpi,
493
+ )
494
+
495
+ return result
496
+
497
+
498
+ @mcp.tool(annotations=get_tool_annotations("export_all_visualizations"))
499
+ @mcp_tool_error_handler()
500
+ async def export_all_visualizations(
501
+ data_id: str,
502
+ output_dir: str = "./exports",
503
+ format: str = "png",
504
+ dpi: Optional[int] = None,
505
+ context: Optional[Context] = None,
506
+ ) -> list[str]:
507
+ """Export all cached visualizations for a dataset to disk
508
+
509
+ This function regenerates each visualization from stored metadata and the original
510
+ data, then exports at the requested quality. This secure approach avoids
511
+ unsafe pickle deserialization.
512
+
513
+ Args:
514
+ data_id: Dataset ID to export visualizations for
515
+ output_dir: Directory to save files (default: ./exports)
516
+ format: Image format (png, jpg, jpeg, pdf, svg, eps, ps, tiff) (default: png)
517
+ dpi: DPI for raster formats (default: 300 for publication quality)
518
+
519
+ Returns:
520
+ List of paths to saved files
521
+
522
+ Examples:
523
+ # Export all visualizations as PNG
524
+ export_all_visualizations("data1")
525
+
526
+ # Export all as PDF for publication
527
+ export_all_visualizations("data1", format="pdf", dpi=300)
528
+
529
+ # Export to custom directory as SVG
530
+ export_all_visualizations("data1", "./my_exports", format="svg")
531
+ """
532
+ from .tools.visualization import export_all_visualizations as export_func
533
+
534
+ # Create ToolContext for unified data access
535
+ ctx = ToolContext(
536
+ _data_manager=data_manager,
537
+ _mcp_context=context,
538
+ _visualization_registry=visualization_registry,
539
+ )
540
+
541
+ result = await export_func(
542
+ data_id=data_id,
543
+ ctx=ctx,
544
+ output_dir=output_dir,
545
+ format=format,
546
+ dpi=dpi,
547
+ )
548
+
549
+ return result
550
+
551
+
552
+ @mcp.tool(annotations=get_tool_annotations("clear_visualization_cache"))
553
+ @mcp_tool_error_handler()
554
+ async def clear_visualization_cache(
555
+ data_id: Optional[str] = None,
556
+ context: Optional[Context] = None,
557
+ ) -> int:
558
+ """Clear visualization cache to free memory
559
+
560
+ Args:
561
+ data_id: Optional dataset ID to clear specific visualizations (if None, clears all)
562
+
563
+ Returns:
564
+ Number of visualizations cleared
565
+
566
+ Examples:
567
+ Clear all visualizations: clear_visualization_cache()
568
+ Clear for specific dataset: clear_visualization_cache("data1")
569
+ """
570
+ from .tools.visualization import clear_visualization_cache as clear_func
571
+
572
+ # Create ToolContext for unified data access
573
+ ctx = ToolContext(
574
+ _data_manager=data_manager,
575
+ _mcp_context=context,
576
+ _visualization_registry=visualization_registry,
577
+ )
578
+
579
+ result = await clear_func(ctx=ctx, data_id=data_id)
580
+
581
+ return result
582
+
583
+
584
+ @mcp.tool(annotations=get_tool_annotations("annotate_cell_types"))
585
+ @mcp_tool_error_handler()
586
+ async def annotate_cell_types(
587
+ data_id: str,
588
+ params: AnnotationParameters = AnnotationParameters(),
589
+ context: Optional[Context] = None,
590
+ ) -> AnnotationResult:
591
+ """Annotate cell types in spatial transcriptomics data
592
+
593
+ Args:
594
+ data_id: Dataset ID
595
+ params: Annotation parameters
596
+
597
+ Returns:
598
+ Annotation result with cell type information and optional visualization
599
+
600
+ Notes:
601
+ Annotation methods (status):
602
+ - tangram: Implemented (requires reference_data_id and PREPROCESSED reference data with HVGs)
603
+ - scanvi: Implemented (deep learning label transfer via scvi-tools, requires reference_data_id)
604
+ - cellassign: Implemented (via scvi-tools, requires marker_genes parameter)
605
+ - mllmcelltype: Implemented (multimodal LLM classifier)
606
+ - sctype: Implemented (requires R and rpy2)
607
+ - singler: Implemented (Python-based via singler/celldex packages, requires singler_reference parameter)
608
+
609
+ For methods requiring reference data (tangram, scanvi, singler):
610
+ - tangram/scanvi: reference_data_id must point to a loaded AND PREPROCESSED single-cell dataset
611
+ - IMPORTANT: Reference data MUST be preprocessed with preprocess_data() before use!
612
+ - cell_type_key: Leave as None for auto-detection. Only set if you know the exact column name in reference data
613
+ - Common cell type column names: 'cell_type', 'cell_types', 'celltype'
614
+ - singler: Can use either reference_data_id OR singler_reference (celldex built-in references)
615
+
616
+ Tangram-specific notes:
617
+ - Method: Deep learning-based spatial mapping of single-cell to spatial transcriptomics
618
+ - Requires: reference_data_id with PREPROCESSED single-cell data
619
+ - Mapping modes (mode parameter):
620
+ * mode="cells" (default): Maps individual cells to spatial locations
621
+ - Preserves single-cell heterogeneity and fine-grained resolution
622
+ - More computationally intensive (GPU recommended for large datasets)
623
+ - Best for: Same specimen data, when cell-level detail is critical
624
+ * mode="clusters" (recommended for cross-specimen): Aggregates cells by type before mapping
625
+ - Dramatically improves performance, runs on standard laptop
626
+ - Official recommendation: "Our choice when scRNAseq and spatial data come from different specimens"
627
+ - Requires: cluster_label parameter (e.g., "cell_type")
628
+ - Best for: Different specimens, limited resources, cell type distributions
629
+ - Trades single-cell resolution for stability and speed
630
+ - Confidence scores: Automatically normalized to [0, 1] probability range
631
+ - GPU acceleration: Set tangram_device='cuda:0' if GPU available
632
+ - Other parameters: tangram_density_prior, tangram_learning_rate, tangram_lambda_r
633
+
634
+ scANVI-specific notes:
635
+ - Method: Semi-supervised variational inference for label transfer
636
+ - Requires: Both datasets must have 'counts' layer (raw counts)
637
+ - Architecture: Configurable via scanvi_n_latent, scanvi_n_hidden, scanvi_dropout_rate
638
+ - Small datasets (<1000 genes/cells): Use scanvi_n_latent=3-5, scanvi_dropout_rate=0.2,
639
+ scanvi_use_scvi_pretrain=False, num_epochs=50 to prevent NaN errors
640
+ - Returns probabilistic cell type predictions with confidence scores
641
+ - GPU acceleration available (set tangram_device='cuda:0' if available)
642
+
643
+ SingleR-specific notes:
644
+ - Method: Reference-based correlation matching for cell type annotation
645
+ - Reference options:
646
+ * Built-in celldex references (via singler_reference parameter):
647
+ - Human: 'hpca' (recommended), 'blueprint_encode', 'dice', 'monaco_immune', 'novershtern_hematopoietic'
648
+ - Mouse: 'immgen' (recommended), 'mouse_rnaseq'
649
+ * Custom reference (via reference_data_id parameter)
650
+ - Common mistakes:
651
+ * 'HumanPrimaryCellAtlasData' - WRONG, use 'hpca'
652
+ * 'ImmGenData' - WRONG, use 'immgen'
653
+ - Returns correlation-based confidence scores for cell type assignments
654
+ - No GPU required (Python-based implementation via singler/celldex packages)
655
+ """
656
+ # Validate dataset
657
+ validate_dataset(data_id)
658
+
659
+ # Validate reference data for methods that require it
660
+ if (
661
+ params.method in ["tangram", "scanvi", "singler"]
662
+ and params.reference_data_id
663
+ and not data_manager.dataset_exists(params.reference_data_id)
664
+ ):
665
+ raise DataNotFoundError(
666
+ f"Reference dataset {params.reference_data_id} not found"
667
+ )
668
+
669
+ # Create ToolContext for clean data access (no redundant dict wrapping)
670
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
671
+
672
+ # Lazy import annotation tool (avoids slow startup)
673
+ from .tools.annotation import annotate_cell_types
674
+
675
+ # Call annotation function with ToolContext
676
+ result = await annotate_cell_types(data_id, ctx, params)
677
+
678
+ # Note: No writeback needed - adata modifications are in-place on the same object
679
+
680
+ # Save annotation result
681
+ await data_manager.save_result(data_id, "annotation", result)
682
+
683
+ # Visualization should be done separately via visualization tools
684
+
685
+ return result
686
+
687
+
688
+ @mcp.tool(annotations=get_tool_annotations("analyze_spatial_statistics"))
689
+ @mcp_tool_error_handler()
690
+ async def analyze_spatial_statistics(
691
+ data_id: str,
692
+ params: SpatialStatisticsParameters = SpatialStatisticsParameters(),
693
+ context: Optional[Context] = None,
694
+ ) -> SpatialStatisticsResult:
695
+ """Analyze spatial statistics and autocorrelation patterns
696
+
697
+ Args:
698
+ data_id: Dataset ID
699
+ params: Analysis parameters
700
+
701
+ Returns:
702
+ Spatial statistics analysis result with statistics and optional visualization
703
+
704
+ Notes:
705
+ Available analysis types (implemented):
706
+ - moran: Global Moran's I spatial autocorrelation (squidpy)
707
+ - local_moran: Local Moran's I (LISA) for spatial clustering detection
708
+ - geary: Geary's C spatial autocorrelation (squidpy)
709
+ - getis_ord: Getis-Ord Gi* hot/cold spot detection (esda/PySAL)
710
+ * Detects statistically significant spatial clusters of high/low values
711
+ * Parameters: getis_ord_alpha (significance level), getis_ord_correction (FDR/Bonferroni)
712
+ * Returns raw and corrected hotspot/coldspot counts
713
+ - neighborhood: Neighborhood enrichment (squidpy)
714
+ - co_occurrence: Co-occurrence analysis (squidpy)
715
+ - centrality: Graph centrality scores (squidpy)
716
+ - ripley: Ripley's K/L spatial point patterns
717
+ - bivariate_moran: Bivariate Moran's I for gene pair correlation
718
+
719
+ **Categorical Data Analysis (Choose based on number of categories):**
720
+ - join_count: Traditional Join Count for BINARY data (exactly 2 categories)
721
+ * Use for: Binary presence/absence, case/control, treated/untreated
722
+ * Returns: Global statistics (BB/WW/BW joins, p-value)
723
+ * Reference: Cliff & Ord (1981)
724
+
725
+ - local_join_count: Local Join Count for MULTI-CATEGORY data (>2 categories)
726
+ * Use for: Cell types, tissue domains, multi-class categorical variables
727
+ * Returns: Per-category local clustering statistics with p-values
728
+ * Identifies WHERE each category spatially clusters
729
+ * Reference: Anselin & Li (2019)
730
+
731
+ - network_properties: Spatial network analysis
732
+ - spatial_centrality: Spatial-specific centrality measures
733
+ """
734
+ # Validate dataset
735
+ validate_dataset(data_id)
736
+
737
+ # Create ToolContext for clean data access (no redundant dict wrapping)
738
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
739
+
740
+ # Lazy import spatial_statistics (squidpy is slow to import)
741
+ from .tools.spatial_statistics import (
742
+ analyze_spatial_statistics as _analyze_spatial_statistics,
743
+ )
744
+
745
+ # Call spatial statistics analysis function with ToolContext
746
+ result = await _analyze_spatial_statistics(data_id, ctx, params)
747
+
748
+ # Note: No writeback needed - adata modifications are in-place on the same object
749
+
750
+ # Save spatial statistics result
751
+ await data_manager.save_result(data_id, "spatial_statistics", result)
752
+
753
+ # Note: Visualization should be created separately using create_visualization tool
754
+ # This maintains clean separation between analysis and visualization
755
+
756
+ return result
757
+
758
+
759
+ @mcp.tool(annotations=get_tool_annotations("find_markers"))
760
+ @mcp_tool_error_handler()
761
+ async def find_markers(
762
+ data_id: str,
763
+ group_key: str,
764
+ group1: Optional[str] = None,
765
+ group2: Optional[str] = None,
766
+ method: str = "wilcoxon",
767
+ n_top_genes: int = 25, # Number of top differentially expressed genes to return
768
+ pseudocount: float = 1.0, # Pseudocount for log2 fold change calculation
769
+ min_cells: int = 3, # Minimum cells per group for statistical testing
770
+ sample_key: Optional[str] = None, # Sample key for pseudobulk (pydeseq2)
771
+ context: Optional[Context] = None,
772
+ ) -> DifferentialExpressionResult:
773
+ """Find differentially expressed genes between groups
774
+
775
+ Args:
776
+ data_id: Dataset ID
777
+ group_key: Column name defining groups
778
+ group1: First group (if None, compare against all others)
779
+ group2: Second group (if None, compare group1 against all others)
780
+ method: Statistical test method
781
+ n_top_genes: Number of top differentially expressed genes to return
782
+ pseudocount: Pseudocount added to expression values before log2 fold change
783
+ calculation to avoid log(0). Default: 1.0 (standard practice).
784
+ Lower values (0.1-0.5) increase sensitivity to low-expression genes.
785
+ Higher values (1-10) stabilize fold changes for sparse data.
786
+ min_cells: Minimum number of cells per group for statistical testing.
787
+ Default: 3 (minimum required for Wilcoxon test).
788
+ Increase to 10-30 for more robust statistical results.
789
+ Groups with fewer cells are automatically skipped with a warning.
790
+ sample_key: Column name in adata.obs for sample/replicate identifier.
791
+ REQUIRED for 'pydeseq2' method to perform pseudobulk aggregation.
792
+ Common values: 'sample', 'patient_id', 'batch', 'replicate'.
793
+
794
+ Returns:
795
+ Differential expression result with top marker genes
796
+ """
797
+ # Validate dataset
798
+ validate_dataset(data_id)
799
+
800
+ # Create ToolContext for clean data access (no redundant dict wrapping)
801
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
802
+
803
+ # Create params object for unified signature pattern
804
+ params = DifferentialExpressionParameters(
805
+ group_key=group_key,
806
+ group1=group1,
807
+ group2=group2,
808
+ method=method, # type: ignore[arg-type]
809
+ n_top_genes=n_top_genes,
810
+ pseudocount=pseudocount,
811
+ min_cells=min_cells,
812
+ sample_key=sample_key,
813
+ )
814
+
815
+ # Lazy import differential expression tool
816
+ from .tools.differential import differential_expression
817
+
818
+ # Call differential expression function with unified (data_id, ctx, params) signature
819
+ result = await differential_expression(data_id, ctx, params)
820
+
821
+ # Note: No writeback needed - adata modifications are in-place on the same object
822
+
823
+ # Save differential expression result
824
+ await data_manager.save_result(data_id, "differential_expression", result)
825
+
826
+ return result
827
+
828
+
829
+ @mcp.tool(annotations=get_tool_annotations("compare_conditions"))
830
+ @mcp_tool_error_handler()
831
+ async def compare_conditions(
832
+ data_id: str,
833
+ condition_key: str,
834
+ condition1: str,
835
+ condition2: str,
836
+ sample_key: str,
837
+ cell_type_key: Optional[str] = None,
838
+ method: str = "pseudobulk",
839
+ n_top_genes: int = 50,
840
+ min_cells_per_sample: int = 10,
841
+ min_samples_per_condition: int = 2,
842
+ padj_threshold: float = 0.05,
843
+ log2fc_threshold: float = 0.0,
844
+ context: Optional[Context] = None,
845
+ ) -> ConditionComparisonResult:
846
+ """Compare experimental conditions across multiple biological samples.
847
+
848
+ This tool performs pseudobulk differential expression analysis to compare
849
+ conditions (e.g., Treatment vs Control) across biological replicates.
850
+ It properly accounts for sample-level variation using DESeq2.
851
+
852
+ Args:
853
+ data_id: Dataset ID
854
+ condition_key: Column name in adata.obs containing experimental conditions
855
+ (e.g., 'treatment', 'disease_status', 'timepoint')
856
+ condition1: First condition for comparison (typically experimental group)
857
+ condition2: Second condition for comparison (typically control group)
858
+ sample_key: Column name in adata.obs identifying biological replicates
859
+ (e.g., 'patient_id', 'sample', 'replicate')
860
+ cell_type_key: Optional column for cell type stratification. If provided,
861
+ analysis is performed separately for each cell type.
862
+ method: Analysis method (currently only 'pseudobulk' is supported)
863
+ n_top_genes: Number of top genes to return per comparison
864
+ min_cells_per_sample: Minimum cells required per sample to be included
865
+ min_samples_per_condition: Minimum samples required per condition
866
+ padj_threshold: Adjusted p-value threshold for significance
867
+ log2fc_threshold: Log2 fold change threshold for significance
868
+
869
+ Returns:
870
+ ConditionComparisonResult with differential expression results
871
+
872
+ Example:
873
+ # Global comparison
874
+ compare_conditions(
875
+ data_id="data1",
876
+ condition_key="treatment",
877
+ condition1="Drug",
878
+ condition2="Control",
879
+ sample_key="patient_id"
880
+ )
881
+
882
+ # Cell type stratified
883
+ compare_conditions(
884
+ data_id="data1",
885
+ condition_key="treatment",
886
+ condition1="Drug",
887
+ condition2="Control",
888
+ sample_key="patient_id",
889
+ cell_type_key="cell_type"
890
+ )
891
+ """
892
+ # Validate dataset
893
+ validate_dataset(data_id)
894
+
895
+ # Create ToolContext
896
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
897
+
898
+ # Create params object
899
+ params = ConditionComparisonParameters(
900
+ condition_key=condition_key,
901
+ condition1=condition1,
902
+ condition2=condition2,
903
+ sample_key=sample_key,
904
+ cell_type_key=cell_type_key,
905
+ method=method, # type: ignore[arg-type]
906
+ n_top_genes=n_top_genes,
907
+ min_cells_per_sample=min_cells_per_sample,
908
+ min_samples_per_condition=min_samples_per_condition,
909
+ padj_threshold=padj_threshold,
910
+ log2fc_threshold=log2fc_threshold,
911
+ )
912
+
913
+ # Lazy import
914
+ from .tools.condition_comparison import compare_conditions as _compare_conditions
915
+
916
+ # Run analysis
917
+ result = await _compare_conditions(data_id, ctx, params)
918
+
919
+ # Save result
920
+ await data_manager.save_result(data_id, "condition_comparison", result)
921
+
922
+ return result
923
+
924
+
925
+ @mcp.tool(annotations=get_tool_annotations("analyze_cnv"))
926
+ @mcp_tool_error_handler()
927
+ async def analyze_cnv(
928
+ data_id: str,
929
+ reference_key: str,
930
+ reference_categories: list[str],
931
+ method: str = "infercnvpy",
932
+ window_size: int = 100,
933
+ step: int = 10,
934
+ exclude_chromosomes: Optional[list[str]] = None,
935
+ dynamic_threshold: Optional[float] = 1.5,
936
+ cluster_cells: bool = False,
937
+ dendrogram: bool = False,
938
+ numbat_genome: str = "hg38",
939
+ numbat_allele_data_key: str = "allele_counts",
940
+ numbat_t: float = 0.15,
941
+ numbat_max_entropy: float = 0.8,
942
+ numbat_min_cells: int = 10,
943
+ numbat_ncores: int = 1,
944
+ numbat_skip_nj: bool = False,
945
+ context: Optional[Context] = None,
946
+ ) -> CNVResult:
947
+ """Analyze copy number variations (CNVs) in spatial transcriptomics data
948
+
949
+ Supports two CNV analysis methods:
950
+ - infercnvpy: Expression-based CNV inference (default, fast)
951
+ - Numbat: Haplotype-aware CNV analysis (requires allele data, more accurate)
952
+
953
+ Args:
954
+ data_id: Dataset identifier
955
+ reference_key: Column name in adata.obs for cell type labels
956
+ reference_categories: List of cell types to use as reference (normal cells)
957
+ method: CNV analysis method ("infercnvpy" or "numbat", default: "infercnvpy")
958
+ window_size: Number of genes for CNV averaging window (default: 100)
959
+ step: Step size for sliding window (default: 10)
960
+ exclude_chromosomes: Chromosomes to exclude (e.g., ['chrX', 'chrY'])
961
+ dynamic_threshold: Threshold for dynamic CNV calling (default: 1.5)
962
+ cluster_cells: Whether to cluster cells by CNV pattern
963
+ dendrogram: Whether to compute hierarchical clustering dendrogram
964
+ context: MCP context
965
+
966
+ Returns:
967
+ CNV analysis result with statistics and visualization availability
968
+
969
+ Notes:
970
+ CNV analysis methods:
971
+ - infercnvpy: Expression-based (implemented, no allele data required)
972
+ - numbat: Haplotype-aware (implemented when rpy2 installed, requires allele data)
973
+
974
+ Numbat-specific notes:
975
+ - Method: Haplotype-aware CNV analysis with phylogeny reconstruction
976
+ - Requires: Allele-specific counts in adata.layers or adata.obsm
977
+ - Allele data preparation: Use cellSNP-lite, pileup_and_phase, or similar tools
978
+ - Genome options: hg38, hg19, mm10, mm39
979
+ - Returns: CNV matrix, clone assignments, phylogeny tree
980
+ - GPU acceleration: Not applicable (R-based method)
981
+
982
+ Examples:
983
+ # Basic infercnvpy analysis
984
+ analyze_cnv("data1", "cell_type", ["T cells", "B cells"])
985
+
986
+ # Numbat analysis (requires allele data)
987
+ analyze_cnv("data1", "cell_type", ["T cells", "B cells"],
988
+ method="numbat", numbat_genome="hg38")
989
+
990
+ # With clustering
991
+ analyze_cnv("data1", "leiden", ["0", "1"], cluster_cells=True)
992
+ """
993
+ # Validate dataset
994
+ validate_dataset(data_id)
995
+
996
+ # Create ToolContext for clean data access (no redundant dict wrapping)
997
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
998
+
999
+ # Create CNVParameters object
1000
+ # Type: ignore needed for Literal parameters validated at runtime by Pydantic
1001
+ params = CNVParameters(
1002
+ method=method, # type: ignore[arg-type]
1003
+ reference_key=reference_key,
1004
+ reference_categories=reference_categories,
1005
+ window_size=window_size,
1006
+ step=step,
1007
+ exclude_chromosomes=exclude_chromosomes,
1008
+ dynamic_threshold=dynamic_threshold,
1009
+ cluster_cells=cluster_cells,
1010
+ dendrogram=dendrogram,
1011
+ numbat_genome=numbat_genome, # type: ignore[arg-type]
1012
+ numbat_allele_data_key=numbat_allele_data_key,
1013
+ numbat_t=numbat_t,
1014
+ numbat_max_entropy=numbat_max_entropy,
1015
+ numbat_min_cells=numbat_min_cells,
1016
+ numbat_ncores=numbat_ncores,
1017
+ numbat_skip_nj=numbat_skip_nj,
1018
+ )
1019
+
1020
+ # Lazy import CNV analysis tool
1021
+ from .tools.cnv_analysis import infer_cnv
1022
+
1023
+ # Call CNV inference function with ToolContext
1024
+ result = await infer_cnv(data_id=data_id, ctx=ctx, params=params)
1025
+
1026
+ # Note: No writeback needed - adata modifications are in-place on the same object
1027
+
1028
+ # Save CNV result
1029
+ await data_manager.save_result(data_id, "cnv_analysis", result)
1030
+
1031
+ return result
1032
+
1033
+
1034
+ @mcp.tool(annotations=get_tool_annotations("analyze_velocity_data"))
1035
+ @mcp_tool_error_handler()
1036
+ async def analyze_velocity_data(
1037
+ data_id: str,
1038
+ params: RNAVelocityParameters = RNAVelocityParameters(),
1039
+ context: Optional[Context] = None,
1040
+ ) -> RNAVelocityResult:
1041
+ """Analyze RNA velocity to understand cellular dynamics
1042
+
1043
+ Args:
1044
+ data_id: Dataset ID
1045
+ params: RNA velocity parameters
1046
+
1047
+ Returns:
1048
+ RNA velocity analysis result
1049
+
1050
+ Notes:
1051
+ Velocity methods (status):
1052
+ - scvelo: scVelo with three modes (implemented, tested)
1053
+ - deterministic: Deterministic rate model
1054
+ - stochastic: Stochastic rate model (default)
1055
+ - dynamical: Dynamical model with ODE fitting
1056
+ - velovi: VeloVI deep learning method (implemented, requires scvi-tools, tested)
1057
+ """
1058
+ # Validate dataset
1059
+ validate_dataset(data_id)
1060
+
1061
+ # Create ToolContext for clean data access (no redundant dict wrapping)
1062
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1063
+
1064
+ # Lazy import velocity analysis tool
1065
+ from .tools.velocity import analyze_rna_velocity
1066
+
1067
+ # Call RNA velocity function with ToolContext
1068
+ result = await analyze_rna_velocity(data_id, ctx, params)
1069
+
1070
+ # Note: No writeback needed - adata modifications are in-place on the same object
1071
+
1072
+ # Save velocity result
1073
+ await data_manager.save_result(data_id, "rna_velocity", result)
1074
+
1075
+ # Visualization should be done separately via visualization tools
1076
+
1077
+ return result
1078
+
1079
+
1080
+ @mcp.tool(annotations=get_tool_annotations("analyze_trajectory_data"))
1081
+ @mcp_tool_error_handler()
1082
+ async def analyze_trajectory_data(
1083
+ data_id: str,
1084
+ params: TrajectoryParameters = TrajectoryParameters(),
1085
+ context: Optional[Context] = None,
1086
+ ) -> TrajectoryResult:
1087
+ """Infer cellular trajectories and pseudotime
1088
+
1089
+ Args:
1090
+ data_id: Dataset ID
1091
+ params: Trajectory analysis parameters
1092
+
1093
+ Returns:
1094
+ Trajectory analysis result
1095
+
1096
+ Notes:
1097
+ Trajectory methods (status):
1098
+ - dpt: Diffusion pseudotime (implemented)
1099
+ - palantir: Probabilistic trajectory inference (implemented when palantir installed)
1100
+ - cellrank: RNA velocity-based trajectory inference (implemented when cellrank installed)
1101
+ - velovi: scvi-tools VeloVI (implemented when scvi-tools available)
1102
+ """
1103
+ # Validate dataset
1104
+ validate_dataset(data_id)
1105
+
1106
+ # Create ToolContext
1107
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1108
+
1109
+ # Lazy import trajectory function
1110
+ from .tools.trajectory import analyze_trajectory
1111
+
1112
+ # Call trajectory function
1113
+ result = await analyze_trajectory(data_id, ctx, params)
1114
+
1115
+ # Note: No writeback needed - adata modifications are in-place on the same object
1116
+
1117
+ # Save trajectory result
1118
+ await data_manager.save_result(data_id, "trajectory", result)
1119
+
1120
+ # Visualization should be done separately via visualization tools
1121
+
1122
+ return result
1123
+
1124
+
1125
+ @mcp.tool(annotations=get_tool_annotations("integrate_samples"))
1126
+ @mcp_tool_error_handler()
1127
+ async def integrate_samples(
1128
+ data_ids: list[str],
1129
+ params: IntegrationParameters = IntegrationParameters(),
1130
+ context: Optional[Context] = None,
1131
+ ) -> IntegrationResult:
1132
+ """Integrate multiple spatial transcriptomics samples
1133
+
1134
+ Args:
1135
+ data_ids: List of dataset IDs to integrate
1136
+ params: Integration parameters
1137
+
1138
+ Returns:
1139
+ Integration result with integrated dataset ID
1140
+
1141
+ Notes:
1142
+ Integration methods (status):
1143
+ - harmony, bbknn, scanorama: Classical methods (implemented)
1144
+ - scvi: Deep learning method (implemented, requires scvi-tools)
1145
+
1146
+ Removed methods:
1147
+ - multivi: Requires MuData format (not compatible with current workflow)
1148
+ - contrastivevi: Not integrated (designed for Perturb-seq use cases)
1149
+ """
1150
+ # Validate all datasets first
1151
+ for data_id in data_ids:
1152
+ validate_dataset(data_id)
1153
+
1154
+ # Create ToolContext for clean data access
1155
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1156
+
1157
+ # Lazy import to avoid slow startup
1158
+ from .tools.integration import integrate_samples as integrate_func
1159
+
1160
+ # Call integration function with ToolContext
1161
+ # Note: integrate_func uses ctx.add_dataset() to store the integrated dataset
1162
+ result = await integrate_func(data_ids, ctx, params)
1163
+
1164
+ # Save integration result
1165
+ integrated_id = result.data_id
1166
+ await data_manager.save_result(integrated_id, "integration", result)
1167
+
1168
+ return result
1169
+
1170
+
1171
+ @mcp.tool(annotations=get_tool_annotations("deconvolve_data"))
1172
+ @mcp_tool_error_handler()
1173
+ async def deconvolve_data(
1174
+ data_id: str,
1175
+ params: DeconvolutionParameters, # No default - LLM must provide parameters
1176
+ context: Optional[Context] = None,
1177
+ ) -> DeconvolutionResult:
1178
+ """Deconvolve spatial spots to estimate cell type proportions
1179
+
1180
+ Args:
1181
+ data_id: Dataset ID
1182
+ params: Deconvolution parameters including:
1183
+ - method: Deconvolution method to use
1184
+ - cell_type_key: Key in reference data for cell types (REQUIRED)
1185
+ - reference_data_id: Reference single-cell dataset ID (required for most methods)
1186
+
1187
+ Cell2location-specific parameters (official scvi-tools recommendations):
1188
+ Phase 1 (Critical fixes):
1189
+ - ref_model_epochs: Reference model training epochs (default: 250)
1190
+ - n_epochs: Cell2location model training epochs (default: 30000)
1191
+ - n_cells_per_spot: Expected cells per location (default: 30, tissue-dependent)
1192
+ - detection_alpha: RNA detection sensitivity (NEW DEFAULT 2024: 20, old: 200)
1193
+ - batch_key: Batch column for batch effect correction (default: None)
1194
+ - categorical_covariate_keys: Technical covariates list (default: None)
1195
+ - apply_gene_filtering: Apply official gene filtering (default: True)
1196
+ - gene_filter_*: Gene filtering thresholds (cell_count_cutoff=5, etc.)
1197
+
1198
+ Phase 2 (Training enhancements):
1199
+ - ref_model_lr: Reference model learning rate (default: 0.002)
1200
+ - cell2location_lr: Cell2location learning rate (default: 0.005)
1201
+ - ref_model_train_size: Training data fraction for ref model (default: 1.0)
1202
+ - cell2location_train_size: Training data fraction for cell2location (default: 1.0)
1203
+ - enable_qc_plots: Generate QC diagnostic plots (default: False)
1204
+ - qc_output_dir: Output directory for QC plots (default: None)
1205
+
1206
+ Phase 3 (Runtime optimization):
1207
+ - early_stopping: Enable early stopping to reduce training time (default: True)
1208
+ - early_stopping_patience: Epochs to wait before stopping (default: 45)
1209
+ - early_stopping_threshold: Minimum relative change threshold (default: 0.0)
1210
+ - use_aggressive_training: Use train_aggressive() for better convergence (default: True)
1211
+ - validation_size: Validation set fraction for early stopping (default: 0.1)
1212
+
1213
+ Returns:
1214
+ Deconvolution result with cell type proportions
1215
+
1216
+ Notes:
1217
+ Deconvolution methods (status):
1218
+ - cell2location, destvi, stereoscope, tangram: Implemented when scvi-tools available
1219
+ - rctd: Implemented via rpy2/R when R packages are installed (spacexr)
1220
+ * Supports 3 modes: 'doublet' (high-res), 'full' (low-res, default), 'multi' (greedy)
1221
+ * Mode selection via rctd_mode parameter
1222
+ * Reference: Cable et al. (2022) Nat. Biotechnol.
1223
+ - spotlight: Implemented via rpy2/R when R packages are installed
1224
+ - card: Implemented via rpy2/R when CARD package is installed
1225
+ * Unique feature: Models spatial correlation of cell type compositions via CAR model
1226
+ * Optional imputation: Create enhanced high-resolution spatial maps
1227
+ * Parameters: card_imputation, card_NumGrids, card_ineibor, card_minCountGene, card_minCountSpot
1228
+ * Reference: Ma & Zhou (2022) Nat. Biotechnol.
1229
+
1230
+ RCTD-specific notes:
1231
+ - Method: Robust decomposition of cell type mixtures using platform-free approach
1232
+ - Mode selection guide:
1233
+ * 'doublet': For high-resolution data (Slide-seq ~10μm, MERFISH, Visium HD)
1234
+ - Assigns 1-2 cell types per spot, identifies singlets vs doublets
1235
+ * 'full' (default): For low-resolution data (standard Visium 55μm spots)
1236
+ - Can assign any number of cell types, best for multi-cellular spots
1237
+ * 'multi': Greedy algorithm alternative to 'full'
1238
+ - More constrained than 'full', useful for intermediate resolutions
1239
+ - Additional parameters: rctd_confidence_threshold, rctd_doublet_threshold, max_cores
1240
+
1241
+ CARD-specific notes:
1242
+ - Method: Spatially informed cell type deconvolution with CAR (Conditional AutoRegressive) model
1243
+ - Unique capability: Models spatial correlation of cell type compositions across tissue locations
1244
+ - Imputation feature (optional via card_imputation=True):
1245
+ * Creates enhanced spatial maps with arbitrarily higher resolution than original measurement
1246
+ * Imputes cell type compositions and gene expression at unmeasured locations
1247
+ * Extremely fast: 0.4s for all genes (5816x faster than BayesSpace)
1248
+ * Use cases: Enhance Visium to near-cellular resolution, fill tissue gaps, smooth artifacts
1249
+ - Imputation parameters:
1250
+ * card_NumGrids: Number of grid points (2000=standard, 5000=high-res, 10000=ultra)
1251
+ * card_ineibor: Neighbors for smoothing (10=default, higher=smoother)
1252
+ - Quality control: card_minCountGene, card_minCountSpot
1253
+ - Multi-sample support: card_sample_key for batch effects
1254
+ - Visualization: Use plot_type='card_imputation' to visualize imputed results
1255
+
1256
+ Cell2location uses two-stage training:
1257
+ 1. Reference model (NB regression): Learns cell type signatures (250 epochs)
1258
+ 2. Cell2location model: Maps cell types to spatial locations (30000 epochs)
1259
+ """
1260
+ # Validate dataset
1261
+ validate_dataset(data_id)
1262
+
1263
+ # Validate reference data if provided
1264
+ if params.reference_data_id and not data_manager.dataset_exists(
1265
+ params.reference_data_id
1266
+ ):
1267
+ raise DataNotFoundError(
1268
+ f"Reference dataset {params.reference_data_id} not found"
1269
+ )
1270
+
1271
+ # Create ToolContext for clean data access (no redundant dict wrapping)
1272
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1273
+
1274
+ # Lazy import deconvolution tool
1275
+ from .tools.deconvolution import deconvolve_spatial_data
1276
+
1277
+ # Call deconvolution function with ToolContext
1278
+ result = await deconvolve_spatial_data(data_id, ctx, params)
1279
+
1280
+ # Note: No writeback needed - adata modifications are in-place on the same object
1281
+
1282
+ # Save deconvolution result
1283
+ await data_manager.save_result(data_id, "deconvolution", result)
1284
+
1285
+ # Visualization should be done separately via visualization tools
1286
+
1287
+ return result
1288
+
1289
+
1290
+ @mcp.tool(annotations=get_tool_annotations("identify_spatial_domains"))
1291
+ @mcp_tool_error_handler()
1292
+ async def identify_spatial_domains(
1293
+ data_id: str,
1294
+ params: SpatialDomainParameters = SpatialDomainParameters(),
1295
+ context: Optional[Context] = None,
1296
+ ) -> SpatialDomainResult:
1297
+ """Identify spatial domains and tissue architecture
1298
+
1299
+ Args:
1300
+ data_id: Dataset ID
1301
+ params: Spatial domain parameters
1302
+
1303
+ Returns:
1304
+ Spatial domain result with identified domains
1305
+
1306
+ Notes:
1307
+ Spatial domain methods (status):
1308
+ - spagcn: SpaGCN graph convolutional network (implemented; optional dependency SpaGCN)
1309
+ - leiden / louvain: clustering-based (implemented; no extra deps)
1310
+ - stagate: STAGATE (implemented; optional dependency STAGATE)
1311
+ - graphst: GraphST graph self-supervised contrastive learning (implemented; optional dependency GraphST)
1312
+ - stlearn / sedr / bayesspace: not implemented in this server; planned/experimental
1313
+ """
1314
+ # Validate dataset first
1315
+ validate_dataset(data_id)
1316
+
1317
+ # Create ToolContext for clean data access
1318
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1319
+
1320
+ # Lazy import to avoid slow startup
1321
+ from .tools.spatial_domains import identify_spatial_domains as identify_domains_func
1322
+
1323
+ # Call spatial domains function with ToolContext
1324
+ result = await identify_domains_func(data_id, ctx, params)
1325
+
1326
+ # Note: No writeback needed - adata modifications are in-place on the same object
1327
+
1328
+ # Save spatial domains result
1329
+ await data_manager.save_result(data_id, "spatial_domains", result)
1330
+
1331
+ return result
1332
+
1333
+
1334
+ @mcp.tool(annotations=get_tool_annotations("analyze_cell_communication"))
1335
+ @mcp_tool_error_handler()
1336
+ async def analyze_cell_communication(
1337
+ data_id: str,
1338
+ params: CellCommunicationParameters, # No default - LLM must provide parameters
1339
+ context: Optional[Context] = None,
1340
+ ) -> CellCommunicationResult:
1341
+ """Analyze cell-cell communication patterns
1342
+
1343
+ Args:
1344
+ data_id: Dataset ID
1345
+ params: Cell communication parameters
1346
+
1347
+ Returns:
1348
+ Cell communication analysis result
1349
+
1350
+ Notes:
1351
+ Cell communication methods (status):
1352
+ - liana: Implemented (global/cluster and spatial bivariate modes; requires liana)
1353
+ - cellphonedb: Implemented (statistical analysis with spatial microenvironments; requires cellphonedb)
1354
+ - cellchat_r: Implemented (native R CellChat with full features; requires rpy2 and CellChat R package)
1355
+ - nichenet / connectome / cytotalk / squidpy: Not implemented in this server
1356
+
1357
+ IMPORTANT: For comprehensive cell communication analysis:
1358
+
1359
+ **Species-specific configuration:**
1360
+ - species="mouse" + liana_resource="mouseconsensus" for mouse data
1361
+ - species="human" + liana_resource="consensus" for human data
1362
+ - species="zebrafish" for zebrafish data
1363
+
1364
+ **Available LIANA resources (liana_resource parameter):**
1365
+ - "consensus" (default, recommended): Consensus of multiple databases
1366
+ - "mouseconsensus": Mouse-specific consensus database
1367
+ - "cellphonedb": CellPhoneDB database (curated, stringent)
1368
+ - "celltalkdb": CellTalkDB database (large, comprehensive)
1369
+ - "icellnet": iCellNet database (immune cell focus)
1370
+ - "cellchatdb": CellChat database
1371
+ - "connectomedb2020": Connectome database 2020
1372
+ - "baccin2019", "cellcall", "cellinker", "embrace", "guide2pharma",
1373
+ "hpmr", "italk", "kirouac2010", "lrdb", "ramilowski2015": Additional resources
1374
+
1375
+ **Common failure scenarios and solutions:**
1376
+ 1. "Too few features from resource found in data":
1377
+ - adata.raw is automatically used when available for comprehensive gene coverage
1378
+ - Ensure species matches data (mouse vs human)
1379
+ - Use species-appropriate resource (mouseconsensus for mouse)
1380
+
1381
+ 2. Missing spatial connectivity:
1382
+ - Run spatial neighbor computation in preprocessing step (see below)
1383
+
1384
+ 3. Missing cell type annotations:
1385
+ - Ensure cell_type_key column exists or run annotation first
1386
+
1387
+ **Spatial connectivity computation (preprocessing step):**
1388
+
1389
+ The spatial neighborhood definition profoundly impacts cell communication analysis results.
1390
+ Choose parameters based on your spatial transcriptomics platform and biological question:
1391
+
1392
+ **Platform-specific recommendations:**
1393
+
1394
+ 10x Visium (hexagonal grid, 55µm spots, 100µm center-to-center spacing):
1395
+ • coord_type: "grid" (for hexagonal layout) or "generic" (for custom)
1396
+ • n_neighs: 6 (direct neighbors in hexagonal grid)
1397
+ • n_rings: 1-2 (for grid mode: 1=first ring only, 2=first+second ring)
1398
+ • radius: 150-200 pixels (for distance-based, ~captures first neighbor ring)
1399
+ ├─ Local interactions (paracrine signaling): n_neighs=6 or n_rings=1
1400
+ ├─ Microenvironment analysis: n_neighs=12-18 or n_rings=2
1401
+ └─ Broader spatial context: radius=300-500 pixels
1402
+
1403
+ Slide-seq/Slide-seqV2 (10µm beads, high density):
1404
+ • coord_type: "generic"
1405
+ • n_neighs: 10-30 (higher density requires more neighbors)
1406
+ • radius: 50-100 µm (typical cell-cell signaling range)
1407
+ ├─ Dense regions: n_neighs=20-30
1408
+ ├─ Sparse regions: n_neighs=10-15
1409
+ └─ Distance-based: radius=50-100 µm (matches biological signaling range)
1410
+
1411
+ MERFISH/seqFISH+ (single-cell resolution, <1µm precision):
1412
+ • coord_type: "generic"
1413
+ • n_neighs: 3-10 (nearest cell neighbors)
1414
+ • radius: 20-50 µm (direct cell-cell contact to short-range paracrine)
1415
+ ├─ Direct contact: n_neighs=3-5 or radius=10-20 µm
1416
+ ├─ Paracrine signaling: n_neighs=5-10 or radius=30-50 µm
1417
+ └─ Microenvironment: radius=50-100 µm
1418
+
1419
+ **Biological considerations:**
1420
+
1421
+ Cell communication distance ranges (from literature):
1422
+ • Juxtacrine signaling: 0-10 µm (direct contact)
1423
+ • Paracrine signaling: 10-100 µm (e.g., Wnt/Wg: ~50-100 µm)
1424
+ • Broader microenvironment: 100-500 µm
1425
+
1426
+ Analysis goal-based selection:
1427
+ • Identify direct cell-cell interactions → Use smaller neighborhoods (n_neighs=6-10, radius=50-100 µm)
1428
+ • Study tissue microenvironments → Use larger neighborhoods (n_neighs=15-30, radius=200-500 µm)
1429
+ • Rare cell type interactions → Use adaptive/larger k to avoid missing signals
1430
+ • Abundant cell types → Use smaller k to avoid spurious connections
1431
+
1432
+ **Parameter tradeoffs:**
1433
+ • Larger neighborhoods: Capture long-range signals but lose spatial specificity
1434
+ • Smaller neighborhoods: High spatial precision but may miss important interactions
1435
+ • Fixed k (n_neighs): Same number for all spots, may overcluster dense regions
1436
+ • Distance-based (radius): More biologically meaningful but varying neighbor counts
1437
+
1438
+ **Examples:**
1439
+
1440
+ Visium - local paracrine signaling:
1441
+ # Step 1: Compute spatial neighbors (preprocessing)
1442
+ import squidpy as sq
1443
+ sq.gr.spatial_neighbors(adata, coord_type='grid', n_rings=1)
1444
+
1445
+ # Step 2: Analyze communication
1446
+ params = {
1447
+ "species": "human",
1448
+ "liana_resource": "consensus"
1449
+ }
1450
+
1451
+ Visium - microenvironment analysis:
1452
+ # Step 1: Compute spatial neighbors (preprocessing)
1453
+ import squidpy as sq
1454
+ sq.gr.spatial_neighbors(adata, coord_type='generic', n_neighs=18)
1455
+
1456
+ # Step 2: Analyze communication
1457
+ params = {
1458
+ "species": "human"
1459
+ }
1460
+
1461
+ MERFISH - direct cell-cell contact:
1462
+ # Step 1: Compute spatial neighbors (preprocessing)
1463
+ import squidpy as sq
1464
+ sq.gr.spatial_neighbors(adata, coord_type='generic', radius=20)
1465
+
1466
+ # Step 2: Analyze communication
1467
+ params = {
1468
+ "species": "mouse",
1469
+ "liana_resource": "mouseconsensus"
1470
+ }
1471
+
1472
+ **References:**
1473
+ • Squidpy framework: Palla et al., Nat Methods 2022
1474
+ • LIANA+: Dimitrov et al., Nat Cell Biol 2024
1475
+ • Visium resolution: 10x Genomics Technical Note
1476
+ • Signaling ranges: Literature-based (Wnt/Wg: ~50-100 µm)
1477
+ """
1478
+ # Validate dataset first
1479
+ validate_dataset(data_id)
1480
+
1481
+ # Create ToolContext for clean data access
1482
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1483
+
1484
+ # Lazy import to avoid slow startup
1485
+ from .tools.cell_communication import (
1486
+ analyze_cell_communication as analyze_comm_func,
1487
+ )
1488
+
1489
+ # Call cell communication function with ToolContext
1490
+ result = await analyze_comm_func(data_id, ctx, params)
1491
+
1492
+ # Note: No writeback needed - adata modifications are in-place on the same object
1493
+
1494
+ # Save communication result
1495
+ await data_manager.save_result(data_id, "cell_communication", result)
1496
+
1497
+ # Visualization should be done separately via visualization tools
1498
+
1499
+ return result
1500
+
1501
+
1502
+ @mcp.tool(annotations=get_tool_annotations("analyze_enrichment"))
1503
+ @mcp_tool_error_handler()
1504
+ async def analyze_enrichment(
1505
+ data_id: str,
1506
+ params: Optional[EnrichmentParameters] = None,
1507
+ context: Optional[Context] = None,
1508
+ ) -> EnrichmentResult:
1509
+ """Perform gene set enrichment analysis
1510
+
1511
+ Args:
1512
+ data_id: Dataset ID
1513
+ params: Enrichment analysis parameters (REQUIRED: species must be specified)
1514
+
1515
+ Returns:
1516
+ Enrichment analysis result
1517
+
1518
+ IMPORTANT - Species and Database Selection:
1519
+ You MUST specify 'species' parameter explicitly. No default species is assumed.
1520
+
1521
+ Recommended database combinations by species:
1522
+
1523
+ FOR MOUSE DATA (species="mouse"):
1524
+ - "KEGG_Pathways" (recommended, uses KEGG_2019_Mouse internally)
1525
+ - "Reactome_Pathways" (comprehensive pathway database)
1526
+ - "MSigDB_Hallmark" (curated hallmark gene sets)
1527
+ - "GO_Biological_Process" (works but may have fewer matches)
1528
+
1529
+ FOR HUMAN DATA (species="human"):
1530
+ - "KEGG_Pathways" (recommended, uses KEGG_2021_Human internally)
1531
+ - "Reactome_Pathways" (comprehensive pathway database)
1532
+ - "MSigDB_Hallmark" (curated hallmark gene sets)
1533
+ - "GO_Biological_Process" (standard GO terms)
1534
+
1535
+ Available gene_set_database options:
1536
+ - "GO_Biological_Process" (default, auto-adapts to species)
1537
+ - "GO_Molecular_Function" (GO molecular function terms)
1538
+ - "GO_Cellular_Component" (GO cellular component terms)
1539
+ - "KEGG_Pathways" (species-specific: KEGG_2021_Human or KEGG_2019_Mouse)
1540
+ - "Reactome_Pathways" (Reactome_2022 pathway database)
1541
+ - "MSigDB_Hallmark" (MSigDB_Hallmark_2020 curated gene sets)
1542
+ - "Cell_Type_Markers" (cell type marker genes)
1543
+ - Custom gene sets via gene_sets parameter
1544
+
1545
+ Methods available:
1546
+ - "pathway_ora": Over-representation analysis (recommended)
1547
+ - "pathway_enrichr": Enrichr web service
1548
+ - "pathway_gsea": Gene Set Enrichment Analysis
1549
+ - "pathway_ssgsea": Single-sample GSEA
1550
+ - "spatial_enrichmap": Spatial enrichment mapping
1551
+
1552
+ Complete results are preserved in adata.uns for downstream visualization and analysis.
1553
+
1554
+ Example usage:
1555
+ For mouse data: params={"species": "mouse", "gene_set_database": "KEGG_Pathways"}
1556
+ For human data: params={"species": "human", "gene_set_database": "KEGG_Pathways"}
1557
+ """
1558
+ from .tools.enrichment import analyze_enrichment as analyze_enrichment_func
1559
+
1560
+ # Validate dataset
1561
+ validate_dataset(data_id)
1562
+
1563
+ # Create ToolContext
1564
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1565
+
1566
+ # Call enrichment analysis (all business logic is in tools/enrichment.py)
1567
+ result = await analyze_enrichment_func(data_id, ctx, params)
1568
+
1569
+ # Save result
1570
+ await data_manager.save_result(data_id, "enrichment", result)
1571
+
1572
+ return result
1573
+
1574
+
1575
+ @mcp.tool(annotations=get_tool_annotations("find_spatial_genes"))
1576
+ @mcp_tool_error_handler()
1577
+ async def find_spatial_genes(
1578
+ data_id: str,
1579
+ params: SpatialVariableGenesParameters = SpatialVariableGenesParameters(),
1580
+ context: Optional[Context] = None,
1581
+ ) -> SpatialVariableGenesResult:
1582
+ """Identify spatially variable genes using various methods
1583
+
1584
+ Args:
1585
+ data_id: Dataset ID
1586
+ params: Spatial variable gene parameters
1587
+
1588
+ Returns:
1589
+ Spatial variable genes result
1590
+
1591
+ Notes:
1592
+ Available methods:
1593
+ - sparkx: SPARK-X non-parametric method (default, best accuracy)
1594
+ - spatialde: SpatialDE Gaussian process-based method (statistically rigorous)
1595
+
1596
+ Method selection via params.method parameter.
1597
+ Each method has specific parameters - see SpatialVariableGenesParameters model.
1598
+
1599
+ Performance comparison (3000 spots × 20000 genes):
1600
+ - SPARK-X: ~2-5 min (best accuracy)
1601
+ - SpatialDE: ~15-30 min (best statistical rigor)
1602
+ """
1603
+ # Validate dataset
1604
+ validate_dataset(data_id)
1605
+
1606
+ # Create ToolContext for clean data access (no redundant dict wrapping)
1607
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1608
+
1609
+ # Lazy import spatial genes tool
1610
+ from .tools.spatial_genes import identify_spatial_genes
1611
+
1612
+ # Call spatial genes function with ToolContext
1613
+ result = await identify_spatial_genes(data_id, ctx, params)
1614
+
1615
+ # Note: No writeback needed - adata modifications are in-place on the same object
1616
+
1617
+ # Save spatial genes result
1618
+ await data_manager.save_result(data_id, "spatial_genes", result)
1619
+
1620
+ # Visualization should be done separately via visualization tools
1621
+
1622
+ return result
1623
+
1624
+
1625
+ @mcp.tool(annotations=get_tool_annotations("register_spatial_data"))
1626
+ @mcp_tool_error_handler()
1627
+ async def register_spatial_data(
1628
+ source_id: str,
1629
+ target_id: str,
1630
+ method: str = "paste",
1631
+ landmarks: Optional[list[dict[str, Any]]] = None,
1632
+ context: Optional[Context] = None,
1633
+ ) -> dict[str, Any]:
1634
+ """Register/align spatial transcriptomics data across sections
1635
+
1636
+ Args:
1637
+ source_id: Source dataset ID
1638
+ target_id: Target dataset ID to align to
1639
+ method: Registration method (paste, stalign)
1640
+ landmarks: Additional parameters for registration methods
1641
+
1642
+ Returns:
1643
+ Registration result with transformation matrix
1644
+ """
1645
+ # Validate datasets first
1646
+ validate_dataset(source_id)
1647
+ validate_dataset(target_id)
1648
+
1649
+ # Create ToolContext for unified data access
1650
+ ctx = ToolContext(_data_manager=data_manager, _mcp_context=context)
1651
+
1652
+ # Lazy import to avoid slow startup
1653
+ from .tools.spatial_registration import register_spatial_slices_mcp
1654
+
1655
+ # Call registration function using ToolContext
1656
+ # Note: registration modifies adata in-place, changes reflected via reference
1657
+ result = await register_spatial_slices_mcp(source_id, target_id, ctx, method)
1658
+
1659
+ # Save registration result
1660
+ await data_manager.save_result(source_id, "registration", result)
1661
+
1662
+ return result
1663
+
1664
+
1665
+ # ============== Publication Export Tools ==============
1666
+
1667
+
1668
+ @mcp.tool(annotations=get_tool_annotations("save_data"))
1669
+ @mcp_tool_error_handler()
1670
+ async def save_data(
1671
+ data_id: str,
1672
+ output_path: Optional[str] = None,
1673
+ context: Optional[Context] = None,
1674
+ ) -> str:
1675
+ """Manually save dataset to disk
1676
+
1677
+ Saves the current state of the dataset including all analysis results
1678
+ and metadata to a compressed H5AD file.
1679
+
1680
+ Args:
1681
+ data_id: Dataset ID to save
1682
+ output_path: Optional custom save path. If not provided, saves to:
1683
+ - CHATSPATIAL_DATA_DIR environment variable location, or
1684
+ - .chatspatial_saved/ directory next to original data
1685
+
1686
+ Returns:
1687
+ Path where data was saved
1688
+
1689
+ Examples:
1690
+ # Save to default location
1691
+ save_data("data1")
1692
+
1693
+ # Save to custom location
1694
+ save_data("data1", output_path="/path/to/save/my_analysis.h5ad")
1695
+
1696
+ Note:
1697
+ Saved files include all preprocessing, analysis results, and metadata.
1698
+ Use CHATSPATIAL_DATA_DIR environment variable for centralized storage.
1699
+ """
1700
+ from .utils.persistence import save_adata
1701
+
1702
+ # Validate dataset exists
1703
+ validate_dataset(data_id)
1704
+
1705
+ if context:
1706
+ await context.info(f"Saving dataset '{data_id}'...")
1707
+
1708
+ # Get dataset info
1709
+ dataset_info = await data_manager.get_dataset(data_id)
1710
+ adata = dataset_info["adata"]
1711
+ original_path = dataset_info.get("path", "")
1712
+
1713
+ try:
1714
+ if output_path:
1715
+ # User specified custom path
1716
+ from pathlib import Path
1717
+
1718
+ # Resolve to absolute path to avoid confusion about save location
1719
+ save_path = Path(output_path).resolve()
1720
+ save_path.parent.mkdir(parents=True, exist_ok=True)
1721
+ adata.write_h5ad(save_path, compression="gzip", compression_opts=4)
1722
+ else:
1723
+ # Use default location
1724
+ save_path = save_adata(data_id, adata, original_path)
1725
+
1726
+ # Always return absolute path so user knows exact location
1727
+ absolute_path = save_path.resolve()
1728
+
1729
+ if context:
1730
+ await context.info(f"Dataset saved to: {absolute_path}")
1731
+
1732
+ return f"Dataset '{data_id}' saved to: {absolute_path}"
1733
+
1734
+ except Exception as e:
1735
+ error_msg = f"Failed to save dataset: {e}"
1736
+ if context:
1737
+ await context.error(error_msg)
1738
+ raise
1739
+
1740
+
1741
+ def main():
1742
+ """Run the MCP server"""
1743
+ import argparse
1744
+
1745
+ parser = argparse.ArgumentParser(description="ChatSpatial MCP Server")
1746
+ parser.add_argument(
1747
+ "--transport",
1748
+ choices=["stdio", "sse"],
1749
+ default="stdio",
1750
+ help="Transport protocol to use (default: stdio)",
1751
+ )
1752
+
1753
+ args = parser.parse_args()
1754
+
1755
+ print(
1756
+ f"Starting ChatSpatial server with {args.transport} transport...",
1757
+ file=sys.stderr,
1758
+ )
1759
+ mcp.run(transport=args.transport)
1760
+
1761
+
1762
+ if __name__ == "__main__":
1763
+ main()