chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,836 @@
1
+ """
2
+ Spatial Variable Genes (SVG) identification for ChatSpatial MCP.
3
+
4
+ This module provides implementations for SVG detection methods including SpatialDE and SPARK-X,
5
+ enabling comprehensive spatial transcriptomics analysis. Each method offers distinct advantages
6
+ for identifying genes with spatial expression patterns.
7
+
8
+ Methods Overview:
9
+ - SPARK-X (default): Non-parametric statistical method, best accuracy, requires R
10
+ - SpatialDE: Gaussian process-based kernel method, statistically rigorous
11
+
12
+ The module integrates these tools into the ChatSpatial MCP framework, handling data preparation,
13
+ execution, result formatting, and error management across different computational backends.
14
+ """
15
+
16
+ from typing import TYPE_CHECKING, Any
17
+
18
+ if TYPE_CHECKING:
19
+ from ..spatial_mcp_adapter import ToolContext
20
+
21
+ from collections import Counter
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ import scipy.sparse as sp
26
+
27
+ from ..models.analysis import SpatialVariableGenesResult # noqa: E402
28
+ from ..models.data import SpatialVariableGenesParameters # noqa: E402
29
+ from ..utils import validate_var_column # noqa: E402
30
+ from ..utils.adata_utils import require_spatial_coords, to_dense # noqa: E402
31
+ from ..utils.dependency_manager import require # noqa: E402
32
+ from ..utils.exceptions import DataNotFoundError # noqa: E402
33
+ from ..utils.exceptions import DataError, ParameterError, ProcessingError
34
+ from ..utils.mcp_utils import suppress_output # noqa: E402
35
+
36
+ # =============================================================================
37
+ # Shared Utilities for Spatial Variable Gene Detection
38
+ # =============================================================================
39
+
40
+ # Default limit for spatial_genes list returned to LLM
41
+ # Full results stored in adata.var for complete access
42
+ DEFAULT_TOP_GENES_LIMIT = 500
43
+
44
+
45
+ def _ensure_unique_gene_names(gene_names: list[str]) -> list[str]:
46
+ """Ensure gene names are unique by adding suffixes to duplicates.
47
+
48
+ Required for R-based methods (SPARK-X) that use gene names as rownames.
49
+
50
+ Args:
51
+ gene_names: List of gene names (may contain duplicates)
52
+
53
+ Returns:
54
+ List of unique gene names with suffixes added to duplicates
55
+ """
56
+ if len(gene_names) == len(set(gene_names)):
57
+ return gene_names
58
+
59
+ gene_counts = Counter(gene_names)
60
+ unique_names = []
61
+ seen_counts: dict[str, int] = {}
62
+
63
+ for gene in gene_names:
64
+ if gene_counts[gene] > 1:
65
+ if gene not in seen_counts:
66
+ seen_counts[gene] = 0
67
+ unique_names.append(gene)
68
+ else:
69
+ seen_counts[gene] += 1
70
+ unique_names.append(f"{gene}_{seen_counts[gene]}")
71
+ else:
72
+ unique_names.append(gene)
73
+
74
+ return unique_names
75
+
76
+
77
+ def _calculate_sparse_gene_stats(X) -> tuple[np.ndarray, np.ndarray]:
78
+ """Calculate gene statistics on sparse or dense matrix.
79
+
80
+ Efficiently computes gene totals and expression counts without densifying
81
+ the entire matrix.
82
+
83
+ Args:
84
+ X: Gene expression matrix (cells × genes), sparse or dense
85
+
86
+ Returns:
87
+ Tuple of (gene_totals, n_expressed_per_gene) as 1D arrays
88
+ """
89
+ is_sparse = sp.issparse(X)
90
+
91
+ if is_sparse:
92
+ gene_totals = np.array(X.sum(axis=0)).flatten()
93
+ n_expressed = np.array((X > 0).sum(axis=0)).flatten()
94
+ else:
95
+ gene_totals = np.asarray(X.sum(axis=0)).flatten()
96
+ n_expressed = np.asarray((X > 0).sum(axis=0)).flatten()
97
+
98
+ return gene_totals, n_expressed
99
+
100
+
101
+ async def identify_spatial_genes(
102
+ data_id: str,
103
+ ctx: "ToolContext",
104
+ params: SpatialVariableGenesParameters,
105
+ ) -> SpatialVariableGenesResult:
106
+ """
107
+ Identify spatial variable genes using statistical methods.
108
+
109
+ This is the main entry point for spatial gene detection, routing to the appropriate
110
+ method based on params.method. Each method has different strengths:
111
+
112
+ Method Selection Guide:
113
+ - SPARK-X (default): Best for accuracy, handles large datasets efficiently
114
+ - SpatialDE: Best for statistical rigor in publication-ready analyses
115
+
116
+ Data Requirements:
117
+ - SPARK-X: Works with raw counts or normalized data
118
+ - SpatialDE: Works with raw count data
119
+
120
+ Args:
121
+ data_id: Dataset identifier in data store
122
+ ctx: ToolContext for data access and logging
123
+ params: Method-specific parameters (see SpatialVariableGenesParameters)
124
+
125
+ Returns:
126
+ SpatialVariableGenesResult containing:
127
+ - List of significant spatial genes
128
+ - Statistical metrics (p-values, q-values)
129
+ - Method-specific results
130
+
131
+ Raises:
132
+ ValueError: If dataset not found or spatial coordinates missing
133
+ ImportError: If required method dependencies not installed
134
+
135
+ Performance Notes:
136
+ - SPARK-X: ~2-5 min for 3000 spots × 20000 genes
137
+ - SpatialDE: ~15-30 min (scales with spot count squared)
138
+ """
139
+ # Get data via ToolContext
140
+ adata = await ctx.get_adata(data_id)
141
+
142
+ # Validate spatial coordinates exist
143
+ require_spatial_coords(adata, spatial_key=params.spatial_key)
144
+
145
+ # Route to appropriate method
146
+ if params.method == "spatialde":
147
+ return await _identify_spatial_genes_spatialde(data_id, adata, params, ctx)
148
+ elif params.method == "sparkx":
149
+ return await _identify_spatial_genes_sparkx(data_id, adata, params, ctx)
150
+ else:
151
+ raise ParameterError(
152
+ f"Unsupported method: {params.method}. Available methods: spatialde, sparkx"
153
+ )
154
+
155
+
156
+ async def _identify_spatial_genes_spatialde(
157
+ data_id: str,
158
+ adata: Any,
159
+ params: SpatialVariableGenesParameters,
160
+ ctx: "ToolContext",
161
+ ) -> SpatialVariableGenesResult:
162
+ """
163
+ Identify spatial variable genes using the SpatialDE statistical framework.
164
+
165
+ SpatialDE employs Gaussian process regression with spatial kernels to decompose
166
+ gene expression variance into spatial and non-spatial components. It provides
167
+ rigorous statistical testing for spatial expression patterns with multiple
168
+ testing correction.
169
+
170
+ Official Preprocessing Workflow (Implemented):
171
+ This implementation follows the official SpatialDE best practices:
172
+ 1. Filter low-expression genes (total_counts >= 3)
173
+ 2. Variance stabilization (NaiveDE.stabilize)
174
+ 3. Regress out library size effects (NaiveDE.regress_out)
175
+ 4. Run SpatialDE spatial covariance test
176
+ 5. Apply FDR correction (Storey q-value)
177
+
178
+ Method Details:
179
+ - Models spatial correlation using squared exponential kernel
180
+ - Tests significance via likelihood ratio test
181
+ - Applies FDR correction for multiple testing
182
+ - Returns both raw and adjusted p-values
183
+
184
+ Key Parameters:
185
+ - n_top_genes: Limit analysis to top N genes (for performance)
186
+ * If provided, preferentially uses HVGs if available
187
+ * Recommended: 1000-3000 for quick analysis
188
+ * None (default): Test all genes (may take 15-30 min for large datasets)
189
+
190
+ Performance Notes:
191
+ - ~10 minutes for 14,000 genes (official benchmark)
192
+ - Scales approximately linearly with gene count
193
+ - Performance warning issued when n_genes > 5000
194
+ - Tip: Use n_top_genes parameter to reduce runtime
195
+
196
+ Data Requirements:
197
+ - Raw count data (from adata.raw or adata.X)
198
+ - 2D spatial coordinates in adata.obsm['spatial']
199
+ - Data will be automatically preprocessed using official workflow
200
+
201
+ Returns:
202
+ Results including:
203
+ - List of significant spatial genes (q-value < 0.05)
204
+ - Log-likelihood ratios as test statistics
205
+ - Raw p-values and FDR-corrected q-values
206
+ - Spatial correlation length scale per gene
207
+
208
+ Requirements:
209
+ - SpatialDE package with NaiveDE module
210
+ - 2D spatial coordinates
211
+ - Raw count data (not normalized)
212
+
213
+ References:
214
+ Svensson et al. (2018) "SpatialDE: identification of spatially variable genes"
215
+ Nature Methods, DOI: 10.1038/nmeth.4636
216
+ Official tutorial: https://github.com/Teichlab/SpatialDE
217
+ """
218
+ # Use centralized dependency manager for consistent error handling
219
+ require("spatialde") # Raises ImportError with install instructions if missing
220
+
221
+ # Apply scipy compatibility patch for SpatialDE (scipy >= 1.14 removed scipy.misc.derivative)
222
+ from ..utils.scipy_compat import patch_scipy_misc_derivative
223
+
224
+ patch_scipy_misc_derivative()
225
+
226
+ import NaiveDE
227
+ import SpatialDE
228
+ from SpatialDE.util import qvalue
229
+
230
+ # Prepare spatial coordinates
231
+ coords = pd.DataFrame(
232
+ adata.obsm[params.spatial_key][:, :2], # Ensure 2D coordinates
233
+ columns=["x", "y"],
234
+ index=adata.obs_names,
235
+ )
236
+
237
+ # Get raw count data for SpatialDE preprocessing
238
+ # OPTIMIZATION: Filter genes on SPARSE matrix first, then convert only selected genes to dense
239
+ if adata.raw is not None:
240
+ raw_data = adata.raw.X
241
+ var_names = adata.raw.var_names
242
+ var_df = adata.var # For HVG lookup
243
+ else:
244
+ # Check if current data appears to be raw counts
245
+ data_max = adata.X.max() if hasattr(adata.X, "max") else np.max(adata.X)
246
+ if data_max <= 10: # Likely already normalized
247
+ raise DataError(
248
+ "SpatialDE requires raw counts. Data appears normalized (max<=10)."
249
+ )
250
+
251
+ raw_data = adata.X
252
+ var_names = adata.var_names
253
+ var_df = adata.var
254
+
255
+ # Step 1: Filter low-expression genes ON SPARSE MATRIX (Official recommendation)
256
+ # SpatialDE README: "Filter practically unobserved genes" with total_counts >= 3
257
+ gene_totals, _ = _calculate_sparse_gene_stats(raw_data)
258
+
259
+ keep_genes_mask = gene_totals >= 3
260
+ selected_var_names = var_names[keep_genes_mask]
261
+ # Step 2: Select top N HVGs ON SPARSE MATRIX (if requested)
262
+ # This further reduces genes BEFORE densification
263
+ final_genes = selected_var_names
264
+
265
+ if params.n_top_genes is not None and params.n_top_genes < len(selected_var_names):
266
+ if "highly_variable" in var_df.columns:
267
+ # Prioritize HVGs if available
268
+ hvg_mask = var_df.loc[selected_var_names, "highly_variable"]
269
+ hvg_genes = selected_var_names[hvg_mask]
270
+
271
+ if len(hvg_genes) >= params.n_top_genes:
272
+ # Use HVGs
273
+ final_genes = hvg_genes[: params.n_top_genes]
274
+ else:
275
+ # Not enough HVGs, select by expression
276
+ gene_totals_filtered = gene_totals[keep_genes_mask]
277
+ top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][
278
+ ::-1
279
+ ]
280
+ final_genes = selected_var_names[top_indices]
281
+ else:
282
+ # Select by expression
283
+ gene_totals_filtered = gene_totals[keep_genes_mask]
284
+ top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][::-1]
285
+ final_genes = selected_var_names[top_indices]
286
+
287
+ # Step 3: Slice sparse matrix to final genes, THEN convert to dense
288
+ # This is where the memory optimization happens: only convert selected genes
289
+ if adata.raw is not None:
290
+ final_adata_subset = adata.raw[:, final_genes]
291
+ else:
292
+ final_adata_subset = adata[:, final_genes]
293
+
294
+ # Now create DataFrame from the SUBSET (much smaller memory footprint)
295
+ counts = pd.DataFrame(
296
+ to_dense(final_adata_subset.X),
297
+ columns=final_adata_subset.var_names,
298
+ index=final_adata_subset.obs_names,
299
+ )
300
+
301
+ # Performance warning for large gene sets
302
+ n_genes = counts.shape[1]
303
+ n_spots = counts.shape[0]
304
+ if n_genes > 5000:
305
+ estimated_time = int(n_genes / 14000 * 10) # Based on 14k genes = 10 min
306
+ await ctx.warning(
307
+ f"WARNING:Running SpatialDE on {n_genes} genes × {n_spots} spots may take {estimated_time}-{estimated_time*2} minutes.\n"
308
+ f" • Official benchmark: ~10 min for 14,000 genes\n"
309
+ f" • Tip: Use n_top_genes=1000-3000 to test fewer genes\n"
310
+ f" • Or use method='sparkx' for faster analysis (2-5 min)"
311
+ )
312
+
313
+ # Calculate total counts per spot for regress_out
314
+ total_counts = pd.DataFrame(
315
+ {"total_counts": counts.sum(axis=1)}, index=counts.index
316
+ )
317
+
318
+ # Apply official SpatialDE preprocessing workflow
319
+ # Step 1: Variance stabilization
320
+ norm_expr = NaiveDE.stabilize(counts.T).T
321
+
322
+ # Step 2: Regress out library size effects
323
+ resid_expr = NaiveDE.regress_out(
324
+ total_counts, norm_expr.T, "np.log(total_counts)"
325
+ ).T
326
+
327
+ # Step 3: Run SpatialDE
328
+ results = SpatialDE.run(coords.values, resid_expr)
329
+
330
+ # Multiple testing correction using Storey q-value method
331
+ if params.spatialde_pi0 is not None:
332
+ # User-specified pi0 value
333
+ results["qval"] = qvalue(results["pval"].values, pi0=params.spatialde_pi0)
334
+ else:
335
+ # Adaptive pi0 estimation (SpatialDE default, recommended)
336
+ results["qval"] = qvalue(results["pval"].values)
337
+
338
+ # Sort by q-value
339
+ results = results.sort_values("qval")
340
+
341
+ # Filter significant genes
342
+ significant_genes_all = results[results["qval"] < 0.05]["g"].tolist()
343
+
344
+ # Limit for MCP response (full results stored in adata.var)
345
+ limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT
346
+ significant_genes = significant_genes_all[:limit]
347
+
348
+ # Store results in adata
349
+ results_key = f"spatialde_results_{data_id}"
350
+ adata.var["spatialde_pval"] = results.set_index("g")["pval"]
351
+ adata.var["spatialde_qval"] = results.set_index("g")["qval"]
352
+ adata.var["spatialde_l"] = results.set_index("g")["l"]
353
+
354
+ # Store scientific metadata for reproducibility
355
+ from ..utils.adata_utils import store_analysis_metadata
356
+
357
+ store_analysis_metadata(
358
+ adata,
359
+ analysis_name="spatial_genes_spatialde",
360
+ method="spatialde_official_workflow",
361
+ parameters={
362
+ "kernel": params.spatialde_kernel,
363
+ "preprocessing": "NaiveDE.stabilize + NaiveDE.regress_out",
364
+ "gene_filter_threshold": 3,
365
+ "n_genes_tested": n_genes,
366
+ "n_spots": n_spots,
367
+ "pi0": (
368
+ params.spatialde_pi0 if params.spatialde_pi0 is not None else "adaptive"
369
+ ),
370
+ },
371
+ results_keys={
372
+ "var": ["spatialde_pval", "spatialde_qval", "spatialde_l"],
373
+ "obs": [],
374
+ "obsm": [],
375
+ "uns": [],
376
+ },
377
+ statistics={
378
+ "n_genes_analyzed": len(results),
379
+ "n_significant_genes": len(
380
+ results[results["qval"] < 0.05] # FDR standard threshold
381
+ ),
382
+ },
383
+ )
384
+
385
+ # Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded
386
+ # from MCP response via Field(exclude=True) in SpatialVariableGenesResult.
387
+ # Full results are accessible via adata.var['spatialde_pval', 'spatialde_qval'].
388
+
389
+ result = SpatialVariableGenesResult(
390
+ data_id=data_id,
391
+ method="spatialde",
392
+ n_genes_analyzed=len(results),
393
+ n_significant_genes=len(significant_genes_all),
394
+ spatial_genes=significant_genes,
395
+ results_key=results_key,
396
+ )
397
+
398
+ return result
399
+
400
+
401
+ async def _identify_spatial_genes_sparkx(
402
+ data_id: str,
403
+ adata: Any,
404
+ params: SpatialVariableGenesParameters,
405
+ ctx: "ToolContext",
406
+ ) -> SpatialVariableGenesResult:
407
+ """
408
+ Identify spatial variable genes using the SPARK-X non-parametric method.
409
+
410
+ SPARK-X is an efficient non-parametric method for detecting spatially variable
411
+ genes without assuming specific distribution models. It uses spatial covariance
412
+ testing and is particularly effective for large-scale datasets. The method is
413
+ implemented in R and accessed via rpy2.
414
+
415
+ Method Advantages:
416
+ - Non-parametric: No distributional assumptions required
417
+ - Computationally efficient: Scales well with gene count
418
+ - Robust: Handles various spatial patterns effectively
419
+ - Flexible: Works with both single and mixture spatial kernels
420
+
421
+ Gene Filtering Pipeline (based on SPARK-X paper + 2024 best practices):
422
+ TIER 1 - Standard Filtering (SPARK-X paper):
423
+ - filter_mt_genes: Remove mitochondrial genes (MT-*, mt-*) [default: True]
424
+ - filter_ribo_genes: Remove ribosomal genes (RPS*, RPL*) [default: False]
425
+ - Expression filtering: Min percentage + total counts
426
+
427
+ TIER 2 - Advanced Options (2024 best practice from PMC11537352):
428
+ - test_only_hvg: Test only highly variable genes [default: False]
429
+ * Reduces housekeeping gene dominance
430
+ * Requires prior HVG computation in preprocessing
431
+
432
+ TIER 3 - Quality Warnings:
433
+ - warn_housekeeping: Warn if >30% top genes are housekeeping [default: True]
434
+ * Alerts about potential biological interpretation issues
435
+
436
+ Key Parameters:
437
+ - sparkx_option: 'single' or 'mixture' kernel (default: 'mixture')
438
+ - sparkx_percentage: Min percentage of cells expressing gene (default: 0.1)
439
+ - sparkx_min_total_counts: Min total counts per gene (default: 10)
440
+ - sparkx_num_core: Number of CPU cores for parallel processing
441
+ - filter_mt_genes: Filter mitochondrial genes (default: True)
442
+ - filter_ribo_genes: Filter ribosomal genes (default: False)
443
+ - test_only_hvg: Test only HVGs (default: False)
444
+ - warn_housekeeping: Warn about housekeeping dominance (default: True)
445
+
446
+ Data Processing:
447
+ - Automatically filters low-expression genes based on parameters
448
+ - Uses raw counts when available (adata.raw), otherwise current matrix
449
+ - Handles duplicate gene names by adding suffixes
450
+
451
+ Returns:
452
+ Results including:
453
+ - List of significant spatial genes (adjusted p-value < 0.05)
454
+ - Raw p-values from spatial covariance test
455
+ - Bonferroni-adjusted p-values
456
+ - Results dataframe with all tested genes
457
+ - Quality warnings if housekeeping genes dominate
458
+
459
+ Requirements:
460
+ - R installation with SPARK package
461
+ - rpy2 Python package for R integration
462
+ - Raw count data preferred (will use adata.raw if available)
463
+
464
+ Performance:
465
+ - Fastest among the three methods
466
+ - ~2-5 minutes for typical datasets (3000 spots × 20000 genes)
467
+ - Memory efficient through gene filtering
468
+
469
+ References:
470
+ - SPARK-X paper: Sun et al. (2021) Genome Biology
471
+ - HVG+SVG best practice: PMC11537352 (2024)
472
+ """
473
+ # Use centralized dependency manager for consistent error handling
474
+ require("rpy2") # Raises ImportError with install instructions if missing
475
+ from rpy2 import robjects as ro
476
+ from rpy2.rinterface_lib import openrlib # For thread safety
477
+ from rpy2.robjects import conversion, default_converter
478
+ from rpy2.robjects.packages import importr
479
+
480
+ # Prepare spatial coordinates - SPARK needs data.frame format
481
+ coords_array = adata.obsm[params.spatial_key][:, :2].astype(float)
482
+ n_spots, n_genes = adata.shape
483
+
484
+ # ==================== OPTIMIZED: Filter on sparse matrix, then convert ====================
485
+ # Strategy: Keep data sparse throughout filtering, only convert final filtered result
486
+ # Benefit: For 30k cells × 20k genes → 3k genes: save ~15GB memory
487
+
488
+ # Get sparse count matrix - DO NOT convert to dense yet!
489
+ if adata.raw is not None:
490
+ sparse_counts = adata.raw.X # Keep sparse!
491
+ gene_names = [str(name) for name in adata.raw.var_names]
492
+ n_genes = len(gene_names)
493
+ else:
494
+ sparse_counts = adata.X # Keep sparse!
495
+ gene_names = [str(name) for name in adata.var_names]
496
+ n_genes = len(gene_names)
497
+
498
+ # Ensure gene names are unique (required for SPARK-X R rownames)
499
+ gene_names = _ensure_unique_gene_names(gene_names)
500
+
501
+ # ==================== Gene Filtering Pipeline (ON SPARSE MATRIX) ====================
502
+ # Following SPARK-X paper best practices + 2024 literature recommendations
503
+ # All filtering done on sparse matrix to minimize memory usage
504
+
505
+ # Initialize gene mask (all True = keep all genes initially)
506
+ gene_mask = np.ones(len(gene_names), dtype=bool)
507
+
508
+ # Get var annotation source (prefer raw for complete gene annotations)
509
+ var_source = adata.raw if adata.raw is not None else adata
510
+
511
+ # TIER 1: Mitochondrial gene filtering (SPARK-X paper standard practice)
512
+ # Reuse preprocessing annotations when available for consistency
513
+ if params.filter_mt_genes:
514
+ mt_mask = None
515
+
516
+ # Try to reuse preprocessing annotations (elegant consistency)
517
+ if "mt" in var_source.var.columns:
518
+ mt_mask = var_source.var["mt"].values
519
+ else:
520
+ # Fallback to pattern-based detection
521
+ mt_mask = np.array([gene.startswith(("MT-", "mt-")) for gene in gene_names])
522
+
523
+ n_mt_genes = mt_mask.sum()
524
+ if n_mt_genes > 0:
525
+ gene_mask &= ~mt_mask # Exclude MT genes
526
+
527
+ # TIER 1: Ribosomal gene filtering (optional)
528
+ # Reuse preprocessing annotations when available for consistency
529
+ if params.filter_ribo_genes:
530
+ ribo_mask = None
531
+
532
+ # Try to reuse preprocessing annotations (elegant consistency)
533
+ if "ribo" in var_source.var.columns:
534
+ ribo_mask = var_source.var["ribo"].values
535
+ else:
536
+ # Fallback to pattern-based detection
537
+ ribo_mask = np.array(
538
+ [gene.startswith(("RPS", "RPL", "Rps", "Rpl")) for gene in gene_names]
539
+ )
540
+
541
+ n_ribo_genes = ribo_mask.sum()
542
+ if n_ribo_genes > 0:
543
+ gene_mask &= ~ribo_mask # Exclude ribosomal genes
544
+
545
+ # TIER 2: HVG-only testing (2024 best practice from PMC11537352)
546
+ if params.test_only_hvg:
547
+ # Check if HVGs are available in adata.var (the preprocessed data)
548
+ validate_var_column(
549
+ adata,
550
+ "highly_variable",
551
+ "Highly variable genes marker (test_only_hvg=True requires this)",
552
+ )
553
+
554
+ # Get HVG list from preprocessed data (adata.var)
555
+ hvg_genes_set = set(adata.var_names[adata.var["highly_variable"]])
556
+
557
+ if len(hvg_genes_set) == 0:
558
+ raise DataNotFoundError("No HVGs found. Run preprocessing first.")
559
+
560
+ # Filter gene_names to only include HVGs
561
+ hvg_mask = np.array([gene in hvg_genes_set for gene in gene_names])
562
+ n_hvg = hvg_mask.sum()
563
+
564
+ if n_hvg == 0:
565
+ # No overlap between current gene list and HVGs
566
+ raise DataError(
567
+ f"test_only_hvg=True but no overlap found between current gene list ({len(gene_names)} genes) "
568
+ f"and HVGs ({len(hvg_genes_set)} genes). "
569
+ "This may occur if adata.raw contains different genes than the preprocessed data. "
570
+ "Try setting test_only_hvg=False or ensure adata.raw is None."
571
+ )
572
+
573
+ gene_mask &= hvg_mask # Keep only HVGs
574
+
575
+ # TIER 1: Apply SPARK-X standard filtering (expression-based) - ON SPARSE MATRIX
576
+ percentage = params.sparkx_percentage
577
+ min_total_counts = params.sparkx_min_total_counts
578
+
579
+ # Calculate gene statistics on sparse matrix (efficient!)
580
+ gene_totals, n_expressed = _calculate_sparse_gene_stats(sparse_counts)
581
+
582
+ # Filter genes: must be expressed in at least percentage of cells AND have min total counts
583
+ min_cells = int(np.ceil(n_spots * percentage))
584
+ expr_mask = (n_expressed >= min_cells) & (gene_totals >= min_total_counts)
585
+
586
+ gene_mask &= expr_mask # Combine with previous filters
587
+
588
+ # Apply combined filter mask to sparse matrix (still sparse!)
589
+ if gene_mask.sum() < len(gene_names):
590
+ filtered_sparse = sparse_counts[:, gene_mask]
591
+ gene_names = [
592
+ gene for gene, keep in zip(gene_names, gene_mask, strict=False) if keep
593
+ ]
594
+ else:
595
+ filtered_sparse = sparse_counts
596
+
597
+ # NOW convert filtered sparse matrix to dense (much smaller!)
598
+ # copy=True ensures we don't modify original for dense input
599
+ counts_matrix = to_dense(filtered_sparse, copy=True)
600
+
601
+ # Ensure counts are non-negative integers
602
+ counts_matrix = np.maximum(counts_matrix, 0).astype(int)
603
+
604
+ # Update gene count after filtering
605
+ n_genes = len(gene_names)
606
+
607
+ # Transpose for SPARK format (genes × spots)
608
+ counts_transposed = counts_matrix.T
609
+
610
+ # Create spot names
611
+ spot_names = [str(name) for name in adata.obs_names]
612
+
613
+ # Wrap ALL R operations in thread lock and localconverter for proper contextvars handling
614
+ # This prevents "Conversion rules missing" errors in multithreaded/async environments
615
+ with openrlib.rlock: # Thread safety lock
616
+ with conversion.localconverter(default_converter): # Conversion context
617
+ # Import SPARK package inside context (FIX for contextvars issue)
618
+ try:
619
+ spark = importr("SPARK")
620
+ except Exception as e:
621
+ raise ImportError(
622
+ f"SPARK not installed in R. Install with: install.packages('SPARK'). Error: {e}"
623
+ ) from e
624
+
625
+ # Convert to R format (already in context)
626
+ # Count matrix: genes × spots
627
+ r_counts = ro.r.matrix(
628
+ ro.IntVector(counts_transposed.flatten()),
629
+ nrow=n_genes,
630
+ ncol=n_spots,
631
+ byrow=True,
632
+ )
633
+ r_counts.rownames = ro.StrVector(gene_names)
634
+ r_counts.colnames = ro.StrVector(spot_names)
635
+
636
+ # Coordinates as data.frame (SPARK requirement)
637
+ coords_df = pd.DataFrame(coords_array, columns=["x", "y"], index=spot_names)
638
+ r_coords = ro.r["data.frame"](
639
+ x=ro.FloatVector(coords_df["x"]),
640
+ y=ro.FloatVector(coords_df["y"]),
641
+ row_names=ro.StrVector(coords_df.index),
642
+ )
643
+
644
+ try:
645
+ # Execute SPARK-X analysis inside context (FIX for contextvars issue)
646
+ # Keep suppress_output for MCP communication compatibility
647
+ with suppress_output():
648
+ results = spark.sparkx(
649
+ count_in=r_counts,
650
+ locus_in=r_coords,
651
+ X_in=ro.NULL, # No additional covariates (could be extended in future)
652
+ numCores=params.sparkx_num_core,
653
+ option=params.sparkx_option,
654
+ verbose=False, # Ensure verbose is off for cleaner MCP communication
655
+ )
656
+
657
+ # Extract p-values from results (inside context for proper conversion)
658
+ # SPARK-X returns res_mtest as a data.frame with columns:
659
+ # - combinedPval: combined p-values across spatial kernels
660
+ # - adjustedPval: BY-adjusted p-values (Benjamini-Yekutieli FDR correction)
661
+ # Reference: SPARK R package documentation
662
+ try:
663
+ pvals = results.rx2("res_mtest")
664
+ if pvals is None:
665
+ raise ProcessingError(
666
+ "SPARK-X returned None for res_mtest. "
667
+ "This may indicate the analysis failed silently."
668
+ )
669
+
670
+ # Verify expected data.frame format
671
+ is_dataframe = ro.r["is.data.frame"](pvals)[0]
672
+ if not is_dataframe:
673
+ raise ProcessingError(
674
+ "SPARK-X output format error. Requires SPARK >= 1.1.0."
675
+ )
676
+
677
+ # Extract combinedPval (raw p-values combined across kernels)
678
+ combined_pvals = ro.r["$"](pvals, "combinedPval")
679
+ if combined_pvals is None:
680
+ raise ProcessingError(
681
+ "SPARK-X res_mtest missing 'combinedPval' column. "
682
+ "This is required for spatial gene identification."
683
+ )
684
+ pval_list = [float(p) for p in combined_pvals]
685
+
686
+ # Extract adjustedPval (BY-corrected p-values from SPARK-X)
687
+ adjusted_pvals = ro.r["$"](pvals, "adjustedPval")
688
+ if adjusted_pvals is None:
689
+ raise ProcessingError(
690
+ "SPARK-X res_mtest missing 'adjustedPval' column. "
691
+ "This column contains BY-corrected p-values for multiple testing."
692
+ )
693
+ adjusted_pval_list = [float(p) for p in adjusted_pvals]
694
+
695
+ # Create results dataframe
696
+ results_df = pd.DataFrame(
697
+ {
698
+ "gene": gene_names[: len(pval_list)],
699
+ "pvalue": pval_list,
700
+ "adjusted_pvalue": adjusted_pval_list, # BY-corrected by SPARK-X
701
+ }
702
+ )
703
+
704
+ # Warn if returned genes much fewer than input genes
705
+ if len(results_df) < n_genes * 0.5:
706
+ await ctx.warning(
707
+ f"SPARK-X returned results for only {len(results_df)}/{n_genes} genes. "
708
+ f"This may indicate a problem with the R environment, SPARK package, or input data. "
709
+ f"Consider checking R logs or trying SpatialDE as an alternative method."
710
+ )
711
+
712
+ except Exception as e:
713
+ # P-value extraction failed - provide clear error message
714
+ raise ProcessingError(
715
+ f"SPARK-X p-value extraction failed: {e}\n\n"
716
+ f"Expected SPARK-X output format:\n"
717
+ f"SPARK-X output invalid. Requires SPARK >= 1.1.0."
718
+ ) from e
719
+
720
+ except Exception as e:
721
+ raise ProcessingError(f"SPARK-X analysis failed: {e}") from e
722
+
723
+ # Sort by adjusted p-value
724
+ results_df = results_df.sort_values("adjusted_pvalue")
725
+
726
+ # Filter significant genes
727
+ significant_genes_all = results_df[results_df["adjusted_pvalue"] < 0.05][
728
+ "gene"
729
+ ].tolist()
730
+
731
+ # Limit for MCP response (full results stored in adata.var)
732
+ limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT
733
+ significant_genes = significant_genes_all[:limit]
734
+
735
+ # TIER 3: Housekeeping gene warnings (post-processing quality check)
736
+ if params.warn_housekeeping and len(results_df) > 0:
737
+ # Define housekeeping gene patterns (based on literature)
738
+ housekeeping_patterns = [
739
+ "RPS", # Ribosomal protein small subunit
740
+ "RPL", # Ribosomal protein large subunit
741
+ "Rps", # Mouse ribosomal small
742
+ "Rpl", # Mouse ribosomal large
743
+ "MT-", # Mitochondrial (human)
744
+ "mt-", # Mitochondrial (mouse)
745
+ "ACTB", # Beta-actin
746
+ "GAPDH", # Glyceraldehyde-3-phosphate dehydrogenase
747
+ "EEF1A1", # Eukaryotic translation elongation factor 1 alpha 1
748
+ "TUBA1B", # Tubulin alpha 1b
749
+ "B2M", # Beta-2-microglobulin
750
+ ]
751
+
752
+ # Check top significant genes (up to 50)
753
+ top_genes_to_check = results_df.head(50)["gene"].tolist()
754
+
755
+ # Mark housekeeping genes
756
+ housekeeping_genes = [
757
+ gene
758
+ for gene in top_genes_to_check
759
+ if any(
760
+ gene.startswith(pattern) or gene == pattern
761
+ for pattern in housekeeping_patterns
762
+ )
763
+ ]
764
+
765
+ n_housekeeping = len(housekeeping_genes)
766
+ n_top = len(top_genes_to_check)
767
+ housekeeping_ratio = n_housekeeping / n_top if n_top > 0 else 0
768
+
769
+ # Warn if >30% are housekeeping genes
770
+ if housekeeping_ratio > 0.3:
771
+ await ctx.warning(
772
+ f"WARNING:Housekeeping gene dominance detected: {n_housekeeping}/{n_top} ({housekeeping_ratio*100:.1f}%) of top genes are housekeeping genes.\n"
773
+ f" • Housekeeping genes found: {', '.join(housekeeping_genes[:10])}{'...' if len(housekeeping_genes) > 10 else ''}\n"
774
+ f" • These genes may not represent true spatial patterns\n"
775
+ f" • Recommendations:\n"
776
+ f" 1. Use test_only_hvg=True to reduce housekeeping dominance (2024 best practice)\n"
777
+ f" 2. Use filter_ribo_genes=True to filter ribosomal genes\n"
778
+ f" 3. Focus on genes with clear biological relevance\n"
779
+ f" • Note: This is a quality warning, not an error"
780
+ )
781
+
782
+ # Store results in adata
783
+ results_key = f"sparkx_results_{data_id}"
784
+ adata.var["sparkx_pval"] = pd.Series(
785
+ dict(zip(results_df["gene"], results_df["pvalue"], strict=False)),
786
+ name="sparkx_pval",
787
+ ).reindex(adata.var_names, fill_value=1.0)
788
+
789
+ adata.var["sparkx_qval"] = pd.Series(
790
+ dict(zip(results_df["gene"], results_df["adjusted_pvalue"], strict=False)),
791
+ name="sparkx_qval",
792
+ ).reindex(adata.var_names, fill_value=1.0)
793
+
794
+ # Store scientific metadata for reproducibility
795
+ from ..utils.adata_utils import store_analysis_metadata
796
+
797
+ store_analysis_metadata(
798
+ adata,
799
+ analysis_name="spatial_genes_sparkx",
800
+ method="sparkx",
801
+ parameters={
802
+ "num_core": params.sparkx_num_core,
803
+ "percentage": params.sparkx_percentage,
804
+ "min_total_counts": params.sparkx_min_total_counts,
805
+ "option": params.sparkx_option,
806
+ "filter_mt_genes": params.filter_mt_genes,
807
+ "filter_ribo_genes": params.filter_ribo_genes,
808
+ "test_only_hvg": params.test_only_hvg,
809
+ "warn_housekeeping": params.warn_housekeeping,
810
+ },
811
+ results_keys={
812
+ "var": ["sparkx_pval", "sparkx_qval"],
813
+ "obs": [],
814
+ "obsm": [],
815
+ "uns": [],
816
+ },
817
+ statistics={
818
+ "n_genes_analyzed": len(results_df),
819
+ "n_significant_genes": len(significant_genes_all),
820
+ },
821
+ )
822
+
823
+ # Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded
824
+ # from MCP response via Field(exclude=True) in SpatialVariableGenesResult.
825
+ # Full results are accessible via adata.var['sparkx_pval', 'sparkx_qval'].
826
+
827
+ result = SpatialVariableGenesResult(
828
+ data_id=data_id,
829
+ method="sparkx",
830
+ n_genes_analyzed=len(results_df),
831
+ n_significant_genes=len(significant_genes_all),
832
+ spatial_genes=significant_genes,
833
+ results_key=results_key,
834
+ )
835
+
836
+ return result