chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,836 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spatial Variable Genes (SVG) identification for ChatSpatial MCP.
|
|
3
|
+
|
|
4
|
+
This module provides implementations for SVG detection methods including SpatialDE and SPARK-X,
|
|
5
|
+
enabling comprehensive spatial transcriptomics analysis. Each method offers distinct advantages
|
|
6
|
+
for identifying genes with spatial expression patterns.
|
|
7
|
+
|
|
8
|
+
Methods Overview:
|
|
9
|
+
- SPARK-X (default): Non-parametric statistical method, best accuracy, requires R
|
|
10
|
+
- SpatialDE: Gaussian process-based kernel method, statistically rigorous
|
|
11
|
+
|
|
12
|
+
The module integrates these tools into the ChatSpatial MCP framework, handling data preparation,
|
|
13
|
+
execution, result formatting, and error management across different computational backends.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
20
|
+
|
|
21
|
+
from collections import Counter
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import pandas as pd
|
|
25
|
+
import scipy.sparse as sp
|
|
26
|
+
|
|
27
|
+
from ..models.analysis import SpatialVariableGenesResult # noqa: E402
|
|
28
|
+
from ..models.data import SpatialVariableGenesParameters # noqa: E402
|
|
29
|
+
from ..utils import validate_var_column # noqa: E402
|
|
30
|
+
from ..utils.adata_utils import require_spatial_coords, to_dense # noqa: E402
|
|
31
|
+
from ..utils.dependency_manager import require # noqa: E402
|
|
32
|
+
from ..utils.exceptions import DataNotFoundError # noqa: E402
|
|
33
|
+
from ..utils.exceptions import DataError, ParameterError, ProcessingError
|
|
34
|
+
from ..utils.mcp_utils import suppress_output # noqa: E402
|
|
35
|
+
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Shared Utilities for Spatial Variable Gene Detection
|
|
38
|
+
# =============================================================================
|
|
39
|
+
|
|
40
|
+
# Default limit for spatial_genes list returned to LLM
|
|
41
|
+
# Full results stored in adata.var for complete access
|
|
42
|
+
DEFAULT_TOP_GENES_LIMIT = 500
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _ensure_unique_gene_names(gene_names: list[str]) -> list[str]:
|
|
46
|
+
"""Ensure gene names are unique by adding suffixes to duplicates.
|
|
47
|
+
|
|
48
|
+
Required for R-based methods (SPARK-X) that use gene names as rownames.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
gene_names: List of gene names (may contain duplicates)
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of unique gene names with suffixes added to duplicates
|
|
55
|
+
"""
|
|
56
|
+
if len(gene_names) == len(set(gene_names)):
|
|
57
|
+
return gene_names
|
|
58
|
+
|
|
59
|
+
gene_counts = Counter(gene_names)
|
|
60
|
+
unique_names = []
|
|
61
|
+
seen_counts: dict[str, int] = {}
|
|
62
|
+
|
|
63
|
+
for gene in gene_names:
|
|
64
|
+
if gene_counts[gene] > 1:
|
|
65
|
+
if gene not in seen_counts:
|
|
66
|
+
seen_counts[gene] = 0
|
|
67
|
+
unique_names.append(gene)
|
|
68
|
+
else:
|
|
69
|
+
seen_counts[gene] += 1
|
|
70
|
+
unique_names.append(f"{gene}_{seen_counts[gene]}")
|
|
71
|
+
else:
|
|
72
|
+
unique_names.append(gene)
|
|
73
|
+
|
|
74
|
+
return unique_names
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _calculate_sparse_gene_stats(X) -> tuple[np.ndarray, np.ndarray]:
|
|
78
|
+
"""Calculate gene statistics on sparse or dense matrix.
|
|
79
|
+
|
|
80
|
+
Efficiently computes gene totals and expression counts without densifying
|
|
81
|
+
the entire matrix.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
X: Gene expression matrix (cells × genes), sparse or dense
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Tuple of (gene_totals, n_expressed_per_gene) as 1D arrays
|
|
88
|
+
"""
|
|
89
|
+
is_sparse = sp.issparse(X)
|
|
90
|
+
|
|
91
|
+
if is_sparse:
|
|
92
|
+
gene_totals = np.array(X.sum(axis=0)).flatten()
|
|
93
|
+
n_expressed = np.array((X > 0).sum(axis=0)).flatten()
|
|
94
|
+
else:
|
|
95
|
+
gene_totals = np.asarray(X.sum(axis=0)).flatten()
|
|
96
|
+
n_expressed = np.asarray((X > 0).sum(axis=0)).flatten()
|
|
97
|
+
|
|
98
|
+
return gene_totals, n_expressed
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def identify_spatial_genes(
|
|
102
|
+
data_id: str,
|
|
103
|
+
ctx: "ToolContext",
|
|
104
|
+
params: SpatialVariableGenesParameters,
|
|
105
|
+
) -> SpatialVariableGenesResult:
|
|
106
|
+
"""
|
|
107
|
+
Identify spatial variable genes using statistical methods.
|
|
108
|
+
|
|
109
|
+
This is the main entry point for spatial gene detection, routing to the appropriate
|
|
110
|
+
method based on params.method. Each method has different strengths:
|
|
111
|
+
|
|
112
|
+
Method Selection Guide:
|
|
113
|
+
- SPARK-X (default): Best for accuracy, handles large datasets efficiently
|
|
114
|
+
- SpatialDE: Best for statistical rigor in publication-ready analyses
|
|
115
|
+
|
|
116
|
+
Data Requirements:
|
|
117
|
+
- SPARK-X: Works with raw counts or normalized data
|
|
118
|
+
- SpatialDE: Works with raw count data
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
data_id: Dataset identifier in data store
|
|
122
|
+
ctx: ToolContext for data access and logging
|
|
123
|
+
params: Method-specific parameters (see SpatialVariableGenesParameters)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
SpatialVariableGenesResult containing:
|
|
127
|
+
- List of significant spatial genes
|
|
128
|
+
- Statistical metrics (p-values, q-values)
|
|
129
|
+
- Method-specific results
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If dataset not found or spatial coordinates missing
|
|
133
|
+
ImportError: If required method dependencies not installed
|
|
134
|
+
|
|
135
|
+
Performance Notes:
|
|
136
|
+
- SPARK-X: ~2-5 min for 3000 spots × 20000 genes
|
|
137
|
+
- SpatialDE: ~15-30 min (scales with spot count squared)
|
|
138
|
+
"""
|
|
139
|
+
# Get data via ToolContext
|
|
140
|
+
adata = await ctx.get_adata(data_id)
|
|
141
|
+
|
|
142
|
+
# Validate spatial coordinates exist
|
|
143
|
+
require_spatial_coords(adata, spatial_key=params.spatial_key)
|
|
144
|
+
|
|
145
|
+
# Route to appropriate method
|
|
146
|
+
if params.method == "spatialde":
|
|
147
|
+
return await _identify_spatial_genes_spatialde(data_id, adata, params, ctx)
|
|
148
|
+
elif params.method == "sparkx":
|
|
149
|
+
return await _identify_spatial_genes_sparkx(data_id, adata, params, ctx)
|
|
150
|
+
else:
|
|
151
|
+
raise ParameterError(
|
|
152
|
+
f"Unsupported method: {params.method}. Available methods: spatialde, sparkx"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
async def _identify_spatial_genes_spatialde(
|
|
157
|
+
data_id: str,
|
|
158
|
+
adata: Any,
|
|
159
|
+
params: SpatialVariableGenesParameters,
|
|
160
|
+
ctx: "ToolContext",
|
|
161
|
+
) -> SpatialVariableGenesResult:
|
|
162
|
+
"""
|
|
163
|
+
Identify spatial variable genes using the SpatialDE statistical framework.
|
|
164
|
+
|
|
165
|
+
SpatialDE employs Gaussian process regression with spatial kernels to decompose
|
|
166
|
+
gene expression variance into spatial and non-spatial components. It provides
|
|
167
|
+
rigorous statistical testing for spatial expression patterns with multiple
|
|
168
|
+
testing correction.
|
|
169
|
+
|
|
170
|
+
Official Preprocessing Workflow (Implemented):
|
|
171
|
+
This implementation follows the official SpatialDE best practices:
|
|
172
|
+
1. Filter low-expression genes (total_counts >= 3)
|
|
173
|
+
2. Variance stabilization (NaiveDE.stabilize)
|
|
174
|
+
3. Regress out library size effects (NaiveDE.regress_out)
|
|
175
|
+
4. Run SpatialDE spatial covariance test
|
|
176
|
+
5. Apply FDR correction (Storey q-value)
|
|
177
|
+
|
|
178
|
+
Method Details:
|
|
179
|
+
- Models spatial correlation using squared exponential kernel
|
|
180
|
+
- Tests significance via likelihood ratio test
|
|
181
|
+
- Applies FDR correction for multiple testing
|
|
182
|
+
- Returns both raw and adjusted p-values
|
|
183
|
+
|
|
184
|
+
Key Parameters:
|
|
185
|
+
- n_top_genes: Limit analysis to top N genes (for performance)
|
|
186
|
+
* If provided, preferentially uses HVGs if available
|
|
187
|
+
* Recommended: 1000-3000 for quick analysis
|
|
188
|
+
* None (default): Test all genes (may take 15-30 min for large datasets)
|
|
189
|
+
|
|
190
|
+
Performance Notes:
|
|
191
|
+
- ~10 minutes for 14,000 genes (official benchmark)
|
|
192
|
+
- Scales approximately linearly with gene count
|
|
193
|
+
- Performance warning issued when n_genes > 5000
|
|
194
|
+
- Tip: Use n_top_genes parameter to reduce runtime
|
|
195
|
+
|
|
196
|
+
Data Requirements:
|
|
197
|
+
- Raw count data (from adata.raw or adata.X)
|
|
198
|
+
- 2D spatial coordinates in adata.obsm['spatial']
|
|
199
|
+
- Data will be automatically preprocessed using official workflow
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Results including:
|
|
203
|
+
- List of significant spatial genes (q-value < 0.05)
|
|
204
|
+
- Log-likelihood ratios as test statistics
|
|
205
|
+
- Raw p-values and FDR-corrected q-values
|
|
206
|
+
- Spatial correlation length scale per gene
|
|
207
|
+
|
|
208
|
+
Requirements:
|
|
209
|
+
- SpatialDE package with NaiveDE module
|
|
210
|
+
- 2D spatial coordinates
|
|
211
|
+
- Raw count data (not normalized)
|
|
212
|
+
|
|
213
|
+
References:
|
|
214
|
+
Svensson et al. (2018) "SpatialDE: identification of spatially variable genes"
|
|
215
|
+
Nature Methods, DOI: 10.1038/nmeth.4636
|
|
216
|
+
Official tutorial: https://github.com/Teichlab/SpatialDE
|
|
217
|
+
"""
|
|
218
|
+
# Use centralized dependency manager for consistent error handling
|
|
219
|
+
require("spatialde") # Raises ImportError with install instructions if missing
|
|
220
|
+
|
|
221
|
+
# Apply scipy compatibility patch for SpatialDE (scipy >= 1.14 removed scipy.misc.derivative)
|
|
222
|
+
from ..utils.scipy_compat import patch_scipy_misc_derivative
|
|
223
|
+
|
|
224
|
+
patch_scipy_misc_derivative()
|
|
225
|
+
|
|
226
|
+
import NaiveDE
|
|
227
|
+
import SpatialDE
|
|
228
|
+
from SpatialDE.util import qvalue
|
|
229
|
+
|
|
230
|
+
# Prepare spatial coordinates
|
|
231
|
+
coords = pd.DataFrame(
|
|
232
|
+
adata.obsm[params.spatial_key][:, :2], # Ensure 2D coordinates
|
|
233
|
+
columns=["x", "y"],
|
|
234
|
+
index=adata.obs_names,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Get raw count data for SpatialDE preprocessing
|
|
238
|
+
# OPTIMIZATION: Filter genes on SPARSE matrix first, then convert only selected genes to dense
|
|
239
|
+
if adata.raw is not None:
|
|
240
|
+
raw_data = adata.raw.X
|
|
241
|
+
var_names = adata.raw.var_names
|
|
242
|
+
var_df = adata.var # For HVG lookup
|
|
243
|
+
else:
|
|
244
|
+
# Check if current data appears to be raw counts
|
|
245
|
+
data_max = adata.X.max() if hasattr(adata.X, "max") else np.max(adata.X)
|
|
246
|
+
if data_max <= 10: # Likely already normalized
|
|
247
|
+
raise DataError(
|
|
248
|
+
"SpatialDE requires raw counts. Data appears normalized (max<=10)."
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
raw_data = adata.X
|
|
252
|
+
var_names = adata.var_names
|
|
253
|
+
var_df = adata.var
|
|
254
|
+
|
|
255
|
+
# Step 1: Filter low-expression genes ON SPARSE MATRIX (Official recommendation)
|
|
256
|
+
# SpatialDE README: "Filter practically unobserved genes" with total_counts >= 3
|
|
257
|
+
gene_totals, _ = _calculate_sparse_gene_stats(raw_data)
|
|
258
|
+
|
|
259
|
+
keep_genes_mask = gene_totals >= 3
|
|
260
|
+
selected_var_names = var_names[keep_genes_mask]
|
|
261
|
+
# Step 2: Select top N HVGs ON SPARSE MATRIX (if requested)
|
|
262
|
+
# This further reduces genes BEFORE densification
|
|
263
|
+
final_genes = selected_var_names
|
|
264
|
+
|
|
265
|
+
if params.n_top_genes is not None and params.n_top_genes < len(selected_var_names):
|
|
266
|
+
if "highly_variable" in var_df.columns:
|
|
267
|
+
# Prioritize HVGs if available
|
|
268
|
+
hvg_mask = var_df.loc[selected_var_names, "highly_variable"]
|
|
269
|
+
hvg_genes = selected_var_names[hvg_mask]
|
|
270
|
+
|
|
271
|
+
if len(hvg_genes) >= params.n_top_genes:
|
|
272
|
+
# Use HVGs
|
|
273
|
+
final_genes = hvg_genes[: params.n_top_genes]
|
|
274
|
+
else:
|
|
275
|
+
# Not enough HVGs, select by expression
|
|
276
|
+
gene_totals_filtered = gene_totals[keep_genes_mask]
|
|
277
|
+
top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][
|
|
278
|
+
::-1
|
|
279
|
+
]
|
|
280
|
+
final_genes = selected_var_names[top_indices]
|
|
281
|
+
else:
|
|
282
|
+
# Select by expression
|
|
283
|
+
gene_totals_filtered = gene_totals[keep_genes_mask]
|
|
284
|
+
top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][::-1]
|
|
285
|
+
final_genes = selected_var_names[top_indices]
|
|
286
|
+
|
|
287
|
+
# Step 3: Slice sparse matrix to final genes, THEN convert to dense
|
|
288
|
+
# This is where the memory optimization happens: only convert selected genes
|
|
289
|
+
if adata.raw is not None:
|
|
290
|
+
final_adata_subset = adata.raw[:, final_genes]
|
|
291
|
+
else:
|
|
292
|
+
final_adata_subset = adata[:, final_genes]
|
|
293
|
+
|
|
294
|
+
# Now create DataFrame from the SUBSET (much smaller memory footprint)
|
|
295
|
+
counts = pd.DataFrame(
|
|
296
|
+
to_dense(final_adata_subset.X),
|
|
297
|
+
columns=final_adata_subset.var_names,
|
|
298
|
+
index=final_adata_subset.obs_names,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Performance warning for large gene sets
|
|
302
|
+
n_genes = counts.shape[1]
|
|
303
|
+
n_spots = counts.shape[0]
|
|
304
|
+
if n_genes > 5000:
|
|
305
|
+
estimated_time = int(n_genes / 14000 * 10) # Based on 14k genes = 10 min
|
|
306
|
+
await ctx.warning(
|
|
307
|
+
f"WARNING:Running SpatialDE on {n_genes} genes × {n_spots} spots may take {estimated_time}-{estimated_time*2} minutes.\n"
|
|
308
|
+
f" • Official benchmark: ~10 min for 14,000 genes\n"
|
|
309
|
+
f" • Tip: Use n_top_genes=1000-3000 to test fewer genes\n"
|
|
310
|
+
f" • Or use method='sparkx' for faster analysis (2-5 min)"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Calculate total counts per spot for regress_out
|
|
314
|
+
total_counts = pd.DataFrame(
|
|
315
|
+
{"total_counts": counts.sum(axis=1)}, index=counts.index
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Apply official SpatialDE preprocessing workflow
|
|
319
|
+
# Step 1: Variance stabilization
|
|
320
|
+
norm_expr = NaiveDE.stabilize(counts.T).T
|
|
321
|
+
|
|
322
|
+
# Step 2: Regress out library size effects
|
|
323
|
+
resid_expr = NaiveDE.regress_out(
|
|
324
|
+
total_counts, norm_expr.T, "np.log(total_counts)"
|
|
325
|
+
).T
|
|
326
|
+
|
|
327
|
+
# Step 3: Run SpatialDE
|
|
328
|
+
results = SpatialDE.run(coords.values, resid_expr)
|
|
329
|
+
|
|
330
|
+
# Multiple testing correction using Storey q-value method
|
|
331
|
+
if params.spatialde_pi0 is not None:
|
|
332
|
+
# User-specified pi0 value
|
|
333
|
+
results["qval"] = qvalue(results["pval"].values, pi0=params.spatialde_pi0)
|
|
334
|
+
else:
|
|
335
|
+
# Adaptive pi0 estimation (SpatialDE default, recommended)
|
|
336
|
+
results["qval"] = qvalue(results["pval"].values)
|
|
337
|
+
|
|
338
|
+
# Sort by q-value
|
|
339
|
+
results = results.sort_values("qval")
|
|
340
|
+
|
|
341
|
+
# Filter significant genes
|
|
342
|
+
significant_genes_all = results[results["qval"] < 0.05]["g"].tolist()
|
|
343
|
+
|
|
344
|
+
# Limit for MCP response (full results stored in adata.var)
|
|
345
|
+
limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT
|
|
346
|
+
significant_genes = significant_genes_all[:limit]
|
|
347
|
+
|
|
348
|
+
# Store results in adata
|
|
349
|
+
results_key = f"spatialde_results_{data_id}"
|
|
350
|
+
adata.var["spatialde_pval"] = results.set_index("g")["pval"]
|
|
351
|
+
adata.var["spatialde_qval"] = results.set_index("g")["qval"]
|
|
352
|
+
adata.var["spatialde_l"] = results.set_index("g")["l"]
|
|
353
|
+
|
|
354
|
+
# Store scientific metadata for reproducibility
|
|
355
|
+
from ..utils.adata_utils import store_analysis_metadata
|
|
356
|
+
|
|
357
|
+
store_analysis_metadata(
|
|
358
|
+
adata,
|
|
359
|
+
analysis_name="spatial_genes_spatialde",
|
|
360
|
+
method="spatialde_official_workflow",
|
|
361
|
+
parameters={
|
|
362
|
+
"kernel": params.spatialde_kernel,
|
|
363
|
+
"preprocessing": "NaiveDE.stabilize + NaiveDE.regress_out",
|
|
364
|
+
"gene_filter_threshold": 3,
|
|
365
|
+
"n_genes_tested": n_genes,
|
|
366
|
+
"n_spots": n_spots,
|
|
367
|
+
"pi0": (
|
|
368
|
+
params.spatialde_pi0 if params.spatialde_pi0 is not None else "adaptive"
|
|
369
|
+
),
|
|
370
|
+
},
|
|
371
|
+
results_keys={
|
|
372
|
+
"var": ["spatialde_pval", "spatialde_qval", "spatialde_l"],
|
|
373
|
+
"obs": [],
|
|
374
|
+
"obsm": [],
|
|
375
|
+
"uns": [],
|
|
376
|
+
},
|
|
377
|
+
statistics={
|
|
378
|
+
"n_genes_analyzed": len(results),
|
|
379
|
+
"n_significant_genes": len(
|
|
380
|
+
results[results["qval"] < 0.05] # FDR standard threshold
|
|
381
|
+
),
|
|
382
|
+
},
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded
|
|
386
|
+
# from MCP response via Field(exclude=True) in SpatialVariableGenesResult.
|
|
387
|
+
# Full results are accessible via adata.var['spatialde_pval', 'spatialde_qval'].
|
|
388
|
+
|
|
389
|
+
result = SpatialVariableGenesResult(
|
|
390
|
+
data_id=data_id,
|
|
391
|
+
method="spatialde",
|
|
392
|
+
n_genes_analyzed=len(results),
|
|
393
|
+
n_significant_genes=len(significant_genes_all),
|
|
394
|
+
spatial_genes=significant_genes,
|
|
395
|
+
results_key=results_key,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return result
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
async def _identify_spatial_genes_sparkx(
|
|
402
|
+
data_id: str,
|
|
403
|
+
adata: Any,
|
|
404
|
+
params: SpatialVariableGenesParameters,
|
|
405
|
+
ctx: "ToolContext",
|
|
406
|
+
) -> SpatialVariableGenesResult:
|
|
407
|
+
"""
|
|
408
|
+
Identify spatial variable genes using the SPARK-X non-parametric method.
|
|
409
|
+
|
|
410
|
+
SPARK-X is an efficient non-parametric method for detecting spatially variable
|
|
411
|
+
genes without assuming specific distribution models. It uses spatial covariance
|
|
412
|
+
testing and is particularly effective for large-scale datasets. The method is
|
|
413
|
+
implemented in R and accessed via rpy2.
|
|
414
|
+
|
|
415
|
+
Method Advantages:
|
|
416
|
+
- Non-parametric: No distributional assumptions required
|
|
417
|
+
- Computationally efficient: Scales well with gene count
|
|
418
|
+
- Robust: Handles various spatial patterns effectively
|
|
419
|
+
- Flexible: Works with both single and mixture spatial kernels
|
|
420
|
+
|
|
421
|
+
Gene Filtering Pipeline (based on SPARK-X paper + 2024 best practices):
|
|
422
|
+
TIER 1 - Standard Filtering (SPARK-X paper):
|
|
423
|
+
- filter_mt_genes: Remove mitochondrial genes (MT-*, mt-*) [default: True]
|
|
424
|
+
- filter_ribo_genes: Remove ribosomal genes (RPS*, RPL*) [default: False]
|
|
425
|
+
- Expression filtering: Min percentage + total counts
|
|
426
|
+
|
|
427
|
+
TIER 2 - Advanced Options (2024 best practice from PMC11537352):
|
|
428
|
+
- test_only_hvg: Test only highly variable genes [default: False]
|
|
429
|
+
* Reduces housekeeping gene dominance
|
|
430
|
+
* Requires prior HVG computation in preprocessing
|
|
431
|
+
|
|
432
|
+
TIER 3 - Quality Warnings:
|
|
433
|
+
- warn_housekeeping: Warn if >30% top genes are housekeeping [default: True]
|
|
434
|
+
* Alerts about potential biological interpretation issues
|
|
435
|
+
|
|
436
|
+
Key Parameters:
|
|
437
|
+
- sparkx_option: 'single' or 'mixture' kernel (default: 'mixture')
|
|
438
|
+
- sparkx_percentage: Min percentage of cells expressing gene (default: 0.1)
|
|
439
|
+
- sparkx_min_total_counts: Min total counts per gene (default: 10)
|
|
440
|
+
- sparkx_num_core: Number of CPU cores for parallel processing
|
|
441
|
+
- filter_mt_genes: Filter mitochondrial genes (default: True)
|
|
442
|
+
- filter_ribo_genes: Filter ribosomal genes (default: False)
|
|
443
|
+
- test_only_hvg: Test only HVGs (default: False)
|
|
444
|
+
- warn_housekeeping: Warn about housekeeping dominance (default: True)
|
|
445
|
+
|
|
446
|
+
Data Processing:
|
|
447
|
+
- Automatically filters low-expression genes based on parameters
|
|
448
|
+
- Uses raw counts when available (adata.raw), otherwise current matrix
|
|
449
|
+
- Handles duplicate gene names by adding suffixes
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Results including:
|
|
453
|
+
- List of significant spatial genes (adjusted p-value < 0.05)
|
|
454
|
+
- Raw p-values from spatial covariance test
|
|
455
|
+
- Bonferroni-adjusted p-values
|
|
456
|
+
- Results dataframe with all tested genes
|
|
457
|
+
- Quality warnings if housekeeping genes dominate
|
|
458
|
+
|
|
459
|
+
Requirements:
|
|
460
|
+
- R installation with SPARK package
|
|
461
|
+
- rpy2 Python package for R integration
|
|
462
|
+
- Raw count data preferred (will use adata.raw if available)
|
|
463
|
+
|
|
464
|
+
Performance:
|
|
465
|
+
- Fastest among the three methods
|
|
466
|
+
- ~2-5 minutes for typical datasets (3000 spots × 20000 genes)
|
|
467
|
+
- Memory efficient through gene filtering
|
|
468
|
+
|
|
469
|
+
References:
|
|
470
|
+
- SPARK-X paper: Sun et al. (2021) Genome Biology
|
|
471
|
+
- HVG+SVG best practice: PMC11537352 (2024)
|
|
472
|
+
"""
|
|
473
|
+
# Use centralized dependency manager for consistent error handling
|
|
474
|
+
require("rpy2") # Raises ImportError with install instructions if missing
|
|
475
|
+
from rpy2 import robjects as ro
|
|
476
|
+
from rpy2.rinterface_lib import openrlib # For thread safety
|
|
477
|
+
from rpy2.robjects import conversion, default_converter
|
|
478
|
+
from rpy2.robjects.packages import importr
|
|
479
|
+
|
|
480
|
+
# Prepare spatial coordinates - SPARK needs data.frame format
|
|
481
|
+
coords_array = adata.obsm[params.spatial_key][:, :2].astype(float)
|
|
482
|
+
n_spots, n_genes = adata.shape
|
|
483
|
+
|
|
484
|
+
# ==================== OPTIMIZED: Filter on sparse matrix, then convert ====================
|
|
485
|
+
# Strategy: Keep data sparse throughout filtering, only convert final filtered result
|
|
486
|
+
# Benefit: For 30k cells × 20k genes → 3k genes: save ~15GB memory
|
|
487
|
+
|
|
488
|
+
# Get sparse count matrix - DO NOT convert to dense yet!
|
|
489
|
+
if adata.raw is not None:
|
|
490
|
+
sparse_counts = adata.raw.X # Keep sparse!
|
|
491
|
+
gene_names = [str(name) for name in adata.raw.var_names]
|
|
492
|
+
n_genes = len(gene_names)
|
|
493
|
+
else:
|
|
494
|
+
sparse_counts = adata.X # Keep sparse!
|
|
495
|
+
gene_names = [str(name) for name in adata.var_names]
|
|
496
|
+
n_genes = len(gene_names)
|
|
497
|
+
|
|
498
|
+
# Ensure gene names are unique (required for SPARK-X R rownames)
|
|
499
|
+
gene_names = _ensure_unique_gene_names(gene_names)
|
|
500
|
+
|
|
501
|
+
# ==================== Gene Filtering Pipeline (ON SPARSE MATRIX) ====================
|
|
502
|
+
# Following SPARK-X paper best practices + 2024 literature recommendations
|
|
503
|
+
# All filtering done on sparse matrix to minimize memory usage
|
|
504
|
+
|
|
505
|
+
# Initialize gene mask (all True = keep all genes initially)
|
|
506
|
+
gene_mask = np.ones(len(gene_names), dtype=bool)
|
|
507
|
+
|
|
508
|
+
# Get var annotation source (prefer raw for complete gene annotations)
|
|
509
|
+
var_source = adata.raw if adata.raw is not None else adata
|
|
510
|
+
|
|
511
|
+
# TIER 1: Mitochondrial gene filtering (SPARK-X paper standard practice)
|
|
512
|
+
# Reuse preprocessing annotations when available for consistency
|
|
513
|
+
if params.filter_mt_genes:
|
|
514
|
+
mt_mask = None
|
|
515
|
+
|
|
516
|
+
# Try to reuse preprocessing annotations (elegant consistency)
|
|
517
|
+
if "mt" in var_source.var.columns:
|
|
518
|
+
mt_mask = var_source.var["mt"].values
|
|
519
|
+
else:
|
|
520
|
+
# Fallback to pattern-based detection
|
|
521
|
+
mt_mask = np.array([gene.startswith(("MT-", "mt-")) for gene in gene_names])
|
|
522
|
+
|
|
523
|
+
n_mt_genes = mt_mask.sum()
|
|
524
|
+
if n_mt_genes > 0:
|
|
525
|
+
gene_mask &= ~mt_mask # Exclude MT genes
|
|
526
|
+
|
|
527
|
+
# TIER 1: Ribosomal gene filtering (optional)
|
|
528
|
+
# Reuse preprocessing annotations when available for consistency
|
|
529
|
+
if params.filter_ribo_genes:
|
|
530
|
+
ribo_mask = None
|
|
531
|
+
|
|
532
|
+
# Try to reuse preprocessing annotations (elegant consistency)
|
|
533
|
+
if "ribo" in var_source.var.columns:
|
|
534
|
+
ribo_mask = var_source.var["ribo"].values
|
|
535
|
+
else:
|
|
536
|
+
# Fallback to pattern-based detection
|
|
537
|
+
ribo_mask = np.array(
|
|
538
|
+
[gene.startswith(("RPS", "RPL", "Rps", "Rpl")) for gene in gene_names]
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
n_ribo_genes = ribo_mask.sum()
|
|
542
|
+
if n_ribo_genes > 0:
|
|
543
|
+
gene_mask &= ~ribo_mask # Exclude ribosomal genes
|
|
544
|
+
|
|
545
|
+
# TIER 2: HVG-only testing (2024 best practice from PMC11537352)
|
|
546
|
+
if params.test_only_hvg:
|
|
547
|
+
# Check if HVGs are available in adata.var (the preprocessed data)
|
|
548
|
+
validate_var_column(
|
|
549
|
+
adata,
|
|
550
|
+
"highly_variable",
|
|
551
|
+
"Highly variable genes marker (test_only_hvg=True requires this)",
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Get HVG list from preprocessed data (adata.var)
|
|
555
|
+
hvg_genes_set = set(adata.var_names[adata.var["highly_variable"]])
|
|
556
|
+
|
|
557
|
+
if len(hvg_genes_set) == 0:
|
|
558
|
+
raise DataNotFoundError("No HVGs found. Run preprocessing first.")
|
|
559
|
+
|
|
560
|
+
# Filter gene_names to only include HVGs
|
|
561
|
+
hvg_mask = np.array([gene in hvg_genes_set for gene in gene_names])
|
|
562
|
+
n_hvg = hvg_mask.sum()
|
|
563
|
+
|
|
564
|
+
if n_hvg == 0:
|
|
565
|
+
# No overlap between current gene list and HVGs
|
|
566
|
+
raise DataError(
|
|
567
|
+
f"test_only_hvg=True but no overlap found between current gene list ({len(gene_names)} genes) "
|
|
568
|
+
f"and HVGs ({len(hvg_genes_set)} genes). "
|
|
569
|
+
"This may occur if adata.raw contains different genes than the preprocessed data. "
|
|
570
|
+
"Try setting test_only_hvg=False or ensure adata.raw is None."
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
gene_mask &= hvg_mask # Keep only HVGs
|
|
574
|
+
|
|
575
|
+
# TIER 1: Apply SPARK-X standard filtering (expression-based) - ON SPARSE MATRIX
|
|
576
|
+
percentage = params.sparkx_percentage
|
|
577
|
+
min_total_counts = params.sparkx_min_total_counts
|
|
578
|
+
|
|
579
|
+
# Calculate gene statistics on sparse matrix (efficient!)
|
|
580
|
+
gene_totals, n_expressed = _calculate_sparse_gene_stats(sparse_counts)
|
|
581
|
+
|
|
582
|
+
# Filter genes: must be expressed in at least percentage of cells AND have min total counts
|
|
583
|
+
min_cells = int(np.ceil(n_spots * percentage))
|
|
584
|
+
expr_mask = (n_expressed >= min_cells) & (gene_totals >= min_total_counts)
|
|
585
|
+
|
|
586
|
+
gene_mask &= expr_mask # Combine with previous filters
|
|
587
|
+
|
|
588
|
+
# Apply combined filter mask to sparse matrix (still sparse!)
|
|
589
|
+
if gene_mask.sum() < len(gene_names):
|
|
590
|
+
filtered_sparse = sparse_counts[:, gene_mask]
|
|
591
|
+
gene_names = [
|
|
592
|
+
gene for gene, keep in zip(gene_names, gene_mask, strict=False) if keep
|
|
593
|
+
]
|
|
594
|
+
else:
|
|
595
|
+
filtered_sparse = sparse_counts
|
|
596
|
+
|
|
597
|
+
# NOW convert filtered sparse matrix to dense (much smaller!)
|
|
598
|
+
# copy=True ensures we don't modify original for dense input
|
|
599
|
+
counts_matrix = to_dense(filtered_sparse, copy=True)
|
|
600
|
+
|
|
601
|
+
# Ensure counts are non-negative integers
|
|
602
|
+
counts_matrix = np.maximum(counts_matrix, 0).astype(int)
|
|
603
|
+
|
|
604
|
+
# Update gene count after filtering
|
|
605
|
+
n_genes = len(gene_names)
|
|
606
|
+
|
|
607
|
+
# Transpose for SPARK format (genes × spots)
|
|
608
|
+
counts_transposed = counts_matrix.T
|
|
609
|
+
|
|
610
|
+
# Create spot names
|
|
611
|
+
spot_names = [str(name) for name in adata.obs_names]
|
|
612
|
+
|
|
613
|
+
# Wrap ALL R operations in thread lock and localconverter for proper contextvars handling
|
|
614
|
+
# This prevents "Conversion rules missing" errors in multithreaded/async environments
|
|
615
|
+
with openrlib.rlock: # Thread safety lock
|
|
616
|
+
with conversion.localconverter(default_converter): # Conversion context
|
|
617
|
+
# Import SPARK package inside context (FIX for contextvars issue)
|
|
618
|
+
try:
|
|
619
|
+
spark = importr("SPARK")
|
|
620
|
+
except Exception as e:
|
|
621
|
+
raise ImportError(
|
|
622
|
+
f"SPARK not installed in R. Install with: install.packages('SPARK'). Error: {e}"
|
|
623
|
+
) from e
|
|
624
|
+
|
|
625
|
+
# Convert to R format (already in context)
|
|
626
|
+
# Count matrix: genes × spots
|
|
627
|
+
r_counts = ro.r.matrix(
|
|
628
|
+
ro.IntVector(counts_transposed.flatten()),
|
|
629
|
+
nrow=n_genes,
|
|
630
|
+
ncol=n_spots,
|
|
631
|
+
byrow=True,
|
|
632
|
+
)
|
|
633
|
+
r_counts.rownames = ro.StrVector(gene_names)
|
|
634
|
+
r_counts.colnames = ro.StrVector(spot_names)
|
|
635
|
+
|
|
636
|
+
# Coordinates as data.frame (SPARK requirement)
|
|
637
|
+
coords_df = pd.DataFrame(coords_array, columns=["x", "y"], index=spot_names)
|
|
638
|
+
r_coords = ro.r["data.frame"](
|
|
639
|
+
x=ro.FloatVector(coords_df["x"]),
|
|
640
|
+
y=ro.FloatVector(coords_df["y"]),
|
|
641
|
+
row_names=ro.StrVector(coords_df.index),
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
# Execute SPARK-X analysis inside context (FIX for contextvars issue)
|
|
646
|
+
# Keep suppress_output for MCP communication compatibility
|
|
647
|
+
with suppress_output():
|
|
648
|
+
results = spark.sparkx(
|
|
649
|
+
count_in=r_counts,
|
|
650
|
+
locus_in=r_coords,
|
|
651
|
+
X_in=ro.NULL, # No additional covariates (could be extended in future)
|
|
652
|
+
numCores=params.sparkx_num_core,
|
|
653
|
+
option=params.sparkx_option,
|
|
654
|
+
verbose=False, # Ensure verbose is off for cleaner MCP communication
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Extract p-values from results (inside context for proper conversion)
|
|
658
|
+
# SPARK-X returns res_mtest as a data.frame with columns:
|
|
659
|
+
# - combinedPval: combined p-values across spatial kernels
|
|
660
|
+
# - adjustedPval: BY-adjusted p-values (Benjamini-Yekutieli FDR correction)
|
|
661
|
+
# Reference: SPARK R package documentation
|
|
662
|
+
try:
|
|
663
|
+
pvals = results.rx2("res_mtest")
|
|
664
|
+
if pvals is None:
|
|
665
|
+
raise ProcessingError(
|
|
666
|
+
"SPARK-X returned None for res_mtest. "
|
|
667
|
+
"This may indicate the analysis failed silently."
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Verify expected data.frame format
|
|
671
|
+
is_dataframe = ro.r["is.data.frame"](pvals)[0]
|
|
672
|
+
if not is_dataframe:
|
|
673
|
+
raise ProcessingError(
|
|
674
|
+
"SPARK-X output format error. Requires SPARK >= 1.1.0."
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Extract combinedPval (raw p-values combined across kernels)
|
|
678
|
+
combined_pvals = ro.r["$"](pvals, "combinedPval")
|
|
679
|
+
if combined_pvals is None:
|
|
680
|
+
raise ProcessingError(
|
|
681
|
+
"SPARK-X res_mtest missing 'combinedPval' column. "
|
|
682
|
+
"This is required for spatial gene identification."
|
|
683
|
+
)
|
|
684
|
+
pval_list = [float(p) for p in combined_pvals]
|
|
685
|
+
|
|
686
|
+
# Extract adjustedPval (BY-corrected p-values from SPARK-X)
|
|
687
|
+
adjusted_pvals = ro.r["$"](pvals, "adjustedPval")
|
|
688
|
+
if adjusted_pvals is None:
|
|
689
|
+
raise ProcessingError(
|
|
690
|
+
"SPARK-X res_mtest missing 'adjustedPval' column. "
|
|
691
|
+
"This column contains BY-corrected p-values for multiple testing."
|
|
692
|
+
)
|
|
693
|
+
adjusted_pval_list = [float(p) for p in adjusted_pvals]
|
|
694
|
+
|
|
695
|
+
# Create results dataframe
|
|
696
|
+
results_df = pd.DataFrame(
|
|
697
|
+
{
|
|
698
|
+
"gene": gene_names[: len(pval_list)],
|
|
699
|
+
"pvalue": pval_list,
|
|
700
|
+
"adjusted_pvalue": adjusted_pval_list, # BY-corrected by SPARK-X
|
|
701
|
+
}
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
# Warn if returned genes much fewer than input genes
|
|
705
|
+
if len(results_df) < n_genes * 0.5:
|
|
706
|
+
await ctx.warning(
|
|
707
|
+
f"SPARK-X returned results for only {len(results_df)}/{n_genes} genes. "
|
|
708
|
+
f"This may indicate a problem with the R environment, SPARK package, or input data. "
|
|
709
|
+
f"Consider checking R logs or trying SpatialDE as an alternative method."
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
except Exception as e:
|
|
713
|
+
# P-value extraction failed - provide clear error message
|
|
714
|
+
raise ProcessingError(
|
|
715
|
+
f"SPARK-X p-value extraction failed: {e}\n\n"
|
|
716
|
+
f"Expected SPARK-X output format:\n"
|
|
717
|
+
f"SPARK-X output invalid. Requires SPARK >= 1.1.0."
|
|
718
|
+
) from e
|
|
719
|
+
|
|
720
|
+
except Exception as e:
|
|
721
|
+
raise ProcessingError(f"SPARK-X analysis failed: {e}") from e
|
|
722
|
+
|
|
723
|
+
# Sort by adjusted p-value
|
|
724
|
+
results_df = results_df.sort_values("adjusted_pvalue")
|
|
725
|
+
|
|
726
|
+
# Filter significant genes
|
|
727
|
+
significant_genes_all = results_df[results_df["adjusted_pvalue"] < 0.05][
|
|
728
|
+
"gene"
|
|
729
|
+
].tolist()
|
|
730
|
+
|
|
731
|
+
# Limit for MCP response (full results stored in adata.var)
|
|
732
|
+
limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT
|
|
733
|
+
significant_genes = significant_genes_all[:limit]
|
|
734
|
+
|
|
735
|
+
# TIER 3: Housekeeping gene warnings (post-processing quality check)
|
|
736
|
+
if params.warn_housekeeping and len(results_df) > 0:
|
|
737
|
+
# Define housekeeping gene patterns (based on literature)
|
|
738
|
+
housekeeping_patterns = [
|
|
739
|
+
"RPS", # Ribosomal protein small subunit
|
|
740
|
+
"RPL", # Ribosomal protein large subunit
|
|
741
|
+
"Rps", # Mouse ribosomal small
|
|
742
|
+
"Rpl", # Mouse ribosomal large
|
|
743
|
+
"MT-", # Mitochondrial (human)
|
|
744
|
+
"mt-", # Mitochondrial (mouse)
|
|
745
|
+
"ACTB", # Beta-actin
|
|
746
|
+
"GAPDH", # Glyceraldehyde-3-phosphate dehydrogenase
|
|
747
|
+
"EEF1A1", # Eukaryotic translation elongation factor 1 alpha 1
|
|
748
|
+
"TUBA1B", # Tubulin alpha 1b
|
|
749
|
+
"B2M", # Beta-2-microglobulin
|
|
750
|
+
]
|
|
751
|
+
|
|
752
|
+
# Check top significant genes (up to 50)
|
|
753
|
+
top_genes_to_check = results_df.head(50)["gene"].tolist()
|
|
754
|
+
|
|
755
|
+
# Mark housekeeping genes
|
|
756
|
+
housekeeping_genes = [
|
|
757
|
+
gene
|
|
758
|
+
for gene in top_genes_to_check
|
|
759
|
+
if any(
|
|
760
|
+
gene.startswith(pattern) or gene == pattern
|
|
761
|
+
for pattern in housekeeping_patterns
|
|
762
|
+
)
|
|
763
|
+
]
|
|
764
|
+
|
|
765
|
+
n_housekeeping = len(housekeeping_genes)
|
|
766
|
+
n_top = len(top_genes_to_check)
|
|
767
|
+
housekeeping_ratio = n_housekeeping / n_top if n_top > 0 else 0
|
|
768
|
+
|
|
769
|
+
# Warn if >30% are housekeeping genes
|
|
770
|
+
if housekeeping_ratio > 0.3:
|
|
771
|
+
await ctx.warning(
|
|
772
|
+
f"WARNING:Housekeeping gene dominance detected: {n_housekeeping}/{n_top} ({housekeeping_ratio*100:.1f}%) of top genes are housekeeping genes.\n"
|
|
773
|
+
f" • Housekeeping genes found: {', '.join(housekeeping_genes[:10])}{'...' if len(housekeeping_genes) > 10 else ''}\n"
|
|
774
|
+
f" • These genes may not represent true spatial patterns\n"
|
|
775
|
+
f" • Recommendations:\n"
|
|
776
|
+
f" 1. Use test_only_hvg=True to reduce housekeeping dominance (2024 best practice)\n"
|
|
777
|
+
f" 2. Use filter_ribo_genes=True to filter ribosomal genes\n"
|
|
778
|
+
f" 3. Focus on genes with clear biological relevance\n"
|
|
779
|
+
f" • Note: This is a quality warning, not an error"
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
# Store results in adata
|
|
783
|
+
results_key = f"sparkx_results_{data_id}"
|
|
784
|
+
adata.var["sparkx_pval"] = pd.Series(
|
|
785
|
+
dict(zip(results_df["gene"], results_df["pvalue"], strict=False)),
|
|
786
|
+
name="sparkx_pval",
|
|
787
|
+
).reindex(adata.var_names, fill_value=1.0)
|
|
788
|
+
|
|
789
|
+
adata.var["sparkx_qval"] = pd.Series(
|
|
790
|
+
dict(zip(results_df["gene"], results_df["adjusted_pvalue"], strict=False)),
|
|
791
|
+
name="sparkx_qval",
|
|
792
|
+
).reindex(adata.var_names, fill_value=1.0)
|
|
793
|
+
|
|
794
|
+
# Store scientific metadata for reproducibility
|
|
795
|
+
from ..utils.adata_utils import store_analysis_metadata
|
|
796
|
+
|
|
797
|
+
store_analysis_metadata(
|
|
798
|
+
adata,
|
|
799
|
+
analysis_name="spatial_genes_sparkx",
|
|
800
|
+
method="sparkx",
|
|
801
|
+
parameters={
|
|
802
|
+
"num_core": params.sparkx_num_core,
|
|
803
|
+
"percentage": params.sparkx_percentage,
|
|
804
|
+
"min_total_counts": params.sparkx_min_total_counts,
|
|
805
|
+
"option": params.sparkx_option,
|
|
806
|
+
"filter_mt_genes": params.filter_mt_genes,
|
|
807
|
+
"filter_ribo_genes": params.filter_ribo_genes,
|
|
808
|
+
"test_only_hvg": params.test_only_hvg,
|
|
809
|
+
"warn_housekeeping": params.warn_housekeeping,
|
|
810
|
+
},
|
|
811
|
+
results_keys={
|
|
812
|
+
"var": ["sparkx_pval", "sparkx_qval"],
|
|
813
|
+
"obs": [],
|
|
814
|
+
"obsm": [],
|
|
815
|
+
"uns": [],
|
|
816
|
+
},
|
|
817
|
+
statistics={
|
|
818
|
+
"n_genes_analyzed": len(results_df),
|
|
819
|
+
"n_significant_genes": len(significant_genes_all),
|
|
820
|
+
},
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
# Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded
|
|
824
|
+
# from MCP response via Field(exclude=True) in SpatialVariableGenesResult.
|
|
825
|
+
# Full results are accessible via adata.var['sparkx_pval', 'sparkx_qval'].
|
|
826
|
+
|
|
827
|
+
result = SpatialVariableGenesResult(
|
|
828
|
+
data_id=data_id,
|
|
829
|
+
method="sparkx",
|
|
830
|
+
n_genes_analyzed=len(results_df),
|
|
831
|
+
n_significant_genes=len(significant_genes_all),
|
|
832
|
+
spatial_genes=significant_genes,
|
|
833
|
+
results_key=results_key,
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
return result
|