chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preprocessing tools for spatial transcriptomics data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import traceback
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import scanpy as sc
|
|
9
|
+
import scipy.sparse
|
|
10
|
+
|
|
11
|
+
from ..models.analysis import PreprocessingResult
|
|
12
|
+
from ..models.data import PreprocessingParameters
|
|
13
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
14
|
+
from ..utils.adata_utils import (
|
|
15
|
+
ensure_unique_var_names_async,
|
|
16
|
+
sample_expression_values,
|
|
17
|
+
standardize_adata,
|
|
18
|
+
)
|
|
19
|
+
from ..utils.compute import ensure_pca
|
|
20
|
+
from ..utils.dependency_manager import require, validate_r_package
|
|
21
|
+
from ..utils.exceptions import (
|
|
22
|
+
DataError,
|
|
23
|
+
DependencyError,
|
|
24
|
+
ParameterError,
|
|
25
|
+
ProcessingError,
|
|
26
|
+
)
|
|
27
|
+
from ..utils.mcp_utils import mcp_tool_error_handler
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@mcp_tool_error_handler()
|
|
31
|
+
async def preprocess_data(
|
|
32
|
+
data_id: str,
|
|
33
|
+
ctx: ToolContext,
|
|
34
|
+
params: PreprocessingParameters = PreprocessingParameters(),
|
|
35
|
+
) -> PreprocessingResult:
|
|
36
|
+
"""Preprocess spatial transcriptomics data
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
data_id: Dataset ID
|
|
40
|
+
ctx: Tool context for data access and logging
|
|
41
|
+
params: Preprocessing parameters
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Preprocessing result summary
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
# Get AnnData directly via ToolContext
|
|
48
|
+
adata = await ctx.get_adata(data_id)
|
|
49
|
+
|
|
50
|
+
# Standardize data format at the entry point
|
|
51
|
+
try:
|
|
52
|
+
adata = standardize_adata(
|
|
53
|
+
adata, copy=False, strict=False, preserve_original=True
|
|
54
|
+
)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
await ctx.warning(
|
|
57
|
+
f"Data standardization failed: {e}. Proceeding with original data."
|
|
58
|
+
)
|
|
59
|
+
# Continue with original data if standardization fails
|
|
60
|
+
|
|
61
|
+
# Validate input data
|
|
62
|
+
if adata.n_obs == 0 or adata.n_vars == 0:
|
|
63
|
+
raise DataError(
|
|
64
|
+
f"Dataset {data_id} is empty: {adata.n_obs} cells, {adata.n_vars} genes"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Handle duplicate gene names (must be done before gene-based operations)
|
|
68
|
+
await ensure_unique_var_names_async(adata, ctx, "data")
|
|
69
|
+
|
|
70
|
+
# 1. Calculate QC metrics (including mitochondrial percentage)
|
|
71
|
+
try:
|
|
72
|
+
# Identify mitochondrial genes (MT-* for human, mt-* for mouse)
|
|
73
|
+
adata.var["mt"] = adata.var_names.str.startswith(("MT-", "mt-"))
|
|
74
|
+
|
|
75
|
+
# Identify ribosomal genes (RPS*, RPL* for human, Rps*, Rpl* for mouse)
|
|
76
|
+
adata.var["ribo"] = adata.var_names.str.startswith(
|
|
77
|
+
("RPS", "RPL", "Rps", "Rpl")
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# FIX: Adjust percent_top for small datasets
|
|
81
|
+
#
|
|
82
|
+
# Problem: sc.pp.calculate_qc_metrics() uses default percent_top=[50, 100, 200, 500]
|
|
83
|
+
# to calculate "percentage of counts in top N genes". When n_genes < 500,
|
|
84
|
+
# scanpy raises IndexError: "Positions outside range of features"
|
|
85
|
+
# (see scanpy/preprocessing/_qc.py line 392: check_ns decorator)
|
|
86
|
+
#
|
|
87
|
+
# Solution: Dynamically adjust percent_top to only include values < n_genes
|
|
88
|
+
n_genes = adata.n_vars
|
|
89
|
+
default_percent_top = [50, 100, 200, 500]
|
|
90
|
+
|
|
91
|
+
# Filter to only include values that are valid for this dataset
|
|
92
|
+
safe_percent_top = [p for p in default_percent_top if p < n_genes]
|
|
93
|
+
|
|
94
|
+
# For very small datasets (n_genes < 50), create proportional values
|
|
95
|
+
if not safe_percent_top:
|
|
96
|
+
safe_percent_top = []
|
|
97
|
+
for fraction in [0.1, 0.25, 0.5]:
|
|
98
|
+
val = max(1, int(n_genes * fraction))
|
|
99
|
+
if val < n_genes and val not in safe_percent_top:
|
|
100
|
+
safe_percent_top.append(val)
|
|
101
|
+
|
|
102
|
+
# Add the largest possible value (n_genes - 1) if reasonable
|
|
103
|
+
if n_genes > 1 and (n_genes - 1) not in safe_percent_top:
|
|
104
|
+
safe_percent_top.append(n_genes - 1)
|
|
105
|
+
|
|
106
|
+
safe_percent_top = (
|
|
107
|
+
sorted(set(safe_percent_top)) if safe_percent_top else None
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Calculate QC metrics including mitochondrial and ribosomal percentages
|
|
111
|
+
sc.pp.calculate_qc_metrics(
|
|
112
|
+
adata,
|
|
113
|
+
qc_vars=["mt", "ribo"],
|
|
114
|
+
percent_top=safe_percent_top,
|
|
115
|
+
inplace=True,
|
|
116
|
+
)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
raise ProcessingError(
|
|
119
|
+
f"QC metrics failed: {e}. "
|
|
120
|
+
f"Data: {adata.n_obs}×{adata.n_vars}, type: {type(adata.X).__name__}"
|
|
121
|
+
) from e
|
|
122
|
+
|
|
123
|
+
# Store original QC metrics before filtering (including mito stats)
|
|
124
|
+
mito_pct_col = "pct_counts_mt" if "pct_counts_mt" in adata.obs else None
|
|
125
|
+
qc_metrics = {
|
|
126
|
+
"n_cells_before_filtering": int(adata.n_obs),
|
|
127
|
+
"n_genes_before_filtering": int(adata.n_vars),
|
|
128
|
+
"median_genes_per_cell": float(np.median(adata.obs.n_genes_by_counts)),
|
|
129
|
+
"median_umi_per_cell": float(np.median(adata.obs.total_counts)),
|
|
130
|
+
}
|
|
131
|
+
# Add mitochondrial stats if available
|
|
132
|
+
if mito_pct_col:
|
|
133
|
+
qc_metrics["median_mito_pct"] = float(np.median(adata.obs[mito_pct_col]))
|
|
134
|
+
qc_metrics["max_mito_pct"] = float(np.max(adata.obs[mito_pct_col]))
|
|
135
|
+
qc_metrics["n_mt_genes"] = int(adata.var["mt"].sum())
|
|
136
|
+
|
|
137
|
+
# 2. Apply user-controlled data filtering and subsampling
|
|
138
|
+
min_cells = params.filter_genes_min_cells
|
|
139
|
+
if min_cells is not None and min_cells > 0:
|
|
140
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
141
|
+
|
|
142
|
+
min_genes = params.filter_cells_min_genes
|
|
143
|
+
if min_genes is not None and min_genes > 0:
|
|
144
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
145
|
+
|
|
146
|
+
# Apply mitochondrial percentage filtering (BEST PRACTICE for spatial data)
|
|
147
|
+
# High mito% indicates damaged cells that have lost cytoplasmic mRNA
|
|
148
|
+
if params.filter_mito_pct is not None and mito_pct_col:
|
|
149
|
+
high_mito_mask = adata.obs[mito_pct_col] > params.filter_mito_pct
|
|
150
|
+
n_high_mito = high_mito_mask.sum()
|
|
151
|
+
|
|
152
|
+
if n_high_mito > 0:
|
|
153
|
+
adata = adata[~high_mito_mask].copy()
|
|
154
|
+
# Update qc_metrics with mito filtering info
|
|
155
|
+
qc_metrics["n_spots_filtered_mito"] = int(n_high_mito)
|
|
156
|
+
elif params.filter_mito_pct is not None and not mito_pct_col:
|
|
157
|
+
await ctx.warning(
|
|
158
|
+
"Mitochondrial filtering requested but no mito genes detected. "
|
|
159
|
+
"This may indicate non-standard gene naming or imaging-based data."
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Apply spot subsampling if requested
|
|
163
|
+
if params.subsample_spots is not None and params.subsample_spots < adata.n_obs:
|
|
164
|
+
sc.pp.subsample(
|
|
165
|
+
adata,
|
|
166
|
+
n_obs=params.subsample_spots,
|
|
167
|
+
random_state=params.subsample_random_seed,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Apply gene subsampling if requested (after HVG selection)
|
|
171
|
+
gene_subsample_requested = params.subsample_genes is not None
|
|
172
|
+
|
|
173
|
+
# Save raw data before normalization (required for some analysis methods)
|
|
174
|
+
|
|
175
|
+
# IMPORTANT: Create a proper frozen copy for .raw to preserve counts
|
|
176
|
+
# Using `adata.raw = adata` creates a view that gets modified during normalization
|
|
177
|
+
# We need to create an independent AnnData object to truly preserve counts
|
|
178
|
+
import anndata as ad_module
|
|
179
|
+
|
|
180
|
+
# Memory optimization: AnnData.raw internally copies var, so no need for .copy()
|
|
181
|
+
# obs MUST be copied to prevent contamination from later preprocessing steps
|
|
182
|
+
# uns can be empty dict as raw doesn't need metadata
|
|
183
|
+
adata.raw = ad_module.AnnData(
|
|
184
|
+
X=adata.X.copy(), # Must copy - will be modified during normalization
|
|
185
|
+
var=adata.var, # No copy needed - AnnData internally creates independent copy
|
|
186
|
+
obs=adata.obs.copy(), # Must copy - will be modified by clustering/annotation
|
|
187
|
+
uns={}, # Empty dict - raw doesn't need uns metadata
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Store counts layer for scVI-tools compatibility (Cell2location, scANVI, DestVI)
|
|
191
|
+
# Note: This layer follows adata through HVG subsetting, complementing adata.raw
|
|
192
|
+
# - adata.raw: Full gene set (for cell communication needing complete L-R coverage)
|
|
193
|
+
# - adata.layers["counts"]: HVG subset after filtering (for scVI-tools alignment)
|
|
194
|
+
adata.layers["counts"] = adata.X.copy()
|
|
195
|
+
|
|
196
|
+
# Store preprocessing metadata following scanpy/anndata conventions
|
|
197
|
+
# This metadata enables downstream tools to reuse gene annotations
|
|
198
|
+
adata.uns["preprocessing"] = {
|
|
199
|
+
"normalization": params.normalization,
|
|
200
|
+
"raw_preserved": True,
|
|
201
|
+
"counts_layer": True,
|
|
202
|
+
"n_genes_before_norm": adata.n_vars,
|
|
203
|
+
# Gene type annotations - downstream tools should reuse these
|
|
204
|
+
"gene_annotations": {
|
|
205
|
+
"mt_column": "mt" if "mt" in adata.var.columns else None,
|
|
206
|
+
"ribo_column": "ribo" if "ribo" in adata.var.columns else None,
|
|
207
|
+
"n_mt_genes": (
|
|
208
|
+
int(adata.var["mt"].sum()) if "mt" in adata.var.columns else 0
|
|
209
|
+
),
|
|
210
|
+
"n_ribo_genes": (
|
|
211
|
+
int(adata.var["ribo"].sum()) if "ribo" in adata.var.columns else 0
|
|
212
|
+
),
|
|
213
|
+
},
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
# Update QC metrics after filtering
|
|
217
|
+
qc_metrics.update(
|
|
218
|
+
{
|
|
219
|
+
"n_cells_after_filtering": int(adata.n_obs),
|
|
220
|
+
"n_genes_after_filtering": int(adata.n_vars),
|
|
221
|
+
}
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# 3. Normalize data
|
|
225
|
+
# Log normalization configuration (developer log)
|
|
226
|
+
norm_config = {
|
|
227
|
+
"Method": params.normalization,
|
|
228
|
+
"Target sum": (
|
|
229
|
+
f"{params.normalize_target_sum:.0f}"
|
|
230
|
+
if params.normalize_target_sum is not None
|
|
231
|
+
else "ADAPTIVE (using median counts)"
|
|
232
|
+
),
|
|
233
|
+
}
|
|
234
|
+
if params.scale:
|
|
235
|
+
norm_config["Scale clipping"] = (
|
|
236
|
+
f"±{params.scale_max_value} SD"
|
|
237
|
+
if params.scale_max_value is not None
|
|
238
|
+
else "NONE (preserving all outliers)"
|
|
239
|
+
)
|
|
240
|
+
ctx.log_config("Normalization Configuration", norm_config)
|
|
241
|
+
|
|
242
|
+
if params.normalization == "log":
|
|
243
|
+
# Standard log normalization
|
|
244
|
+
# Check if data appears to be already normalized
|
|
245
|
+
X_sample = sample_expression_values(adata)
|
|
246
|
+
|
|
247
|
+
# Check for negative values (indicates already log-normalized data)
|
|
248
|
+
if np.any(X_sample < 0):
|
|
249
|
+
error_msg = (
|
|
250
|
+
"Log normalization requires non-negative data (raw or normalized counts). "
|
|
251
|
+
"Data contains negative values, suggesting it has already been log-normalized. "
|
|
252
|
+
"Options:\n"
|
|
253
|
+
"• Use normalization='none' if data is already pre-processed\n"
|
|
254
|
+
"• Load raw count data instead of processed data\n"
|
|
255
|
+
"• Remove the log transformation from your data before re-processing"
|
|
256
|
+
)
|
|
257
|
+
raise DataError(error_msg)
|
|
258
|
+
|
|
259
|
+
if params.normalize_target_sum is not None:
|
|
260
|
+
sc.pp.normalize_total(adata, target_sum=params.normalize_target_sum)
|
|
261
|
+
else:
|
|
262
|
+
# Calculate median for adaptive normalization
|
|
263
|
+
calculated_median = np.median(np.array(adata.X.sum(axis=1)).flatten())
|
|
264
|
+
sc.pp.normalize_total(adata, target_sum=calculated_median)
|
|
265
|
+
sc.pp.log1p(adata)
|
|
266
|
+
elif params.normalization == "sct":
|
|
267
|
+
# SCTransform v2 variance-stabilizing normalization via R's sctransform
|
|
268
|
+
# Check R sctransform availability using centralized dependency manager
|
|
269
|
+
try:
|
|
270
|
+
validate_r_package("sctransform", ctx)
|
|
271
|
+
validate_r_package("Matrix", ctx)
|
|
272
|
+
except ImportError as e:
|
|
273
|
+
full_error = (
|
|
274
|
+
f"SCTransform requires R and the sctransform package.\n\n"
|
|
275
|
+
f"ERROR: {e}\n\n"
|
|
276
|
+
"INSTALLATION:\n"
|
|
277
|
+
" 1. Install R (https://cran.r-project.org/)\n"
|
|
278
|
+
" 2. In R: install.packages('sctransform')\n"
|
|
279
|
+
" 3. pip install 'rpy2>=3.5.0'\n\n"
|
|
280
|
+
"ALTERNATIVES:\n"
|
|
281
|
+
"• Use normalization='pearson_residuals' (built-in, similar results)\n"
|
|
282
|
+
"• Use normalization='log' (standard method)"
|
|
283
|
+
)
|
|
284
|
+
raise DependencyError(full_error) from e
|
|
285
|
+
|
|
286
|
+
# Check if data appears to be raw counts (required for SCTransform)
|
|
287
|
+
X_sample = sample_expression_values(adata)
|
|
288
|
+
|
|
289
|
+
# Check for non-integer values (indicates normalized data)
|
|
290
|
+
if np.any((X_sample % 1) != 0):
|
|
291
|
+
raise DataError(
|
|
292
|
+
"SCTransform requires raw count data (integers). "
|
|
293
|
+
"Use normalization='log' for normalized data."
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Map method parameter to vst.flavor
|
|
297
|
+
vst_flavor = "v2" if params.sct_method == "fix-slope" else "v1"
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
# Import rpy2 modules
|
|
301
|
+
import rpy2.robjects as ro
|
|
302
|
+
from rpy2.robjects import numpy2ri
|
|
303
|
+
from rpy2.robjects.conversion import localconverter
|
|
304
|
+
|
|
305
|
+
# Note: counts layer already saved in unified preprocessing step (line 338)
|
|
306
|
+
# It will be properly subsetted if SCT filters genes
|
|
307
|
+
# Convert to sparse CSC matrix (genes × cells) for R's dgCMatrix
|
|
308
|
+
if scipy.sparse.issparse(adata.X):
|
|
309
|
+
counts_sparse = scipy.sparse.csc_matrix(adata.X.T)
|
|
310
|
+
else:
|
|
311
|
+
counts_sparse = scipy.sparse.csc_matrix(adata.X.T)
|
|
312
|
+
|
|
313
|
+
# Transfer sparse matrix components to R
|
|
314
|
+
with localconverter(ro.default_converter + numpy2ri.converter):
|
|
315
|
+
ro.globalenv["sp_data"] = counts_sparse.data.astype(np.float64)
|
|
316
|
+
ro.globalenv["sp_indices"] = counts_sparse.indices.astype(np.int32)
|
|
317
|
+
ro.globalenv["sp_indptr"] = counts_sparse.indptr.astype(np.int32)
|
|
318
|
+
ro.globalenv["n_genes"] = counts_sparse.shape[0]
|
|
319
|
+
ro.globalenv["n_cells"] = counts_sparse.shape[1]
|
|
320
|
+
ro.globalenv["gene_names"] = ro.StrVector(adata.var_names.tolist())
|
|
321
|
+
ro.globalenv["cell_names"] = ro.StrVector(adata.obs_names.tolist())
|
|
322
|
+
ro.globalenv["vst_flavor"] = vst_flavor
|
|
323
|
+
ro.globalenv["n_cells_param"] = (
|
|
324
|
+
params.sct_n_cells if params.sct_n_cells else ro.NULL
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Reconstruct sparse matrix and run SCTransform in R
|
|
328
|
+
ro.r(
|
|
329
|
+
"""
|
|
330
|
+
library(Matrix)
|
|
331
|
+
library(sctransform)
|
|
332
|
+
|
|
333
|
+
# Create dgCMatrix from components
|
|
334
|
+
umi_matrix <- new(
|
|
335
|
+
"dgCMatrix",
|
|
336
|
+
x = as.numeric(sp_data),
|
|
337
|
+
i = as.integer(sp_indices),
|
|
338
|
+
p = as.integer(sp_indptr),
|
|
339
|
+
Dim = as.integer(c(n_genes, n_cells)),
|
|
340
|
+
Dimnames = list(gene_names, cell_names)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Run SCTransform
|
|
344
|
+
suppressWarnings({
|
|
345
|
+
vst_result <- sctransform::vst(
|
|
346
|
+
umi = umi_matrix,
|
|
347
|
+
vst.flavor = vst_flavor,
|
|
348
|
+
return_gene_attr = TRUE,
|
|
349
|
+
return_cell_attr = TRUE,
|
|
350
|
+
n_cells = n_cells_param,
|
|
351
|
+
verbosity = 0
|
|
352
|
+
)
|
|
353
|
+
})
|
|
354
|
+
|
|
355
|
+
# Convert output to dense matrix for transfer
|
|
356
|
+
pearson_residuals <- as.matrix(vst_result$y)
|
|
357
|
+
residual_variance <- vst_result$gene_attr$residual_variance
|
|
358
|
+
# Extract gene names that survived SCTransform filtering
|
|
359
|
+
kept_genes <- rownames(vst_result$y)
|
|
360
|
+
"""
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Extract results from R
|
|
364
|
+
with localconverter(ro.default_converter + numpy2ri.converter):
|
|
365
|
+
pearson_residuals = np.array(ro.r("pearson_residuals"))
|
|
366
|
+
residual_variance = np.array(ro.r("residual_variance"))
|
|
367
|
+
kept_genes = list(ro.r("kept_genes"))
|
|
368
|
+
|
|
369
|
+
# CRITICAL FIX: Subset adata to match genes returned by SCTransform
|
|
370
|
+
# R's sctransform internally filters genes, so we need to subset
|
|
371
|
+
n_genes_before_sct = adata.n_vars
|
|
372
|
+
if len(kept_genes) != adata.n_vars:
|
|
373
|
+
n_filtered = adata.n_vars - len(kept_genes)
|
|
374
|
+
# Subset adata to keep only genes returned by SCTransform
|
|
375
|
+
adata = adata[:, kept_genes].copy()
|
|
376
|
+
else:
|
|
377
|
+
n_filtered = 0
|
|
378
|
+
|
|
379
|
+
# Transpose back to cells × genes for AnnData format
|
|
380
|
+
adata.X = pearson_residuals.T
|
|
381
|
+
|
|
382
|
+
# Store SCTransform metadata
|
|
383
|
+
adata.uns["sctransform"] = {
|
|
384
|
+
"method": params.sct_method,
|
|
385
|
+
"vst_flavor": vst_flavor,
|
|
386
|
+
"var_features_n": params.sct_var_features_n,
|
|
387
|
+
"exclude_poisson": params.sct_exclude_poisson,
|
|
388
|
+
"n_cells": params.sct_n_cells,
|
|
389
|
+
"n_genes_before": n_genes_before_sct,
|
|
390
|
+
"n_genes_after": len(kept_genes),
|
|
391
|
+
"n_genes_filtered_by_sct": n_filtered,
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
# Mark highly variable genes based on residual variance
|
|
395
|
+
# Now adata has been subset, so residual_variance should match adata.n_vars
|
|
396
|
+
if len(residual_variance) != adata.n_vars:
|
|
397
|
+
error_msg = (
|
|
398
|
+
f"Dimension mismatch after SCTransform: "
|
|
399
|
+
f"residual_variance has {len(residual_variance)} values "
|
|
400
|
+
f"but adata has {adata.n_vars} genes"
|
|
401
|
+
)
|
|
402
|
+
raise ProcessingError(error_msg)
|
|
403
|
+
|
|
404
|
+
adata.var["sct_residual_variance"] = residual_variance
|
|
405
|
+
|
|
406
|
+
# Select top N genes by residual variance
|
|
407
|
+
n_hvg = min(params.sct_var_features_n, len(residual_variance))
|
|
408
|
+
top_hvg_indices = np.argsort(residual_variance)[-n_hvg:]
|
|
409
|
+
adata.var["highly_variable"] = False
|
|
410
|
+
adata.var.iloc[
|
|
411
|
+
top_hvg_indices, adata.var.columns.get_loc("highly_variable")
|
|
412
|
+
] = True
|
|
413
|
+
|
|
414
|
+
except MemoryError as e:
|
|
415
|
+
raise MemoryError(
|
|
416
|
+
f"Memory error for SCTransform on {adata.n_obs}×{adata.n_vars} matrix. "
|
|
417
|
+
f"Use normalization='log' or subsample data."
|
|
418
|
+
) from e
|
|
419
|
+
except Exception as e:
|
|
420
|
+
raise ProcessingError(f"SCTransform failed: {e}") from e
|
|
421
|
+
elif params.normalization == "pearson_residuals":
|
|
422
|
+
# Modern Pearson residuals normalization (recommended for UMI data)
|
|
423
|
+
|
|
424
|
+
# Check if method is available
|
|
425
|
+
if not hasattr(sc.experimental.pp, "normalize_pearson_residuals"):
|
|
426
|
+
error_msg = (
|
|
427
|
+
"Pearson residuals normalization not available (requires scanpy>=1.9.0).\n"
|
|
428
|
+
"Options:\n"
|
|
429
|
+
"• Install newer scanpy: pip install 'scanpy>=1.9.0'\n"
|
|
430
|
+
"• Use log normalization instead: params.normalization='log'\n"
|
|
431
|
+
"• Skip normalization if data is pre-processed: params.normalization='none'"
|
|
432
|
+
)
|
|
433
|
+
raise DependencyError(error_msg)
|
|
434
|
+
|
|
435
|
+
# Check if data appears to be raw counts
|
|
436
|
+
X_sample = sample_expression_values(adata)
|
|
437
|
+
|
|
438
|
+
# Check for non-integer values (indicates normalized data)
|
|
439
|
+
if np.any((X_sample % 1) != 0):
|
|
440
|
+
raise DataError(
|
|
441
|
+
"Pearson residuals requires raw count data (integers). "
|
|
442
|
+
"Data contains non-integer values. "
|
|
443
|
+
"Use params.normalization='none' if data is already normalized, "
|
|
444
|
+
"or params.normalization='log' for standard normalization."
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Execute normalization
|
|
448
|
+
try:
|
|
449
|
+
# Apply Pearson residuals normalization (to all genes)
|
|
450
|
+
# Note: High variable gene selection happens later in the pipeline
|
|
451
|
+
sc.experimental.pp.normalize_pearson_residuals(adata)
|
|
452
|
+
except MemoryError as e:
|
|
453
|
+
raise MemoryError(
|
|
454
|
+
f"Insufficient memory for Pearson residuals on {adata.n_obs}×{adata.n_vars} matrix. "
|
|
455
|
+
"Try reducing n_hvgs or use 'log' normalization."
|
|
456
|
+
) from e
|
|
457
|
+
except Exception as e:
|
|
458
|
+
raise ProcessingError(
|
|
459
|
+
f"Pearson residuals normalization failed: {e}. "
|
|
460
|
+
"Consider using 'log' normalization instead."
|
|
461
|
+
) from e
|
|
462
|
+
elif params.normalization == "none":
|
|
463
|
+
# Explicitly skip normalization
|
|
464
|
+
|
|
465
|
+
# CRITICAL: Check if data appears to be raw counts
|
|
466
|
+
# HVG selection requires normalized data for statistical validity
|
|
467
|
+
X_sample = sample_expression_values(adata)
|
|
468
|
+
|
|
469
|
+
# Check if data looks raw (all integers and high values)
|
|
470
|
+
if np.all((X_sample % 1) == 0) and np.max(X_sample) > 100:
|
|
471
|
+
error_msg = (
|
|
472
|
+
"STATISTICAL ERROR: Cannot perform HVG selection on raw counts with normalization='none'\n\n"
|
|
473
|
+
"Your data appears to be raw counts (integer values with max > 100), but you specified "
|
|
474
|
+
"normalization='none'. Highly variable gene (HVG) selection requires normalized data "
|
|
475
|
+
"for statistical validity because:\n"
|
|
476
|
+
"• Raw count variance scales non-linearly with expression level\n"
|
|
477
|
+
"• This prevents accurate comparison of variability across genes\n"
|
|
478
|
+
"• Scanpy's HVG algorithm will fail with 'infinity' errors\n\n"
|
|
479
|
+
"REQUIRED ACTIONS:\n"
|
|
480
|
+
"Option 1 (Recommended): Use normalization='log' for standard log-normalization\n"
|
|
481
|
+
"Option 2: Use normalization='pearson_residuals' for variance-stabilizing normalization\n"
|
|
482
|
+
"Option 3: Pre-normalize your data externally, then reload with normalized values\n\n"
|
|
483
|
+
"WARNING: If your data is already normalized but appears raw, verify data integrity."
|
|
484
|
+
)
|
|
485
|
+
raise DataError(error_msg)
|
|
486
|
+
elif params.normalization == "scvi":
|
|
487
|
+
# scVI deep learning-based normalization
|
|
488
|
+
# Uses variational autoencoder to learn latent representation
|
|
489
|
+
require("scvi", feature="scVI normalization")
|
|
490
|
+
import scvi
|
|
491
|
+
|
|
492
|
+
# Check if data appears to be raw counts (required for scVI)
|
|
493
|
+
X_sample = sample_expression_values(adata)
|
|
494
|
+
|
|
495
|
+
# Check for negative values (indicates already normalized data)
|
|
496
|
+
if np.any(X_sample < 0):
|
|
497
|
+
raise DataError(
|
|
498
|
+
"scVI requires non-negative count data. Data contains negative values."
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
try:
|
|
502
|
+
# Note: counts layer already saved in unified preprocessing step (line 338)
|
|
503
|
+
# scVI requires this layer for proper count-based modeling
|
|
504
|
+
|
|
505
|
+
# Setup AnnData for scVI using the pre-saved counts layer
|
|
506
|
+
scvi.model.SCVI.setup_anndata(
|
|
507
|
+
adata,
|
|
508
|
+
layer="counts",
|
|
509
|
+
batch_key=(
|
|
510
|
+
params.batch_key
|
|
511
|
+
if params.batch_key in adata.obs.columns
|
|
512
|
+
else None
|
|
513
|
+
),
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Create scVI model with user-specified parameters
|
|
517
|
+
scvi_model = scvi.model.SCVI(
|
|
518
|
+
adata,
|
|
519
|
+
n_hidden=params.scvi_n_hidden,
|
|
520
|
+
n_latent=params.scvi_n_latent,
|
|
521
|
+
n_layers=params.scvi_n_layers,
|
|
522
|
+
dropout_rate=params.scvi_dropout_rate,
|
|
523
|
+
gene_likelihood=params.scvi_gene_likelihood,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Train the model with user-configurable parameters
|
|
527
|
+
scvi_model.train(
|
|
528
|
+
max_epochs=params.scvi_max_epochs,
|
|
529
|
+
early_stopping=params.scvi_early_stopping,
|
|
530
|
+
early_stopping_patience=params.scvi_early_stopping_patience,
|
|
531
|
+
early_stopping_monitor="elbo_validation",
|
|
532
|
+
train_size=params.scvi_train_size,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# Get latent representation (replaces PCA)
|
|
536
|
+
adata.obsm["X_scvi"] = scvi_model.get_latent_representation()
|
|
537
|
+
|
|
538
|
+
# Get normalized expression for downstream analysis
|
|
539
|
+
# This is the denoised, batch-corrected expression
|
|
540
|
+
normalized_expr = scvi_model.get_normalized_expression(
|
|
541
|
+
library_size=1e4 # Normalize to 10k counts
|
|
542
|
+
)
|
|
543
|
+
# Store as dense array (normalized expression is typically dense)
|
|
544
|
+
if hasattr(normalized_expr, "values"):
|
|
545
|
+
adata.X = normalized_expr.values
|
|
546
|
+
else:
|
|
547
|
+
adata.X = np.array(normalized_expr)
|
|
548
|
+
|
|
549
|
+
# Apply log1p for downstream compatibility
|
|
550
|
+
adata.X = np.log1p(adata.X)
|
|
551
|
+
|
|
552
|
+
# Store scVI metadata
|
|
553
|
+
adata.uns["scvi"] = {
|
|
554
|
+
"n_hidden": params.scvi_n_hidden,
|
|
555
|
+
"n_latent": params.scvi_n_latent,
|
|
556
|
+
"n_layers": params.scvi_n_layers,
|
|
557
|
+
"dropout_rate": params.scvi_dropout_rate,
|
|
558
|
+
"gene_likelihood": params.scvi_gene_likelihood,
|
|
559
|
+
"training_completed": True,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
except Exception as e:
|
|
563
|
+
raise ProcessingError(f"scVI normalization failed: {e}") from e
|
|
564
|
+
else:
|
|
565
|
+
# Catch unknown normalization methods
|
|
566
|
+
valid_methods = ["log", "sct", "pearson_residuals", "none", "scvi"]
|
|
567
|
+
raise ParameterError(
|
|
568
|
+
f"Unknown normalization method: '{params.normalization}'. "
|
|
569
|
+
f"Valid options are: {', '.join(valid_methods)}"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# 4. Find highly variable genes and apply gene subsampling
|
|
573
|
+
# Determine number of HVGs to select
|
|
574
|
+
if gene_subsample_requested:
|
|
575
|
+
# User wants to subsample genes
|
|
576
|
+
n_hvgs = min(params.subsample_genes, adata.n_vars - 1, params.n_hvgs)
|
|
577
|
+
else:
|
|
578
|
+
# Use standard HVG selection
|
|
579
|
+
n_hvgs = min(params.n_hvgs, adata.n_vars - 1)
|
|
580
|
+
|
|
581
|
+
# Statistical warning: Very low HVG count may lead to unstable clustering
|
|
582
|
+
# Based on literature consensus: 500-5000 genes recommended, 1000-2000 typical
|
|
583
|
+
# References:
|
|
584
|
+
# - Bioconductor OSCA: "any value from 500 to 5000 is reasonable"
|
|
585
|
+
# - Single-cell best practices: typical range 1000-2000
|
|
586
|
+
if n_hvgs < 500:
|
|
587
|
+
await ctx.warning(
|
|
588
|
+
f"Using only {n_hvgs} HVGs is below the recommended minimum of 500 genes.\n"
|
|
589
|
+
f" • Literature consensus: 500-5000 genes (typical: 1000-2000)\n"
|
|
590
|
+
f" • Low gene counts may lead to unstable clustering results\n"
|
|
591
|
+
f" • Recommended: Use n_hvgs=1000-2000 for most analyses\n"
|
|
592
|
+
f" • Current dataset: {adata.n_obs} cells × {adata.n_vars} total genes"
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Check if we should use all genes (for very small gene sets like MERFISH)
|
|
596
|
+
if adata.n_vars < 100:
|
|
597
|
+
adata.var["highly_variable"] = True
|
|
598
|
+
else:
|
|
599
|
+
# Attempt HVG selection - no fallback for failures
|
|
600
|
+
try:
|
|
601
|
+
sc.pp.highly_variable_genes(adata, n_top_genes=n_hvgs)
|
|
602
|
+
except Exception as e:
|
|
603
|
+
raise ProcessingError(
|
|
604
|
+
f"HVG selection failed: {e}. "
|
|
605
|
+
f"Data: {adata.n_obs}×{adata.n_vars}, requested: {n_hvgs} HVGs."
|
|
606
|
+
) from e
|
|
607
|
+
|
|
608
|
+
# Exclude mitochondrial genes from HVG selection (BEST PRACTICE)
|
|
609
|
+
# Mito genes can dominate HVG due to high expression and technical variation
|
|
610
|
+
if params.remove_mito_genes and "mt" in adata.var.columns:
|
|
611
|
+
n_mito_hvg = (adata.var["highly_variable"] & adata.var["mt"]).sum()
|
|
612
|
+
if n_mito_hvg > 0:
|
|
613
|
+
adata.var.loc[adata.var["mt"], "highly_variable"] = False
|
|
614
|
+
|
|
615
|
+
# Exclude ribosomal genes from HVG selection (optional)
|
|
616
|
+
if params.remove_ribo_genes and "ribo" in adata.var.columns:
|
|
617
|
+
n_ribo_hvg = (adata.var["highly_variable"] & adata.var["ribo"]).sum()
|
|
618
|
+
if n_ribo_hvg > 0:
|
|
619
|
+
adata.var.loc[adata.var["ribo"], "highly_variable"] = False
|
|
620
|
+
|
|
621
|
+
# Apply gene subsampling if requested
|
|
622
|
+
if gene_subsample_requested and params.subsample_genes < adata.n_vars:
|
|
623
|
+
# Ensure HVG selection was successful
|
|
624
|
+
if "highly_variable" not in adata.var:
|
|
625
|
+
raise ProcessingError(
|
|
626
|
+
"Gene subsampling failed: no HVGs identified. Run HVG selection first."
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if not adata.var["highly_variable"].any():
|
|
630
|
+
raise DataError(
|
|
631
|
+
"Gene subsampling requested but no genes were marked as highly variable. "
|
|
632
|
+
"Check HVG selection parameters or data quality."
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
# Use properly identified HVGs
|
|
636
|
+
adata = adata[:, adata.var["highly_variable"]].copy()
|
|
637
|
+
|
|
638
|
+
# 5. Batch effect correction (if applicable)
|
|
639
|
+
if (
|
|
640
|
+
params.batch_key in adata.obs
|
|
641
|
+
and len(adata.obs[params.batch_key].unique()) > 1
|
|
642
|
+
):
|
|
643
|
+
try:
|
|
644
|
+
# Use Harmony for batch correction (modern standard, works on PCA space)
|
|
645
|
+
# Harmony is more robust than ComBat for single-cell/spatial data
|
|
646
|
+
# Use centralized dependency manager for consistent error handling
|
|
647
|
+
require(
|
|
648
|
+
"harmonypy"
|
|
649
|
+
) # Raises ImportError with install instructions if missing
|
|
650
|
+
import scanpy.external as sce
|
|
651
|
+
|
|
652
|
+
# Harmony requires PCA - use lazy computation
|
|
653
|
+
ensure_pca(adata, n_comps=min(50, adata.n_vars - 1))
|
|
654
|
+
|
|
655
|
+
sce.pp.harmony_integrate(adata, key=params.batch_key)
|
|
656
|
+
except Exception as e:
|
|
657
|
+
raise ProcessingError(
|
|
658
|
+
f"Harmony batch correction failed: {e}. "
|
|
659
|
+
f"Check batch sizes or try scVI/BBKNN integration."
|
|
660
|
+
) from e
|
|
661
|
+
|
|
662
|
+
# 6. Scale data (if requested)
|
|
663
|
+
if params.scale:
|
|
664
|
+
try:
|
|
665
|
+
# Trust scanpy's internal zero-variance handling and sparse matrix optimization
|
|
666
|
+
sc.pp.scale(adata, max_value=params.scale_max_value)
|
|
667
|
+
|
|
668
|
+
# Clean up any NaN/Inf values that might remain (sparse-matrix safe)
|
|
669
|
+
# Only apply if we have a max_value for clipping
|
|
670
|
+
if params.scale_max_value is not None:
|
|
671
|
+
if hasattr(adata.X, "data"):
|
|
672
|
+
# Sparse matrix - only modify the data array
|
|
673
|
+
adata.X.data = np.nan_to_num(
|
|
674
|
+
adata.X.data,
|
|
675
|
+
nan=0.0,
|
|
676
|
+
posinf=params.scale_max_value,
|
|
677
|
+
neginf=-params.scale_max_value,
|
|
678
|
+
)
|
|
679
|
+
else:
|
|
680
|
+
# Dense matrix
|
|
681
|
+
adata.X = np.nan_to_num(
|
|
682
|
+
adata.X,
|
|
683
|
+
nan=0.0,
|
|
684
|
+
posinf=params.scale_max_value,
|
|
685
|
+
neginf=-params.scale_max_value,
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
except Exception as e:
|
|
689
|
+
await ctx.warning(f"Scaling failed: {e}. Continuing without scaling.")
|
|
690
|
+
|
|
691
|
+
# Store preprocessing metadata for downstream tools
|
|
692
|
+
# PCA, UMAP, clustering, and spatial neighbors are computed lazily
|
|
693
|
+
# by analysis tools using ensure_* functions from utils.compute
|
|
694
|
+
adata.uns["preprocessing"]["completed"] = True
|
|
695
|
+
adata.uns["preprocessing"]["n_pcs"] = params.n_pcs
|
|
696
|
+
adata.uns["preprocessing"]["n_neighbors"] = params.n_neighbors
|
|
697
|
+
adata.uns["preprocessing"][
|
|
698
|
+
"clustering_resolution"
|
|
699
|
+
] = params.clustering_resolution
|
|
700
|
+
|
|
701
|
+
# Store the processed AnnData object back via ToolContext
|
|
702
|
+
await ctx.set_adata(data_id, adata)
|
|
703
|
+
|
|
704
|
+
# Return preprocessing result
|
|
705
|
+
# Note: clusters=0 indicates clustering not yet performed
|
|
706
|
+
# Analysis tools will compute clustering lazily when needed
|
|
707
|
+
return PreprocessingResult(
|
|
708
|
+
data_id=data_id,
|
|
709
|
+
n_cells=adata.n_obs,
|
|
710
|
+
n_genes=adata.n_vars,
|
|
711
|
+
n_hvgs=(
|
|
712
|
+
int(sum(adata.var.highly_variable))
|
|
713
|
+
if "highly_variable" in adata.var
|
|
714
|
+
else 0
|
|
715
|
+
),
|
|
716
|
+
clusters=0, # Clustering computed lazily by analysis tools
|
|
717
|
+
qc_metrics=qc_metrics,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
except Exception as e:
|
|
721
|
+
error_msg = f"Error in preprocessing: {e}"
|
|
722
|
+
tb = traceback.format_exc()
|
|
723
|
+
raise ProcessingError(f"{error_msg}\n{tb}") from e
|