chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,723 @@
1
+ """
2
+ Preprocessing tools for spatial transcriptomics data.
3
+ """
4
+
5
+ import traceback
6
+
7
+ import numpy as np
8
+ import scanpy as sc
9
+ import scipy.sparse
10
+
11
+ from ..models.analysis import PreprocessingResult
12
+ from ..models.data import PreprocessingParameters
13
+ from ..spatial_mcp_adapter import ToolContext
14
+ from ..utils.adata_utils import (
15
+ ensure_unique_var_names_async,
16
+ sample_expression_values,
17
+ standardize_adata,
18
+ )
19
+ from ..utils.compute import ensure_pca
20
+ from ..utils.dependency_manager import require, validate_r_package
21
+ from ..utils.exceptions import (
22
+ DataError,
23
+ DependencyError,
24
+ ParameterError,
25
+ ProcessingError,
26
+ )
27
+ from ..utils.mcp_utils import mcp_tool_error_handler
28
+
29
+
30
+ @mcp_tool_error_handler()
31
+ async def preprocess_data(
32
+ data_id: str,
33
+ ctx: ToolContext,
34
+ params: PreprocessingParameters = PreprocessingParameters(),
35
+ ) -> PreprocessingResult:
36
+ """Preprocess spatial transcriptomics data
37
+
38
+ Args:
39
+ data_id: Dataset ID
40
+ ctx: Tool context for data access and logging
41
+ params: Preprocessing parameters
42
+
43
+ Returns:
44
+ Preprocessing result summary
45
+ """
46
+ try:
47
+ # Get AnnData directly via ToolContext
48
+ adata = await ctx.get_adata(data_id)
49
+
50
+ # Standardize data format at the entry point
51
+ try:
52
+ adata = standardize_adata(
53
+ adata, copy=False, strict=False, preserve_original=True
54
+ )
55
+ except Exception as e:
56
+ await ctx.warning(
57
+ f"Data standardization failed: {e}. Proceeding with original data."
58
+ )
59
+ # Continue with original data if standardization fails
60
+
61
+ # Validate input data
62
+ if adata.n_obs == 0 or adata.n_vars == 0:
63
+ raise DataError(
64
+ f"Dataset {data_id} is empty: {adata.n_obs} cells, {adata.n_vars} genes"
65
+ )
66
+
67
+ # Handle duplicate gene names (must be done before gene-based operations)
68
+ await ensure_unique_var_names_async(adata, ctx, "data")
69
+
70
+ # 1. Calculate QC metrics (including mitochondrial percentage)
71
+ try:
72
+ # Identify mitochondrial genes (MT-* for human, mt-* for mouse)
73
+ adata.var["mt"] = adata.var_names.str.startswith(("MT-", "mt-"))
74
+
75
+ # Identify ribosomal genes (RPS*, RPL* for human, Rps*, Rpl* for mouse)
76
+ adata.var["ribo"] = adata.var_names.str.startswith(
77
+ ("RPS", "RPL", "Rps", "Rpl")
78
+ )
79
+
80
+ # FIX: Adjust percent_top for small datasets
81
+ #
82
+ # Problem: sc.pp.calculate_qc_metrics() uses default percent_top=[50, 100, 200, 500]
83
+ # to calculate "percentage of counts in top N genes". When n_genes < 500,
84
+ # scanpy raises IndexError: "Positions outside range of features"
85
+ # (see scanpy/preprocessing/_qc.py line 392: check_ns decorator)
86
+ #
87
+ # Solution: Dynamically adjust percent_top to only include values < n_genes
88
+ n_genes = adata.n_vars
89
+ default_percent_top = [50, 100, 200, 500]
90
+
91
+ # Filter to only include values that are valid for this dataset
92
+ safe_percent_top = [p for p in default_percent_top if p < n_genes]
93
+
94
+ # For very small datasets (n_genes < 50), create proportional values
95
+ if not safe_percent_top:
96
+ safe_percent_top = []
97
+ for fraction in [0.1, 0.25, 0.5]:
98
+ val = max(1, int(n_genes * fraction))
99
+ if val < n_genes and val not in safe_percent_top:
100
+ safe_percent_top.append(val)
101
+
102
+ # Add the largest possible value (n_genes - 1) if reasonable
103
+ if n_genes > 1 and (n_genes - 1) not in safe_percent_top:
104
+ safe_percent_top.append(n_genes - 1)
105
+
106
+ safe_percent_top = (
107
+ sorted(set(safe_percent_top)) if safe_percent_top else None
108
+ )
109
+
110
+ # Calculate QC metrics including mitochondrial and ribosomal percentages
111
+ sc.pp.calculate_qc_metrics(
112
+ adata,
113
+ qc_vars=["mt", "ribo"],
114
+ percent_top=safe_percent_top,
115
+ inplace=True,
116
+ )
117
+ except Exception as e:
118
+ raise ProcessingError(
119
+ f"QC metrics failed: {e}. "
120
+ f"Data: {adata.n_obs}×{adata.n_vars}, type: {type(adata.X).__name__}"
121
+ ) from e
122
+
123
+ # Store original QC metrics before filtering (including mito stats)
124
+ mito_pct_col = "pct_counts_mt" if "pct_counts_mt" in adata.obs else None
125
+ qc_metrics = {
126
+ "n_cells_before_filtering": int(adata.n_obs),
127
+ "n_genes_before_filtering": int(adata.n_vars),
128
+ "median_genes_per_cell": float(np.median(adata.obs.n_genes_by_counts)),
129
+ "median_umi_per_cell": float(np.median(adata.obs.total_counts)),
130
+ }
131
+ # Add mitochondrial stats if available
132
+ if mito_pct_col:
133
+ qc_metrics["median_mito_pct"] = float(np.median(adata.obs[mito_pct_col]))
134
+ qc_metrics["max_mito_pct"] = float(np.max(adata.obs[mito_pct_col]))
135
+ qc_metrics["n_mt_genes"] = int(adata.var["mt"].sum())
136
+
137
+ # 2. Apply user-controlled data filtering and subsampling
138
+ min_cells = params.filter_genes_min_cells
139
+ if min_cells is not None and min_cells > 0:
140
+ sc.pp.filter_genes(adata, min_cells=min_cells)
141
+
142
+ min_genes = params.filter_cells_min_genes
143
+ if min_genes is not None and min_genes > 0:
144
+ sc.pp.filter_cells(adata, min_genes=min_genes)
145
+
146
+ # Apply mitochondrial percentage filtering (BEST PRACTICE for spatial data)
147
+ # High mito% indicates damaged cells that have lost cytoplasmic mRNA
148
+ if params.filter_mito_pct is not None and mito_pct_col:
149
+ high_mito_mask = adata.obs[mito_pct_col] > params.filter_mito_pct
150
+ n_high_mito = high_mito_mask.sum()
151
+
152
+ if n_high_mito > 0:
153
+ adata = adata[~high_mito_mask].copy()
154
+ # Update qc_metrics with mito filtering info
155
+ qc_metrics["n_spots_filtered_mito"] = int(n_high_mito)
156
+ elif params.filter_mito_pct is not None and not mito_pct_col:
157
+ await ctx.warning(
158
+ "Mitochondrial filtering requested but no mito genes detected. "
159
+ "This may indicate non-standard gene naming or imaging-based data."
160
+ )
161
+
162
+ # Apply spot subsampling if requested
163
+ if params.subsample_spots is not None and params.subsample_spots < adata.n_obs:
164
+ sc.pp.subsample(
165
+ adata,
166
+ n_obs=params.subsample_spots,
167
+ random_state=params.subsample_random_seed,
168
+ )
169
+
170
+ # Apply gene subsampling if requested (after HVG selection)
171
+ gene_subsample_requested = params.subsample_genes is not None
172
+
173
+ # Save raw data before normalization (required for some analysis methods)
174
+
175
+ # IMPORTANT: Create a proper frozen copy for .raw to preserve counts
176
+ # Using `adata.raw = adata` creates a view that gets modified during normalization
177
+ # We need to create an independent AnnData object to truly preserve counts
178
+ import anndata as ad_module
179
+
180
+ # Memory optimization: AnnData.raw internally copies var, so no need for .copy()
181
+ # obs MUST be copied to prevent contamination from later preprocessing steps
182
+ # uns can be empty dict as raw doesn't need metadata
183
+ adata.raw = ad_module.AnnData(
184
+ X=adata.X.copy(), # Must copy - will be modified during normalization
185
+ var=adata.var, # No copy needed - AnnData internally creates independent copy
186
+ obs=adata.obs.copy(), # Must copy - will be modified by clustering/annotation
187
+ uns={}, # Empty dict - raw doesn't need uns metadata
188
+ )
189
+
190
+ # Store counts layer for scVI-tools compatibility (Cell2location, scANVI, DestVI)
191
+ # Note: This layer follows adata through HVG subsetting, complementing adata.raw
192
+ # - adata.raw: Full gene set (for cell communication needing complete L-R coverage)
193
+ # - adata.layers["counts"]: HVG subset after filtering (for scVI-tools alignment)
194
+ adata.layers["counts"] = adata.X.copy()
195
+
196
+ # Store preprocessing metadata following scanpy/anndata conventions
197
+ # This metadata enables downstream tools to reuse gene annotations
198
+ adata.uns["preprocessing"] = {
199
+ "normalization": params.normalization,
200
+ "raw_preserved": True,
201
+ "counts_layer": True,
202
+ "n_genes_before_norm": adata.n_vars,
203
+ # Gene type annotations - downstream tools should reuse these
204
+ "gene_annotations": {
205
+ "mt_column": "mt" if "mt" in adata.var.columns else None,
206
+ "ribo_column": "ribo" if "ribo" in adata.var.columns else None,
207
+ "n_mt_genes": (
208
+ int(adata.var["mt"].sum()) if "mt" in adata.var.columns else 0
209
+ ),
210
+ "n_ribo_genes": (
211
+ int(adata.var["ribo"].sum()) if "ribo" in adata.var.columns else 0
212
+ ),
213
+ },
214
+ }
215
+
216
+ # Update QC metrics after filtering
217
+ qc_metrics.update(
218
+ {
219
+ "n_cells_after_filtering": int(adata.n_obs),
220
+ "n_genes_after_filtering": int(adata.n_vars),
221
+ }
222
+ )
223
+
224
+ # 3. Normalize data
225
+ # Log normalization configuration (developer log)
226
+ norm_config = {
227
+ "Method": params.normalization,
228
+ "Target sum": (
229
+ f"{params.normalize_target_sum:.0f}"
230
+ if params.normalize_target_sum is not None
231
+ else "ADAPTIVE (using median counts)"
232
+ ),
233
+ }
234
+ if params.scale:
235
+ norm_config["Scale clipping"] = (
236
+ f"±{params.scale_max_value} SD"
237
+ if params.scale_max_value is not None
238
+ else "NONE (preserving all outliers)"
239
+ )
240
+ ctx.log_config("Normalization Configuration", norm_config)
241
+
242
+ if params.normalization == "log":
243
+ # Standard log normalization
244
+ # Check if data appears to be already normalized
245
+ X_sample = sample_expression_values(adata)
246
+
247
+ # Check for negative values (indicates already log-normalized data)
248
+ if np.any(X_sample < 0):
249
+ error_msg = (
250
+ "Log normalization requires non-negative data (raw or normalized counts). "
251
+ "Data contains negative values, suggesting it has already been log-normalized. "
252
+ "Options:\n"
253
+ "• Use normalization='none' if data is already pre-processed\n"
254
+ "• Load raw count data instead of processed data\n"
255
+ "• Remove the log transformation from your data before re-processing"
256
+ )
257
+ raise DataError(error_msg)
258
+
259
+ if params.normalize_target_sum is not None:
260
+ sc.pp.normalize_total(adata, target_sum=params.normalize_target_sum)
261
+ else:
262
+ # Calculate median for adaptive normalization
263
+ calculated_median = np.median(np.array(adata.X.sum(axis=1)).flatten())
264
+ sc.pp.normalize_total(adata, target_sum=calculated_median)
265
+ sc.pp.log1p(adata)
266
+ elif params.normalization == "sct":
267
+ # SCTransform v2 variance-stabilizing normalization via R's sctransform
268
+ # Check R sctransform availability using centralized dependency manager
269
+ try:
270
+ validate_r_package("sctransform", ctx)
271
+ validate_r_package("Matrix", ctx)
272
+ except ImportError as e:
273
+ full_error = (
274
+ f"SCTransform requires R and the sctransform package.\n\n"
275
+ f"ERROR: {e}\n\n"
276
+ "INSTALLATION:\n"
277
+ " 1. Install R (https://cran.r-project.org/)\n"
278
+ " 2. In R: install.packages('sctransform')\n"
279
+ " 3. pip install 'rpy2>=3.5.0'\n\n"
280
+ "ALTERNATIVES:\n"
281
+ "• Use normalization='pearson_residuals' (built-in, similar results)\n"
282
+ "• Use normalization='log' (standard method)"
283
+ )
284
+ raise DependencyError(full_error) from e
285
+
286
+ # Check if data appears to be raw counts (required for SCTransform)
287
+ X_sample = sample_expression_values(adata)
288
+
289
+ # Check for non-integer values (indicates normalized data)
290
+ if np.any((X_sample % 1) != 0):
291
+ raise DataError(
292
+ "SCTransform requires raw count data (integers). "
293
+ "Use normalization='log' for normalized data."
294
+ )
295
+
296
+ # Map method parameter to vst.flavor
297
+ vst_flavor = "v2" if params.sct_method == "fix-slope" else "v1"
298
+
299
+ try:
300
+ # Import rpy2 modules
301
+ import rpy2.robjects as ro
302
+ from rpy2.robjects import numpy2ri
303
+ from rpy2.robjects.conversion import localconverter
304
+
305
+ # Note: counts layer already saved in unified preprocessing step (line 338)
306
+ # It will be properly subsetted if SCT filters genes
307
+ # Convert to sparse CSC matrix (genes × cells) for R's dgCMatrix
308
+ if scipy.sparse.issparse(adata.X):
309
+ counts_sparse = scipy.sparse.csc_matrix(adata.X.T)
310
+ else:
311
+ counts_sparse = scipy.sparse.csc_matrix(adata.X.T)
312
+
313
+ # Transfer sparse matrix components to R
314
+ with localconverter(ro.default_converter + numpy2ri.converter):
315
+ ro.globalenv["sp_data"] = counts_sparse.data.astype(np.float64)
316
+ ro.globalenv["sp_indices"] = counts_sparse.indices.astype(np.int32)
317
+ ro.globalenv["sp_indptr"] = counts_sparse.indptr.astype(np.int32)
318
+ ro.globalenv["n_genes"] = counts_sparse.shape[0]
319
+ ro.globalenv["n_cells"] = counts_sparse.shape[1]
320
+ ro.globalenv["gene_names"] = ro.StrVector(adata.var_names.tolist())
321
+ ro.globalenv["cell_names"] = ro.StrVector(adata.obs_names.tolist())
322
+ ro.globalenv["vst_flavor"] = vst_flavor
323
+ ro.globalenv["n_cells_param"] = (
324
+ params.sct_n_cells if params.sct_n_cells else ro.NULL
325
+ )
326
+
327
+ # Reconstruct sparse matrix and run SCTransform in R
328
+ ro.r(
329
+ """
330
+ library(Matrix)
331
+ library(sctransform)
332
+
333
+ # Create dgCMatrix from components
334
+ umi_matrix <- new(
335
+ "dgCMatrix",
336
+ x = as.numeric(sp_data),
337
+ i = as.integer(sp_indices),
338
+ p = as.integer(sp_indptr),
339
+ Dim = as.integer(c(n_genes, n_cells)),
340
+ Dimnames = list(gene_names, cell_names)
341
+ )
342
+
343
+ # Run SCTransform
344
+ suppressWarnings({
345
+ vst_result <- sctransform::vst(
346
+ umi = umi_matrix,
347
+ vst.flavor = vst_flavor,
348
+ return_gene_attr = TRUE,
349
+ return_cell_attr = TRUE,
350
+ n_cells = n_cells_param,
351
+ verbosity = 0
352
+ )
353
+ })
354
+
355
+ # Convert output to dense matrix for transfer
356
+ pearson_residuals <- as.matrix(vst_result$y)
357
+ residual_variance <- vst_result$gene_attr$residual_variance
358
+ # Extract gene names that survived SCTransform filtering
359
+ kept_genes <- rownames(vst_result$y)
360
+ """
361
+ )
362
+
363
+ # Extract results from R
364
+ with localconverter(ro.default_converter + numpy2ri.converter):
365
+ pearson_residuals = np.array(ro.r("pearson_residuals"))
366
+ residual_variance = np.array(ro.r("residual_variance"))
367
+ kept_genes = list(ro.r("kept_genes"))
368
+
369
+ # CRITICAL FIX: Subset adata to match genes returned by SCTransform
370
+ # R's sctransform internally filters genes, so we need to subset
371
+ n_genes_before_sct = adata.n_vars
372
+ if len(kept_genes) != adata.n_vars:
373
+ n_filtered = adata.n_vars - len(kept_genes)
374
+ # Subset adata to keep only genes returned by SCTransform
375
+ adata = adata[:, kept_genes].copy()
376
+ else:
377
+ n_filtered = 0
378
+
379
+ # Transpose back to cells × genes for AnnData format
380
+ adata.X = pearson_residuals.T
381
+
382
+ # Store SCTransform metadata
383
+ adata.uns["sctransform"] = {
384
+ "method": params.sct_method,
385
+ "vst_flavor": vst_flavor,
386
+ "var_features_n": params.sct_var_features_n,
387
+ "exclude_poisson": params.sct_exclude_poisson,
388
+ "n_cells": params.sct_n_cells,
389
+ "n_genes_before": n_genes_before_sct,
390
+ "n_genes_after": len(kept_genes),
391
+ "n_genes_filtered_by_sct": n_filtered,
392
+ }
393
+
394
+ # Mark highly variable genes based on residual variance
395
+ # Now adata has been subset, so residual_variance should match adata.n_vars
396
+ if len(residual_variance) != adata.n_vars:
397
+ error_msg = (
398
+ f"Dimension mismatch after SCTransform: "
399
+ f"residual_variance has {len(residual_variance)} values "
400
+ f"but adata has {adata.n_vars} genes"
401
+ )
402
+ raise ProcessingError(error_msg)
403
+
404
+ adata.var["sct_residual_variance"] = residual_variance
405
+
406
+ # Select top N genes by residual variance
407
+ n_hvg = min(params.sct_var_features_n, len(residual_variance))
408
+ top_hvg_indices = np.argsort(residual_variance)[-n_hvg:]
409
+ adata.var["highly_variable"] = False
410
+ adata.var.iloc[
411
+ top_hvg_indices, adata.var.columns.get_loc("highly_variable")
412
+ ] = True
413
+
414
+ except MemoryError as e:
415
+ raise MemoryError(
416
+ f"Memory error for SCTransform on {adata.n_obs}×{adata.n_vars} matrix. "
417
+ f"Use normalization='log' or subsample data."
418
+ ) from e
419
+ except Exception as e:
420
+ raise ProcessingError(f"SCTransform failed: {e}") from e
421
+ elif params.normalization == "pearson_residuals":
422
+ # Modern Pearson residuals normalization (recommended for UMI data)
423
+
424
+ # Check if method is available
425
+ if not hasattr(sc.experimental.pp, "normalize_pearson_residuals"):
426
+ error_msg = (
427
+ "Pearson residuals normalization not available (requires scanpy>=1.9.0).\n"
428
+ "Options:\n"
429
+ "• Install newer scanpy: pip install 'scanpy>=1.9.0'\n"
430
+ "• Use log normalization instead: params.normalization='log'\n"
431
+ "• Skip normalization if data is pre-processed: params.normalization='none'"
432
+ )
433
+ raise DependencyError(error_msg)
434
+
435
+ # Check if data appears to be raw counts
436
+ X_sample = sample_expression_values(adata)
437
+
438
+ # Check for non-integer values (indicates normalized data)
439
+ if np.any((X_sample % 1) != 0):
440
+ raise DataError(
441
+ "Pearson residuals requires raw count data (integers). "
442
+ "Data contains non-integer values. "
443
+ "Use params.normalization='none' if data is already normalized, "
444
+ "or params.normalization='log' for standard normalization."
445
+ )
446
+
447
+ # Execute normalization
448
+ try:
449
+ # Apply Pearson residuals normalization (to all genes)
450
+ # Note: High variable gene selection happens later in the pipeline
451
+ sc.experimental.pp.normalize_pearson_residuals(adata)
452
+ except MemoryError as e:
453
+ raise MemoryError(
454
+ f"Insufficient memory for Pearson residuals on {adata.n_obs}×{adata.n_vars} matrix. "
455
+ "Try reducing n_hvgs or use 'log' normalization."
456
+ ) from e
457
+ except Exception as e:
458
+ raise ProcessingError(
459
+ f"Pearson residuals normalization failed: {e}. "
460
+ "Consider using 'log' normalization instead."
461
+ ) from e
462
+ elif params.normalization == "none":
463
+ # Explicitly skip normalization
464
+
465
+ # CRITICAL: Check if data appears to be raw counts
466
+ # HVG selection requires normalized data for statistical validity
467
+ X_sample = sample_expression_values(adata)
468
+
469
+ # Check if data looks raw (all integers and high values)
470
+ if np.all((X_sample % 1) == 0) and np.max(X_sample) > 100:
471
+ error_msg = (
472
+ "STATISTICAL ERROR: Cannot perform HVG selection on raw counts with normalization='none'\n\n"
473
+ "Your data appears to be raw counts (integer values with max > 100), but you specified "
474
+ "normalization='none'. Highly variable gene (HVG) selection requires normalized data "
475
+ "for statistical validity because:\n"
476
+ "• Raw count variance scales non-linearly with expression level\n"
477
+ "• This prevents accurate comparison of variability across genes\n"
478
+ "• Scanpy's HVG algorithm will fail with 'infinity' errors\n\n"
479
+ "REQUIRED ACTIONS:\n"
480
+ "Option 1 (Recommended): Use normalization='log' for standard log-normalization\n"
481
+ "Option 2: Use normalization='pearson_residuals' for variance-stabilizing normalization\n"
482
+ "Option 3: Pre-normalize your data externally, then reload with normalized values\n\n"
483
+ "WARNING: If your data is already normalized but appears raw, verify data integrity."
484
+ )
485
+ raise DataError(error_msg)
486
+ elif params.normalization == "scvi":
487
+ # scVI deep learning-based normalization
488
+ # Uses variational autoencoder to learn latent representation
489
+ require("scvi", feature="scVI normalization")
490
+ import scvi
491
+
492
+ # Check if data appears to be raw counts (required for scVI)
493
+ X_sample = sample_expression_values(adata)
494
+
495
+ # Check for negative values (indicates already normalized data)
496
+ if np.any(X_sample < 0):
497
+ raise DataError(
498
+ "scVI requires non-negative count data. Data contains negative values."
499
+ )
500
+
501
+ try:
502
+ # Note: counts layer already saved in unified preprocessing step (line 338)
503
+ # scVI requires this layer for proper count-based modeling
504
+
505
+ # Setup AnnData for scVI using the pre-saved counts layer
506
+ scvi.model.SCVI.setup_anndata(
507
+ adata,
508
+ layer="counts",
509
+ batch_key=(
510
+ params.batch_key
511
+ if params.batch_key in adata.obs.columns
512
+ else None
513
+ ),
514
+ )
515
+
516
+ # Create scVI model with user-specified parameters
517
+ scvi_model = scvi.model.SCVI(
518
+ adata,
519
+ n_hidden=params.scvi_n_hidden,
520
+ n_latent=params.scvi_n_latent,
521
+ n_layers=params.scvi_n_layers,
522
+ dropout_rate=params.scvi_dropout_rate,
523
+ gene_likelihood=params.scvi_gene_likelihood,
524
+ )
525
+
526
+ # Train the model with user-configurable parameters
527
+ scvi_model.train(
528
+ max_epochs=params.scvi_max_epochs,
529
+ early_stopping=params.scvi_early_stopping,
530
+ early_stopping_patience=params.scvi_early_stopping_patience,
531
+ early_stopping_monitor="elbo_validation",
532
+ train_size=params.scvi_train_size,
533
+ )
534
+
535
+ # Get latent representation (replaces PCA)
536
+ adata.obsm["X_scvi"] = scvi_model.get_latent_representation()
537
+
538
+ # Get normalized expression for downstream analysis
539
+ # This is the denoised, batch-corrected expression
540
+ normalized_expr = scvi_model.get_normalized_expression(
541
+ library_size=1e4 # Normalize to 10k counts
542
+ )
543
+ # Store as dense array (normalized expression is typically dense)
544
+ if hasattr(normalized_expr, "values"):
545
+ adata.X = normalized_expr.values
546
+ else:
547
+ adata.X = np.array(normalized_expr)
548
+
549
+ # Apply log1p for downstream compatibility
550
+ adata.X = np.log1p(adata.X)
551
+
552
+ # Store scVI metadata
553
+ adata.uns["scvi"] = {
554
+ "n_hidden": params.scvi_n_hidden,
555
+ "n_latent": params.scvi_n_latent,
556
+ "n_layers": params.scvi_n_layers,
557
+ "dropout_rate": params.scvi_dropout_rate,
558
+ "gene_likelihood": params.scvi_gene_likelihood,
559
+ "training_completed": True,
560
+ }
561
+
562
+ except Exception as e:
563
+ raise ProcessingError(f"scVI normalization failed: {e}") from e
564
+ else:
565
+ # Catch unknown normalization methods
566
+ valid_methods = ["log", "sct", "pearson_residuals", "none", "scvi"]
567
+ raise ParameterError(
568
+ f"Unknown normalization method: '{params.normalization}'. "
569
+ f"Valid options are: {', '.join(valid_methods)}"
570
+ )
571
+
572
+ # 4. Find highly variable genes and apply gene subsampling
573
+ # Determine number of HVGs to select
574
+ if gene_subsample_requested:
575
+ # User wants to subsample genes
576
+ n_hvgs = min(params.subsample_genes, adata.n_vars - 1, params.n_hvgs)
577
+ else:
578
+ # Use standard HVG selection
579
+ n_hvgs = min(params.n_hvgs, adata.n_vars - 1)
580
+
581
+ # Statistical warning: Very low HVG count may lead to unstable clustering
582
+ # Based on literature consensus: 500-5000 genes recommended, 1000-2000 typical
583
+ # References:
584
+ # - Bioconductor OSCA: "any value from 500 to 5000 is reasonable"
585
+ # - Single-cell best practices: typical range 1000-2000
586
+ if n_hvgs < 500:
587
+ await ctx.warning(
588
+ f"Using only {n_hvgs} HVGs is below the recommended minimum of 500 genes.\n"
589
+ f" • Literature consensus: 500-5000 genes (typical: 1000-2000)\n"
590
+ f" • Low gene counts may lead to unstable clustering results\n"
591
+ f" • Recommended: Use n_hvgs=1000-2000 for most analyses\n"
592
+ f" • Current dataset: {adata.n_obs} cells × {adata.n_vars} total genes"
593
+ )
594
+
595
+ # Check if we should use all genes (for very small gene sets like MERFISH)
596
+ if adata.n_vars < 100:
597
+ adata.var["highly_variable"] = True
598
+ else:
599
+ # Attempt HVG selection - no fallback for failures
600
+ try:
601
+ sc.pp.highly_variable_genes(adata, n_top_genes=n_hvgs)
602
+ except Exception as e:
603
+ raise ProcessingError(
604
+ f"HVG selection failed: {e}. "
605
+ f"Data: {adata.n_obs}×{adata.n_vars}, requested: {n_hvgs} HVGs."
606
+ ) from e
607
+
608
+ # Exclude mitochondrial genes from HVG selection (BEST PRACTICE)
609
+ # Mito genes can dominate HVG due to high expression and technical variation
610
+ if params.remove_mito_genes and "mt" in adata.var.columns:
611
+ n_mito_hvg = (adata.var["highly_variable"] & adata.var["mt"]).sum()
612
+ if n_mito_hvg > 0:
613
+ adata.var.loc[adata.var["mt"], "highly_variable"] = False
614
+
615
+ # Exclude ribosomal genes from HVG selection (optional)
616
+ if params.remove_ribo_genes and "ribo" in adata.var.columns:
617
+ n_ribo_hvg = (adata.var["highly_variable"] & adata.var["ribo"]).sum()
618
+ if n_ribo_hvg > 0:
619
+ adata.var.loc[adata.var["ribo"], "highly_variable"] = False
620
+
621
+ # Apply gene subsampling if requested
622
+ if gene_subsample_requested and params.subsample_genes < adata.n_vars:
623
+ # Ensure HVG selection was successful
624
+ if "highly_variable" not in adata.var:
625
+ raise ProcessingError(
626
+ "Gene subsampling failed: no HVGs identified. Run HVG selection first."
627
+ )
628
+
629
+ if not adata.var["highly_variable"].any():
630
+ raise DataError(
631
+ "Gene subsampling requested but no genes were marked as highly variable. "
632
+ "Check HVG selection parameters or data quality."
633
+ )
634
+
635
+ # Use properly identified HVGs
636
+ adata = adata[:, adata.var["highly_variable"]].copy()
637
+
638
+ # 5. Batch effect correction (if applicable)
639
+ if (
640
+ params.batch_key in adata.obs
641
+ and len(adata.obs[params.batch_key].unique()) > 1
642
+ ):
643
+ try:
644
+ # Use Harmony for batch correction (modern standard, works on PCA space)
645
+ # Harmony is more robust than ComBat for single-cell/spatial data
646
+ # Use centralized dependency manager for consistent error handling
647
+ require(
648
+ "harmonypy"
649
+ ) # Raises ImportError with install instructions if missing
650
+ import scanpy.external as sce
651
+
652
+ # Harmony requires PCA - use lazy computation
653
+ ensure_pca(adata, n_comps=min(50, adata.n_vars - 1))
654
+
655
+ sce.pp.harmony_integrate(adata, key=params.batch_key)
656
+ except Exception as e:
657
+ raise ProcessingError(
658
+ f"Harmony batch correction failed: {e}. "
659
+ f"Check batch sizes or try scVI/BBKNN integration."
660
+ ) from e
661
+
662
+ # 6. Scale data (if requested)
663
+ if params.scale:
664
+ try:
665
+ # Trust scanpy's internal zero-variance handling and sparse matrix optimization
666
+ sc.pp.scale(adata, max_value=params.scale_max_value)
667
+
668
+ # Clean up any NaN/Inf values that might remain (sparse-matrix safe)
669
+ # Only apply if we have a max_value for clipping
670
+ if params.scale_max_value is not None:
671
+ if hasattr(adata.X, "data"):
672
+ # Sparse matrix - only modify the data array
673
+ adata.X.data = np.nan_to_num(
674
+ adata.X.data,
675
+ nan=0.0,
676
+ posinf=params.scale_max_value,
677
+ neginf=-params.scale_max_value,
678
+ )
679
+ else:
680
+ # Dense matrix
681
+ adata.X = np.nan_to_num(
682
+ adata.X,
683
+ nan=0.0,
684
+ posinf=params.scale_max_value,
685
+ neginf=-params.scale_max_value,
686
+ )
687
+
688
+ except Exception as e:
689
+ await ctx.warning(f"Scaling failed: {e}. Continuing without scaling.")
690
+
691
+ # Store preprocessing metadata for downstream tools
692
+ # PCA, UMAP, clustering, and spatial neighbors are computed lazily
693
+ # by analysis tools using ensure_* functions from utils.compute
694
+ adata.uns["preprocessing"]["completed"] = True
695
+ adata.uns["preprocessing"]["n_pcs"] = params.n_pcs
696
+ adata.uns["preprocessing"]["n_neighbors"] = params.n_neighbors
697
+ adata.uns["preprocessing"][
698
+ "clustering_resolution"
699
+ ] = params.clustering_resolution
700
+
701
+ # Store the processed AnnData object back via ToolContext
702
+ await ctx.set_adata(data_id, adata)
703
+
704
+ # Return preprocessing result
705
+ # Note: clusters=0 indicates clustering not yet performed
706
+ # Analysis tools will compute clustering lazily when needed
707
+ return PreprocessingResult(
708
+ data_id=data_id,
709
+ n_cells=adata.n_obs,
710
+ n_genes=adata.n_vars,
711
+ n_hvgs=(
712
+ int(sum(adata.var.highly_variable))
713
+ if "highly_variable" in adata.var
714
+ else 0
715
+ ),
716
+ clusters=0, # Clustering computed lazily by analysis tools
717
+ qc_metrics=qc_metrics,
718
+ )
719
+
720
+ except Exception as e:
721
+ error_msg = f"Error in preprocessing: {e}"
722
+ tb = traceback.format_exc()
723
+ raise ProcessingError(f"{error_msg}\n{tb}") from e