chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data models for spatial transcriptomics analysis.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Annotated, Literal, Optional, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
from typing_extensions import Self
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ColumnInfo(BaseModel):
|
|
14
|
+
"""Metadata column information for dataset profiling"""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
dtype: Literal["categorical", "numerical"]
|
|
18
|
+
n_unique: int
|
|
19
|
+
sample_values: Optional[list[str]] = None # Sample values for categorical
|
|
20
|
+
range: Optional[tuple[float, float]] = None # Value range for numerical
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SpatialDataset(BaseModel):
|
|
24
|
+
"""Spatial transcriptomics dataset model with comprehensive metadata profile"""
|
|
25
|
+
|
|
26
|
+
id: str
|
|
27
|
+
name: str
|
|
28
|
+
data_type: Literal[
|
|
29
|
+
"10x_visium", "slide_seq", "merfish", "seqfish", "other", "h5ad", "auto"
|
|
30
|
+
]
|
|
31
|
+
description: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
# Basic statistics
|
|
34
|
+
n_cells: int = 0
|
|
35
|
+
n_genes: int = 0
|
|
36
|
+
spatial_coordinates_available: bool = False
|
|
37
|
+
tissue_image_available: bool = False
|
|
38
|
+
|
|
39
|
+
# Metadata profiles - let LLM interpret the structure
|
|
40
|
+
obs_columns: Optional[list[ColumnInfo]] = None # Cell-level metadata
|
|
41
|
+
var_columns: Optional[list[ColumnInfo]] = None # Gene-level metadata
|
|
42
|
+
obsm_keys: Optional[list[str]] = None # Multi-dimensional data keys
|
|
43
|
+
uns_keys: Optional[list[str]] = None # Unstructured data keys
|
|
44
|
+
|
|
45
|
+
# Gene expression profiles
|
|
46
|
+
top_highly_variable_genes: Optional[list[str]] = None
|
|
47
|
+
top_expressed_genes: Optional[list[str]] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class PreprocessingParameters(BaseModel):
|
|
51
|
+
"""Preprocessing parameters model"""
|
|
52
|
+
|
|
53
|
+
# Data filtering and subsampling parameters (user controlled)
|
|
54
|
+
filter_genes_min_cells: Optional[Annotated[int, Field(gt=0)]] = (
|
|
55
|
+
3 # Filter genes expressed in < N cells
|
|
56
|
+
)
|
|
57
|
+
filter_cells_min_genes: Optional[Annotated[int, Field(gt=0)]] = (
|
|
58
|
+
30 # Filter cells expressing < N genes
|
|
59
|
+
)
|
|
60
|
+
subsample_spots: Optional[Annotated[int, Field(gt=0, le=50000)]] = (
|
|
61
|
+
None # Subsample to N spots (None = no subsampling)
|
|
62
|
+
)
|
|
63
|
+
subsample_genes: Optional[Annotated[int, Field(gt=0, le=50000)]] = (
|
|
64
|
+
None # Keep top N variable genes (None = keep all filtered genes)
|
|
65
|
+
)
|
|
66
|
+
subsample_random_seed: int = 42 # Random seed for subsampling
|
|
67
|
+
|
|
68
|
+
# ========== Mitochondrial and Ribosomal Gene Filtering ==========
|
|
69
|
+
filter_mito_pct: Optional[float] = Field(
|
|
70
|
+
default=20.0,
|
|
71
|
+
ge=0.0,
|
|
72
|
+
le=100.0,
|
|
73
|
+
description=(
|
|
74
|
+
"Filter spots/cells with mitochondrial percentage above this threshold.\n\n"
|
|
75
|
+
"DEFAULT: 20.0 (remove spots with >20% mitochondrial reads)\n\n"
|
|
76
|
+
"RATIONALE:\n"
|
|
77
|
+
"High mitochondrial content often indicates cell stress, damage, or apoptosis.\n"
|
|
78
|
+
"Damaged cells release cytoplasmic mRNA while retaining mitochondrial transcripts.\n\n"
|
|
79
|
+
"RECOMMENDED VALUES:\n"
|
|
80
|
+
"• 20.0 (default): Standard threshold for most tissues\n"
|
|
81
|
+
"• 5-10: Stringent filtering for high-quality data\n"
|
|
82
|
+
"• 30-50: Relaxed for tissues with naturally high mito (muscle, neurons)\n"
|
|
83
|
+
"• None: Disable filtering (not recommended)\n\n"
|
|
84
|
+
"TISSUE-SPECIFIC CONSIDERATIONS:\n"
|
|
85
|
+
"• Brain: White matter naturally has higher mito% than gray matter\n"
|
|
86
|
+
"• Muscle/Heart: High mito% is biologically normal\n"
|
|
87
|
+
"• Tumor samples: May have elevated mito% due to metabolic changes\n\n"
|
|
88
|
+
"REFERENCE:\n"
|
|
89
|
+
"OSTA Book: lmweber.org/OSTA/pages/seq-quality-control.html"
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
remove_mito_genes: bool = Field(
|
|
93
|
+
default=True,
|
|
94
|
+
description=(
|
|
95
|
+
"Remove mitochondrial genes (MT-*, mt-*) before HVG selection.\n\n"
|
|
96
|
+
"DEFAULT: True (recommended for most analyses)\n\n"
|
|
97
|
+
"RATIONALE:\n"
|
|
98
|
+
"Mitochondrial genes can dominate HVG selection due to high expression\n"
|
|
99
|
+
"and technical variation, masking biologically relevant genes.\n\n"
|
|
100
|
+
"WHEN TO ENABLE (True):\n"
|
|
101
|
+
"• Standard spatial transcriptomics analysis\n"
|
|
102
|
+
"• Clustering and cell type identification\n"
|
|
103
|
+
"• Trajectory analysis\n\n"
|
|
104
|
+
"WHEN TO DISABLE (False):\n"
|
|
105
|
+
"• Studying mitochondrial biology or metabolism\n"
|
|
106
|
+
"• Analyzing mitochondrial heteroplasmy\n"
|
|
107
|
+
"• When mito genes are biologically relevant to your question\n\n"
|
|
108
|
+
"NOTE: Genes are only excluded from HVG selection, not removed from data.\n"
|
|
109
|
+
"They remain available in adata.raw for downstream analyses."
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
remove_ribo_genes: bool = Field(
|
|
113
|
+
default=False,
|
|
114
|
+
description=(
|
|
115
|
+
"Remove ribosomal genes (RPS*, RPL*, Rps*, Rpl*) before HVG selection.\n\n"
|
|
116
|
+
"DEFAULT: False (ribosomal genes often carry biological signal)\n\n"
|
|
117
|
+
"RATIONALE:\n"
|
|
118
|
+
"Ribosomal genes are highly expressed housekeeping genes. While they\n"
|
|
119
|
+
"add noise in some analyses, they can be informative for cell state.\n\n"
|
|
120
|
+
"WHEN TO ENABLE (True):\n"
|
|
121
|
+
"• When ribosomal genes dominate your HVG list\n"
|
|
122
|
+
"• For cleaner clustering focused on cell type markers\n"
|
|
123
|
+
"• Following certain published pipelines that recommend it\n\n"
|
|
124
|
+
"WHEN TO KEEP DISABLED (False):\n"
|
|
125
|
+
"• Standard analyses (ribosomal content varies by cell type)\n"
|
|
126
|
+
"• Studying translation or ribosome biogenesis\n"
|
|
127
|
+
"• When unsure - ribosomal genes rarely cause problems"
|
|
128
|
+
),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Normalization and scaling parameters
|
|
132
|
+
normalization: Literal["log", "sct", "pearson_residuals", "none", "scvi"] = Field(
|
|
133
|
+
default="log",
|
|
134
|
+
description=(
|
|
135
|
+
"Normalization method for gene expression data.\n\n"
|
|
136
|
+
"AVAILABLE OPTIONS:\n"
|
|
137
|
+
"• 'log' (default): Standard log(x+1) normalization after library size correction. "
|
|
138
|
+
"Robust and widely used for most analyses.\n"
|
|
139
|
+
"• 'sct': SCTransform v2 variance-stabilizing normalization via R's sctransform package. "
|
|
140
|
+
"Best for raw UMI counts from 10x platforms. Based on regularized negative binomial regression.\n"
|
|
141
|
+
"• 'pearson_residuals': Analytic Pearson residuals (scanpy built-in, similar to SCTransform). "
|
|
142
|
+
"Requires raw integer counts and scanpy>=1.9.0. Faster than SCTransform with similar results.\n"
|
|
143
|
+
"• 'none': Skip normalization. Use when data is already pre-normalized.\n"
|
|
144
|
+
"• 'scvi': Deep learning-based normalization using scVI variational autoencoder. "
|
|
145
|
+
"Learns a latent representation (X_scvi) that replaces PCA. Best for batch correction and denoising.\n\n"
|
|
146
|
+
"REQUIREMENTS:\n"
|
|
147
|
+
"• sct: R with sctransform package (R -e 'install.packages(\"sctransform\")') + rpy2\n"
|
|
148
|
+
"• pearson_residuals: Raw count data (integers only), scanpy>=1.9.0\n"
|
|
149
|
+
"• scvi: scvi-tools package (pip install scvi-tools), raw count data\n"
|
|
150
|
+
"• none: Data should already be normalized (will warn if raw counts detected)\n\n"
|
|
151
|
+
"RECOMMENDATIONS:\n"
|
|
152
|
+
"• For raw Visium/Xenium/MERFISH data: 'sct', 'pearson_residuals', or 'log'\n"
|
|
153
|
+
"• For Seurat workflow compatibility: 'sct' (SCTransform v2)\n"
|
|
154
|
+
"• For speed with similar results: 'pearson_residuals'\n"
|
|
155
|
+
"• For pre-processed data: 'none'\n"
|
|
156
|
+
"• For batch effect correction and denoising: 'scvi' (deep learning-based)"
|
|
157
|
+
),
|
|
158
|
+
)
|
|
159
|
+
scale: bool = Field(
|
|
160
|
+
default=False,
|
|
161
|
+
description=(
|
|
162
|
+
"Scale gene expression to unit variance before PCA.\n\n"
|
|
163
|
+
"DEFAULT: False (following Scanpy spatial transcriptomics best practices)\n\n"
|
|
164
|
+
"RATIONALE:\n"
|
|
165
|
+
"The standard Scanpy spatial transcriptomics tutorials do NOT include scaling:\n"
|
|
166
|
+
" normalize_total → log1p → HVG selection → PCA\n"
|
|
167
|
+
"Scaling is omitted because log-normalization already stabilizes variance.\n\n"
|
|
168
|
+
"WHEN TO ENABLE (scale=True):\n"
|
|
169
|
+
"• Using methods that explicitly require scaled input (e.g., GraphST)\n"
|
|
170
|
+
"• When gene expression magnitudes vary dramatically\n"
|
|
171
|
+
"• For compatibility with Seurat's ScaleData() workflow\n\n"
|
|
172
|
+
"WHEN TO KEEP DISABLED (scale=False):\n"
|
|
173
|
+
"• Standard Visium/spatial analysis with Scanpy/Squidpy\n"
|
|
174
|
+
"• Using SCTransform normalization (already variance-stabilized)\n"
|
|
175
|
+
"• Using Pearson residuals normalization\n\n"
|
|
176
|
+
"REFERENCE:\n"
|
|
177
|
+
"Scanpy spatial tutorial: scanpy-tutorials.readthedocs.io/en/latest/spatial/"
|
|
178
|
+
),
|
|
179
|
+
)
|
|
180
|
+
n_hvgs: Annotated[int, Field(gt=0, le=5000)] = 2000
|
|
181
|
+
n_pcs: Annotated[int, Field(gt=0, le=100)] = 30
|
|
182
|
+
|
|
183
|
+
# ========== Normalization Control Parameters ==========
|
|
184
|
+
normalize_target_sum: Optional[float] = Field(
|
|
185
|
+
default=None, # Adaptive default - uses median counts
|
|
186
|
+
ge=1.0, # Must be positive if specified
|
|
187
|
+
le=1e8, # Reasonable upper bound
|
|
188
|
+
description=(
|
|
189
|
+
"Target sum for total count normalization per cell/spot. "
|
|
190
|
+
"Controls the library size after normalization. "
|
|
191
|
+
"\n"
|
|
192
|
+
"RECOMMENDED VALUES BY TECHNOLOGY:\n"
|
|
193
|
+
"• None (default): Uses median of total counts - most adaptive, recommended for unknown data\n"
|
|
194
|
+
"• 1e4 (10,000): Standard for 10x Visium spatial transcriptomics\n"
|
|
195
|
+
"• 1e6 (1,000,000): CPM normalization, standard for MERFISH/CosMx/Xenium\n"
|
|
196
|
+
"• Custom value: Match to your expected counts per cell/spot\n"
|
|
197
|
+
"\n"
|
|
198
|
+
"DECISION GUIDE:\n"
|
|
199
|
+
"- Multi-cellular spots (Visium): Use 1e4\n"
|
|
200
|
+
"- Single-cell imaging (MERFISH, Xenium, CosMx): Use 1e6\n"
|
|
201
|
+
"- High-depth sequencing: Consider 1e5 or higher\n"
|
|
202
|
+
"- Low-depth/targeted panels: Consider 1e3-1e4\n"
|
|
203
|
+
"- Cross-sample integration: Use same value for all samples\n"
|
|
204
|
+
"- Spatial domain analysis: Consider skipping normalization (None)\n"
|
|
205
|
+
"\n"
|
|
206
|
+
"SCIENTIFIC RATIONALE:\n"
|
|
207
|
+
"This parameter scales all cells/spots to have the same total count, "
|
|
208
|
+
"removing technical variation due to sequencing depth or capture efficiency. "
|
|
209
|
+
"The choice affects the magnitude of normalized expression values and "
|
|
210
|
+
"can influence downstream analyses like HVG selection and clustering."
|
|
211
|
+
),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
scale_max_value: Optional[float] = Field(
|
|
215
|
+
default=10.0,
|
|
216
|
+
ge=1.0, # Must be positive if specified
|
|
217
|
+
le=100.0, # Reasonable upper bound
|
|
218
|
+
description=(
|
|
219
|
+
"Maximum value for clipping after scaling to unit variance (in standard deviations). "
|
|
220
|
+
"Prevents extreme outliers from dominating downstream analyses. "
|
|
221
|
+
"\n"
|
|
222
|
+
"RECOMMENDED VALUES:\n"
|
|
223
|
+
"• 10.0 (default): Standard in single-cell field, balances outlier control with data preservation\n"
|
|
224
|
+
"• None: No clipping - preserves all variation, use for high-quality data\n"
|
|
225
|
+
"• 5.0-8.0: More aggressive clipping for noisy data\n"
|
|
226
|
+
"• 15.0-20.0: Less aggressive for clean imaging data\n"
|
|
227
|
+
"\n"
|
|
228
|
+
"DECISION GUIDE BY DATA TYPE:\n"
|
|
229
|
+
"- Standard scRNA-seq or Visium: 10.0\n"
|
|
230
|
+
"- High-quality imaging (MERFISH/Xenium): 15.0 or None\n"
|
|
231
|
+
"- Noisy/low-quality data: 5.0-8.0\n"
|
|
232
|
+
"- Exploratory analysis: Start with 10.0\n"
|
|
233
|
+
"- Final analysis: Consider None to preserve all variation\n"
|
|
234
|
+
"\n"
|
|
235
|
+
"TECHNICAL DETAILS:\n"
|
|
236
|
+
"After scaling each gene to zero mean and unit variance, "
|
|
237
|
+
"values exceeding ±max_value standard deviations are clipped. "
|
|
238
|
+
"This prevents a few extreme values from dominating PCA and clustering. "
|
|
239
|
+
"Lower values increase robustness but may remove biological signal."
|
|
240
|
+
),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# SCTransform preprocessing parameters (requires R + sctransform package via rpy2)
|
|
244
|
+
# Installation: R -e 'install.packages("sctransform")' && pip install rpy2
|
|
245
|
+
sct_var_features_n: int = Field(
|
|
246
|
+
default=3000,
|
|
247
|
+
ge=100,
|
|
248
|
+
le=10000,
|
|
249
|
+
description="Number of highly variable features for SCTransform (default: 3000)",
|
|
250
|
+
)
|
|
251
|
+
sct_method: Literal["offset", "fix-slope"] = Field(
|
|
252
|
+
default="fix-slope",
|
|
253
|
+
description=(
|
|
254
|
+
"SCTransform regularization method:\n"
|
|
255
|
+
"• 'fix-slope' (default, v2): Fixed slope regularization, more robust and recommended.\n"
|
|
256
|
+
"• 'offset': Original offset model from v1."
|
|
257
|
+
),
|
|
258
|
+
)
|
|
259
|
+
sct_exclude_poisson: bool = Field(
|
|
260
|
+
default=True,
|
|
261
|
+
description="Exclude Poisson genes from regularization (v2 default: True). "
|
|
262
|
+
"Improves robustness by excluding genes where variance ≤ mean.",
|
|
263
|
+
)
|
|
264
|
+
sct_n_cells: Optional[int] = Field(
|
|
265
|
+
default=5000,
|
|
266
|
+
ge=100,
|
|
267
|
+
description="Number of cells to subsample for parameter estimation (default: 5000). "
|
|
268
|
+
"Set to None to use all cells (slower but may be more accurate for small datasets).",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# scVI preprocessing parameters - architecture
|
|
272
|
+
use_scvi_preprocessing: bool = False # Whether to use scVI for preprocessing
|
|
273
|
+
scvi_n_hidden: int = 128
|
|
274
|
+
scvi_n_latent: int = 10
|
|
275
|
+
scvi_n_layers: int = 1
|
|
276
|
+
scvi_dropout_rate: float = 0.1
|
|
277
|
+
scvi_gene_likelihood: Literal["zinb", "nb", "poisson"] = "zinb"
|
|
278
|
+
|
|
279
|
+
# scVI preprocessing parameters - training (user-configurable)
|
|
280
|
+
scvi_max_epochs: Annotated[int, Field(gt=0, le=2000)] = Field(
|
|
281
|
+
default=400,
|
|
282
|
+
description=(
|
|
283
|
+
"Maximum number of training epochs for scVI. "
|
|
284
|
+
"Default 400 is sufficient for most datasets with early stopping enabled. "
|
|
285
|
+
"Increase to 600-800 for large/complex datasets without early stopping."
|
|
286
|
+
),
|
|
287
|
+
)
|
|
288
|
+
scvi_early_stopping: bool = Field(
|
|
289
|
+
default=True,
|
|
290
|
+
description=(
|
|
291
|
+
"Whether to enable early stopping based on validation ELBO. "
|
|
292
|
+
"STRONGLY RECOMMENDED: Prevents overfitting and reduces training time. "
|
|
293
|
+
"Set to False only for debugging or when you need exact epoch control."
|
|
294
|
+
),
|
|
295
|
+
)
|
|
296
|
+
scvi_early_stopping_patience: Annotated[int, Field(gt=0, le=100)] = Field(
|
|
297
|
+
default=20,
|
|
298
|
+
description=(
|
|
299
|
+
"Number of epochs to wait for validation improvement before stopping. "
|
|
300
|
+
"Default 20 balances convergence detection with training stability. "
|
|
301
|
+
"Increase to 30-50 for noisy data, decrease to 10-15 for faster training."
|
|
302
|
+
),
|
|
303
|
+
)
|
|
304
|
+
scvi_train_size: Annotated[float, Field(gt=0.5, le=1.0)] = Field(
|
|
305
|
+
default=0.9,
|
|
306
|
+
description=(
|
|
307
|
+
"Fraction of data used for training (rest for validation). "
|
|
308
|
+
"Default 0.9 (90% train, 10% validation) is standard practice. "
|
|
309
|
+
"Use 1.0 to disable validation (NOT RECOMMENDED - no early stopping)."
|
|
310
|
+
),
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Key naming parameters (configurable hard-coded keys)
|
|
314
|
+
cluster_key: str = Field(
|
|
315
|
+
"leiden", alias="clustering_key"
|
|
316
|
+
) # Key name for storing clustering results
|
|
317
|
+
spatial_key: Optional[str] = Field(
|
|
318
|
+
default=None,
|
|
319
|
+
description="Spatial coordinate key in obsm (auto-detected if None)",
|
|
320
|
+
) # Changed from hardcoded "spatial" to allow auto-detection
|
|
321
|
+
batch_key: str = "batch" # Key name for batch information in obs
|
|
322
|
+
|
|
323
|
+
# User-controllable parameters (scientifically-informed defaults)
|
|
324
|
+
n_neighbors: Annotated[int, Field(gt=2, le=100)] = Field(
|
|
325
|
+
default=15,
|
|
326
|
+
description=(
|
|
327
|
+
"Number of neighbors for k-NN graph construction. "
|
|
328
|
+
"Default 15 aligns with Scanpy industry standard and UMAP developer recommendations (10-15 range). "
|
|
329
|
+
"Larger values (20-50) preserve more global structure, smaller values (5-10) emphasize local patterns. "
|
|
330
|
+
"For spatial transcriptomics: 15 captures meaningful tissue neighborhoods in both Visium (55μm) and Visium HD (2μm) data."
|
|
331
|
+
),
|
|
332
|
+
)
|
|
333
|
+
clustering_resolution: Annotated[float, Field(gt=0.1, le=2.0)] = Field(
|
|
334
|
+
default=1.0,
|
|
335
|
+
description=(
|
|
336
|
+
"Leiden clustering resolution parameter controlling clustering coarseness. "
|
|
337
|
+
"Higher values (1.5-2.0) produce more numerous, smaller clusters; "
|
|
338
|
+
"lower values (0.2-0.5) produce fewer, broader clusters. "
|
|
339
|
+
"Common values: 0.25, 0.5, 1.0. Default 1.0 matches scanpy standard and works well for most spatial datasets."
|
|
340
|
+
),
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class DifferentialExpressionParameters(BaseModel):
|
|
345
|
+
"""Differential expression analysis parameters model.
|
|
346
|
+
|
|
347
|
+
This model encapsulates all parameters for differential expression analysis,
|
|
348
|
+
following the unified (data_id, ctx, params) signature pattern.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
group_key: str = Field(
|
|
352
|
+
...,
|
|
353
|
+
description=(
|
|
354
|
+
"Column name in adata.obs for grouping cells/spots. "
|
|
355
|
+
"Common values: 'leiden', 'louvain', 'cell_type', 'seurat_clusters'"
|
|
356
|
+
),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
group1: Optional[str] = Field(
|
|
360
|
+
None,
|
|
361
|
+
description=(
|
|
362
|
+
"First group for comparison. If None, find markers for all groups "
|
|
363
|
+
"(one-vs-rest comparison for each group)."
|
|
364
|
+
),
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
group2: Optional[str] = Field(
|
|
368
|
+
None,
|
|
369
|
+
description=(
|
|
370
|
+
"Second group for comparison. If None or 'rest', compare group1 against "
|
|
371
|
+
"all other cells. Only used when group1 is specified."
|
|
372
|
+
),
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
method: Literal[
|
|
376
|
+
"wilcoxon", "t-test", "t-test_overestim_var", "logreg", "pydeseq2"
|
|
377
|
+
] = Field(
|
|
378
|
+
"wilcoxon",
|
|
379
|
+
description=(
|
|
380
|
+
"Statistical method for differential expression analysis.\n"
|
|
381
|
+
"• 'wilcoxon' (default): Wilcoxon rank-sum test, robust to outliers\n"
|
|
382
|
+
"• 't-test': Standard t-test, assumes normal distribution\n"
|
|
383
|
+
"• 't-test_overestim_var': t-test with overestimated variance\n"
|
|
384
|
+
"• 'logreg': Logistic regression\n"
|
|
385
|
+
"• 'pydeseq2': DESeq2 pseudobulk method (requires sample_key for aggregation)\n"
|
|
386
|
+
" - More accurate for multi-sample studies\n"
|
|
387
|
+
" - Accounts for biological replicates and batch effects\n"
|
|
388
|
+
" - Requires: pip install pydeseq2"
|
|
389
|
+
),
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
sample_key: Optional[str] = Field(
|
|
393
|
+
None,
|
|
394
|
+
description=(
|
|
395
|
+
"Column name in adata.obs for sample/replicate identifier.\n"
|
|
396
|
+
"REQUIRED for 'pydeseq2' method to perform pseudobulk aggregation.\n"
|
|
397
|
+
"Common values: 'sample', 'patient_id', 'batch', 'replicate'\n"
|
|
398
|
+
"Each unique value becomes a pseudobulk sample by summing counts within groups."
|
|
399
|
+
),
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
403
|
+
50,
|
|
404
|
+
description=(
|
|
405
|
+
"Number of top differentially expressed genes to return per group. "
|
|
406
|
+
"Default: 50. Range: 1-500."
|
|
407
|
+
),
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
pseudocount: Annotated[float, Field(gt=0, le=100)] = Field(
|
|
411
|
+
1.0,
|
|
412
|
+
description=(
|
|
413
|
+
"Pseudocount added before log2 fold change calculation to avoid log(0).\n"
|
|
414
|
+
"• 1.0 (default): Standard practice, stable for most data\n"
|
|
415
|
+
"• 0.1-0.5: More sensitive to low-expression changes\n"
|
|
416
|
+
"• 1-10: More stable for sparse/noisy data"
|
|
417
|
+
),
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
min_cells: Annotated[int, Field(gt=0, le=1000)] = Field(
|
|
421
|
+
3,
|
|
422
|
+
description=(
|
|
423
|
+
"Minimum number of cells per group for statistical testing.\n"
|
|
424
|
+
"• 3 (default): Minimum required for Wilcoxon test\n"
|
|
425
|
+
"• 10-30: More robust statistical results\n"
|
|
426
|
+
"Groups with fewer cells are automatically skipped with a warning."
|
|
427
|
+
),
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class VisualizationParameters(BaseModel):
|
|
432
|
+
"""Visualization parameters model"""
|
|
433
|
+
|
|
434
|
+
model_config = ConfigDict(extra="forbid") # Strict validation after preprocessing
|
|
435
|
+
|
|
436
|
+
feature: Optional[Union[str, list[str]]] = Field(
|
|
437
|
+
None,
|
|
438
|
+
description="Single feature or list of features (accepts both 'feature' and 'features')",
|
|
439
|
+
) # Single feature or list of features
|
|
440
|
+
|
|
441
|
+
@model_validator(mode="before")
|
|
442
|
+
@classmethod
|
|
443
|
+
def preprocess_params(cls, data):
|
|
444
|
+
"""
|
|
445
|
+
Preprocess visualization parameters to handle different input formats.
|
|
446
|
+
|
|
447
|
+
Handles:
|
|
448
|
+
- None: Returns empty dict
|
|
449
|
+
- str: Converts to feature parameter (supports "gene:CCL21" and "CCL21" formats)
|
|
450
|
+
- dict: Normalizes features/feature naming
|
|
451
|
+
"""
|
|
452
|
+
# Handle None input
|
|
453
|
+
if data is None:
|
|
454
|
+
return {}
|
|
455
|
+
|
|
456
|
+
# Handle string format parameters (shorthand for feature)
|
|
457
|
+
if isinstance(data, str):
|
|
458
|
+
if data.startswith("gene:"):
|
|
459
|
+
feature = data.split(":", 1)[1]
|
|
460
|
+
return {"feature": feature, "plot_type": "spatial"}
|
|
461
|
+
else:
|
|
462
|
+
return {"feature": data, "plot_type": "spatial"}
|
|
463
|
+
|
|
464
|
+
# Handle dict format - normalize features/feature naming
|
|
465
|
+
if isinstance(data, dict):
|
|
466
|
+
data_copy = data.copy()
|
|
467
|
+
# Handle 'features' as alias for 'feature'
|
|
468
|
+
if "features" in data_copy and "feature" not in data_copy:
|
|
469
|
+
data_copy["feature"] = data_copy.pop("features")
|
|
470
|
+
return data_copy
|
|
471
|
+
|
|
472
|
+
# For other types (e.g., VisualizationParameters instances), return as-is
|
|
473
|
+
return data
|
|
474
|
+
|
|
475
|
+
plot_type: Literal[
|
|
476
|
+
"spatial",
|
|
477
|
+
"heatmap",
|
|
478
|
+
"violin",
|
|
479
|
+
"umap",
|
|
480
|
+
"dotplot", # Marker gene expression dotplot
|
|
481
|
+
"cell_communication",
|
|
482
|
+
"deconvolution",
|
|
483
|
+
"trajectory",
|
|
484
|
+
"rna_velocity",
|
|
485
|
+
"spatial_statistics",
|
|
486
|
+
"multi_gene",
|
|
487
|
+
"lr_pairs",
|
|
488
|
+
"gene_correlation",
|
|
489
|
+
"pathway_enrichment",
|
|
490
|
+
"spatial_interaction",
|
|
491
|
+
"batch_integration", # Batch integration quality assessment
|
|
492
|
+
"cnv_heatmap", # CNV analysis heatmap
|
|
493
|
+
"spatial_cnv", # CNV spatial projection
|
|
494
|
+
"card_imputation", # CARD imputation high-resolution results
|
|
495
|
+
] = "spatial"
|
|
496
|
+
colormap: str = "coolwarm"
|
|
497
|
+
|
|
498
|
+
# Unified subtype parameter for all visualization types with subtypes
|
|
499
|
+
subtype: Optional[str] = Field(
|
|
500
|
+
None,
|
|
501
|
+
description=(
|
|
502
|
+
"Unified subtype parameter for visualization variants. "
|
|
503
|
+
"Usage depends on plot_type:\n"
|
|
504
|
+
"- rna_velocity: 'stream' (default, velocity embedding stream), "
|
|
505
|
+
"'phase' (spliced vs unspliced phase plot), 'proportions' (pie chart of spliced/unspliced ratios), "
|
|
506
|
+
"'heatmap' (gene expression by latent_time), 'paga' (PAGA with velocity arrows)\n"
|
|
507
|
+
"- trajectory: 'pseudotime' (default, pseudotime on embedding), "
|
|
508
|
+
"'circular' (CellRank circular projection), 'fate_map' (aggregated fate probabilities), "
|
|
509
|
+
"'gene_trends' (gene expression along lineages), 'fate_heatmap' (smoothed expression heatmap), "
|
|
510
|
+
"'palantir' (Palantir comprehensive results)\n"
|
|
511
|
+
"- pathway_enrichment: 'barplot', 'dotplot' (traditional ORA/GSEA), "
|
|
512
|
+
"'spatial_score', 'spatial_correlogram', 'spatial_variogram', 'spatial_cross_correlation' (spatial EnrichMap)\n"
|
|
513
|
+
"- deconvolution: 'spatial_multi', 'dominant_type', 'diversity', 'stacked_bar', 'scatterpie', 'umap'\n"
|
|
514
|
+
"- spatial_statistics: 'neighborhood', 'co_occurrence', 'ripley', 'moran', 'centrality', 'getis_ord'\n"
|
|
515
|
+
"- Other plot types may not require this parameter"
|
|
516
|
+
),
|
|
517
|
+
)
|
|
518
|
+
cluster_key: Optional[str] = Field(
|
|
519
|
+
None,
|
|
520
|
+
description=(
|
|
521
|
+
"Column name in adata.obs containing cluster or cell type labels "
|
|
522
|
+
"(e.g., 'leiden', 'louvain', 'cell_type'). "
|
|
523
|
+
"REQUIRED for plot_type='heatmap' and 'violin'. "
|
|
524
|
+
"NOTE: ChatSpatial uses 'cluster_key' (not 'groupby' as in Scanpy) "
|
|
525
|
+
"for consistency with Squidpy spatial analysis functions."
|
|
526
|
+
),
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Multi-gene visualization parameters
|
|
530
|
+
multi_panel: bool = False # Whether to create multi-panel plots
|
|
531
|
+
panel_layout: Optional[tuple[int, int]] = (
|
|
532
|
+
None # (rows, cols) - auto-determined if None
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# GridSpec subplot spacing parameters (for multi-panel plots)
|
|
536
|
+
subplot_wspace: float = Field(
|
|
537
|
+
0.0,
|
|
538
|
+
ge=-0.3, # Allow larger negative values for extreme tight spacing
|
|
539
|
+
le=1.0,
|
|
540
|
+
description=(
|
|
541
|
+
"Horizontal spacing between subplots (GridSpec wspace parameter). "
|
|
542
|
+
"Fraction of average subplot width. "
|
|
543
|
+
"Default 0.0 provides tight spacing for spatial plots with colorbars. "
|
|
544
|
+
"Common values: 0.0 (tight), 0.05 (compact), 0.1 (normal), 0.2 (loose). "
|
|
545
|
+
"Negative values (-0.1 to -0.2) create overlapping spacing for extreme compactness."
|
|
546
|
+
),
|
|
547
|
+
)
|
|
548
|
+
subplot_hspace: float = Field(
|
|
549
|
+
0.3,
|
|
550
|
+
ge=0.0,
|
|
551
|
+
le=1.0,
|
|
552
|
+
description=(
|
|
553
|
+
"Vertical spacing between subplots (GridSpec hspace parameter). "
|
|
554
|
+
"Fraction of average subplot height. "
|
|
555
|
+
"Default 0.3 provides comfortable spacing. "
|
|
556
|
+
"Common values: 0.2 (tight), 0.3 (normal), 0.4 (loose)."
|
|
557
|
+
),
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Colorbar parameters (for spatial plots with make_axes_locatable)
|
|
561
|
+
colorbar_pad: float = Field(
|
|
562
|
+
0.02,
|
|
563
|
+
ge=0.0,
|
|
564
|
+
le=0.2,
|
|
565
|
+
description=(
|
|
566
|
+
"Distance between subplot and colorbar (as fraction of subplot width). "
|
|
567
|
+
"Default 0.02 provides tight spacing. "
|
|
568
|
+
"Common values: 0.02 (tight), 0.03 (compact), 0.05 (normal)."
|
|
569
|
+
),
|
|
570
|
+
)
|
|
571
|
+
colorbar_size: str = Field(
|
|
572
|
+
"3%",
|
|
573
|
+
description=(
|
|
574
|
+
"Width of colorbar as percentage of subplot width. "
|
|
575
|
+
"Default '3%' provides narrow colorbar to save space. "
|
|
576
|
+
"Common values: '3%' (narrow), '4%' (compact), '5%' (normal)."
|
|
577
|
+
),
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# Ligand-receptor pair parameters
|
|
581
|
+
lr_pairs: Optional[list[tuple[str, str]]] = None # List of (ligand, receptor) pairs
|
|
582
|
+
lr_database: str = "cellchat" # Database for LR pairs
|
|
583
|
+
plot_top_pairs: int = Field(
|
|
584
|
+
6,
|
|
585
|
+
gt=0,
|
|
586
|
+
le=100,
|
|
587
|
+
description="Number of top LR pairs to display in cell communication visualization. Default: 6. For chord diagrams, use higher values (e.g., 50) to show more interactions.",
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Gene correlation parameters
|
|
591
|
+
correlation_method: Literal["pearson", "spearman", "kendall"] = "pearson"
|
|
592
|
+
show_correlation_stats: bool = True
|
|
593
|
+
|
|
594
|
+
# Figure parameters
|
|
595
|
+
figure_size: Optional[tuple[int, int]] = (
|
|
596
|
+
None # (width, height) - auto-determined if None
|
|
597
|
+
)
|
|
598
|
+
dpi: int = 300 # Publication quality (Nature/Cell standard)
|
|
599
|
+
alpha: float = 0.9 # Spot transparency (higher = more opaque)
|
|
600
|
+
spot_size: Optional[float] = Field(
|
|
601
|
+
150.0,
|
|
602
|
+
description=(
|
|
603
|
+
"Size of spots in spatial plots (in pixels). "
|
|
604
|
+
"Default 150 provides good balance for most 10x Visium data. "
|
|
605
|
+
"Adjust based on data density: "
|
|
606
|
+
"dense (>3000 spots): 100-150, "
|
|
607
|
+
"sparse (<2000 spots): 150-200. "
|
|
608
|
+
"Set to None for scanpy auto-sizing (not recommended - usually too small)."
|
|
609
|
+
),
|
|
610
|
+
)
|
|
611
|
+
alpha_img: float = Field(
|
|
612
|
+
0.3,
|
|
613
|
+
ge=0.0,
|
|
614
|
+
le=1.0,
|
|
615
|
+
description=(
|
|
616
|
+
"Background tissue image transparency (lower = dimmer, helps spots stand out). "
|
|
617
|
+
"Default 0.3 provides good contrast. "
|
|
618
|
+
"Increase to 0.4-0.5 to emphasize tissue structure."
|
|
619
|
+
),
|
|
620
|
+
)
|
|
621
|
+
show_tissue_image: bool = Field(
|
|
622
|
+
True,
|
|
623
|
+
description=(
|
|
624
|
+
"Whether to show tissue histology image in spatial plots. "
|
|
625
|
+
"If False, only plot spots on coordinates without background image. "
|
|
626
|
+
"This option only applies when tissue image is available. "
|
|
627
|
+
"When False, spots are plotted on a clean coordinate system for clearer visualization. "
|
|
628
|
+
"Default: True"
|
|
629
|
+
),
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Color parameters
|
|
633
|
+
vmin: Optional[float] = None # Minimum value for color scale
|
|
634
|
+
vmax: Optional[float] = None # Maximum value for color scale
|
|
635
|
+
color_scale: Literal["linear", "log", "sqrt"] = "linear" # Color scaling
|
|
636
|
+
|
|
637
|
+
# Display parameters
|
|
638
|
+
title: Optional[str] = None
|
|
639
|
+
show_legend: bool = True
|
|
640
|
+
show_colorbar: bool = True
|
|
641
|
+
show_axes: bool = True
|
|
642
|
+
add_gene_labels: bool = True # Whether to add gene names as labels
|
|
643
|
+
|
|
644
|
+
# Trajectory visualization parameters
|
|
645
|
+
basis: Optional[str] = (
|
|
646
|
+
None # Basis for trajectory visualization (e.g., 'spatial', 'umap', 'pca')
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
# GSEA visualization parameters
|
|
650
|
+
gsea_results_key: str = "gsea_results" # Key in adata.uns for GSEA results
|
|
651
|
+
n_top_pathways: int = 10 # Number of top pathways to show in barplot
|
|
652
|
+
|
|
653
|
+
# NEW: Spatial plot enhancement parameters
|
|
654
|
+
add_outline: bool = Field(
|
|
655
|
+
False, description="Add cluster outline/contour overlay to spatial plots"
|
|
656
|
+
)
|
|
657
|
+
outline_color: str = Field("black", description="Color for cluster outlines")
|
|
658
|
+
outline_width: float = Field(
|
|
659
|
+
0.4, description="Line width for cluster outlines (Nature/Cell standard)"
|
|
660
|
+
)
|
|
661
|
+
outline_cluster_key: Optional[str] = Field(
|
|
662
|
+
None, description="Cluster key for outlines (e.g., 'leiden')"
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
# NEW: UMAP enhancement parameters
|
|
666
|
+
size_by: Optional[str] = Field(
|
|
667
|
+
None,
|
|
668
|
+
description="Feature for point size encoding in UMAP (dual color+size encoding)",
|
|
669
|
+
)
|
|
670
|
+
show_velocity: bool = Field(
|
|
671
|
+
False, description="Overlay RNA velocity vectors on UMAP"
|
|
672
|
+
)
|
|
673
|
+
velocity_scale: float = Field(1.0, description="Scaling factor for velocity arrows")
|
|
674
|
+
|
|
675
|
+
# NEW: Heatmap enhancement parameters
|
|
676
|
+
obs_annotation: Optional[list[str]] = Field(
|
|
677
|
+
None, description="List of obs keys to show as column annotations"
|
|
678
|
+
)
|
|
679
|
+
var_annotation: Optional[list[str]] = Field(
|
|
680
|
+
None, description="List of var keys to show as row annotations"
|
|
681
|
+
)
|
|
682
|
+
annotation_colors: Optional[dict[str, str]] = Field(
|
|
683
|
+
None, description="Custom colors for annotations"
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# NEW: Integration assessment parameters
|
|
687
|
+
batch_key: str = Field(
|
|
688
|
+
"batch", description="Key in adata.obs for batch/sample identifier"
|
|
689
|
+
)
|
|
690
|
+
integration_method: Optional[str] = Field(
|
|
691
|
+
None, description="Integration method used (for display)"
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
# Dotplot visualization parameters
|
|
695
|
+
dotplot_dendrogram: bool = Field(
|
|
696
|
+
False,
|
|
697
|
+
description="Whether to show dendrogram for gene clustering in dotplot",
|
|
698
|
+
)
|
|
699
|
+
dotplot_swap_axes: bool = Field(
|
|
700
|
+
False,
|
|
701
|
+
description="Swap axes to show genes on x-axis and groups on y-axis",
|
|
702
|
+
)
|
|
703
|
+
dotplot_standard_scale: Optional[Literal["var", "group"]] = Field(
|
|
704
|
+
None,
|
|
705
|
+
description=(
|
|
706
|
+
"Standardize expression values for dotplot. "
|
|
707
|
+
"'var' = standardize per gene (row), "
|
|
708
|
+
"'group' = standardize per group (column)"
|
|
709
|
+
),
|
|
710
|
+
)
|
|
711
|
+
dotplot_dot_max: Optional[float] = Field(
|
|
712
|
+
None,
|
|
713
|
+
ge=0.0,
|
|
714
|
+
le=1.0,
|
|
715
|
+
description=(
|
|
716
|
+
"Maximum dot size as fraction (0-1). "
|
|
717
|
+
"If None, maximum observed fraction is used"
|
|
718
|
+
),
|
|
719
|
+
)
|
|
720
|
+
dotplot_dot_min: Optional[float] = Field(
|
|
721
|
+
None,
|
|
722
|
+
ge=0.0,
|
|
723
|
+
le=1.0,
|
|
724
|
+
description=(
|
|
725
|
+
"Minimum dot size as fraction (0-1). "
|
|
726
|
+
"If None, minimum observed fraction is used"
|
|
727
|
+
),
|
|
728
|
+
)
|
|
729
|
+
dotplot_smallest_dot: float = Field(
|
|
730
|
+
0.0,
|
|
731
|
+
ge=0.0,
|
|
732
|
+
le=50.0,
|
|
733
|
+
description=(
|
|
734
|
+
"Size of dot when expression fraction is 0. "
|
|
735
|
+
"Default 0 hides genes with no expression in a group"
|
|
736
|
+
),
|
|
737
|
+
)
|
|
738
|
+
dotplot_var_groups: Optional[dict[str, list[str]]] = Field(
|
|
739
|
+
None,
|
|
740
|
+
description=(
|
|
741
|
+
"Group genes by category for organized display. "
|
|
742
|
+
"Example: {'T cell markers': ['CD3D', 'CD4'], 'B cell markers': ['CD19', 'MS4A1']}"
|
|
743
|
+
),
|
|
744
|
+
)
|
|
745
|
+
dotplot_categories_order: Optional[list[str]] = Field(
|
|
746
|
+
None,
|
|
747
|
+
description="Custom order for groups (clusters/cell types) on the axis",
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
# Deconvolution visualization parameters
|
|
751
|
+
n_cell_types: Annotated[
|
|
752
|
+
int,
|
|
753
|
+
Field(
|
|
754
|
+
gt=0,
|
|
755
|
+
le=10,
|
|
756
|
+
description="Number of top cell types to show in deconvolution visualization. Must be between 1-10. Default: 4",
|
|
757
|
+
),
|
|
758
|
+
] = 4
|
|
759
|
+
deconv_method: Optional[str] = Field(
|
|
760
|
+
None,
|
|
761
|
+
description=(
|
|
762
|
+
"Deconvolution method name (e.g., 'cell2location', 'rctd'). "
|
|
763
|
+
"If None and only one result exists, auto-select and notify. "
|
|
764
|
+
"If None and multiple results exist, raise error requiring explicit specification. "
|
|
765
|
+
"This ensures you visualize the intended analysis for scientific reproducibility."
|
|
766
|
+
),
|
|
767
|
+
)
|
|
768
|
+
min_proportion_threshold: float = Field(
|
|
769
|
+
0.3,
|
|
770
|
+
ge=0.0,
|
|
771
|
+
le=1.0,
|
|
772
|
+
description="Minimum proportion threshold for marking spots as 'pure' vs 'mixed' (dominant_type visualization). Default: 0.3",
|
|
773
|
+
)
|
|
774
|
+
show_mixed_spots: bool = Field(
|
|
775
|
+
True,
|
|
776
|
+
description="Whether to mark mixed/heterogeneous spots in dominant_type visualization. Default: True",
|
|
777
|
+
)
|
|
778
|
+
pie_scale: float = Field(
|
|
779
|
+
0.4,
|
|
780
|
+
gt=0.0,
|
|
781
|
+
le=2.0,
|
|
782
|
+
description="Size scale factor for pie charts in scatterpie visualization. Default: 0.4",
|
|
783
|
+
)
|
|
784
|
+
scatterpie_alpha: float = Field(
|
|
785
|
+
1.0,
|
|
786
|
+
ge=0.0,
|
|
787
|
+
le=1.0,
|
|
788
|
+
description="Transparency of pie charts in scatterpie visualization (0=transparent, 1=opaque). Default: 1.0",
|
|
789
|
+
)
|
|
790
|
+
max_spots: int = Field(
|
|
791
|
+
100,
|
|
792
|
+
gt=0,
|
|
793
|
+
le=1000,
|
|
794
|
+
description="Maximum number of spots to show in stacked_bar visualization. Default: 100",
|
|
795
|
+
)
|
|
796
|
+
sort_by: Literal["dominant_type", "spatial", "cluster"] = Field(
|
|
797
|
+
"dominant_type",
|
|
798
|
+
description="Sorting method for stacked_bar visualization. Options: dominant_type (group by dominant cell type), spatial (spatial order), cluster (cluster order). Default: dominant_type",
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
@model_validator(mode="after")
|
|
802
|
+
def validate_conditional_parameters(self) -> Self:
|
|
803
|
+
"""Validate parameter dependencies and provide helpful error messages."""
|
|
804
|
+
|
|
805
|
+
# Spatial statistics validation
|
|
806
|
+
if self.plot_type == "spatial_statistics" and (
|
|
807
|
+
not self.subtype
|
|
808
|
+
or (isinstance(self.subtype, str) and not self.subtype.strip())
|
|
809
|
+
):
|
|
810
|
+
available_types = [
|
|
811
|
+
"neighborhood",
|
|
812
|
+
"co_occurrence",
|
|
813
|
+
"ripley",
|
|
814
|
+
"moran",
|
|
815
|
+
"centrality",
|
|
816
|
+
"getis_ord",
|
|
817
|
+
]
|
|
818
|
+
raise ValueError(
|
|
819
|
+
f"Parameter dependency error: subtype is required when plot_type='spatial_statistics'.\n"
|
|
820
|
+
f"Available subtypes: {', '.join(available_types)}\n"
|
|
821
|
+
f"Example usage: VisualizationParameters(plot_type='spatial_statistics', subtype='neighborhood')\n"
|
|
822
|
+
f"For more details, see spatial statistics documentation."
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Deconvolution validation - set default subtype if not provided
|
|
826
|
+
if self.plot_type == "deconvolution" and not self.subtype:
|
|
827
|
+
self.subtype = "spatial_multi" # Default deconvolution visualization type
|
|
828
|
+
|
|
829
|
+
return self
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
class AnnotationParameters(BaseModel):
|
|
833
|
+
"""Cell type annotation parameters model"""
|
|
834
|
+
|
|
835
|
+
method: Literal[
|
|
836
|
+
"tangram",
|
|
837
|
+
"scanvi",
|
|
838
|
+
"cellassign",
|
|
839
|
+
"mllmcelltype",
|
|
840
|
+
"sctype",
|
|
841
|
+
"singler",
|
|
842
|
+
] = "tangram"
|
|
843
|
+
marker_genes: Optional[dict[str, list[str]]] = None
|
|
844
|
+
reference_data: Optional[str] = None
|
|
845
|
+
reference_data_id: Optional[str] = (
|
|
846
|
+
None # For Tangram method - ID of reference single-cell dataset
|
|
847
|
+
)
|
|
848
|
+
training_genes: Optional[list[str]] = (
|
|
849
|
+
None # For Tangram method - genes to use for mapping
|
|
850
|
+
)
|
|
851
|
+
num_epochs: int = (
|
|
852
|
+
100 # For Tangram/ScanVI methods - number of training epochs (reduced for faster training)
|
|
853
|
+
)
|
|
854
|
+
tangram_mode: Literal["cells", "clusters"] = (
|
|
855
|
+
"cells" # Tangram mapping mode: 'cells' (cell-level) or 'clusters' (cluster-level)
|
|
856
|
+
)
|
|
857
|
+
cluster_label: Optional[str] = (
|
|
858
|
+
None # For mLLMCellType method - cluster label in spatial data. Only required when method='mllmcelltype'
|
|
859
|
+
)
|
|
860
|
+
cell_type_key: Optional[str] = Field(
|
|
861
|
+
default=None,
|
|
862
|
+
description=(
|
|
863
|
+
"Column name for cell types in REFERENCE data. "
|
|
864
|
+
"\n\n"
|
|
865
|
+
"REQUIRED FOR METHODS USING REFERENCE DATA:\n"
|
|
866
|
+
" • tangram: REQUIRED - maps spatial data to reference using cell type labels\n"
|
|
867
|
+
" • scanvi: REQUIRED - transfers labels from reference to query data\n"
|
|
868
|
+
" • singler: REQUIRED - correlates expression with reference cell types\n"
|
|
869
|
+
"\n"
|
|
870
|
+
"NOT REQUIRED FOR METHODS WITHOUT REFERENCE:\n"
|
|
871
|
+
" • cellassign: Not needed - uses marker_genes parameter instead\n"
|
|
872
|
+
" • sctype: Not needed - uses built-in database or custom markers\n"
|
|
873
|
+
" • mllmcelltype: Not needed - uses LLM for annotation\n"
|
|
874
|
+
"\n"
|
|
875
|
+
"Common column names in reference data: 'cell_type', 'cell_types', 'celltype', 'annotation', 'label', 'cell_type_original'\n"
|
|
876
|
+
"\n"
|
|
877
|
+
"The LLM will auto-detect from metadata if not specified, but explicit specification is recommended."
|
|
878
|
+
),
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
# Tangram-specific parameters (aligned with scvi.external.Tangram API)
|
|
882
|
+
tangram_density_prior: Literal["rna_count_based", "uniform"] = (
|
|
883
|
+
"rna_count_based" # Density prior for mapping
|
|
884
|
+
)
|
|
885
|
+
tangram_device: str = "cpu" # Device for computation ('cpu' or 'cuda:0')
|
|
886
|
+
tangram_learning_rate: float = 0.1 # Learning rate for optimization
|
|
887
|
+
tangram_compute_validation: bool = False # Whether to compute validation metrics
|
|
888
|
+
tangram_project_genes: bool = False # Whether to project gene expression
|
|
889
|
+
|
|
890
|
+
# Tangram regularization parameters (optional)
|
|
891
|
+
tangram_lambda_r: Optional[float] = (
|
|
892
|
+
None # Regularization parameter for entropy term in Tangram loss
|
|
893
|
+
)
|
|
894
|
+
tangram_lambda_neighborhood: Optional[float] = (
|
|
895
|
+
None # Neighborhood regularization parameter for spatial smoothness
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
# General parameters for batch effect and data handling
|
|
899
|
+
batch_key: Optional[str] = None # For batch effect correction
|
|
900
|
+
layer: Optional[str] = None # Which layer to use for analysis
|
|
901
|
+
|
|
902
|
+
# scANVI parameters (scvi-tools semi-supervised label transfer)
|
|
903
|
+
scanvi_n_hidden: int = Field(
|
|
904
|
+
default=128,
|
|
905
|
+
description="Number of hidden units per layer. Official default: 128",
|
|
906
|
+
)
|
|
907
|
+
scanvi_n_latent: int = Field(
|
|
908
|
+
default=10,
|
|
909
|
+
description=(
|
|
910
|
+
"Dimensionality of latent space. Official default: 10\n"
|
|
911
|
+
"scvi-tools recommendation for large integration: 30\n"
|
|
912
|
+
"WARNING:Empirical (not official): Small datasets may need 3-5 to avoid NaN"
|
|
913
|
+
),
|
|
914
|
+
)
|
|
915
|
+
scanvi_n_layers: int = Field(
|
|
916
|
+
default=1,
|
|
917
|
+
description=(
|
|
918
|
+
"Number of hidden layers. Official default: 1\n"
|
|
919
|
+
"scvi-tools recommendation for large integration: 2"
|
|
920
|
+
),
|
|
921
|
+
)
|
|
922
|
+
scanvi_dropout_rate: float = Field(
|
|
923
|
+
default=0.1,
|
|
924
|
+
description=(
|
|
925
|
+
"Dropout rate for regularization. Official default: 0.1\n"
|
|
926
|
+
"WARNING:Empirical (not official): 0.2-0.3 may help small datasets"
|
|
927
|
+
),
|
|
928
|
+
)
|
|
929
|
+
scanvi_unlabeled_category: str = Field(
|
|
930
|
+
default="Unknown",
|
|
931
|
+
description="Label for unlabeled cells in semi-supervised learning",
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# SCVI pretraining parameters (official best practice)
|
|
935
|
+
scanvi_use_scvi_pretrain: bool = Field(
|
|
936
|
+
default=True,
|
|
937
|
+
description=(
|
|
938
|
+
"Whether to pretrain with SCVI before SCANVI training. Default: True\n"
|
|
939
|
+
"Official scvi-tools best practice: SCVI pretraining improves stability\n"
|
|
940
|
+
"WARNING:For small datasets: Set to False if encountering NaN errors"
|
|
941
|
+
),
|
|
942
|
+
)
|
|
943
|
+
scanvi_scvi_epochs: int = Field(
|
|
944
|
+
default=200, description="Number of epochs for SCVI pretraining. Default: 200"
|
|
945
|
+
)
|
|
946
|
+
scanvi_scanvi_epochs: int = Field(
|
|
947
|
+
default=20,
|
|
948
|
+
description=(
|
|
949
|
+
"Number of epochs for SCANVI model training after SCVI pretraining. Default: 20\n"
|
|
950
|
+
"This is the second stage training that fine-tunes the model for label transfer.\n"
|
|
951
|
+
"Official scvi-tools recommendation: 20 epochs is usually sufficient after pretraining.\n"
|
|
952
|
+
"Increase to 50-100 for complex datasets or if label transfer accuracy is low."
|
|
953
|
+
),
|
|
954
|
+
)
|
|
955
|
+
scanvi_n_samples_per_label: int = Field(
|
|
956
|
+
default=100,
|
|
957
|
+
description="Number of samples per label for semi-supervised training",
|
|
958
|
+
)
|
|
959
|
+
|
|
960
|
+
# Query training parameters
|
|
961
|
+
scanvi_query_epochs: int = Field(
|
|
962
|
+
default=100,
|
|
963
|
+
description=(
|
|
964
|
+
"Number of epochs for training on query data. Default: 100\n"
|
|
965
|
+
"WARNING:For small datasets: Recommend 50 to prevent overfitting"
|
|
966
|
+
),
|
|
967
|
+
)
|
|
968
|
+
scanvi_check_val_every_n_epoch: int = Field(
|
|
969
|
+
default=10, description="Validation check frequency during training"
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
# CellAssign parameters
|
|
973
|
+
cellassign_n_hidden: int = 100
|
|
974
|
+
cellassign_learning_rate: float = 0.001
|
|
975
|
+
cellassign_max_iter: int = 200
|
|
976
|
+
|
|
977
|
+
# mLLMCellType parameters
|
|
978
|
+
mllm_n_marker_genes: Annotated[int, Field(gt=0, le=50)] = (
|
|
979
|
+
20 # Number of marker genes per cluster
|
|
980
|
+
)
|
|
981
|
+
mllm_species: Literal["human", "mouse"] = "human" # Species
|
|
982
|
+
mllm_tissue: Optional[str] = None # Tissue type (e.g., "brain", "liver")
|
|
983
|
+
mllm_provider: Literal[
|
|
984
|
+
"openai",
|
|
985
|
+
"anthropic",
|
|
986
|
+
"gemini",
|
|
987
|
+
"deepseek",
|
|
988
|
+
"qwen",
|
|
989
|
+
"zhipu",
|
|
990
|
+
"stepfun",
|
|
991
|
+
"minimax",
|
|
992
|
+
"grok",
|
|
993
|
+
"openrouter",
|
|
994
|
+
] = "openai" # LLM provider (use 'gemini' not 'google')
|
|
995
|
+
mllm_model: Optional[str] = (
|
|
996
|
+
None # Model name. Defaults: openai="gpt-5", anthropic="claude-sonnet-4-20250514", gemini="gemini-2.5-pro-preview-03-25"
|
|
997
|
+
# Examples: "gpt-5", "claude-sonnet-4-5-20250929", "claude-opus-4-1-20250805", "gemini-2.5-pro", "qwen-max-2025-01-25"
|
|
998
|
+
)
|
|
999
|
+
mllm_api_key: Optional[str] = None # API key for the LLM provider
|
|
1000
|
+
mllm_additional_context: Optional[str] = None # Additional context for annotation
|
|
1001
|
+
mllm_use_cache: bool = True # Whether to use caching for API calls
|
|
1002
|
+
mllm_base_urls: Optional[Union[str, dict[str, str]]] = None # Custom API endpoints
|
|
1003
|
+
mllm_verbose: bool = False # Whether to print detailed logs
|
|
1004
|
+
mllm_force_rerun: bool = False # Force reanalysis bypassing cache
|
|
1005
|
+
|
|
1006
|
+
# Multi-model consensus parameters (interactive_consensus_annotation)
|
|
1007
|
+
mllm_use_consensus: bool = False # Whether to use multi-model consensus
|
|
1008
|
+
mllm_models: Optional[list[Union[str, dict[str, str]]]] = (
|
|
1009
|
+
None # List of models for consensus
|
|
1010
|
+
)
|
|
1011
|
+
mllm_api_keys: Optional[dict[str, str]] = None # Dict mapping provider to API key
|
|
1012
|
+
mllm_consensus_threshold: float = 0.7 # Agreement threshold for consensus
|
|
1013
|
+
mllm_entropy_threshold: float = 1.0 # Entropy threshold for controversy detection
|
|
1014
|
+
mllm_max_discussion_rounds: int = 3 # Maximum discussion rounds
|
|
1015
|
+
mllm_consensus_model: Optional[Union[str, dict[str, str]]] = (
|
|
1016
|
+
None # Model for consensus checking
|
|
1017
|
+
)
|
|
1018
|
+
mllm_clusters_to_analyze: Optional[list[str]] = None # Specific clusters to analyze
|
|
1019
|
+
|
|
1020
|
+
# ScType parameters
|
|
1021
|
+
sctype_tissue: Optional[str] = (
|
|
1022
|
+
None # Tissue type (supported: "Adrenal", "Brain", "Eye", "Heart", "Hippocampus", "Immune system", "Intestine", "Kidney", "Liver", "Lung", "Muscle", "Pancreas", "Placenta", "Spleen", "Stomach", "Thymus")
|
|
1023
|
+
)
|
|
1024
|
+
sctype_db_: Optional[str] = (
|
|
1025
|
+
None # Custom database path (if None, uses default ScTypeDB)
|
|
1026
|
+
)
|
|
1027
|
+
sctype_scaled: bool = True # Whether input data is scaled
|
|
1028
|
+
sctype_custom_markers: Optional[dict[str, dict[str, list[str]]]] = (
|
|
1029
|
+
None # Custom markers: {"CellType": {"positive": [...], "negative": [...]}}
|
|
1030
|
+
)
|
|
1031
|
+
sctype_use_cache: bool = True # Whether to cache results to avoid repeated R calls
|
|
1032
|
+
|
|
1033
|
+
# SingleR parameters (for enhanced marker_genes method)
|
|
1034
|
+
singler_reference: Optional[str] = Field(
|
|
1035
|
+
default=None,
|
|
1036
|
+
description=(
|
|
1037
|
+
"Reference dataset name from celldex package (Python naming convention).\n\n"
|
|
1038
|
+
"Valid references:\n"
|
|
1039
|
+
" Human: 'hpca' (Human Primary Cell Atlas, recommended), 'blueprint_encode', "
|
|
1040
|
+
"'dice', 'monaco_immune', 'novershtern_hematopoietic'\n"
|
|
1041
|
+
" Mouse: 'immgen' (ImmGen, recommended), 'mouse_rnaseq'\n\n"
|
|
1042
|
+
"Common mistakes:\n"
|
|
1043
|
+
" 'HumanPrimaryCellAtlasData' - WRONG, use 'hpca'\n"
|
|
1044
|
+
" 'ImmGenData' - WRONG, use 'immgen'\n\n"
|
|
1045
|
+
"If None, uses species-appropriate default ('hpca' for human, 'immgen' for mouse)."
|
|
1046
|
+
),
|
|
1047
|
+
)
|
|
1048
|
+
singler_integrated: bool = Field(
|
|
1049
|
+
default=False,
|
|
1050
|
+
description="Whether to use integrated annotation with multiple references",
|
|
1051
|
+
)
|
|
1052
|
+
singler_fine_tune: bool = Field(
|
|
1053
|
+
default=True,
|
|
1054
|
+
description="Whether to perform fine-tuning step in SingleR annotation (refines labels based on marker genes)",
|
|
1055
|
+
)
|
|
1056
|
+
num_threads: int = 4 # Number of threads for parallel processing
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
class SpatialStatisticsParameters(BaseModel):
|
|
1060
|
+
"""Spatial statistics parameters model"""
|
|
1061
|
+
|
|
1062
|
+
analysis_type: Literal[
|
|
1063
|
+
"neighborhood",
|
|
1064
|
+
"co_occurrence",
|
|
1065
|
+
"ripley",
|
|
1066
|
+
"moran",
|
|
1067
|
+
"local_moran", # Added: Local Moran's I (LISA)
|
|
1068
|
+
"geary",
|
|
1069
|
+
"centrality",
|
|
1070
|
+
"getis_ord",
|
|
1071
|
+
"bivariate_moran",
|
|
1072
|
+
"join_count", # Traditional Join Count for binary data (2 categories)
|
|
1073
|
+
"local_join_count", # Local Join Count for multi-category data (>2 categories)
|
|
1074
|
+
"network_properties",
|
|
1075
|
+
"spatial_centrality",
|
|
1076
|
+
] = "neighborhood"
|
|
1077
|
+
cluster_key: Optional[str] = Field(
|
|
1078
|
+
default=None,
|
|
1079
|
+
description=(
|
|
1080
|
+
"Column name for cluster/cell type labels in adata.obs. "
|
|
1081
|
+
"\n\n"
|
|
1082
|
+
"REQUIRED FOR GROUP-BASED ANALYSES:\n"
|
|
1083
|
+
" • neighborhood: REQUIRED - analyzes enrichment between cell type groups\n"
|
|
1084
|
+
" • co_occurrence: REQUIRED - measures spatial co-occurrence of groups\n"
|
|
1085
|
+
" • ripley: REQUIRED - analyzes spatial point patterns by group\n"
|
|
1086
|
+
" • join_count: REQUIRED - for BINARY categorical data (2 categories)\n"
|
|
1087
|
+
" • local_join_count: REQUIRED - for MULTI-CATEGORY data (>2 categories)\n"
|
|
1088
|
+
"\n"
|
|
1089
|
+
"OPTIONAL/NOT REQUIRED FOR GENE-BASED ANALYSES:\n"
|
|
1090
|
+
" • moran: Not required - analyzes gene expression spatial patterns\n"
|
|
1091
|
+
" • local_moran: Not required - identifies local spatial clusters for genes\n"
|
|
1092
|
+
" • geary: Not required - measures gene expression spatial autocorrelation\n"
|
|
1093
|
+
" • getis_ord: Not required - detects hot/cold spots for gene expression\n"
|
|
1094
|
+
" • bivariate_moran: Not required - analyzes gene pair spatial correlation\n"
|
|
1095
|
+
" • centrality: Not required - computes spatial network centrality\n"
|
|
1096
|
+
" • network_properties: Not required - analyzes spatial network structure\n"
|
|
1097
|
+
" • spatial_centrality: Not required - measures spatial importance\n"
|
|
1098
|
+
"\n"
|
|
1099
|
+
"Common column names: 'leiden', 'louvain', 'cell_type', 'cell_type_tangram', 'seurat_clusters', 'clusters'\n"
|
|
1100
|
+
"\n"
|
|
1101
|
+
"The LLM will auto-detect from metadata if not specified for required analyses."
|
|
1102
|
+
),
|
|
1103
|
+
)
|
|
1104
|
+
n_neighbors: Annotated[int, Field(gt=0)] = Field(
|
|
1105
|
+
8,
|
|
1106
|
+
description=(
|
|
1107
|
+
"Number of nearest neighbors for spatial graph construction. "
|
|
1108
|
+
"Default: 8 (recommended by ArcGIS for Getis-Ord analysis). "
|
|
1109
|
+
"Adjust based on dataset density and spatial scale."
|
|
1110
|
+
),
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
# Unified gene selection parameter (NEW)
|
|
1114
|
+
genes: Optional[list[str]] = Field(
|
|
1115
|
+
None,
|
|
1116
|
+
description="Specific genes to analyze. If None, uses HVG or defaults based on analysis type",
|
|
1117
|
+
)
|
|
1118
|
+
n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
1119
|
+
20,
|
|
1120
|
+
description="Number of top HVGs to analyze (default 20, up to 500 for comprehensive analysis)",
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
# Parallel processing parameters
|
|
1124
|
+
n_jobs: Optional[int] = Field(
|
|
1125
|
+
1,
|
|
1126
|
+
description="Number of parallel jobs. 1 = no parallelization (recommended for small datasets), None = auto-detect, -1 = all cores",
|
|
1127
|
+
)
|
|
1128
|
+
backend: Literal["loky", "threading", "multiprocessing"] = Field(
|
|
1129
|
+
"threading",
|
|
1130
|
+
description="Parallelization backend (threading is safer than loky)",
|
|
1131
|
+
)
|
|
1132
|
+
|
|
1133
|
+
# Moran's I specific parameters
|
|
1134
|
+
moran_n_perms: Annotated[int, Field(gt=0, le=10000)] = Field(
|
|
1135
|
+
10,
|
|
1136
|
+
description="Number of permutations (default 10 for speed, use 100+ for publication)",
|
|
1137
|
+
)
|
|
1138
|
+
moran_two_tailed: bool = Field(False, description="Use two-tailed test")
|
|
1139
|
+
|
|
1140
|
+
# Local Moran's I (LISA) specific parameters
|
|
1141
|
+
local_moran_permutations: Annotated[int, Field(gt=0, le=9999)] = Field(
|
|
1142
|
+
999,
|
|
1143
|
+
description=(
|
|
1144
|
+
"Number of permutations for pseudo p-value calculation in Local Moran's I. "
|
|
1145
|
+
"Higher values increase precision: 99 -> precision 0.01, 999 -> precision 0.001. "
|
|
1146
|
+
"Default 999 is standard practice. Use 9999 for publication-quality results."
|
|
1147
|
+
),
|
|
1148
|
+
)
|
|
1149
|
+
local_moran_alpha: Annotated[float, Field(gt=0.0, lt=1.0)] = Field(
|
|
1150
|
+
0.05,
|
|
1151
|
+
description=(
|
|
1152
|
+
"Significance level (alpha) for Local Moran's I hotspot/coldspot detection. "
|
|
1153
|
+
"Used with FDR correction to determine significant spatial clusters. "
|
|
1154
|
+
"Common values: 0.05 (standard), 0.01 (conservative), 0.10 (exploratory)."
|
|
1155
|
+
),
|
|
1156
|
+
)
|
|
1157
|
+
local_moran_fdr_correction: bool = Field(
|
|
1158
|
+
True,
|
|
1159
|
+
description=(
|
|
1160
|
+
"Whether to apply FDR (False Discovery Rate) correction for multiple testing. "
|
|
1161
|
+
"STRONGLY RECOMMENDED: Each location is tested separately, creating a multiple "
|
|
1162
|
+
"testing problem. FDR correction controls the expected proportion of false positives. "
|
|
1163
|
+
"Set to False only for exploratory analysis."
|
|
1164
|
+
),
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
# Getis-Ord Gi* specific parameters
|
|
1168
|
+
getis_ord_correction: Literal["bonferroni", "fdr_bh", "none"] = Field(
|
|
1169
|
+
"fdr_bh",
|
|
1170
|
+
description=(
|
|
1171
|
+
"Multiple testing correction method for Getis-Ord analysis. "
|
|
1172
|
+
"Options: 'fdr_bh' (Benjamini-Hochberg FDR, recommended for multi-gene), "
|
|
1173
|
+
"'bonferroni' (conservative), 'none' (no correction)"
|
|
1174
|
+
),
|
|
1175
|
+
)
|
|
1176
|
+
getis_ord_alpha: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
|
|
1177
|
+
0.05,
|
|
1178
|
+
description=(
|
|
1179
|
+
"Significance level (alpha) for Getis-Ord hotspot detection. "
|
|
1180
|
+
"Determines Z-score threshold via norm.ppf(1 - alpha/2). "
|
|
1181
|
+
"Common values: 0.05 (z=1.96), 0.01 (z=2.576), 0.10 (z=1.645)"
|
|
1182
|
+
),
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
# Bivariate Moran's I specific parameters
|
|
1186
|
+
gene_pairs: Optional[list[tuple[str, str]]] = Field(
|
|
1187
|
+
None, description="Gene pairs for bivariate analysis"
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
class RNAVelocityParameters(BaseModel):
|
|
1192
|
+
"""RNA velocity analysis parameters model"""
|
|
1193
|
+
|
|
1194
|
+
model_config = ConfigDict(
|
|
1195
|
+
extra="forbid"
|
|
1196
|
+
) # Strict validation - no extra parameters allowed
|
|
1197
|
+
|
|
1198
|
+
# Velocity computation method selection
|
|
1199
|
+
method: Literal["scvelo", "velovi"] = "scvelo"
|
|
1200
|
+
|
|
1201
|
+
# scVelo specific parameters
|
|
1202
|
+
scvelo_mode: Literal["deterministic", "stochastic", "dynamical"] = "stochastic"
|
|
1203
|
+
n_pcs: Annotated[int, Field(gt=0, le=100)] = 30
|
|
1204
|
+
basis: str = "spatial"
|
|
1205
|
+
|
|
1206
|
+
# Preprocessing parameters for velocity computation
|
|
1207
|
+
min_shared_counts: Annotated[int, Field(gt=0)] = (
|
|
1208
|
+
30 # Minimum shared counts for filtering
|
|
1209
|
+
)
|
|
1210
|
+
n_top_genes: Annotated[int, Field(gt=0)] = 2000 # Number of top genes to retain
|
|
1211
|
+
n_neighbors: Annotated[int, Field(gt=0)] = (
|
|
1212
|
+
30 # Number of neighbors for moments computation
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
# VELOVI specific parameters
|
|
1216
|
+
velovi_n_hidden: int = 128
|
|
1217
|
+
velovi_n_latent: int = 10
|
|
1218
|
+
velovi_n_layers: int = 1
|
|
1219
|
+
velovi_n_epochs: int = 1000
|
|
1220
|
+
velovi_dropout_rate: float = 0.1
|
|
1221
|
+
velovi_learning_rate: float = 1e-3
|
|
1222
|
+
velovi_use_gpu: bool = False
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
class TrajectoryParameters(BaseModel):
|
|
1226
|
+
"""Trajectory analysis parameters model"""
|
|
1227
|
+
|
|
1228
|
+
method: Literal["cellrank", "palantir", "dpt"] = "cellrank"
|
|
1229
|
+
spatial_weight: Annotated[float, Field(ge=0.0, le=1.0)] = (
|
|
1230
|
+
0.5 # Spatial information weight
|
|
1231
|
+
)
|
|
1232
|
+
root_cells: Optional[list[str]] = None # For Palantir method
|
|
1233
|
+
|
|
1234
|
+
# CellRank specific parameters
|
|
1235
|
+
cellrank_kernel_weights: tuple[float, float] = (
|
|
1236
|
+
0.8,
|
|
1237
|
+
0.2,
|
|
1238
|
+
) # (velocity_weight, connectivity_weight)
|
|
1239
|
+
cellrank_n_states: Annotated[int, Field(gt=0, le=20)] = (
|
|
1240
|
+
5 # Number of macrostates for CellRank
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
# Palantir specific parameters
|
|
1244
|
+
palantir_n_diffusion_components: Annotated[int, Field(gt=0, le=50)] = (
|
|
1245
|
+
10 # Number of diffusion components
|
|
1246
|
+
)
|
|
1247
|
+
palantir_num_waypoints: Annotated[int, Field(gt=0)] = (
|
|
1248
|
+
500 # Number of waypoints for Palantir
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
# Fallback control
|
|
1252
|
+
# Removed: allow_fallback_to_dpt - No longer doing automatic fallbacks
|
|
1253
|
+
# LLMs should explicitly choose which method to use
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
class IntegrationParameters(BaseModel):
|
|
1257
|
+
"""Sample integration parameters model"""
|
|
1258
|
+
|
|
1259
|
+
method: Literal["harmony", "bbknn", "scanorama", "scvi"] = "harmony"
|
|
1260
|
+
batch_key: str = "batch" # Batch information key
|
|
1261
|
+
n_pcs: Annotated[int, Field(gt=0, le=100)] = (
|
|
1262
|
+
30 # Number of principal components for integration
|
|
1263
|
+
)
|
|
1264
|
+
align_spatial: bool = True # Whether to align spatial coordinates
|
|
1265
|
+
reference_batch: Optional[str] = None # Reference batch for spatial alignment
|
|
1266
|
+
|
|
1267
|
+
# Common scvi-tools parameters
|
|
1268
|
+
use_gpu: bool = False # Whether to use GPU acceleration for scvi-tools methods
|
|
1269
|
+
n_epochs: Optional[int] = None # Number of training epochs (None = auto-determine)
|
|
1270
|
+
|
|
1271
|
+
# scVI integration parameters
|
|
1272
|
+
scvi_n_hidden: int = 128
|
|
1273
|
+
scvi_n_latent: int = 10
|
|
1274
|
+
scvi_n_layers: int = 1
|
|
1275
|
+
scvi_dropout_rate: float = 0.1
|
|
1276
|
+
scvi_gene_likelihood: Literal["zinb", "nb", "poisson"] = "zinb"
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
class DeconvolutionParameters(BaseModel):
|
|
1280
|
+
"""Spatial deconvolution parameters model"""
|
|
1281
|
+
|
|
1282
|
+
method: Literal[
|
|
1283
|
+
"flashdeconv",
|
|
1284
|
+
"cell2location",
|
|
1285
|
+
"rctd",
|
|
1286
|
+
"destvi",
|
|
1287
|
+
"stereoscope",
|
|
1288
|
+
"spotlight",
|
|
1289
|
+
"tangram",
|
|
1290
|
+
"card",
|
|
1291
|
+
] = "flashdeconv"
|
|
1292
|
+
reference_data_id: Optional[str] = (
|
|
1293
|
+
None # Reference single-cell data for deconvolution
|
|
1294
|
+
)
|
|
1295
|
+
cell_type_key: str # REQUIRED: Key in reference data for cell type information. LLM will infer from metadata. Common values: 'cell_type', 'celltype', 'annotation', 'label'
|
|
1296
|
+
|
|
1297
|
+
# Universal GPU parameter
|
|
1298
|
+
use_gpu: bool = Field(
|
|
1299
|
+
False,
|
|
1300
|
+
description=(
|
|
1301
|
+
"Whether to use GPU acceleration for training. "
|
|
1302
|
+
"Supported by: Cell2location, DestVI, Stereoscope, Tangram. "
|
|
1303
|
+
"Not supported by: RCTD, SPOTlight, CARD (R-based methods). "
|
|
1304
|
+
"Requires CUDA-compatible GPU and proper PyTorch installation."
|
|
1305
|
+
),
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1308
|
+
# Cell2location specific parameters
|
|
1309
|
+
cell2location_ref_model_epochs: Annotated[int, Field(gt=0)] = Field(
|
|
1310
|
+
250,
|
|
1311
|
+
description=(
|
|
1312
|
+
"Number of epochs for Cell2location reference model training (NB regression). "
|
|
1313
|
+
"This is the first stage training for estimating reference cell type signatures. "
|
|
1314
|
+
"Official recommendation: 250. "
|
|
1315
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1316
|
+
),
|
|
1317
|
+
)
|
|
1318
|
+
cell2location_n_epochs: Annotated[int, Field(gt=0)] = Field(
|
|
1319
|
+
30000,
|
|
1320
|
+
description=(
|
|
1321
|
+
"Number of epochs for Cell2location spatial mapping model training. "
|
|
1322
|
+
"Official recommendation: 30000. "
|
|
1323
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1324
|
+
),
|
|
1325
|
+
)
|
|
1326
|
+
cell2location_n_cells_per_spot: Annotated[int, Field(gt=0)] = Field(
|
|
1327
|
+
30,
|
|
1328
|
+
description=(
|
|
1329
|
+
"Expected number of cells per spatial location for Cell2location. "
|
|
1330
|
+
"This is tissue-dependent (e.g., 30 for Visium, 5-10 for MERFISH). "
|
|
1331
|
+
"Official recommendation: 30 for Visium data. "
|
|
1332
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1333
|
+
),
|
|
1334
|
+
)
|
|
1335
|
+
cell2location_detection_alpha: Annotated[float, Field(gt=0)] = Field(
|
|
1336
|
+
20.0,
|
|
1337
|
+
description=(
|
|
1338
|
+
"RNA detection sensitivity parameter for Cell2location. "
|
|
1339
|
+
"NEW DEFAULT (2024): 20 for high technical variability, 200 for low variability. "
|
|
1340
|
+
"Recommendation: test both values on your data. "
|
|
1341
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1342
|
+
),
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
# Batch and covariate correction for cell2location
|
|
1346
|
+
cell2location_batch_key: Optional[str] = Field(
|
|
1347
|
+
None,
|
|
1348
|
+
description=(
|
|
1349
|
+
"Column name in adata.obs for batch information (e.g., 'sample_id', 'batch'). "
|
|
1350
|
+
"Used for batch effect correction in Cell2location. "
|
|
1351
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1352
|
+
),
|
|
1353
|
+
)
|
|
1354
|
+
cell2location_categorical_covariate_keys: Optional[list[str]] = Field(
|
|
1355
|
+
None,
|
|
1356
|
+
description=(
|
|
1357
|
+
"List of column names in adata.obs for categorical technical covariates "
|
|
1358
|
+
"(e.g., ['platform', 'donor_id']) for Cell2location. "
|
|
1359
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1360
|
+
),
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Gene filtering parameters (Cell2location-specific preprocessing)
|
|
1364
|
+
cell2location_apply_gene_filtering: bool = Field(
|
|
1365
|
+
True,
|
|
1366
|
+
description=(
|
|
1367
|
+
"Apply Cell2location's recommended permissive gene filtering before training. "
|
|
1368
|
+
"ONLY USED BY CELL2LOCATION. This is NOT the same as HVG selection:\n"
|
|
1369
|
+
"• Cell2location uses permissive filtering to keep rare cell type markers\n"
|
|
1370
|
+
"• Yields ~10k-16k genes (more than typical 2k HVGs)\n"
|
|
1371
|
+
"• Official recommendation: avoid further gene selection for robust results\n"
|
|
1372
|
+
"Other methods use different strategies (see spotlight_n_top_genes parameter)."
|
|
1373
|
+
),
|
|
1374
|
+
)
|
|
1375
|
+
cell2location_gene_filter_cell_count_cutoff: int = Field(
|
|
1376
|
+
5,
|
|
1377
|
+
description=(
|
|
1378
|
+
"Minimum cells expressing a gene for Cell2location filtering (official default: 5). "
|
|
1379
|
+
"Low cutoff preserves rare cell type markers. "
|
|
1380
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1381
|
+
),
|
|
1382
|
+
)
|
|
1383
|
+
cell2location_gene_filter_cell_percentage_cutoff2: float = Field(
|
|
1384
|
+
0.03,
|
|
1385
|
+
description=(
|
|
1386
|
+
"Minimum percentage of cells expressing for Cell2location (official default: 0.03 = 3%). "
|
|
1387
|
+
"Genes detected in ≥3% of cells are always included. "
|
|
1388
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1389
|
+
),
|
|
1390
|
+
)
|
|
1391
|
+
cell2location_gene_filter_nonz_mean_cutoff: float = Field(
|
|
1392
|
+
1.12,
|
|
1393
|
+
description=(
|
|
1394
|
+
"Minimum non-zero mean expression for Cell2location (official default: 1.12). "
|
|
1395
|
+
"For genes between cutoffs, only keep if avg expression in non-zero cells > 1.12. "
|
|
1396
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1397
|
+
),
|
|
1398
|
+
)
|
|
1399
|
+
|
|
1400
|
+
# Phase 2: Training enhancement parameters (Cell2location)
|
|
1401
|
+
cell2location_ref_model_lr: Annotated[float, Field(gt=0)] = Field(
|
|
1402
|
+
0.002,
|
|
1403
|
+
description=(
|
|
1404
|
+
"Reference model learning rate for Cell2location (official default: 0.002 with ClippedAdam optimizer). "
|
|
1405
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1406
|
+
),
|
|
1407
|
+
)
|
|
1408
|
+
cell2location_lr: Annotated[float, Field(gt=0)] = Field(
|
|
1409
|
+
0.005,
|
|
1410
|
+
description=(
|
|
1411
|
+
"Cell2location model learning rate (official default: 0.005). "
|
|
1412
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1413
|
+
),
|
|
1414
|
+
)
|
|
1415
|
+
cell2location_ref_model_train_size: Annotated[float, Field(gt=0, le=1)] = Field(
|
|
1416
|
+
1.0,
|
|
1417
|
+
description=(
|
|
1418
|
+
"Fraction of reference data for training in Cell2location. "
|
|
1419
|
+
"DEFAULT: 1.0 (official tutorial recommendation - use all data). "
|
|
1420
|
+
"IMPORTANT: RegressionModel validation is not yet implemented, so train_size=1 is standard practice. "
|
|
1421
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1422
|
+
),
|
|
1423
|
+
)
|
|
1424
|
+
cell2location_train_size: Annotated[float, Field(gt=0, le=1)] = Field(
|
|
1425
|
+
1.0,
|
|
1426
|
+
description=(
|
|
1427
|
+
"Fraction of spatial data for training in Cell2location. "
|
|
1428
|
+
"DEFAULT: 1.0 (official tutorial: 'we need to estimate cell abundance at all locations'). "
|
|
1429
|
+
"Using train_size=1 ensures all spatial locations are included in training. "
|
|
1430
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1431
|
+
),
|
|
1432
|
+
)
|
|
1433
|
+
cell2location_enable_qc_plots: bool = Field(
|
|
1434
|
+
False,
|
|
1435
|
+
description=(
|
|
1436
|
+
"Generate QC diagnostic plots for Cell2location (ELBO history, convergence diagnostics). "
|
|
1437
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1438
|
+
),
|
|
1439
|
+
)
|
|
1440
|
+
cell2location_qc_output_dir: Optional[str] = Field(
|
|
1441
|
+
None,
|
|
1442
|
+
description=(
|
|
1443
|
+
"Output directory for Cell2location QC plots (None = plots not saved to disk). "
|
|
1444
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1445
|
+
),
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
# Phase 3: Runtime optimization parameters (Cell2location)
|
|
1449
|
+
cell2location_early_stopping: bool = Field(
|
|
1450
|
+
False,
|
|
1451
|
+
description=(
|
|
1452
|
+
"Enable early stopping to reduce Cell2location training time. "
|
|
1453
|
+
"DEFAULT: False (following official tutorial best practice). "
|
|
1454
|
+
"IMPORTANT: RegressionModel does not support validation, so early stopping is not recommended. "
|
|
1455
|
+
"Official tutorial uses train_size=1 without early stopping. "
|
|
1456
|
+
"Only enable if you have specific convergence monitoring needs. "
|
|
1457
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1458
|
+
),
|
|
1459
|
+
)
|
|
1460
|
+
cell2location_early_stopping_patience: Annotated[int, Field(gt=0)] = Field(
|
|
1461
|
+
45,
|
|
1462
|
+
description=(
|
|
1463
|
+
"Epochs to wait before stopping if no improvement for Cell2location (official default: 45). "
|
|
1464
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1465
|
+
),
|
|
1466
|
+
)
|
|
1467
|
+
cell2location_early_stopping_threshold: Annotated[float, Field(gt=0)] = Field(
|
|
1468
|
+
0.0,
|
|
1469
|
+
description=(
|
|
1470
|
+
"Minimum relative change to qualify as improvement for Cell2location (0 = any improvement). "
|
|
1471
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1472
|
+
),
|
|
1473
|
+
)
|
|
1474
|
+
cell2location_use_aggressive_training: bool = Field(
|
|
1475
|
+
False,
|
|
1476
|
+
description=(
|
|
1477
|
+
"Use train_aggressive() method for large-scale datasets in Cell2location. "
|
|
1478
|
+
"DEFAULT: False (standard train() method, following official tutorial). "
|
|
1479
|
+
"WHEN TO USE: Only for datasets with >50k locations that require mini-batch training due to GPU memory constraints. "
|
|
1480
|
+
"Standard Visium datasets (<50k locations) should use train_size=1 with batch_size=None (official best practice). "
|
|
1481
|
+
"Aggressive training implements amortised inference for scalability to 100k-1M+ locations. "
|
|
1482
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1483
|
+
),
|
|
1484
|
+
)
|
|
1485
|
+
cell2location_validation_size: Annotated[float, Field(gt=0, lt=1)] = Field(
|
|
1486
|
+
0.1,
|
|
1487
|
+
description=(
|
|
1488
|
+
"Fraction of data for validation set in Cell2location (required if early_stopping=True). "
|
|
1489
|
+
"NOTE: Official tutorial uses train_size=1 (no validation split) for standard workflows. "
|
|
1490
|
+
"ONLY USED BY CELL2LOCATION METHOD."
|
|
1491
|
+
),
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1494
|
+
# SPOTlight specific parameters
|
|
1495
|
+
spotlight_n_top_genes: Annotated[int, Field(gt=0, le=5000)] = Field(
|
|
1496
|
+
2000,
|
|
1497
|
+
description=(
|
|
1498
|
+
"Number of top highly variable genes (HVGs) to use for SPOTlight deconvolution. "
|
|
1499
|
+
"ONLY USED BY SPOTLIGHT METHOD. Other methods use different gene selection strategies:\n"
|
|
1500
|
+
"• Cell2location: Uses permissive gene filtering (apply_gene_filtering parameter)\n"
|
|
1501
|
+
"• RCTD/DestVI/Stereoscope/CARD/Tangram: Use all common genes between datasets\n"
|
|
1502
|
+
"Default: 2000. Recommended range: 1000-3000 for standard Visium data."
|
|
1503
|
+
),
|
|
1504
|
+
)
|
|
1505
|
+
spotlight_nmf_model: Literal["ns"] = Field(
|
|
1506
|
+
"ns",
|
|
1507
|
+
description=(
|
|
1508
|
+
"NMF model type for SPOTlight. ONLY USED BY SPOTLIGHT METHOD.\n\n"
|
|
1509
|
+
"Currently only 'ns' (non-smooth NMF) is supported. This method produces "
|
|
1510
|
+
"sparser, more interpretable deconvolution results.\n\n"
|
|
1511
|
+
"NOTE: SPOTlight documentation mentions 'std' (standard NMF) as an option, "
|
|
1512
|
+
"but it is currently broken in SPOTlight (internally creates 'stdNMF' algorithm "
|
|
1513
|
+
"which doesn't exist in the NMF package registry). We only expose working parameters.\n\n"
|
|
1514
|
+
"Reference: Elosua-Bayes et al. (2021) Nucleic Acids Research."
|
|
1515
|
+
),
|
|
1516
|
+
)
|
|
1517
|
+
spotlight_min_prop: Annotated[float, Field(ge=0, le=1)] = Field(
|
|
1518
|
+
0.01,
|
|
1519
|
+
description=(
|
|
1520
|
+
"Minimum cell type proportion threshold for SPOTlight. "
|
|
1521
|
+
"Cell types contributing less than this value are filtered out as noise. "
|
|
1522
|
+
"Official default: 0.01 (1%). "
|
|
1523
|
+
"Lower values = keep more cell types but more noise. "
|
|
1524
|
+
"Higher values = stricter filtering but may lose rare cell types. "
|
|
1525
|
+
"ONLY USED BY SPOTLIGHT METHOD."
|
|
1526
|
+
),
|
|
1527
|
+
)
|
|
1528
|
+
spotlight_scale: bool = Field(
|
|
1529
|
+
True,
|
|
1530
|
+
description=(
|
|
1531
|
+
"Whether to scale/normalize data in SPOTlight. "
|
|
1532
|
+
"Affects gene expression scale handling. "
|
|
1533
|
+
"Default: True (recommended). "
|
|
1534
|
+
"ONLY USED BY SPOTLIGHT METHOD."
|
|
1535
|
+
),
|
|
1536
|
+
)
|
|
1537
|
+
spotlight_weight_id: str = Field(
|
|
1538
|
+
"mean.AUC",
|
|
1539
|
+
description=(
|
|
1540
|
+
"Column name for marker gene weights in SPOTlight. "
|
|
1541
|
+
"Specifies which metric to use for weighting marker genes. "
|
|
1542
|
+
"Common values: 'mean.AUC' (default), 'median.AUC'. "
|
|
1543
|
+
"ONLY USED BY SPOTLIGHT METHOD."
|
|
1544
|
+
),
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1547
|
+
# DestVI parameters
|
|
1548
|
+
destvi_n_epochs: Annotated[int, Field(gt=0)] = Field(
|
|
1549
|
+
2000,
|
|
1550
|
+
description=(
|
|
1551
|
+
"Number of epochs for DestVI training. "
|
|
1552
|
+
"Official recommendation: 2000 (minimum 1000). "
|
|
1553
|
+
"ONLY USED BY DESTVI METHOD."
|
|
1554
|
+
),
|
|
1555
|
+
)
|
|
1556
|
+
destvi_n_hidden: int = 128
|
|
1557
|
+
destvi_n_latent: int = 10
|
|
1558
|
+
destvi_n_layers: int = 1
|
|
1559
|
+
destvi_dropout_rate: float = 0.1
|
|
1560
|
+
destvi_learning_rate: float = 1e-3
|
|
1561
|
+
|
|
1562
|
+
# DestVI advanced parameters (official scvi-tools defaults)
|
|
1563
|
+
destvi_train_size: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
|
|
1564
|
+
default=0.9,
|
|
1565
|
+
description=(
|
|
1566
|
+
"Fraction of data to use for training DestVI (rest for validation). "
|
|
1567
|
+
"Official scvi-tools default: 0.9. "
|
|
1568
|
+
"Lower values (0.8) provide more robust validation but less training data. "
|
|
1569
|
+
"ONLY USED BY DESTVI METHOD."
|
|
1570
|
+
),
|
|
1571
|
+
)
|
|
1572
|
+
destvi_vamp_prior_p: Annotated[int, Field(ge=1)] = Field(
|
|
1573
|
+
default=15,
|
|
1574
|
+
description=(
|
|
1575
|
+
"Number of VampPrior components for DestVI. "
|
|
1576
|
+
"Official scvi-tools default: 15. "
|
|
1577
|
+
"Higher values may improve modeling of complex cell type distributions. "
|
|
1578
|
+
"ONLY USED BY DESTVI METHOD."
|
|
1579
|
+
),
|
|
1580
|
+
)
|
|
1581
|
+
destvi_l1_reg: Annotated[float, Field(ge=0.0)] = Field(
|
|
1582
|
+
default=10.0,
|
|
1583
|
+
description=(
|
|
1584
|
+
"L1 regularization strength for DestVI to encourage sparsity. "
|
|
1585
|
+
"Official scvi-tools default: 10.0. "
|
|
1586
|
+
"Higher values encourage sparser cell type assignments per spot. "
|
|
1587
|
+
"ONLY USED BY DESTVI METHOD."
|
|
1588
|
+
),
|
|
1589
|
+
)
|
|
1590
|
+
|
|
1591
|
+
# Stereoscope parameters
|
|
1592
|
+
stereoscope_n_epochs: int = 150000
|
|
1593
|
+
stereoscope_learning_rate: float = 0.01
|
|
1594
|
+
stereoscope_batch_size: int = 128
|
|
1595
|
+
|
|
1596
|
+
# RCTD specific parameters
|
|
1597
|
+
rctd_mode: Literal["full", "doublet", "multi"] = Field(
|
|
1598
|
+
"full",
|
|
1599
|
+
description=(
|
|
1600
|
+
"RCTD deconvolution mode (Cable et al. 2022):\n"
|
|
1601
|
+
"• 'doublet': Assigns 1-2 cell types per spot, classifies each as 'singlet' or 'doublet'. "
|
|
1602
|
+
"Recommended for HIGH-RESOLUTION spatial data (Slide-seq ~10μm, MERFISH, Visium HD)\n"
|
|
1603
|
+
"• 'full' (default): Assigns any number of cell types per spot. "
|
|
1604
|
+
"Recommended for LOW-RESOLUTION data (standard Visium 55μm spots, 100μm spacing)\n"
|
|
1605
|
+
"• 'multi': Extension of doublet mode using greedy algorithm to add multiple cell types. "
|
|
1606
|
+
"Alternative to 'full' with more constraints on cell type mixing"
|
|
1607
|
+
),
|
|
1608
|
+
)
|
|
1609
|
+
max_cores: Annotated[int, Field(gt=0, le=16)] = 4 # Maximum number of cores to use
|
|
1610
|
+
rctd_confidence_threshold: Annotated[float, Field(gt=0)] = (
|
|
1611
|
+
10.0 # Confidence threshold for cell type assignment (higher = more stringent)
|
|
1612
|
+
)
|
|
1613
|
+
rctd_doublet_threshold: Annotated[float, Field(gt=0)] = (
|
|
1614
|
+
25.0 # Threshold for doublet detection (used in doublet/multi modes)
|
|
1615
|
+
)
|
|
1616
|
+
rctd_max_multi_types: Annotated[int, Field(ge=2, le=10)] = Field(
|
|
1617
|
+
4,
|
|
1618
|
+
description=(
|
|
1619
|
+
"Maximum number of cell types per spot in RCTD multi mode. "
|
|
1620
|
+
"Recommended: 4-6 for Visium (100μm spots), 2-3 for higher resolution. "
|
|
1621
|
+
"Must be less than total number of cell types in reference data."
|
|
1622
|
+
),
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
# CARD specific parameters
|
|
1626
|
+
card_minCountGene: Annotated[int, Field(gt=0)] = Field(
|
|
1627
|
+
100,
|
|
1628
|
+
description="Minimum total counts per gene across all spots for CARD quality control filtering",
|
|
1629
|
+
)
|
|
1630
|
+
card_minCountSpot: Annotated[int, Field(gt=0)] = Field(
|
|
1631
|
+
5,
|
|
1632
|
+
description="Minimum number of spots where a gene must be expressed for CARD quality control",
|
|
1633
|
+
)
|
|
1634
|
+
card_sample_key: Optional[str] = Field(
|
|
1635
|
+
None,
|
|
1636
|
+
description="Optional sample/batch column name in reference data for multi-sample CARD analysis",
|
|
1637
|
+
)
|
|
1638
|
+
card_imputation: bool = Field(
|
|
1639
|
+
False,
|
|
1640
|
+
description=(
|
|
1641
|
+
"Enable CARD spatial imputation to create enhanced high-resolution spatial maps. "
|
|
1642
|
+
"CARD's unique CAR (Conditional AutoRegressive) model allows imputation at unmeasured locations, "
|
|
1643
|
+
"constructing refined tissue maps with arbitrarily higher resolution than the original measurement. "
|
|
1644
|
+
"Extremely fast: 0.4s for all genes (5816x faster than BayesSpace). "
|
|
1645
|
+
"Use for: Enhancing Visium to near-cellular resolution, filling tissue gaps, smoothing artifacts"
|
|
1646
|
+
),
|
|
1647
|
+
)
|
|
1648
|
+
card_NumGrids: Annotated[int, Field(gt=0)] = Field(
|
|
1649
|
+
2000,
|
|
1650
|
+
description=(
|
|
1651
|
+
"Number of spatial grid points for CARD imputation (default: 2000). "
|
|
1652
|
+
"Higher values = finer spatial resolution but increased computation. "
|
|
1653
|
+
"Typical values: 2000 (standard), 5000 (high-res), 10000 (ultra high-res). "
|
|
1654
|
+
"The imputed map will have ~NumGrids locations covering the tissue area"
|
|
1655
|
+
),
|
|
1656
|
+
)
|
|
1657
|
+
card_ineibor: Annotated[int, Field(gt=0)] = Field(
|
|
1658
|
+
10,
|
|
1659
|
+
description=(
|
|
1660
|
+
"Number of nearest neighbors for CARD spatial imputation (default: 10). "
|
|
1661
|
+
"Controls the spatial smoothness of imputed results. "
|
|
1662
|
+
"Higher values = smoother maps, lower values = preserve local variation"
|
|
1663
|
+
),
|
|
1664
|
+
)
|
|
1665
|
+
|
|
1666
|
+
# Tangram specific parameters
|
|
1667
|
+
tangram_n_epochs: Annotated[int, Field(gt=0)] = Field(
|
|
1668
|
+
1000,
|
|
1669
|
+
description=(
|
|
1670
|
+
"Number of epochs for Tangram spatial mapping. "
|
|
1671
|
+
"Official recommendation: 1000. "
|
|
1672
|
+
"ONLY USED BY TANGRAM METHOD."
|
|
1673
|
+
),
|
|
1674
|
+
)
|
|
1675
|
+
tangram_mode: Literal["cells", "clusters", "constrained"] = Field(
|
|
1676
|
+
"cells",
|
|
1677
|
+
description=(
|
|
1678
|
+
"Tangram mapping mode. "
|
|
1679
|
+
"'cells': Cell-level mapping (default). "
|
|
1680
|
+
"'clusters': Cluster-level mapping (requires cluster_label). "
|
|
1681
|
+
"'constrained': Constrained optimization with target_count. "
|
|
1682
|
+
"Official recommendation: 'cells' for most applications. "
|
|
1683
|
+
"ONLY USED BY TANGRAM METHOD."
|
|
1684
|
+
),
|
|
1685
|
+
)
|
|
1686
|
+
tangram_learning_rate: Annotated[float, Field(gt=0)] = Field(
|
|
1687
|
+
0.1,
|
|
1688
|
+
description=(
|
|
1689
|
+
"Learning rate for Tangram optimizer. "
|
|
1690
|
+
"Official default: 0.1. "
|
|
1691
|
+
"Higher values = faster convergence but less stable. "
|
|
1692
|
+
"Lower values = more stable but slower. "
|
|
1693
|
+
"ONLY USED BY TANGRAM METHOD."
|
|
1694
|
+
),
|
|
1695
|
+
)
|
|
1696
|
+
tangram_density_prior: Literal["rna_count_based", "uniform"] = Field(
|
|
1697
|
+
"rna_count_based",
|
|
1698
|
+
description=(
|
|
1699
|
+
"Spatial density prior for Tangram. "
|
|
1700
|
+
"'rna_count_based': Weight by RNA counts (default, recommended). "
|
|
1701
|
+
"'uniform': Equal weight for all spots. "
|
|
1702
|
+
"Official recommendation: 'rna_count_based' for better biological interpretation. "
|
|
1703
|
+
"ONLY USED BY TANGRAM METHOD."
|
|
1704
|
+
),
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
# FlashDeconv specific parameters (DEFAULT METHOD - ultra-fast, atlas-scale)
|
|
1708
|
+
flashdeconv_sketch_dim: Annotated[int, Field(gt=0, le=2048)] = Field(
|
|
1709
|
+
512,
|
|
1710
|
+
description=(
|
|
1711
|
+
"Dimension of the sketched space for FlashDeconv. "
|
|
1712
|
+
"Higher values preserve more information but increase computation. "
|
|
1713
|
+
"Default: 512 (recommended for most datasets). "
|
|
1714
|
+
"ONLY USED BY FLASHDECONV METHOD."
|
|
1715
|
+
),
|
|
1716
|
+
)
|
|
1717
|
+
flashdeconv_lambda_spatial: Annotated[float, Field(gt=0)] = Field(
|
|
1718
|
+
5000.0,
|
|
1719
|
+
description=(
|
|
1720
|
+
"Spatial regularization strength for FlashDeconv. "
|
|
1721
|
+
"Higher values encourage smoother spatial patterns. "
|
|
1722
|
+
"Recommended values by platform:\n"
|
|
1723
|
+
"• Standard Visium (55μm): 1000-10000 (default: 5000)\n"
|
|
1724
|
+
"• Visium HD (16μm): 5000-20000\n"
|
|
1725
|
+
"• Visium HD (8μm): 10000-50000\n"
|
|
1726
|
+
"• Visium HD (2μm): 50000-100000\n"
|
|
1727
|
+
"• Stereo-seq/Seq-Scope: 50000-200000\n"
|
|
1728
|
+
"Use 'auto' for automatic tuning (may underestimate for real data). "
|
|
1729
|
+
"ONLY USED BY FLASHDECONV METHOD."
|
|
1730
|
+
),
|
|
1731
|
+
)
|
|
1732
|
+
flashdeconv_n_hvg: Annotated[int, Field(gt=0, le=5000)] = Field(
|
|
1733
|
+
2000,
|
|
1734
|
+
description=(
|
|
1735
|
+
"Number of highly variable genes to select for FlashDeconv. "
|
|
1736
|
+
"Default: 2000. "
|
|
1737
|
+
"ONLY USED BY FLASHDECONV METHOD."
|
|
1738
|
+
),
|
|
1739
|
+
)
|
|
1740
|
+
flashdeconv_n_markers_per_type: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
1741
|
+
50,
|
|
1742
|
+
description=(
|
|
1743
|
+
"Number of marker genes per cell type for FlashDeconv. "
|
|
1744
|
+
"Default: 50. "
|
|
1745
|
+
"ONLY USED BY FLASHDECONV METHOD."
|
|
1746
|
+
),
|
|
1747
|
+
)
|
|
1748
|
+
|
|
1749
|
+
|
|
1750
|
+
class SpatialDomainParameters(BaseModel):
|
|
1751
|
+
"""Spatial domain identification parameters model"""
|
|
1752
|
+
|
|
1753
|
+
method: Literal["spagcn", "leiden", "louvain", "stagate", "graphst"] = "spagcn"
|
|
1754
|
+
n_domains: Annotated[int, Field(gt=0, le=50)] = (
|
|
1755
|
+
7 # Number of spatial domains to identify
|
|
1756
|
+
)
|
|
1757
|
+
|
|
1758
|
+
# SpaGCN specific parameters
|
|
1759
|
+
spagcn_s: Annotated[float, Field(gt=0.0)] = (
|
|
1760
|
+
1.0 # Weight given to histology in SpaGCN
|
|
1761
|
+
)
|
|
1762
|
+
spagcn_b: Annotated[int, Field(gt=0)] = (
|
|
1763
|
+
49 # Area of each spot when extracting color intensity
|
|
1764
|
+
)
|
|
1765
|
+
spagcn_p: Annotated[float, Field(ge=0.0, le=1.0)] = (
|
|
1766
|
+
0.5 # Percentage of total expression contributed by neighborhoods
|
|
1767
|
+
)
|
|
1768
|
+
spagcn_use_histology: bool = True # Whether to use histology image in SpaGCN
|
|
1769
|
+
spagcn_random_seed: int = 100 # Random seed for SpaGCN
|
|
1770
|
+
|
|
1771
|
+
# General clustering parameters
|
|
1772
|
+
resolution: float = 0.5 # Resolution for leiden/louvain clustering
|
|
1773
|
+
use_highly_variable: bool = True # Whether to use highly variable genes only
|
|
1774
|
+
refine_domains: bool = (
|
|
1775
|
+
True # Whether to refine spatial domains using spatial smoothing
|
|
1776
|
+
)
|
|
1777
|
+
refinement_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = (
|
|
1778
|
+
0.5 # Threshold for refinement: only relabel if >=threshold of neighbors differ (0.5 = 50%, following SpaGCN)
|
|
1779
|
+
)
|
|
1780
|
+
|
|
1781
|
+
# Clustering-specific parameters for leiden/louvain methods
|
|
1782
|
+
cluster_n_neighbors: Optional[Annotated[int, Field(gt=0)]] = (
|
|
1783
|
+
None # Number of neighbors for clustering (default: 15)
|
|
1784
|
+
)
|
|
1785
|
+
cluster_spatial_weight: Optional[Annotated[float, Field(ge=0.0, le=1.0)]] = (
|
|
1786
|
+
None # Weight for spatial information (default: 0.3)
|
|
1787
|
+
)
|
|
1788
|
+
cluster_resolution: Optional[float] = None # Resolution parameter for clustering
|
|
1789
|
+
|
|
1790
|
+
# STAGATE specific parameters
|
|
1791
|
+
stagate_rad_cutoff: Optional[float] = (
|
|
1792
|
+
None # Radius cutoff for spatial neighbors (default: 150)
|
|
1793
|
+
)
|
|
1794
|
+
stagate_learning_rate: Optional[float] = None # Learning rate (default: 0.001)
|
|
1795
|
+
stagate_weight_decay: Optional[float] = None # Weight decay (default: 0.0001)
|
|
1796
|
+
stagate_epochs: Optional[int] = None # Number of training epochs (default: 1000)
|
|
1797
|
+
stagate_dim_output: Optional[int] = (
|
|
1798
|
+
None # Dimension of output representation (default: 15)
|
|
1799
|
+
)
|
|
1800
|
+
stagate_random_seed: Optional[int] = None # Random seed (default: 42)
|
|
1801
|
+
|
|
1802
|
+
# GraphST specific parameters
|
|
1803
|
+
graphst_use_gpu: bool = False # Whether to use GPU acceleration
|
|
1804
|
+
graphst_clustering_method: Literal["mclust", "leiden", "louvain"] = (
|
|
1805
|
+
"leiden" # Clustering method for GraphST
|
|
1806
|
+
)
|
|
1807
|
+
graphst_refinement: bool = True # Whether to refine domains using spatial info
|
|
1808
|
+
graphst_radius: int = 50 # Radius for spatial refinement
|
|
1809
|
+
graphst_random_seed: int = 42 # Random seed for GraphST
|
|
1810
|
+
graphst_n_clusters: Optional[int] = (
|
|
1811
|
+
None # Number of clusters (if None, uses n_domains)
|
|
1812
|
+
)
|
|
1813
|
+
|
|
1814
|
+
# Simple timeout configuration
|
|
1815
|
+
timeout: Optional[int] = None # Timeout in seconds (default: 600)
|
|
1816
|
+
|
|
1817
|
+
|
|
1818
|
+
class SpatialVariableGenesParameters(BaseModel):
|
|
1819
|
+
"""Spatial variable genes identification parameters model"""
|
|
1820
|
+
|
|
1821
|
+
# Method selection
|
|
1822
|
+
method: Literal["spatialde", "sparkx"] = (
|
|
1823
|
+
"sparkx" # Default to SPARK-X (best accuracy)
|
|
1824
|
+
)
|
|
1825
|
+
|
|
1826
|
+
# Common parameters for all methods
|
|
1827
|
+
n_top_genes: Optional[Annotated[int, Field(gt=0, le=5000)]] = (
|
|
1828
|
+
None # Number of top spatial variable genes to return (None = all significant)
|
|
1829
|
+
)
|
|
1830
|
+
spatial_key: str = "spatial" # Key in obsm containing spatial coordinates
|
|
1831
|
+
|
|
1832
|
+
# SpatialDE-specific parameters
|
|
1833
|
+
spatialde_normalized: bool = True # Whether data is already normalized
|
|
1834
|
+
spatialde_kernel: str = "SE" # Kernel function type for SpatialDE
|
|
1835
|
+
spatialde_pi0: Optional[float] = Field(
|
|
1836
|
+
default=None,
|
|
1837
|
+
gt=0.0,
|
|
1838
|
+
le=1.0,
|
|
1839
|
+
description=(
|
|
1840
|
+
"Prior probability of null hypothesis for SpatialDE q-value estimation. "
|
|
1841
|
+
"This represents the expected proportion of genes WITHOUT spatial patterns. "
|
|
1842
|
+
"\n\n"
|
|
1843
|
+
"VALUES:\n"
|
|
1844
|
+
"- None (default, RECOMMENDED): Uses adaptive pi0 estimation from SpatialDE\n"
|
|
1845
|
+
"- 0.9: Assumes 10% of genes have spatial patterns (conservative)\n"
|
|
1846
|
+
"- 0.5: Assumes 50% of genes have spatial patterns (moderate)\n"
|
|
1847
|
+
"- 0.1: Assumes 90% of genes have spatial patterns (aggressive, may increase false positives)\n"
|
|
1848
|
+
"\n"
|
|
1849
|
+
"SCIENTIFIC NOTE:\n"
|
|
1850
|
+
"The pi0 parameter directly affects the stringency of FDR correction. "
|
|
1851
|
+
"Lower pi0 values assume more genes are truly spatial, leading to more "
|
|
1852
|
+
"liberal q-value estimates and potentially more false positives. "
|
|
1853
|
+
"The default adaptive estimation (None) is recommended for most analyses "
|
|
1854
|
+
"as it learns pi0 from the data distribution."
|
|
1855
|
+
),
|
|
1856
|
+
)
|
|
1857
|
+
|
|
1858
|
+
# SPARK-X specific parameters
|
|
1859
|
+
sparkx_percentage: Annotated[float, Field(gt=0.0, le=1.0)] = (
|
|
1860
|
+
0.1 # Percentage of total expression for filtering
|
|
1861
|
+
)
|
|
1862
|
+
sparkx_min_total_counts: Annotated[int, Field(gt=0)] = (
|
|
1863
|
+
10 # Minimum total counts per gene
|
|
1864
|
+
)
|
|
1865
|
+
sparkx_num_core: Annotated[int, Field(gt=0, le=16)] = (
|
|
1866
|
+
1 # Number of cores for parallel processing
|
|
1867
|
+
)
|
|
1868
|
+
sparkx_option: Literal["single", "mixture"] = (
|
|
1869
|
+
"mixture" # Kernel testing: "single" (faster) or "mixture" (11 kernels)
|
|
1870
|
+
)
|
|
1871
|
+
sparkx_verbose: bool = False # Whether to print detailed R output
|
|
1872
|
+
|
|
1873
|
+
# Gene filtering parameters
|
|
1874
|
+
filter_mt_genes: bool = (
|
|
1875
|
+
True # Filter mitochondrial genes (MT-*) - standard practice
|
|
1876
|
+
)
|
|
1877
|
+
filter_ribo_genes: bool = (
|
|
1878
|
+
False # Filter ribosomal genes (RPS*, RPL*) - optional, may remove housekeeping
|
|
1879
|
+
)
|
|
1880
|
+
test_only_hvg: bool = (
|
|
1881
|
+
True # Test only highly variable genes - 2024 best practice for reducing housekeeping dominance
|
|
1882
|
+
# Requires preprocessing with HVG detection first; set to False to test all genes (not recommended)
|
|
1883
|
+
)
|
|
1884
|
+
warn_housekeeping: bool = True # Warn if >30% of top genes are housekeeping genes
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
class CellCommunicationParameters(BaseModel):
|
|
1888
|
+
"""Cell-cell communication analysis parameters model with explicit user control"""
|
|
1889
|
+
|
|
1890
|
+
# ========== Basic Method Selection ==========
|
|
1891
|
+
method: Literal["liana", "cellphonedb", "cellchat_r", "fastccc"] = "liana"
|
|
1892
|
+
# Methods:
|
|
1893
|
+
# - "liana": LIANA+ framework (Python, supports multiple resources)
|
|
1894
|
+
# - "cellphonedb": CellPhoneDB v5 (Python)
|
|
1895
|
+
# - "cellchat_r": Native R CellChat (full features with mediator proteins & pathways)
|
|
1896
|
+
# - "fastccc": FastCCC permutation-free framework (Nature Comm 2025, ultra-fast)
|
|
1897
|
+
|
|
1898
|
+
# ========== Species and Resource Control ==========
|
|
1899
|
+
species: Literal["human", "mouse", "zebrafish"]
|
|
1900
|
+
# REQUIRED: Must explicitly specify species for ligand-receptor database
|
|
1901
|
+
# - "human": For human data (genes like ACTB, GAPDH - all uppercase)
|
|
1902
|
+
# - "mouse": For mouse data (genes like Actb, Gapdh - capitalized)
|
|
1903
|
+
# - "zebrafish": For zebrafish data
|
|
1904
|
+
|
|
1905
|
+
# LIANA resource selection (matches actual LIANA+ supported resources)
|
|
1906
|
+
liana_resource: Literal[
|
|
1907
|
+
"consensus", # Default: consensus of multiple databases (recommended)
|
|
1908
|
+
"mouseconsensus", # Mouse consensus database
|
|
1909
|
+
"baccin2019", # Baccin et al. 2019 resource
|
|
1910
|
+
"cellcall", # CellCall database
|
|
1911
|
+
"cellchatdb", # CellChat database
|
|
1912
|
+
"cellinker", # CellLinker database
|
|
1913
|
+
"cellphonedb", # CellPhoneDB database (curated, stringent)
|
|
1914
|
+
"celltalkdb", # CellTalkDB database (large)
|
|
1915
|
+
"connectomedb2020", # Connectome database 2020
|
|
1916
|
+
"embrace", # EMBRACE database
|
|
1917
|
+
"guide2pharma", # Guide to Pharmacology
|
|
1918
|
+
"hpmr", # Human Plasma Membrane Receptome
|
|
1919
|
+
"icellnet", # iCellNet database (immune focus)
|
|
1920
|
+
"italk", # iTALK database
|
|
1921
|
+
"kirouac2010", # Kirouac et al. 2010
|
|
1922
|
+
"lrdb", # LRdb database
|
|
1923
|
+
"ramilowski2015", # Ramilowski et al. 2015
|
|
1924
|
+
] = "consensus" # LR database resource
|
|
1925
|
+
|
|
1926
|
+
# ========== Spatial Analysis Control ==========
|
|
1927
|
+
perform_spatial_analysis: bool = (
|
|
1928
|
+
True # Whether to perform spatial bivariate analysis
|
|
1929
|
+
)
|
|
1930
|
+
|
|
1931
|
+
# ========== Cell Type Control ==========
|
|
1932
|
+
# Cell type key (unified naming with other tools)
|
|
1933
|
+
cell_type_key: str # REQUIRED: Which column to use for cell types. LLM will infer from metadata. Common values: 'cell_type', 'celltype', 'leiden', 'louvain', 'seurat_clusters'
|
|
1934
|
+
|
|
1935
|
+
# ========== LIANA Specific Parameters ==========
|
|
1936
|
+
liana_local_metric: Literal["cosine", "pearson", "spearman", "jaccard"] = (
|
|
1937
|
+
"cosine" # Local spatial metric
|
|
1938
|
+
)
|
|
1939
|
+
liana_global_metric: Literal["morans", "lee"] = "morans" # Global spatial metric
|
|
1940
|
+
liana_n_perms: Annotated[int, Field(gt=0)] = (
|
|
1941
|
+
1000 # Number of permutations for LIANA (1000 minimum for publication-quality p-values)
|
|
1942
|
+
)
|
|
1943
|
+
liana_nz_prop: Annotated[float, Field(gt=0.0, le=1.0)] = (
|
|
1944
|
+
0.2 # Minimum expression proportion
|
|
1945
|
+
)
|
|
1946
|
+
liana_bandwidth: Optional[int] = None # Bandwidth for spatial connectivity
|
|
1947
|
+
liana_cutoff: Annotated[float, Field(gt=0.0, le=1.0)] = (
|
|
1948
|
+
0.1 # Cutoff for spatial connectivity
|
|
1949
|
+
)
|
|
1950
|
+
liana_significance_alpha: Annotated[float, Field(gt=0.0, lt=1.0)] = Field(
|
|
1951
|
+
default=0.05,
|
|
1952
|
+
description=(
|
|
1953
|
+
"Significance threshold (alpha) for FDR-corrected p-values in LIANA analysis.\n"
|
|
1954
|
+
"Default: 0.05 (standard statistical threshold).\n"
|
|
1955
|
+
"Use 0.01 for more stringent filtering, 0.10 for exploratory analysis.\n"
|
|
1956
|
+
"This controls both cluster-level (magnitude_rank) and spatial (FDR-corrected) significance."
|
|
1957
|
+
),
|
|
1958
|
+
)
|
|
1959
|
+
|
|
1960
|
+
# ========== Expression Filtering Parameters ==========
|
|
1961
|
+
min_cells: Annotated[int, Field(ge=0)] = (
|
|
1962
|
+
3 # Minimum cells expressing ligand or receptor (required by LIANA for statistical validity)
|
|
1963
|
+
)
|
|
1964
|
+
|
|
1965
|
+
# ========== Result Control ==========
|
|
1966
|
+
plot_top_pairs: Annotated[int, Field(gt=0, le=100)] = (
|
|
1967
|
+
6 # Number of top LR pairs to include in results (chord diagrams may use 50+)
|
|
1968
|
+
)
|
|
1969
|
+
|
|
1970
|
+
# ========== CellPhoneDB Specific Parameters ==========
|
|
1971
|
+
cellphonedb_threshold: Annotated[float, Field(gt=0.0, le=1.0)] = (
|
|
1972
|
+
0.1 # Expression threshold
|
|
1973
|
+
)
|
|
1974
|
+
cellphonedb_iterations: Annotated[int, Field(gt=0, le=10000)] = (
|
|
1975
|
+
1000 # Statistical permutations
|
|
1976
|
+
)
|
|
1977
|
+
cellphonedb_result_precision: Annotated[int, Field(gt=0, le=5)] = (
|
|
1978
|
+
3 # Result decimal precision
|
|
1979
|
+
)
|
|
1980
|
+
cellphonedb_pvalue: Annotated[float, Field(gt=0.0, le=1.0)] = (
|
|
1981
|
+
0.05 # P-value significance threshold
|
|
1982
|
+
)
|
|
1983
|
+
cellphonedb_use_microenvironments: bool = (
|
|
1984
|
+
True # Whether to use spatial microenvironments
|
|
1985
|
+
)
|
|
1986
|
+
cellphonedb_spatial_radius: Optional[Annotated[float, Field(gt=0.0)]] = (
|
|
1987
|
+
None # Spatial radius for microenvironments
|
|
1988
|
+
)
|
|
1989
|
+
cellphonedb_debug_seed: Optional[int] = None # Random seed for reproducible results
|
|
1990
|
+
|
|
1991
|
+
# Multiple testing correction for CellPhoneDB
|
|
1992
|
+
# When using minimum p-value across multiple cell type pairs, correction is needed
|
|
1993
|
+
# to control false positive rate (e.g., 7 clusters = 49 pairs → FPR 91.9% without correction)
|
|
1994
|
+
cellphonedb_correction_method: Literal["fdr_bh", "bonferroni", "sidak", "none"] = (
|
|
1995
|
+
"fdr_bh" # Multiple testing correction method (default: Benjamini-Hochberg FDR)
|
|
1996
|
+
)
|
|
1997
|
+
# Options:
|
|
1998
|
+
# - "fdr_bh": Benjamini-Hochberg FDR (recommended, balances sensitivity & specificity)
|
|
1999
|
+
# - "bonferroni": Bonferroni correction (most conservative, controls FWER)
|
|
2000
|
+
# - "sidak": Šidák correction (similar to Bonferroni but more accurate for independent tests)
|
|
2001
|
+
# - "none": No correction (NOT recommended, leads to ~92% FPR with 7 clusters)
|
|
2002
|
+
|
|
2003
|
+
# ========== CellChat R Specific Parameters ==========
|
|
2004
|
+
# These parameters are only used when method="cellchat_r"
|
|
2005
|
+
cellchat_db_category: Literal[
|
|
2006
|
+
"Secreted Signaling",
|
|
2007
|
+
"ECM-Receptor",
|
|
2008
|
+
"Cell-Cell Contact",
|
|
2009
|
+
"All",
|
|
2010
|
+
] = "All"
|
|
2011
|
+
# CellChatDB category to use:
|
|
2012
|
+
# - "Secreted Signaling": Ligand-receptor pairs for secreted signaling
|
|
2013
|
+
# - "ECM-Receptor": Extracellular matrix-receptor interactions
|
|
2014
|
+
# - "Cell-Cell Contact": Direct cell-cell contact interactions
|
|
2015
|
+
# - "All": Use all categories (default)
|
|
2016
|
+
|
|
2017
|
+
cellchat_type: Literal["triMean", "truncatedMean", "thresholdedMean", "median"] = (
|
|
2018
|
+
"triMean"
|
|
2019
|
+
)
|
|
2020
|
+
# CellChat expression aggregation method:
|
|
2021
|
+
# - "trimean": Tukey's trimean (robust, default, produces fewer interactions)
|
|
2022
|
+
# - "truncatedMean": Truncated mean (more interactions, use with trim parameter)
|
|
2023
|
+
|
|
2024
|
+
cellchat_trim: Annotated[float, Field(ge=0.0, le=0.5)] = 0.1
|
|
2025
|
+
# Trim proportion for truncatedMean method (0.1 = 10% truncated mean)
|
|
2026
|
+
|
|
2027
|
+
cellchat_population_size: bool = True
|
|
2028
|
+
# Whether to consider cell population size effect in communication probability
|
|
2029
|
+
|
|
2030
|
+
cellchat_min_cells: Annotated[int, Field(ge=1)] = 10
|
|
2031
|
+
# Minimum number of cells required in each cell group for filterCommunication
|
|
2032
|
+
|
|
2033
|
+
cellchat_distance_use: bool = True
|
|
2034
|
+
# Whether to use spatial distance constraints (for spatial data)
|
|
2035
|
+
|
|
2036
|
+
cellchat_interaction_range: Annotated[float, Field(gt=0.0)] = 250.0
|
|
2037
|
+
# Maximum interaction/diffusion range of ligands in microns (for spatial data)
|
|
2038
|
+
|
|
2039
|
+
cellchat_scale_distance: Annotated[float, Field(gt=0.0)] = 0.01
|
|
2040
|
+
# Scale factor for distance calculation (adjust based on imaging technology)
|
|
2041
|
+
|
|
2042
|
+
cellchat_contact_knn_k: Annotated[int, Field(ge=1)] = 6
|
|
2043
|
+
# Number of nearest neighbors for defining contact-dependent signaling
|
|
2044
|
+
# Used for spatial data to determine which cells are in contact range
|
|
2045
|
+
|
|
2046
|
+
cellchat_contact_range: Optional[Annotated[float, Field(gt=0.0)]] = None
|
|
2047
|
+
# Alternative to contact_knn_k: explicit distance threshold for contact signaling
|
|
2048
|
+
# If None, uses contact_knn_k instead (recommended for most spatial data)
|
|
2049
|
+
|
|
2050
|
+
# CellChat spatial conversion factors (platform-specific)
|
|
2051
|
+
cellchat_pixel_ratio: Annotated[float, Field(gt=0.0)] = Field(
|
|
2052
|
+
default=0.5,
|
|
2053
|
+
description=(
|
|
2054
|
+
"Conversion factor from image pixels to micrometers (um).\n"
|
|
2055
|
+
"Platform-specific defaults:\n"
|
|
2056
|
+
" - Visium (10x): 0.5 (1 pixel ≈ 0.5 um at full resolution)\n"
|
|
2057
|
+
" - MERFISH: Varies by imaging setup, typically 0.1-1.0\n"
|
|
2058
|
+
" - Slide-seq: ~0.5 (10 um beads)\n"
|
|
2059
|
+
" - CosMx: 0.18 (imaging resolution)\n"
|
|
2060
|
+
"Used in CellChat's spatial.factors for coordinate conversion."
|
|
2061
|
+
),
|
|
2062
|
+
)
|
|
2063
|
+
|
|
2064
|
+
cellchat_spatial_tol: Annotated[float, Field(gt=0.0)] = Field(
|
|
2065
|
+
default=27.5,
|
|
2066
|
+
description=(
|
|
2067
|
+
"Spatial tolerance (half of spot/cell diameter) in micrometers.\n"
|
|
2068
|
+
"Platform-specific defaults:\n"
|
|
2069
|
+
" - Visium (10x): 27.5 um (spot diameter ~55um, half is ~27.5)\n"
|
|
2070
|
+
" - MERFISH: 5-10 um (single cell resolution)\n"
|
|
2071
|
+
" - Slide-seq: 5 um (10 um bead diameter / 2)\n"
|
|
2072
|
+
" - CosMx: 5-10 um (single cell resolution)\n"
|
|
2073
|
+
"Used in CellChat's spatial.factors.tol for defining spatial proximity."
|
|
2074
|
+
),
|
|
2075
|
+
)
|
|
2076
|
+
|
|
2077
|
+
# ========== FastCCC Specific Parameters ==========
|
|
2078
|
+
# FastCCC is a permutation-free framework using FFT-based convolution
|
|
2079
|
+
# Reference: Nature Communications 2025 (https://github.com/Svvord/FastCCC)
|
|
2080
|
+
# Key advantage: Ultra-fast (16M cells in minutes vs hours for permutation methods)
|
|
2081
|
+
|
|
2082
|
+
fastccc_single_unit_summary: Literal["Mean", "Median", "Q3", "Quantile_0.9"] = (
|
|
2083
|
+
Field(
|
|
2084
|
+
default="Mean",
|
|
2085
|
+
description=(
|
|
2086
|
+
"Aggregation method for single-unit gene expression within cell types.\n"
|
|
2087
|
+
"Options:\n"
|
|
2088
|
+
" - 'Mean': Mean expression (default, most commonly used)\n"
|
|
2089
|
+
" - 'Median': Median expression (robust to outliers)\n"
|
|
2090
|
+
" - 'Q3': Third quartile (75th percentile)\n"
|
|
2091
|
+
" - 'Quantile_0.9': 90th percentile (captures high expressors)"
|
|
2092
|
+
),
|
|
2093
|
+
)
|
|
2094
|
+
)
|
|
2095
|
+
|
|
2096
|
+
fastccc_complex_aggregation: Literal["Minimum", "Average"] = Field(
|
|
2097
|
+
default="Minimum",
|
|
2098
|
+
description=(
|
|
2099
|
+
"Aggregation method for multi-subunit protein complexes.\n"
|
|
2100
|
+
"Options:\n"
|
|
2101
|
+
" - 'Minimum': Use minimum expression (default, ensures all subunits present)\n"
|
|
2102
|
+
" - 'Average': Use average expression across subunits"
|
|
2103
|
+
),
|
|
2104
|
+
)
|
|
2105
|
+
|
|
2106
|
+
fastccc_lr_combination: Literal["Arithmetic", "Geometric"] = Field(
|
|
2107
|
+
default="Arithmetic",
|
|
2108
|
+
description=(
|
|
2109
|
+
"Method for combining ligand and receptor scores.\n"
|
|
2110
|
+
"Options:\n"
|
|
2111
|
+
" - 'Arithmetic': Arithmetic mean of L and R (default)\n"
|
|
2112
|
+
" - 'Geometric': Geometric mean (more conservative)"
|
|
2113
|
+
),
|
|
2114
|
+
)
|
|
2115
|
+
|
|
2116
|
+
fastccc_min_percentile: Annotated[float, Field(ge=0.0, le=1.0)] = Field(
|
|
2117
|
+
default=0.1,
|
|
2118
|
+
description=(
|
|
2119
|
+
"Minimum expression percentile threshold for filtering lowly expressed genes.\n"
|
|
2120
|
+
"Default: 0.1 (10% of cells must express the gene)"
|
|
2121
|
+
),
|
|
2122
|
+
)
|
|
2123
|
+
|
|
2124
|
+
fastccc_use_cauchy: bool = Field(
|
|
2125
|
+
default=True,
|
|
2126
|
+
description=(
|
|
2127
|
+
"Whether to use Cauchy combination for multi-method aggregation.\n"
|
|
2128
|
+
"When True: Runs multiple parameter combinations and aggregates p-values\n"
|
|
2129
|
+
" using Cauchy distribution (more robust, slower)\n"
|
|
2130
|
+
"When False: Uses single parameter set (faster)"
|
|
2131
|
+
),
|
|
2132
|
+
)
|
|
2133
|
+
|
|
2134
|
+
fastccc_pvalue_threshold: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
|
|
2135
|
+
default=0.05,
|
|
2136
|
+
description="P-value threshold for identifying significant interactions.",
|
|
2137
|
+
)
|
|
2138
|
+
|
|
2139
|
+
fastccc_use_deg: bool = Field(
|
|
2140
|
+
default=False,
|
|
2141
|
+
description=(
|
|
2142
|
+
"Apply differential expression gene filtering before analysis.\n"
|
|
2143
|
+
"When True: Only analyze differentially expressed genes (more specific)\n"
|
|
2144
|
+
"When False: Analyze all expressed genes (default, more comprehensive)"
|
|
2145
|
+
),
|
|
2146
|
+
)
|
|
2147
|
+
|
|
2148
|
+
|
|
2149
|
+
class EnrichmentParameters(BaseModel):
|
|
2150
|
+
"""Parameters for gene set enrichment analysis"""
|
|
2151
|
+
|
|
2152
|
+
model_config = ConfigDict(extra="forbid")
|
|
2153
|
+
|
|
2154
|
+
# REQUIRED: Species specification (no default value)
|
|
2155
|
+
species: Literal["human", "mouse", "zebrafish"]
|
|
2156
|
+
# Must explicitly specify the species for gene set matching:
|
|
2157
|
+
# - "human": For human data (genes like CD5L, PTPRC - all uppercase)
|
|
2158
|
+
# - "mouse": For mouse data (genes like Cd5l, Ptprc - capitalize format)
|
|
2159
|
+
# - "zebrafish": For zebrafish data
|
|
2160
|
+
|
|
2161
|
+
# Method selection
|
|
2162
|
+
method: Literal[
|
|
2163
|
+
"spatial_enrichmap",
|
|
2164
|
+
"pathway_gsea",
|
|
2165
|
+
"pathway_ora",
|
|
2166
|
+
"pathway_enrichr",
|
|
2167
|
+
"pathway_ssgsea",
|
|
2168
|
+
] = "spatial_enrichmap" # Enrichment method
|
|
2169
|
+
|
|
2170
|
+
# Gene sets
|
|
2171
|
+
gene_sets: Optional[Union[list[str], dict[str, list[str]]]] = (
|
|
2172
|
+
None # Gene sets to analyze
|
|
2173
|
+
)
|
|
2174
|
+
score_keys: Optional[Union[str, list[str]]] = None # Names for gene signatures
|
|
2175
|
+
|
|
2176
|
+
# Gene set database - choose species-appropriate option
|
|
2177
|
+
gene_set_database: Optional[
|
|
2178
|
+
Literal[
|
|
2179
|
+
"GO_Biological_Process", # Default (auto-adapts to species)
|
|
2180
|
+
"GO_Molecular_Function", # GO molecular function terms
|
|
2181
|
+
"GO_Cellular_Component", # GO cellular component terms
|
|
2182
|
+
"KEGG_Pathways", # KEGG pathways (species-specific: human=2021, mouse=2019)
|
|
2183
|
+
"Reactome_Pathways", # Reactome pathway database (2022 version)
|
|
2184
|
+
"MSigDB_Hallmark", # MSigDB hallmark gene sets (2020 version)
|
|
2185
|
+
"Cell_Type_Markers", # Cell type marker genes
|
|
2186
|
+
]
|
|
2187
|
+
] = "GO_Biological_Process"
|
|
2188
|
+
|
|
2189
|
+
# Spatial parameters (for spatial_enrichmap)
|
|
2190
|
+
spatial_key: str = "spatial" # Key for spatial coordinates
|
|
2191
|
+
n_neighbors: Annotated[int, Field(gt=0)] = 6 # Number of spatial neighbors
|
|
2192
|
+
smoothing: bool = True # Whether to perform spatial smoothing
|
|
2193
|
+
correct_spatial_covariates: bool = True # Whether to correct for spatial covariates
|
|
2194
|
+
|
|
2195
|
+
# Analysis parameters
|
|
2196
|
+
batch_key: Optional[str] = None # Column for batch-wise normalization
|
|
2197
|
+
min_genes: Annotated[int, Field(gt=0)] = 10 # Minimum genes in gene set
|
|
2198
|
+
max_genes: Annotated[int, Field(gt=0)] = 500 # Maximum genes in gene set
|
|
2199
|
+
|
|
2200
|
+
# Statistical parameters
|
|
2201
|
+
pvalue_cutoff: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.05 # P-value cutoff
|
|
2202
|
+
adjust_method: Literal["bonferroni", "fdr", "none"] = (
|
|
2203
|
+
"fdr" # Multiple testing correction
|
|
2204
|
+
)
|
|
2205
|
+
n_permutations: Annotated[int, Field(gt=0)] = (
|
|
2206
|
+
1000 # Number of permutations for GSEA
|
|
2207
|
+
)
|
|
2208
|
+
|
|
2209
|
+
|
|
2210
|
+
class CNVParameters(BaseModel):
|
|
2211
|
+
"""Copy Number Variation (CNV) analysis parameters model"""
|
|
2212
|
+
|
|
2213
|
+
# Method selection
|
|
2214
|
+
method: Literal["infercnvpy", "numbat"] = Field(
|
|
2215
|
+
"infercnvpy",
|
|
2216
|
+
description=(
|
|
2217
|
+
"CNV analysis method. 'infercnvpy': expression-based (default), "
|
|
2218
|
+
"'numbat': haplotype-aware (requires allele data)"
|
|
2219
|
+
),
|
|
2220
|
+
)
|
|
2221
|
+
|
|
2222
|
+
# Reference cell specification
|
|
2223
|
+
reference_key: str = Field(
|
|
2224
|
+
...,
|
|
2225
|
+
description=(
|
|
2226
|
+
"Column name in adata.obs containing cell type or cluster labels "
|
|
2227
|
+
"for identifying reference (normal) cells. Common values: "
|
|
2228
|
+
"'cell_type', 'leiden', 'louvain', 'seurat_clusters'"
|
|
2229
|
+
),
|
|
2230
|
+
)
|
|
2231
|
+
reference_categories: list[str] = Field(
|
|
2232
|
+
...,
|
|
2233
|
+
description=(
|
|
2234
|
+
"List of cell types/clusters to use as reference (normal) cells. "
|
|
2235
|
+
"These should be non-malignant cells like immune cells, fibroblasts, etc. "
|
|
2236
|
+
"Example: ['T cells', 'B cells', 'Macrophages']"
|
|
2237
|
+
),
|
|
2238
|
+
)
|
|
2239
|
+
|
|
2240
|
+
# infercnvpy parameters
|
|
2241
|
+
window_size: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
2242
|
+
100, description="Number of genes for CNV averaging window (default: 100)"
|
|
2243
|
+
)
|
|
2244
|
+
step: Annotated[int, Field(gt=0, le=100)] = Field(
|
|
2245
|
+
10, description="Step size for sliding window (default: 10)"
|
|
2246
|
+
)
|
|
2247
|
+
|
|
2248
|
+
# Analysis options
|
|
2249
|
+
exclude_chromosomes: Optional[list[str]] = Field(
|
|
2250
|
+
None,
|
|
2251
|
+
description=(
|
|
2252
|
+
"Chromosomes to exclude from analysis (e.g., ['chrX', 'chrY', 'chrM'])"
|
|
2253
|
+
),
|
|
2254
|
+
)
|
|
2255
|
+
dynamic_threshold: Optional[float] = Field(
|
|
2256
|
+
1.5,
|
|
2257
|
+
gt=0.0,
|
|
2258
|
+
description="Threshold for dynamic CNV calling (default: 1.5)",
|
|
2259
|
+
)
|
|
2260
|
+
|
|
2261
|
+
# Clustering and visualization options (infercnvpy)
|
|
2262
|
+
cluster_cells: bool = Field(
|
|
2263
|
+
False, description="Whether to cluster cells by CNV pattern"
|
|
2264
|
+
)
|
|
2265
|
+
dendrogram: bool = Field(
|
|
2266
|
+
False, description="Whether to compute hierarchical clustering dendrogram"
|
|
2267
|
+
)
|
|
2268
|
+
|
|
2269
|
+
# Numbat-specific parameters
|
|
2270
|
+
numbat_genome: Literal["hg38", "hg19", "mm10", "mm39"] = Field(
|
|
2271
|
+
"hg38", description="Reference genome for Numbat (default: hg38)"
|
|
2272
|
+
)
|
|
2273
|
+
numbat_allele_data_key: str = Field(
|
|
2274
|
+
"allele_counts",
|
|
2275
|
+
description="Layer name in adata containing allele count data",
|
|
2276
|
+
)
|
|
2277
|
+
numbat_t: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
|
|
2278
|
+
0.15, description="Transition probability threshold (default: 0.15)"
|
|
2279
|
+
)
|
|
2280
|
+
numbat_max_entropy: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
|
|
2281
|
+
0.8,
|
|
2282
|
+
description=(
|
|
2283
|
+
"Maximum entropy threshold. Use 0.8 for spatial data, "
|
|
2284
|
+
"0.5 for scRNA-seq (default: 0.8)"
|
|
2285
|
+
),
|
|
2286
|
+
)
|
|
2287
|
+
numbat_min_cells: Annotated[int, Field(gt=0)] = Field(
|
|
2288
|
+
10, description="Minimum cells per CNV event (default: 10)"
|
|
2289
|
+
)
|
|
2290
|
+
numbat_ncores: Annotated[int, Field(gt=0, le=16)] = Field(
|
|
2291
|
+
1, description="Number of cores for parallel processing (default: 1)"
|
|
2292
|
+
)
|
|
2293
|
+
numbat_skip_nj: bool = Field(
|
|
2294
|
+
False, description="Skip neighbor-joining tree reconstruction (default: False)"
|
|
2295
|
+
)
|
|
2296
|
+
|
|
2297
|
+
|
|
2298
|
+
class RegistrationParameters(BaseModel):
|
|
2299
|
+
"""Spatial registration parameters for aligning multiple tissue slices."""
|
|
2300
|
+
|
|
2301
|
+
method: Literal["paste", "stalign"] = Field(
|
|
2302
|
+
"paste",
|
|
2303
|
+
description=(
|
|
2304
|
+
"Registration method. 'paste': Probabilistic Alignment of ST Experiments "
|
|
2305
|
+
"(optimal transport-based, recommended). 'stalign': STalign diffeomorphic "
|
|
2306
|
+
"mapping (LDDMM-based, for complex deformations)."
|
|
2307
|
+
),
|
|
2308
|
+
)
|
|
2309
|
+
reference_idx: Optional[int] = Field(
|
|
2310
|
+
None,
|
|
2311
|
+
ge=0,
|
|
2312
|
+
description="Index of reference slice (0-indexed). If None, uses first slice.",
|
|
2313
|
+
)
|
|
2314
|
+
|
|
2315
|
+
# PASTE-specific parameters
|
|
2316
|
+
paste_alpha: Annotated[float, Field(gt=0, le=1)] = Field(
|
|
2317
|
+
0.1,
|
|
2318
|
+
description=(
|
|
2319
|
+
"Spatial regularization parameter for PASTE (0-1). "
|
|
2320
|
+
"Higher values give more weight to spatial coordinates vs expression. "
|
|
2321
|
+
"Default: 0.1 (expression-dominated alignment)."
|
|
2322
|
+
),
|
|
2323
|
+
)
|
|
2324
|
+
paste_n_components: Annotated[int, Field(gt=0, le=100)] = Field(
|
|
2325
|
+
30,
|
|
2326
|
+
description="Number of PCA components for PASTE center alignment (default: 30).",
|
|
2327
|
+
)
|
|
2328
|
+
paste_numItermax: Annotated[int, Field(gt=0, le=1000)] = Field(
|
|
2329
|
+
200,
|
|
2330
|
+
description="Maximum iterations for optimal transport solver (default: 200).",
|
|
2331
|
+
)
|
|
2332
|
+
|
|
2333
|
+
# STalign-specific parameters
|
|
2334
|
+
stalign_image_size: tuple[int, int] = Field(
|
|
2335
|
+
(128, 128),
|
|
2336
|
+
description="Image size for STalign rasterization (height, width).",
|
|
2337
|
+
)
|
|
2338
|
+
stalign_niter: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
2339
|
+
50,
|
|
2340
|
+
description="Number of LDDMM iterations for STalign (default: 50).",
|
|
2341
|
+
)
|
|
2342
|
+
stalign_a: Annotated[float, Field(gt=0)] = Field(
|
|
2343
|
+
500.0,
|
|
2344
|
+
description="Regularization parameter 'a' for STalign (default: 500).",
|
|
2345
|
+
)
|
|
2346
|
+
stalign_use_expression: bool = Field(
|
|
2347
|
+
True,
|
|
2348
|
+
description="Use gene expression for STalign intensity (vs uniform).",
|
|
2349
|
+
)
|
|
2350
|
+
|
|
2351
|
+
# Common parameters
|
|
2352
|
+
use_gpu: bool = Field(
|
|
2353
|
+
False,
|
|
2354
|
+
description="Use GPU acceleration (PASTE with PyTorch backend, STalign).",
|
|
2355
|
+
)
|
|
2356
|
+
|
|
2357
|
+
|
|
2358
|
+
class ConditionComparisonParameters(BaseModel):
|
|
2359
|
+
"""Parameters for multi-sample condition comparison analysis.
|
|
2360
|
+
|
|
2361
|
+
This tool compares gene expression between experimental conditions (e.g., Treatment vs Control)
|
|
2362
|
+
across multiple biological samples, using proper statistical methods that account for
|
|
2363
|
+
sample-level variation.
|
|
2364
|
+
|
|
2365
|
+
Key difference from find_markers:
|
|
2366
|
+
- find_markers: Compares cell types/clusters WITHIN a dataset (e.g., T cell vs B cell)
|
|
2367
|
+
- compare_conditions: Compares CONDITIONS ACROSS samples (e.g., Treatment vs Control)
|
|
2368
|
+
"""
|
|
2369
|
+
|
|
2370
|
+
# Required parameters
|
|
2371
|
+
condition_key: str = Field(
|
|
2372
|
+
...,
|
|
2373
|
+
description=(
|
|
2374
|
+
"Column name in adata.obs containing experimental conditions. "
|
|
2375
|
+
"Examples: 'treatment', 'condition', 'group', 'disease_state'"
|
|
2376
|
+
),
|
|
2377
|
+
)
|
|
2378
|
+
|
|
2379
|
+
condition1: str = Field(
|
|
2380
|
+
...,
|
|
2381
|
+
description=(
|
|
2382
|
+
"First condition for comparison (typically the experimental/treatment group). "
|
|
2383
|
+
"Example: 'Treatment', 'Disease', 'Tumor'"
|
|
2384
|
+
),
|
|
2385
|
+
)
|
|
2386
|
+
|
|
2387
|
+
condition2: str = Field(
|
|
2388
|
+
...,
|
|
2389
|
+
description=(
|
|
2390
|
+
"Second condition for comparison (typically the control/reference group). "
|
|
2391
|
+
"Example: 'Control', 'Healthy', 'Normal'"
|
|
2392
|
+
),
|
|
2393
|
+
)
|
|
2394
|
+
|
|
2395
|
+
sample_key: str = Field(
|
|
2396
|
+
...,
|
|
2397
|
+
description=(
|
|
2398
|
+
"Column name in adata.obs identifying biological replicates/samples. "
|
|
2399
|
+
"This is CRITICAL for proper statistical inference - cells from the same sample "
|
|
2400
|
+
"are not independent observations. "
|
|
2401
|
+
"Examples: 'sample_id', 'patient_id', 'replicate', 'batch'"
|
|
2402
|
+
),
|
|
2403
|
+
)
|
|
2404
|
+
|
|
2405
|
+
# Optional parameters
|
|
2406
|
+
cell_type_key: Optional[str] = Field(
|
|
2407
|
+
None,
|
|
2408
|
+
description=(
|
|
2409
|
+
"Column name in adata.obs for cell type annotations. "
|
|
2410
|
+
"If provided, differential expression is performed separately for each cell type, "
|
|
2411
|
+
"enabling cell type-specific condition effects. "
|
|
2412
|
+
"Examples: 'cell_type', 'leiden', 'cell_type_tangram'"
|
|
2413
|
+
),
|
|
2414
|
+
)
|
|
2415
|
+
|
|
2416
|
+
method: Literal["pseudobulk"] = Field(
|
|
2417
|
+
"pseudobulk",
|
|
2418
|
+
description=(
|
|
2419
|
+
"Method for differential expression analysis.\n"
|
|
2420
|
+
"• 'pseudobulk' (default): Aggregate cells by sample, then use DESeq2\n"
|
|
2421
|
+
" - Best practice for multi-sample studies\n"
|
|
2422
|
+
" - Properly accounts for biological variation\n"
|
|
2423
|
+
" - Requires at least 2 samples per condition\n"
|
|
2424
|
+
"Future methods (not yet implemented):\n"
|
|
2425
|
+
"• 'cside': Cell type-Specific Inference of DE (from spacexr)\n"
|
|
2426
|
+
"• 'despace': Differential Spatial Patterns (from DESpace)"
|
|
2427
|
+
),
|
|
2428
|
+
)
|
|
2429
|
+
|
|
2430
|
+
n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
|
|
2431
|
+
50,
|
|
2432
|
+
description="Number of top differentially expressed genes to return per comparison.",
|
|
2433
|
+
)
|
|
2434
|
+
|
|
2435
|
+
min_cells_per_sample: Annotated[int, Field(gt=0)] = Field(
|
|
2436
|
+
10,
|
|
2437
|
+
description=(
|
|
2438
|
+
"Minimum number of cells per sample to include in analysis. "
|
|
2439
|
+
"Samples with fewer cells are excluded to ensure reliable aggregation."
|
|
2440
|
+
),
|
|
2441
|
+
)
|
|
2442
|
+
|
|
2443
|
+
min_samples_per_condition: Annotated[int, Field(gt=0)] = Field(
|
|
2444
|
+
2,
|
|
2445
|
+
description=(
|
|
2446
|
+
"Minimum number of samples required per condition. "
|
|
2447
|
+
"DESeq2 requires at least 2 samples per group for variance estimation."
|
|
2448
|
+
),
|
|
2449
|
+
)
|
|
2450
|
+
|
|
2451
|
+
padj_threshold: Annotated[float, Field(gt=0, lt=1)] = Field(
|
|
2452
|
+
0.05,
|
|
2453
|
+
description="Adjusted p-value threshold for significance (default: 0.05).",
|
|
2454
|
+
)
|
|
2455
|
+
|
|
2456
|
+
log2fc_threshold: Annotated[float, Field(ge=0)] = Field(
|
|
2457
|
+
0.0,
|
|
2458
|
+
description=(
|
|
2459
|
+
"Minimum absolute log2 fold change threshold. "
|
|
2460
|
+
"Set to 0 for no filtering, or e.g., 1.0 for 2-fold change minimum."
|
|
2461
|
+
),
|
|
2462
|
+
)
|