chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2462 @@
1
+ """
2
+ Data models for spatial transcriptomics analysis.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Annotated, Literal, Optional, Union
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+ from typing_extensions import Self
11
+
12
+
13
+ class ColumnInfo(BaseModel):
14
+ """Metadata column information for dataset profiling"""
15
+
16
+ name: str
17
+ dtype: Literal["categorical", "numerical"]
18
+ n_unique: int
19
+ sample_values: Optional[list[str]] = None # Sample values for categorical
20
+ range: Optional[tuple[float, float]] = None # Value range for numerical
21
+
22
+
23
+ class SpatialDataset(BaseModel):
24
+ """Spatial transcriptomics dataset model with comprehensive metadata profile"""
25
+
26
+ id: str
27
+ name: str
28
+ data_type: Literal[
29
+ "10x_visium", "slide_seq", "merfish", "seqfish", "other", "h5ad", "auto"
30
+ ]
31
+ description: Optional[str] = None
32
+
33
+ # Basic statistics
34
+ n_cells: int = 0
35
+ n_genes: int = 0
36
+ spatial_coordinates_available: bool = False
37
+ tissue_image_available: bool = False
38
+
39
+ # Metadata profiles - let LLM interpret the structure
40
+ obs_columns: Optional[list[ColumnInfo]] = None # Cell-level metadata
41
+ var_columns: Optional[list[ColumnInfo]] = None # Gene-level metadata
42
+ obsm_keys: Optional[list[str]] = None # Multi-dimensional data keys
43
+ uns_keys: Optional[list[str]] = None # Unstructured data keys
44
+
45
+ # Gene expression profiles
46
+ top_highly_variable_genes: Optional[list[str]] = None
47
+ top_expressed_genes: Optional[list[str]] = None
48
+
49
+
50
+ class PreprocessingParameters(BaseModel):
51
+ """Preprocessing parameters model"""
52
+
53
+ # Data filtering and subsampling parameters (user controlled)
54
+ filter_genes_min_cells: Optional[Annotated[int, Field(gt=0)]] = (
55
+ 3 # Filter genes expressed in < N cells
56
+ )
57
+ filter_cells_min_genes: Optional[Annotated[int, Field(gt=0)]] = (
58
+ 30 # Filter cells expressing < N genes
59
+ )
60
+ subsample_spots: Optional[Annotated[int, Field(gt=0, le=50000)]] = (
61
+ None # Subsample to N spots (None = no subsampling)
62
+ )
63
+ subsample_genes: Optional[Annotated[int, Field(gt=0, le=50000)]] = (
64
+ None # Keep top N variable genes (None = keep all filtered genes)
65
+ )
66
+ subsample_random_seed: int = 42 # Random seed for subsampling
67
+
68
+ # ========== Mitochondrial and Ribosomal Gene Filtering ==========
69
+ filter_mito_pct: Optional[float] = Field(
70
+ default=20.0,
71
+ ge=0.0,
72
+ le=100.0,
73
+ description=(
74
+ "Filter spots/cells with mitochondrial percentage above this threshold.\n\n"
75
+ "DEFAULT: 20.0 (remove spots with >20% mitochondrial reads)\n\n"
76
+ "RATIONALE:\n"
77
+ "High mitochondrial content often indicates cell stress, damage, or apoptosis.\n"
78
+ "Damaged cells release cytoplasmic mRNA while retaining mitochondrial transcripts.\n\n"
79
+ "RECOMMENDED VALUES:\n"
80
+ "• 20.0 (default): Standard threshold for most tissues\n"
81
+ "• 5-10: Stringent filtering for high-quality data\n"
82
+ "• 30-50: Relaxed for tissues with naturally high mito (muscle, neurons)\n"
83
+ "• None: Disable filtering (not recommended)\n\n"
84
+ "TISSUE-SPECIFIC CONSIDERATIONS:\n"
85
+ "• Brain: White matter naturally has higher mito% than gray matter\n"
86
+ "• Muscle/Heart: High mito% is biologically normal\n"
87
+ "• Tumor samples: May have elevated mito% due to metabolic changes\n\n"
88
+ "REFERENCE:\n"
89
+ "OSTA Book: lmweber.org/OSTA/pages/seq-quality-control.html"
90
+ ),
91
+ )
92
+ remove_mito_genes: bool = Field(
93
+ default=True,
94
+ description=(
95
+ "Remove mitochondrial genes (MT-*, mt-*) before HVG selection.\n\n"
96
+ "DEFAULT: True (recommended for most analyses)\n\n"
97
+ "RATIONALE:\n"
98
+ "Mitochondrial genes can dominate HVG selection due to high expression\n"
99
+ "and technical variation, masking biologically relevant genes.\n\n"
100
+ "WHEN TO ENABLE (True):\n"
101
+ "• Standard spatial transcriptomics analysis\n"
102
+ "• Clustering and cell type identification\n"
103
+ "• Trajectory analysis\n\n"
104
+ "WHEN TO DISABLE (False):\n"
105
+ "• Studying mitochondrial biology or metabolism\n"
106
+ "• Analyzing mitochondrial heteroplasmy\n"
107
+ "• When mito genes are biologically relevant to your question\n\n"
108
+ "NOTE: Genes are only excluded from HVG selection, not removed from data.\n"
109
+ "They remain available in adata.raw for downstream analyses."
110
+ ),
111
+ )
112
+ remove_ribo_genes: bool = Field(
113
+ default=False,
114
+ description=(
115
+ "Remove ribosomal genes (RPS*, RPL*, Rps*, Rpl*) before HVG selection.\n\n"
116
+ "DEFAULT: False (ribosomal genes often carry biological signal)\n\n"
117
+ "RATIONALE:\n"
118
+ "Ribosomal genes are highly expressed housekeeping genes. While they\n"
119
+ "add noise in some analyses, they can be informative for cell state.\n\n"
120
+ "WHEN TO ENABLE (True):\n"
121
+ "• When ribosomal genes dominate your HVG list\n"
122
+ "• For cleaner clustering focused on cell type markers\n"
123
+ "• Following certain published pipelines that recommend it\n\n"
124
+ "WHEN TO KEEP DISABLED (False):\n"
125
+ "• Standard analyses (ribosomal content varies by cell type)\n"
126
+ "• Studying translation or ribosome biogenesis\n"
127
+ "• When unsure - ribosomal genes rarely cause problems"
128
+ ),
129
+ )
130
+
131
+ # Normalization and scaling parameters
132
+ normalization: Literal["log", "sct", "pearson_residuals", "none", "scvi"] = Field(
133
+ default="log",
134
+ description=(
135
+ "Normalization method for gene expression data.\n\n"
136
+ "AVAILABLE OPTIONS:\n"
137
+ "• 'log' (default): Standard log(x+1) normalization after library size correction. "
138
+ "Robust and widely used for most analyses.\n"
139
+ "• 'sct': SCTransform v2 variance-stabilizing normalization via R's sctransform package. "
140
+ "Best for raw UMI counts from 10x platforms. Based on regularized negative binomial regression.\n"
141
+ "• 'pearson_residuals': Analytic Pearson residuals (scanpy built-in, similar to SCTransform). "
142
+ "Requires raw integer counts and scanpy>=1.9.0. Faster than SCTransform with similar results.\n"
143
+ "• 'none': Skip normalization. Use when data is already pre-normalized.\n"
144
+ "• 'scvi': Deep learning-based normalization using scVI variational autoencoder. "
145
+ "Learns a latent representation (X_scvi) that replaces PCA. Best for batch correction and denoising.\n\n"
146
+ "REQUIREMENTS:\n"
147
+ "• sct: R with sctransform package (R -e 'install.packages(\"sctransform\")') + rpy2\n"
148
+ "• pearson_residuals: Raw count data (integers only), scanpy>=1.9.0\n"
149
+ "• scvi: scvi-tools package (pip install scvi-tools), raw count data\n"
150
+ "• none: Data should already be normalized (will warn if raw counts detected)\n\n"
151
+ "RECOMMENDATIONS:\n"
152
+ "• For raw Visium/Xenium/MERFISH data: 'sct', 'pearson_residuals', or 'log'\n"
153
+ "• For Seurat workflow compatibility: 'sct' (SCTransform v2)\n"
154
+ "• For speed with similar results: 'pearson_residuals'\n"
155
+ "• For pre-processed data: 'none'\n"
156
+ "• For batch effect correction and denoising: 'scvi' (deep learning-based)"
157
+ ),
158
+ )
159
+ scale: bool = Field(
160
+ default=False,
161
+ description=(
162
+ "Scale gene expression to unit variance before PCA.\n\n"
163
+ "DEFAULT: False (following Scanpy spatial transcriptomics best practices)\n\n"
164
+ "RATIONALE:\n"
165
+ "The standard Scanpy spatial transcriptomics tutorials do NOT include scaling:\n"
166
+ " normalize_total → log1p → HVG selection → PCA\n"
167
+ "Scaling is omitted because log-normalization already stabilizes variance.\n\n"
168
+ "WHEN TO ENABLE (scale=True):\n"
169
+ "• Using methods that explicitly require scaled input (e.g., GraphST)\n"
170
+ "• When gene expression magnitudes vary dramatically\n"
171
+ "• For compatibility with Seurat's ScaleData() workflow\n\n"
172
+ "WHEN TO KEEP DISABLED (scale=False):\n"
173
+ "• Standard Visium/spatial analysis with Scanpy/Squidpy\n"
174
+ "• Using SCTransform normalization (already variance-stabilized)\n"
175
+ "• Using Pearson residuals normalization\n\n"
176
+ "REFERENCE:\n"
177
+ "Scanpy spatial tutorial: scanpy-tutorials.readthedocs.io/en/latest/spatial/"
178
+ ),
179
+ )
180
+ n_hvgs: Annotated[int, Field(gt=0, le=5000)] = 2000
181
+ n_pcs: Annotated[int, Field(gt=0, le=100)] = 30
182
+
183
+ # ========== Normalization Control Parameters ==========
184
+ normalize_target_sum: Optional[float] = Field(
185
+ default=None, # Adaptive default - uses median counts
186
+ ge=1.0, # Must be positive if specified
187
+ le=1e8, # Reasonable upper bound
188
+ description=(
189
+ "Target sum for total count normalization per cell/spot. "
190
+ "Controls the library size after normalization. "
191
+ "\n"
192
+ "RECOMMENDED VALUES BY TECHNOLOGY:\n"
193
+ "• None (default): Uses median of total counts - most adaptive, recommended for unknown data\n"
194
+ "• 1e4 (10,000): Standard for 10x Visium spatial transcriptomics\n"
195
+ "• 1e6 (1,000,000): CPM normalization, standard for MERFISH/CosMx/Xenium\n"
196
+ "• Custom value: Match to your expected counts per cell/spot\n"
197
+ "\n"
198
+ "DECISION GUIDE:\n"
199
+ "- Multi-cellular spots (Visium): Use 1e4\n"
200
+ "- Single-cell imaging (MERFISH, Xenium, CosMx): Use 1e6\n"
201
+ "- High-depth sequencing: Consider 1e5 or higher\n"
202
+ "- Low-depth/targeted panels: Consider 1e3-1e4\n"
203
+ "- Cross-sample integration: Use same value for all samples\n"
204
+ "- Spatial domain analysis: Consider skipping normalization (None)\n"
205
+ "\n"
206
+ "SCIENTIFIC RATIONALE:\n"
207
+ "This parameter scales all cells/spots to have the same total count, "
208
+ "removing technical variation due to sequencing depth or capture efficiency. "
209
+ "The choice affects the magnitude of normalized expression values and "
210
+ "can influence downstream analyses like HVG selection and clustering."
211
+ ),
212
+ )
213
+
214
+ scale_max_value: Optional[float] = Field(
215
+ default=10.0,
216
+ ge=1.0, # Must be positive if specified
217
+ le=100.0, # Reasonable upper bound
218
+ description=(
219
+ "Maximum value for clipping after scaling to unit variance (in standard deviations). "
220
+ "Prevents extreme outliers from dominating downstream analyses. "
221
+ "\n"
222
+ "RECOMMENDED VALUES:\n"
223
+ "• 10.0 (default): Standard in single-cell field, balances outlier control with data preservation\n"
224
+ "• None: No clipping - preserves all variation, use for high-quality data\n"
225
+ "• 5.0-8.0: More aggressive clipping for noisy data\n"
226
+ "• 15.0-20.0: Less aggressive for clean imaging data\n"
227
+ "\n"
228
+ "DECISION GUIDE BY DATA TYPE:\n"
229
+ "- Standard scRNA-seq or Visium: 10.0\n"
230
+ "- High-quality imaging (MERFISH/Xenium): 15.0 or None\n"
231
+ "- Noisy/low-quality data: 5.0-8.0\n"
232
+ "- Exploratory analysis: Start with 10.0\n"
233
+ "- Final analysis: Consider None to preserve all variation\n"
234
+ "\n"
235
+ "TECHNICAL DETAILS:\n"
236
+ "After scaling each gene to zero mean and unit variance, "
237
+ "values exceeding ±max_value standard deviations are clipped. "
238
+ "This prevents a few extreme values from dominating PCA and clustering. "
239
+ "Lower values increase robustness but may remove biological signal."
240
+ ),
241
+ )
242
+
243
+ # SCTransform preprocessing parameters (requires R + sctransform package via rpy2)
244
+ # Installation: R -e 'install.packages("sctransform")' && pip install rpy2
245
+ sct_var_features_n: int = Field(
246
+ default=3000,
247
+ ge=100,
248
+ le=10000,
249
+ description="Number of highly variable features for SCTransform (default: 3000)",
250
+ )
251
+ sct_method: Literal["offset", "fix-slope"] = Field(
252
+ default="fix-slope",
253
+ description=(
254
+ "SCTransform regularization method:\n"
255
+ "• 'fix-slope' (default, v2): Fixed slope regularization, more robust and recommended.\n"
256
+ "• 'offset': Original offset model from v1."
257
+ ),
258
+ )
259
+ sct_exclude_poisson: bool = Field(
260
+ default=True,
261
+ description="Exclude Poisson genes from regularization (v2 default: True). "
262
+ "Improves robustness by excluding genes where variance ≤ mean.",
263
+ )
264
+ sct_n_cells: Optional[int] = Field(
265
+ default=5000,
266
+ ge=100,
267
+ description="Number of cells to subsample for parameter estimation (default: 5000). "
268
+ "Set to None to use all cells (slower but may be more accurate for small datasets).",
269
+ )
270
+
271
+ # scVI preprocessing parameters - architecture
272
+ use_scvi_preprocessing: bool = False # Whether to use scVI for preprocessing
273
+ scvi_n_hidden: int = 128
274
+ scvi_n_latent: int = 10
275
+ scvi_n_layers: int = 1
276
+ scvi_dropout_rate: float = 0.1
277
+ scvi_gene_likelihood: Literal["zinb", "nb", "poisson"] = "zinb"
278
+
279
+ # scVI preprocessing parameters - training (user-configurable)
280
+ scvi_max_epochs: Annotated[int, Field(gt=0, le=2000)] = Field(
281
+ default=400,
282
+ description=(
283
+ "Maximum number of training epochs for scVI. "
284
+ "Default 400 is sufficient for most datasets with early stopping enabled. "
285
+ "Increase to 600-800 for large/complex datasets without early stopping."
286
+ ),
287
+ )
288
+ scvi_early_stopping: bool = Field(
289
+ default=True,
290
+ description=(
291
+ "Whether to enable early stopping based on validation ELBO. "
292
+ "STRONGLY RECOMMENDED: Prevents overfitting and reduces training time. "
293
+ "Set to False only for debugging or when you need exact epoch control."
294
+ ),
295
+ )
296
+ scvi_early_stopping_patience: Annotated[int, Field(gt=0, le=100)] = Field(
297
+ default=20,
298
+ description=(
299
+ "Number of epochs to wait for validation improvement before stopping. "
300
+ "Default 20 balances convergence detection with training stability. "
301
+ "Increase to 30-50 for noisy data, decrease to 10-15 for faster training."
302
+ ),
303
+ )
304
+ scvi_train_size: Annotated[float, Field(gt=0.5, le=1.0)] = Field(
305
+ default=0.9,
306
+ description=(
307
+ "Fraction of data used for training (rest for validation). "
308
+ "Default 0.9 (90% train, 10% validation) is standard practice. "
309
+ "Use 1.0 to disable validation (NOT RECOMMENDED - no early stopping)."
310
+ ),
311
+ )
312
+
313
+ # Key naming parameters (configurable hard-coded keys)
314
+ cluster_key: str = Field(
315
+ "leiden", alias="clustering_key"
316
+ ) # Key name for storing clustering results
317
+ spatial_key: Optional[str] = Field(
318
+ default=None,
319
+ description="Spatial coordinate key in obsm (auto-detected if None)",
320
+ ) # Changed from hardcoded "spatial" to allow auto-detection
321
+ batch_key: str = "batch" # Key name for batch information in obs
322
+
323
+ # User-controllable parameters (scientifically-informed defaults)
324
+ n_neighbors: Annotated[int, Field(gt=2, le=100)] = Field(
325
+ default=15,
326
+ description=(
327
+ "Number of neighbors for k-NN graph construction. "
328
+ "Default 15 aligns with Scanpy industry standard and UMAP developer recommendations (10-15 range). "
329
+ "Larger values (20-50) preserve more global structure, smaller values (5-10) emphasize local patterns. "
330
+ "For spatial transcriptomics: 15 captures meaningful tissue neighborhoods in both Visium (55μm) and Visium HD (2μm) data."
331
+ ),
332
+ )
333
+ clustering_resolution: Annotated[float, Field(gt=0.1, le=2.0)] = Field(
334
+ default=1.0,
335
+ description=(
336
+ "Leiden clustering resolution parameter controlling clustering coarseness. "
337
+ "Higher values (1.5-2.0) produce more numerous, smaller clusters; "
338
+ "lower values (0.2-0.5) produce fewer, broader clusters. "
339
+ "Common values: 0.25, 0.5, 1.0. Default 1.0 matches scanpy standard and works well for most spatial datasets."
340
+ ),
341
+ )
342
+
343
+
344
+ class DifferentialExpressionParameters(BaseModel):
345
+ """Differential expression analysis parameters model.
346
+
347
+ This model encapsulates all parameters for differential expression analysis,
348
+ following the unified (data_id, ctx, params) signature pattern.
349
+ """
350
+
351
+ group_key: str = Field(
352
+ ...,
353
+ description=(
354
+ "Column name in adata.obs for grouping cells/spots. "
355
+ "Common values: 'leiden', 'louvain', 'cell_type', 'seurat_clusters'"
356
+ ),
357
+ )
358
+
359
+ group1: Optional[str] = Field(
360
+ None,
361
+ description=(
362
+ "First group for comparison. If None, find markers for all groups "
363
+ "(one-vs-rest comparison for each group)."
364
+ ),
365
+ )
366
+
367
+ group2: Optional[str] = Field(
368
+ None,
369
+ description=(
370
+ "Second group for comparison. If None or 'rest', compare group1 against "
371
+ "all other cells. Only used when group1 is specified."
372
+ ),
373
+ )
374
+
375
+ method: Literal[
376
+ "wilcoxon", "t-test", "t-test_overestim_var", "logreg", "pydeseq2"
377
+ ] = Field(
378
+ "wilcoxon",
379
+ description=(
380
+ "Statistical method for differential expression analysis.\n"
381
+ "• 'wilcoxon' (default): Wilcoxon rank-sum test, robust to outliers\n"
382
+ "• 't-test': Standard t-test, assumes normal distribution\n"
383
+ "• 't-test_overestim_var': t-test with overestimated variance\n"
384
+ "• 'logreg': Logistic regression\n"
385
+ "• 'pydeseq2': DESeq2 pseudobulk method (requires sample_key for aggregation)\n"
386
+ " - More accurate for multi-sample studies\n"
387
+ " - Accounts for biological replicates and batch effects\n"
388
+ " - Requires: pip install pydeseq2"
389
+ ),
390
+ )
391
+
392
+ sample_key: Optional[str] = Field(
393
+ None,
394
+ description=(
395
+ "Column name in adata.obs for sample/replicate identifier.\n"
396
+ "REQUIRED for 'pydeseq2' method to perform pseudobulk aggregation.\n"
397
+ "Common values: 'sample', 'patient_id', 'batch', 'replicate'\n"
398
+ "Each unique value becomes a pseudobulk sample by summing counts within groups."
399
+ ),
400
+ )
401
+
402
+ n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
403
+ 50,
404
+ description=(
405
+ "Number of top differentially expressed genes to return per group. "
406
+ "Default: 50. Range: 1-500."
407
+ ),
408
+ )
409
+
410
+ pseudocount: Annotated[float, Field(gt=0, le=100)] = Field(
411
+ 1.0,
412
+ description=(
413
+ "Pseudocount added before log2 fold change calculation to avoid log(0).\n"
414
+ "• 1.0 (default): Standard practice, stable for most data\n"
415
+ "• 0.1-0.5: More sensitive to low-expression changes\n"
416
+ "• 1-10: More stable for sparse/noisy data"
417
+ ),
418
+ )
419
+
420
+ min_cells: Annotated[int, Field(gt=0, le=1000)] = Field(
421
+ 3,
422
+ description=(
423
+ "Minimum number of cells per group for statistical testing.\n"
424
+ "• 3 (default): Minimum required for Wilcoxon test\n"
425
+ "• 10-30: More robust statistical results\n"
426
+ "Groups with fewer cells are automatically skipped with a warning."
427
+ ),
428
+ )
429
+
430
+
431
+ class VisualizationParameters(BaseModel):
432
+ """Visualization parameters model"""
433
+
434
+ model_config = ConfigDict(extra="forbid") # Strict validation after preprocessing
435
+
436
+ feature: Optional[Union[str, list[str]]] = Field(
437
+ None,
438
+ description="Single feature or list of features (accepts both 'feature' and 'features')",
439
+ ) # Single feature or list of features
440
+
441
+ @model_validator(mode="before")
442
+ @classmethod
443
+ def preprocess_params(cls, data):
444
+ """
445
+ Preprocess visualization parameters to handle different input formats.
446
+
447
+ Handles:
448
+ - None: Returns empty dict
449
+ - str: Converts to feature parameter (supports "gene:CCL21" and "CCL21" formats)
450
+ - dict: Normalizes features/feature naming
451
+ """
452
+ # Handle None input
453
+ if data is None:
454
+ return {}
455
+
456
+ # Handle string format parameters (shorthand for feature)
457
+ if isinstance(data, str):
458
+ if data.startswith("gene:"):
459
+ feature = data.split(":", 1)[1]
460
+ return {"feature": feature, "plot_type": "spatial"}
461
+ else:
462
+ return {"feature": data, "plot_type": "spatial"}
463
+
464
+ # Handle dict format - normalize features/feature naming
465
+ if isinstance(data, dict):
466
+ data_copy = data.copy()
467
+ # Handle 'features' as alias for 'feature'
468
+ if "features" in data_copy and "feature" not in data_copy:
469
+ data_copy["feature"] = data_copy.pop("features")
470
+ return data_copy
471
+
472
+ # For other types (e.g., VisualizationParameters instances), return as-is
473
+ return data
474
+
475
+ plot_type: Literal[
476
+ "spatial",
477
+ "heatmap",
478
+ "violin",
479
+ "umap",
480
+ "dotplot", # Marker gene expression dotplot
481
+ "cell_communication",
482
+ "deconvolution",
483
+ "trajectory",
484
+ "rna_velocity",
485
+ "spatial_statistics",
486
+ "multi_gene",
487
+ "lr_pairs",
488
+ "gene_correlation",
489
+ "pathway_enrichment",
490
+ "spatial_interaction",
491
+ "batch_integration", # Batch integration quality assessment
492
+ "cnv_heatmap", # CNV analysis heatmap
493
+ "spatial_cnv", # CNV spatial projection
494
+ "card_imputation", # CARD imputation high-resolution results
495
+ ] = "spatial"
496
+ colormap: str = "coolwarm"
497
+
498
+ # Unified subtype parameter for all visualization types with subtypes
499
+ subtype: Optional[str] = Field(
500
+ None,
501
+ description=(
502
+ "Unified subtype parameter for visualization variants. "
503
+ "Usage depends on plot_type:\n"
504
+ "- rna_velocity: 'stream' (default, velocity embedding stream), "
505
+ "'phase' (spliced vs unspliced phase plot), 'proportions' (pie chart of spliced/unspliced ratios), "
506
+ "'heatmap' (gene expression by latent_time), 'paga' (PAGA with velocity arrows)\n"
507
+ "- trajectory: 'pseudotime' (default, pseudotime on embedding), "
508
+ "'circular' (CellRank circular projection), 'fate_map' (aggregated fate probabilities), "
509
+ "'gene_trends' (gene expression along lineages), 'fate_heatmap' (smoothed expression heatmap), "
510
+ "'palantir' (Palantir comprehensive results)\n"
511
+ "- pathway_enrichment: 'barplot', 'dotplot' (traditional ORA/GSEA), "
512
+ "'spatial_score', 'spatial_correlogram', 'spatial_variogram', 'spatial_cross_correlation' (spatial EnrichMap)\n"
513
+ "- deconvolution: 'spatial_multi', 'dominant_type', 'diversity', 'stacked_bar', 'scatterpie', 'umap'\n"
514
+ "- spatial_statistics: 'neighborhood', 'co_occurrence', 'ripley', 'moran', 'centrality', 'getis_ord'\n"
515
+ "- Other plot types may not require this parameter"
516
+ ),
517
+ )
518
+ cluster_key: Optional[str] = Field(
519
+ None,
520
+ description=(
521
+ "Column name in adata.obs containing cluster or cell type labels "
522
+ "(e.g., 'leiden', 'louvain', 'cell_type'). "
523
+ "REQUIRED for plot_type='heatmap' and 'violin'. "
524
+ "NOTE: ChatSpatial uses 'cluster_key' (not 'groupby' as in Scanpy) "
525
+ "for consistency with Squidpy spatial analysis functions."
526
+ ),
527
+ )
528
+
529
+ # Multi-gene visualization parameters
530
+ multi_panel: bool = False # Whether to create multi-panel plots
531
+ panel_layout: Optional[tuple[int, int]] = (
532
+ None # (rows, cols) - auto-determined if None
533
+ )
534
+
535
+ # GridSpec subplot spacing parameters (for multi-panel plots)
536
+ subplot_wspace: float = Field(
537
+ 0.0,
538
+ ge=-0.3, # Allow larger negative values for extreme tight spacing
539
+ le=1.0,
540
+ description=(
541
+ "Horizontal spacing between subplots (GridSpec wspace parameter). "
542
+ "Fraction of average subplot width. "
543
+ "Default 0.0 provides tight spacing for spatial plots with colorbars. "
544
+ "Common values: 0.0 (tight), 0.05 (compact), 0.1 (normal), 0.2 (loose). "
545
+ "Negative values (-0.1 to -0.2) create overlapping spacing for extreme compactness."
546
+ ),
547
+ )
548
+ subplot_hspace: float = Field(
549
+ 0.3,
550
+ ge=0.0,
551
+ le=1.0,
552
+ description=(
553
+ "Vertical spacing between subplots (GridSpec hspace parameter). "
554
+ "Fraction of average subplot height. "
555
+ "Default 0.3 provides comfortable spacing. "
556
+ "Common values: 0.2 (tight), 0.3 (normal), 0.4 (loose)."
557
+ ),
558
+ )
559
+
560
+ # Colorbar parameters (for spatial plots with make_axes_locatable)
561
+ colorbar_pad: float = Field(
562
+ 0.02,
563
+ ge=0.0,
564
+ le=0.2,
565
+ description=(
566
+ "Distance between subplot and colorbar (as fraction of subplot width). "
567
+ "Default 0.02 provides tight spacing. "
568
+ "Common values: 0.02 (tight), 0.03 (compact), 0.05 (normal)."
569
+ ),
570
+ )
571
+ colorbar_size: str = Field(
572
+ "3%",
573
+ description=(
574
+ "Width of colorbar as percentage of subplot width. "
575
+ "Default '3%' provides narrow colorbar to save space. "
576
+ "Common values: '3%' (narrow), '4%' (compact), '5%' (normal)."
577
+ ),
578
+ )
579
+
580
+ # Ligand-receptor pair parameters
581
+ lr_pairs: Optional[list[tuple[str, str]]] = None # List of (ligand, receptor) pairs
582
+ lr_database: str = "cellchat" # Database for LR pairs
583
+ plot_top_pairs: int = Field(
584
+ 6,
585
+ gt=0,
586
+ le=100,
587
+ description="Number of top LR pairs to display in cell communication visualization. Default: 6. For chord diagrams, use higher values (e.g., 50) to show more interactions.",
588
+ )
589
+
590
+ # Gene correlation parameters
591
+ correlation_method: Literal["pearson", "spearman", "kendall"] = "pearson"
592
+ show_correlation_stats: bool = True
593
+
594
+ # Figure parameters
595
+ figure_size: Optional[tuple[int, int]] = (
596
+ None # (width, height) - auto-determined if None
597
+ )
598
+ dpi: int = 300 # Publication quality (Nature/Cell standard)
599
+ alpha: float = 0.9 # Spot transparency (higher = more opaque)
600
+ spot_size: Optional[float] = Field(
601
+ 150.0,
602
+ description=(
603
+ "Size of spots in spatial plots (in pixels). "
604
+ "Default 150 provides good balance for most 10x Visium data. "
605
+ "Adjust based on data density: "
606
+ "dense (>3000 spots): 100-150, "
607
+ "sparse (<2000 spots): 150-200. "
608
+ "Set to None for scanpy auto-sizing (not recommended - usually too small)."
609
+ ),
610
+ )
611
+ alpha_img: float = Field(
612
+ 0.3,
613
+ ge=0.0,
614
+ le=1.0,
615
+ description=(
616
+ "Background tissue image transparency (lower = dimmer, helps spots stand out). "
617
+ "Default 0.3 provides good contrast. "
618
+ "Increase to 0.4-0.5 to emphasize tissue structure."
619
+ ),
620
+ )
621
+ show_tissue_image: bool = Field(
622
+ True,
623
+ description=(
624
+ "Whether to show tissue histology image in spatial plots. "
625
+ "If False, only plot spots on coordinates without background image. "
626
+ "This option only applies when tissue image is available. "
627
+ "When False, spots are plotted on a clean coordinate system for clearer visualization. "
628
+ "Default: True"
629
+ ),
630
+ )
631
+
632
+ # Color parameters
633
+ vmin: Optional[float] = None # Minimum value for color scale
634
+ vmax: Optional[float] = None # Maximum value for color scale
635
+ color_scale: Literal["linear", "log", "sqrt"] = "linear" # Color scaling
636
+
637
+ # Display parameters
638
+ title: Optional[str] = None
639
+ show_legend: bool = True
640
+ show_colorbar: bool = True
641
+ show_axes: bool = True
642
+ add_gene_labels: bool = True # Whether to add gene names as labels
643
+
644
+ # Trajectory visualization parameters
645
+ basis: Optional[str] = (
646
+ None # Basis for trajectory visualization (e.g., 'spatial', 'umap', 'pca')
647
+ )
648
+
649
+ # GSEA visualization parameters
650
+ gsea_results_key: str = "gsea_results" # Key in adata.uns for GSEA results
651
+ n_top_pathways: int = 10 # Number of top pathways to show in barplot
652
+
653
+ # NEW: Spatial plot enhancement parameters
654
+ add_outline: bool = Field(
655
+ False, description="Add cluster outline/contour overlay to spatial plots"
656
+ )
657
+ outline_color: str = Field("black", description="Color for cluster outlines")
658
+ outline_width: float = Field(
659
+ 0.4, description="Line width for cluster outlines (Nature/Cell standard)"
660
+ )
661
+ outline_cluster_key: Optional[str] = Field(
662
+ None, description="Cluster key for outlines (e.g., 'leiden')"
663
+ )
664
+
665
+ # NEW: UMAP enhancement parameters
666
+ size_by: Optional[str] = Field(
667
+ None,
668
+ description="Feature for point size encoding in UMAP (dual color+size encoding)",
669
+ )
670
+ show_velocity: bool = Field(
671
+ False, description="Overlay RNA velocity vectors on UMAP"
672
+ )
673
+ velocity_scale: float = Field(1.0, description="Scaling factor for velocity arrows")
674
+
675
+ # NEW: Heatmap enhancement parameters
676
+ obs_annotation: Optional[list[str]] = Field(
677
+ None, description="List of obs keys to show as column annotations"
678
+ )
679
+ var_annotation: Optional[list[str]] = Field(
680
+ None, description="List of var keys to show as row annotations"
681
+ )
682
+ annotation_colors: Optional[dict[str, str]] = Field(
683
+ None, description="Custom colors for annotations"
684
+ )
685
+
686
+ # NEW: Integration assessment parameters
687
+ batch_key: str = Field(
688
+ "batch", description="Key in adata.obs for batch/sample identifier"
689
+ )
690
+ integration_method: Optional[str] = Field(
691
+ None, description="Integration method used (for display)"
692
+ )
693
+
694
+ # Dotplot visualization parameters
695
+ dotplot_dendrogram: bool = Field(
696
+ False,
697
+ description="Whether to show dendrogram for gene clustering in dotplot",
698
+ )
699
+ dotplot_swap_axes: bool = Field(
700
+ False,
701
+ description="Swap axes to show genes on x-axis and groups on y-axis",
702
+ )
703
+ dotplot_standard_scale: Optional[Literal["var", "group"]] = Field(
704
+ None,
705
+ description=(
706
+ "Standardize expression values for dotplot. "
707
+ "'var' = standardize per gene (row), "
708
+ "'group' = standardize per group (column)"
709
+ ),
710
+ )
711
+ dotplot_dot_max: Optional[float] = Field(
712
+ None,
713
+ ge=0.0,
714
+ le=1.0,
715
+ description=(
716
+ "Maximum dot size as fraction (0-1). "
717
+ "If None, maximum observed fraction is used"
718
+ ),
719
+ )
720
+ dotplot_dot_min: Optional[float] = Field(
721
+ None,
722
+ ge=0.0,
723
+ le=1.0,
724
+ description=(
725
+ "Minimum dot size as fraction (0-1). "
726
+ "If None, minimum observed fraction is used"
727
+ ),
728
+ )
729
+ dotplot_smallest_dot: float = Field(
730
+ 0.0,
731
+ ge=0.0,
732
+ le=50.0,
733
+ description=(
734
+ "Size of dot when expression fraction is 0. "
735
+ "Default 0 hides genes with no expression in a group"
736
+ ),
737
+ )
738
+ dotplot_var_groups: Optional[dict[str, list[str]]] = Field(
739
+ None,
740
+ description=(
741
+ "Group genes by category for organized display. "
742
+ "Example: {'T cell markers': ['CD3D', 'CD4'], 'B cell markers': ['CD19', 'MS4A1']}"
743
+ ),
744
+ )
745
+ dotplot_categories_order: Optional[list[str]] = Field(
746
+ None,
747
+ description="Custom order for groups (clusters/cell types) on the axis",
748
+ )
749
+
750
+ # Deconvolution visualization parameters
751
+ n_cell_types: Annotated[
752
+ int,
753
+ Field(
754
+ gt=0,
755
+ le=10,
756
+ description="Number of top cell types to show in deconvolution visualization. Must be between 1-10. Default: 4",
757
+ ),
758
+ ] = 4
759
+ deconv_method: Optional[str] = Field(
760
+ None,
761
+ description=(
762
+ "Deconvolution method name (e.g., 'cell2location', 'rctd'). "
763
+ "If None and only one result exists, auto-select and notify. "
764
+ "If None and multiple results exist, raise error requiring explicit specification. "
765
+ "This ensures you visualize the intended analysis for scientific reproducibility."
766
+ ),
767
+ )
768
+ min_proportion_threshold: float = Field(
769
+ 0.3,
770
+ ge=0.0,
771
+ le=1.0,
772
+ description="Minimum proportion threshold for marking spots as 'pure' vs 'mixed' (dominant_type visualization). Default: 0.3",
773
+ )
774
+ show_mixed_spots: bool = Field(
775
+ True,
776
+ description="Whether to mark mixed/heterogeneous spots in dominant_type visualization. Default: True",
777
+ )
778
+ pie_scale: float = Field(
779
+ 0.4,
780
+ gt=0.0,
781
+ le=2.0,
782
+ description="Size scale factor for pie charts in scatterpie visualization. Default: 0.4",
783
+ )
784
+ scatterpie_alpha: float = Field(
785
+ 1.0,
786
+ ge=0.0,
787
+ le=1.0,
788
+ description="Transparency of pie charts in scatterpie visualization (0=transparent, 1=opaque). Default: 1.0",
789
+ )
790
+ max_spots: int = Field(
791
+ 100,
792
+ gt=0,
793
+ le=1000,
794
+ description="Maximum number of spots to show in stacked_bar visualization. Default: 100",
795
+ )
796
+ sort_by: Literal["dominant_type", "spatial", "cluster"] = Field(
797
+ "dominant_type",
798
+ description="Sorting method for stacked_bar visualization. Options: dominant_type (group by dominant cell type), spatial (spatial order), cluster (cluster order). Default: dominant_type",
799
+ )
800
+
801
+ @model_validator(mode="after")
802
+ def validate_conditional_parameters(self) -> Self:
803
+ """Validate parameter dependencies and provide helpful error messages."""
804
+
805
+ # Spatial statistics validation
806
+ if self.plot_type == "spatial_statistics" and (
807
+ not self.subtype
808
+ or (isinstance(self.subtype, str) and not self.subtype.strip())
809
+ ):
810
+ available_types = [
811
+ "neighborhood",
812
+ "co_occurrence",
813
+ "ripley",
814
+ "moran",
815
+ "centrality",
816
+ "getis_ord",
817
+ ]
818
+ raise ValueError(
819
+ f"Parameter dependency error: subtype is required when plot_type='spatial_statistics'.\n"
820
+ f"Available subtypes: {', '.join(available_types)}\n"
821
+ f"Example usage: VisualizationParameters(plot_type='spatial_statistics', subtype='neighborhood')\n"
822
+ f"For more details, see spatial statistics documentation."
823
+ )
824
+
825
+ # Deconvolution validation - set default subtype if not provided
826
+ if self.plot_type == "deconvolution" and not self.subtype:
827
+ self.subtype = "spatial_multi" # Default deconvolution visualization type
828
+
829
+ return self
830
+
831
+
832
+ class AnnotationParameters(BaseModel):
833
+ """Cell type annotation parameters model"""
834
+
835
+ method: Literal[
836
+ "tangram",
837
+ "scanvi",
838
+ "cellassign",
839
+ "mllmcelltype",
840
+ "sctype",
841
+ "singler",
842
+ ] = "tangram"
843
+ marker_genes: Optional[dict[str, list[str]]] = None
844
+ reference_data: Optional[str] = None
845
+ reference_data_id: Optional[str] = (
846
+ None # For Tangram method - ID of reference single-cell dataset
847
+ )
848
+ training_genes: Optional[list[str]] = (
849
+ None # For Tangram method - genes to use for mapping
850
+ )
851
+ num_epochs: int = (
852
+ 100 # For Tangram/ScanVI methods - number of training epochs (reduced for faster training)
853
+ )
854
+ tangram_mode: Literal["cells", "clusters"] = (
855
+ "cells" # Tangram mapping mode: 'cells' (cell-level) or 'clusters' (cluster-level)
856
+ )
857
+ cluster_label: Optional[str] = (
858
+ None # For mLLMCellType method - cluster label in spatial data. Only required when method='mllmcelltype'
859
+ )
860
+ cell_type_key: Optional[str] = Field(
861
+ default=None,
862
+ description=(
863
+ "Column name for cell types in REFERENCE data. "
864
+ "\n\n"
865
+ "REQUIRED FOR METHODS USING REFERENCE DATA:\n"
866
+ " • tangram: REQUIRED - maps spatial data to reference using cell type labels\n"
867
+ " • scanvi: REQUIRED - transfers labels from reference to query data\n"
868
+ " • singler: REQUIRED - correlates expression with reference cell types\n"
869
+ "\n"
870
+ "NOT REQUIRED FOR METHODS WITHOUT REFERENCE:\n"
871
+ " • cellassign: Not needed - uses marker_genes parameter instead\n"
872
+ " • sctype: Not needed - uses built-in database or custom markers\n"
873
+ " • mllmcelltype: Not needed - uses LLM for annotation\n"
874
+ "\n"
875
+ "Common column names in reference data: 'cell_type', 'cell_types', 'celltype', 'annotation', 'label', 'cell_type_original'\n"
876
+ "\n"
877
+ "The LLM will auto-detect from metadata if not specified, but explicit specification is recommended."
878
+ ),
879
+ )
880
+
881
+ # Tangram-specific parameters (aligned with scvi.external.Tangram API)
882
+ tangram_density_prior: Literal["rna_count_based", "uniform"] = (
883
+ "rna_count_based" # Density prior for mapping
884
+ )
885
+ tangram_device: str = "cpu" # Device for computation ('cpu' or 'cuda:0')
886
+ tangram_learning_rate: float = 0.1 # Learning rate for optimization
887
+ tangram_compute_validation: bool = False # Whether to compute validation metrics
888
+ tangram_project_genes: bool = False # Whether to project gene expression
889
+
890
+ # Tangram regularization parameters (optional)
891
+ tangram_lambda_r: Optional[float] = (
892
+ None # Regularization parameter for entropy term in Tangram loss
893
+ )
894
+ tangram_lambda_neighborhood: Optional[float] = (
895
+ None # Neighborhood regularization parameter for spatial smoothness
896
+ )
897
+
898
+ # General parameters for batch effect and data handling
899
+ batch_key: Optional[str] = None # For batch effect correction
900
+ layer: Optional[str] = None # Which layer to use for analysis
901
+
902
+ # scANVI parameters (scvi-tools semi-supervised label transfer)
903
+ scanvi_n_hidden: int = Field(
904
+ default=128,
905
+ description="Number of hidden units per layer. Official default: 128",
906
+ )
907
+ scanvi_n_latent: int = Field(
908
+ default=10,
909
+ description=(
910
+ "Dimensionality of latent space. Official default: 10\n"
911
+ "scvi-tools recommendation for large integration: 30\n"
912
+ "WARNING:Empirical (not official): Small datasets may need 3-5 to avoid NaN"
913
+ ),
914
+ )
915
+ scanvi_n_layers: int = Field(
916
+ default=1,
917
+ description=(
918
+ "Number of hidden layers. Official default: 1\n"
919
+ "scvi-tools recommendation for large integration: 2"
920
+ ),
921
+ )
922
+ scanvi_dropout_rate: float = Field(
923
+ default=0.1,
924
+ description=(
925
+ "Dropout rate for regularization. Official default: 0.1\n"
926
+ "WARNING:Empirical (not official): 0.2-0.3 may help small datasets"
927
+ ),
928
+ )
929
+ scanvi_unlabeled_category: str = Field(
930
+ default="Unknown",
931
+ description="Label for unlabeled cells in semi-supervised learning",
932
+ )
933
+
934
+ # SCVI pretraining parameters (official best practice)
935
+ scanvi_use_scvi_pretrain: bool = Field(
936
+ default=True,
937
+ description=(
938
+ "Whether to pretrain with SCVI before SCANVI training. Default: True\n"
939
+ "Official scvi-tools best practice: SCVI pretraining improves stability\n"
940
+ "WARNING:For small datasets: Set to False if encountering NaN errors"
941
+ ),
942
+ )
943
+ scanvi_scvi_epochs: int = Field(
944
+ default=200, description="Number of epochs for SCVI pretraining. Default: 200"
945
+ )
946
+ scanvi_scanvi_epochs: int = Field(
947
+ default=20,
948
+ description=(
949
+ "Number of epochs for SCANVI model training after SCVI pretraining. Default: 20\n"
950
+ "This is the second stage training that fine-tunes the model for label transfer.\n"
951
+ "Official scvi-tools recommendation: 20 epochs is usually sufficient after pretraining.\n"
952
+ "Increase to 50-100 for complex datasets or if label transfer accuracy is low."
953
+ ),
954
+ )
955
+ scanvi_n_samples_per_label: int = Field(
956
+ default=100,
957
+ description="Number of samples per label for semi-supervised training",
958
+ )
959
+
960
+ # Query training parameters
961
+ scanvi_query_epochs: int = Field(
962
+ default=100,
963
+ description=(
964
+ "Number of epochs for training on query data. Default: 100\n"
965
+ "WARNING:For small datasets: Recommend 50 to prevent overfitting"
966
+ ),
967
+ )
968
+ scanvi_check_val_every_n_epoch: int = Field(
969
+ default=10, description="Validation check frequency during training"
970
+ )
971
+
972
+ # CellAssign parameters
973
+ cellassign_n_hidden: int = 100
974
+ cellassign_learning_rate: float = 0.001
975
+ cellassign_max_iter: int = 200
976
+
977
+ # mLLMCellType parameters
978
+ mllm_n_marker_genes: Annotated[int, Field(gt=0, le=50)] = (
979
+ 20 # Number of marker genes per cluster
980
+ )
981
+ mllm_species: Literal["human", "mouse"] = "human" # Species
982
+ mllm_tissue: Optional[str] = None # Tissue type (e.g., "brain", "liver")
983
+ mllm_provider: Literal[
984
+ "openai",
985
+ "anthropic",
986
+ "gemini",
987
+ "deepseek",
988
+ "qwen",
989
+ "zhipu",
990
+ "stepfun",
991
+ "minimax",
992
+ "grok",
993
+ "openrouter",
994
+ ] = "openai" # LLM provider (use 'gemini' not 'google')
995
+ mllm_model: Optional[str] = (
996
+ None # Model name. Defaults: openai="gpt-5", anthropic="claude-sonnet-4-20250514", gemini="gemini-2.5-pro-preview-03-25"
997
+ # Examples: "gpt-5", "claude-sonnet-4-5-20250929", "claude-opus-4-1-20250805", "gemini-2.5-pro", "qwen-max-2025-01-25"
998
+ )
999
+ mllm_api_key: Optional[str] = None # API key for the LLM provider
1000
+ mllm_additional_context: Optional[str] = None # Additional context for annotation
1001
+ mllm_use_cache: bool = True # Whether to use caching for API calls
1002
+ mllm_base_urls: Optional[Union[str, dict[str, str]]] = None # Custom API endpoints
1003
+ mllm_verbose: bool = False # Whether to print detailed logs
1004
+ mllm_force_rerun: bool = False # Force reanalysis bypassing cache
1005
+
1006
+ # Multi-model consensus parameters (interactive_consensus_annotation)
1007
+ mllm_use_consensus: bool = False # Whether to use multi-model consensus
1008
+ mllm_models: Optional[list[Union[str, dict[str, str]]]] = (
1009
+ None # List of models for consensus
1010
+ )
1011
+ mllm_api_keys: Optional[dict[str, str]] = None # Dict mapping provider to API key
1012
+ mllm_consensus_threshold: float = 0.7 # Agreement threshold for consensus
1013
+ mllm_entropy_threshold: float = 1.0 # Entropy threshold for controversy detection
1014
+ mllm_max_discussion_rounds: int = 3 # Maximum discussion rounds
1015
+ mllm_consensus_model: Optional[Union[str, dict[str, str]]] = (
1016
+ None # Model for consensus checking
1017
+ )
1018
+ mllm_clusters_to_analyze: Optional[list[str]] = None # Specific clusters to analyze
1019
+
1020
+ # ScType parameters
1021
+ sctype_tissue: Optional[str] = (
1022
+ None # Tissue type (supported: "Adrenal", "Brain", "Eye", "Heart", "Hippocampus", "Immune system", "Intestine", "Kidney", "Liver", "Lung", "Muscle", "Pancreas", "Placenta", "Spleen", "Stomach", "Thymus")
1023
+ )
1024
+ sctype_db_: Optional[str] = (
1025
+ None # Custom database path (if None, uses default ScTypeDB)
1026
+ )
1027
+ sctype_scaled: bool = True # Whether input data is scaled
1028
+ sctype_custom_markers: Optional[dict[str, dict[str, list[str]]]] = (
1029
+ None # Custom markers: {"CellType": {"positive": [...], "negative": [...]}}
1030
+ )
1031
+ sctype_use_cache: bool = True # Whether to cache results to avoid repeated R calls
1032
+
1033
+ # SingleR parameters (for enhanced marker_genes method)
1034
+ singler_reference: Optional[str] = Field(
1035
+ default=None,
1036
+ description=(
1037
+ "Reference dataset name from celldex package (Python naming convention).\n\n"
1038
+ "Valid references:\n"
1039
+ " Human: 'hpca' (Human Primary Cell Atlas, recommended), 'blueprint_encode', "
1040
+ "'dice', 'monaco_immune', 'novershtern_hematopoietic'\n"
1041
+ " Mouse: 'immgen' (ImmGen, recommended), 'mouse_rnaseq'\n\n"
1042
+ "Common mistakes:\n"
1043
+ " 'HumanPrimaryCellAtlasData' - WRONG, use 'hpca'\n"
1044
+ " 'ImmGenData' - WRONG, use 'immgen'\n\n"
1045
+ "If None, uses species-appropriate default ('hpca' for human, 'immgen' for mouse)."
1046
+ ),
1047
+ )
1048
+ singler_integrated: bool = Field(
1049
+ default=False,
1050
+ description="Whether to use integrated annotation with multiple references",
1051
+ )
1052
+ singler_fine_tune: bool = Field(
1053
+ default=True,
1054
+ description="Whether to perform fine-tuning step in SingleR annotation (refines labels based on marker genes)",
1055
+ )
1056
+ num_threads: int = 4 # Number of threads for parallel processing
1057
+
1058
+
1059
+ class SpatialStatisticsParameters(BaseModel):
1060
+ """Spatial statistics parameters model"""
1061
+
1062
+ analysis_type: Literal[
1063
+ "neighborhood",
1064
+ "co_occurrence",
1065
+ "ripley",
1066
+ "moran",
1067
+ "local_moran", # Added: Local Moran's I (LISA)
1068
+ "geary",
1069
+ "centrality",
1070
+ "getis_ord",
1071
+ "bivariate_moran",
1072
+ "join_count", # Traditional Join Count for binary data (2 categories)
1073
+ "local_join_count", # Local Join Count for multi-category data (>2 categories)
1074
+ "network_properties",
1075
+ "spatial_centrality",
1076
+ ] = "neighborhood"
1077
+ cluster_key: Optional[str] = Field(
1078
+ default=None,
1079
+ description=(
1080
+ "Column name for cluster/cell type labels in adata.obs. "
1081
+ "\n\n"
1082
+ "REQUIRED FOR GROUP-BASED ANALYSES:\n"
1083
+ " • neighborhood: REQUIRED - analyzes enrichment between cell type groups\n"
1084
+ " • co_occurrence: REQUIRED - measures spatial co-occurrence of groups\n"
1085
+ " • ripley: REQUIRED - analyzes spatial point patterns by group\n"
1086
+ " • join_count: REQUIRED - for BINARY categorical data (2 categories)\n"
1087
+ " • local_join_count: REQUIRED - for MULTI-CATEGORY data (>2 categories)\n"
1088
+ "\n"
1089
+ "OPTIONAL/NOT REQUIRED FOR GENE-BASED ANALYSES:\n"
1090
+ " • moran: Not required - analyzes gene expression spatial patterns\n"
1091
+ " • local_moran: Not required - identifies local spatial clusters for genes\n"
1092
+ " • geary: Not required - measures gene expression spatial autocorrelation\n"
1093
+ " • getis_ord: Not required - detects hot/cold spots for gene expression\n"
1094
+ " • bivariate_moran: Not required - analyzes gene pair spatial correlation\n"
1095
+ " • centrality: Not required - computes spatial network centrality\n"
1096
+ " • network_properties: Not required - analyzes spatial network structure\n"
1097
+ " • spatial_centrality: Not required - measures spatial importance\n"
1098
+ "\n"
1099
+ "Common column names: 'leiden', 'louvain', 'cell_type', 'cell_type_tangram', 'seurat_clusters', 'clusters'\n"
1100
+ "\n"
1101
+ "The LLM will auto-detect from metadata if not specified for required analyses."
1102
+ ),
1103
+ )
1104
+ n_neighbors: Annotated[int, Field(gt=0)] = Field(
1105
+ 8,
1106
+ description=(
1107
+ "Number of nearest neighbors for spatial graph construction. "
1108
+ "Default: 8 (recommended by ArcGIS for Getis-Ord analysis). "
1109
+ "Adjust based on dataset density and spatial scale."
1110
+ ),
1111
+ )
1112
+
1113
+ # Unified gene selection parameter (NEW)
1114
+ genes: Optional[list[str]] = Field(
1115
+ None,
1116
+ description="Specific genes to analyze. If None, uses HVG or defaults based on analysis type",
1117
+ )
1118
+ n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
1119
+ 20,
1120
+ description="Number of top HVGs to analyze (default 20, up to 500 for comprehensive analysis)",
1121
+ )
1122
+
1123
+ # Parallel processing parameters
1124
+ n_jobs: Optional[int] = Field(
1125
+ 1,
1126
+ description="Number of parallel jobs. 1 = no parallelization (recommended for small datasets), None = auto-detect, -1 = all cores",
1127
+ )
1128
+ backend: Literal["loky", "threading", "multiprocessing"] = Field(
1129
+ "threading",
1130
+ description="Parallelization backend (threading is safer than loky)",
1131
+ )
1132
+
1133
+ # Moran's I specific parameters
1134
+ moran_n_perms: Annotated[int, Field(gt=0, le=10000)] = Field(
1135
+ 10,
1136
+ description="Number of permutations (default 10 for speed, use 100+ for publication)",
1137
+ )
1138
+ moran_two_tailed: bool = Field(False, description="Use two-tailed test")
1139
+
1140
+ # Local Moran's I (LISA) specific parameters
1141
+ local_moran_permutations: Annotated[int, Field(gt=0, le=9999)] = Field(
1142
+ 999,
1143
+ description=(
1144
+ "Number of permutations for pseudo p-value calculation in Local Moran's I. "
1145
+ "Higher values increase precision: 99 -> precision 0.01, 999 -> precision 0.001. "
1146
+ "Default 999 is standard practice. Use 9999 for publication-quality results."
1147
+ ),
1148
+ )
1149
+ local_moran_alpha: Annotated[float, Field(gt=0.0, lt=1.0)] = Field(
1150
+ 0.05,
1151
+ description=(
1152
+ "Significance level (alpha) for Local Moran's I hotspot/coldspot detection. "
1153
+ "Used with FDR correction to determine significant spatial clusters. "
1154
+ "Common values: 0.05 (standard), 0.01 (conservative), 0.10 (exploratory)."
1155
+ ),
1156
+ )
1157
+ local_moran_fdr_correction: bool = Field(
1158
+ True,
1159
+ description=(
1160
+ "Whether to apply FDR (False Discovery Rate) correction for multiple testing. "
1161
+ "STRONGLY RECOMMENDED: Each location is tested separately, creating a multiple "
1162
+ "testing problem. FDR correction controls the expected proportion of false positives. "
1163
+ "Set to False only for exploratory analysis."
1164
+ ),
1165
+ )
1166
+
1167
+ # Getis-Ord Gi* specific parameters
1168
+ getis_ord_correction: Literal["bonferroni", "fdr_bh", "none"] = Field(
1169
+ "fdr_bh",
1170
+ description=(
1171
+ "Multiple testing correction method for Getis-Ord analysis. "
1172
+ "Options: 'fdr_bh' (Benjamini-Hochberg FDR, recommended for multi-gene), "
1173
+ "'bonferroni' (conservative), 'none' (no correction)"
1174
+ ),
1175
+ )
1176
+ getis_ord_alpha: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
1177
+ 0.05,
1178
+ description=(
1179
+ "Significance level (alpha) for Getis-Ord hotspot detection. "
1180
+ "Determines Z-score threshold via norm.ppf(1 - alpha/2). "
1181
+ "Common values: 0.05 (z=1.96), 0.01 (z=2.576), 0.10 (z=1.645)"
1182
+ ),
1183
+ )
1184
+
1185
+ # Bivariate Moran's I specific parameters
1186
+ gene_pairs: Optional[list[tuple[str, str]]] = Field(
1187
+ None, description="Gene pairs for bivariate analysis"
1188
+ )
1189
+
1190
+
1191
+ class RNAVelocityParameters(BaseModel):
1192
+ """RNA velocity analysis parameters model"""
1193
+
1194
+ model_config = ConfigDict(
1195
+ extra="forbid"
1196
+ ) # Strict validation - no extra parameters allowed
1197
+
1198
+ # Velocity computation method selection
1199
+ method: Literal["scvelo", "velovi"] = "scvelo"
1200
+
1201
+ # scVelo specific parameters
1202
+ scvelo_mode: Literal["deterministic", "stochastic", "dynamical"] = "stochastic"
1203
+ n_pcs: Annotated[int, Field(gt=0, le=100)] = 30
1204
+ basis: str = "spatial"
1205
+
1206
+ # Preprocessing parameters for velocity computation
1207
+ min_shared_counts: Annotated[int, Field(gt=0)] = (
1208
+ 30 # Minimum shared counts for filtering
1209
+ )
1210
+ n_top_genes: Annotated[int, Field(gt=0)] = 2000 # Number of top genes to retain
1211
+ n_neighbors: Annotated[int, Field(gt=0)] = (
1212
+ 30 # Number of neighbors for moments computation
1213
+ )
1214
+
1215
+ # VELOVI specific parameters
1216
+ velovi_n_hidden: int = 128
1217
+ velovi_n_latent: int = 10
1218
+ velovi_n_layers: int = 1
1219
+ velovi_n_epochs: int = 1000
1220
+ velovi_dropout_rate: float = 0.1
1221
+ velovi_learning_rate: float = 1e-3
1222
+ velovi_use_gpu: bool = False
1223
+
1224
+
1225
+ class TrajectoryParameters(BaseModel):
1226
+ """Trajectory analysis parameters model"""
1227
+
1228
+ method: Literal["cellrank", "palantir", "dpt"] = "cellrank"
1229
+ spatial_weight: Annotated[float, Field(ge=0.0, le=1.0)] = (
1230
+ 0.5 # Spatial information weight
1231
+ )
1232
+ root_cells: Optional[list[str]] = None # For Palantir method
1233
+
1234
+ # CellRank specific parameters
1235
+ cellrank_kernel_weights: tuple[float, float] = (
1236
+ 0.8,
1237
+ 0.2,
1238
+ ) # (velocity_weight, connectivity_weight)
1239
+ cellrank_n_states: Annotated[int, Field(gt=0, le=20)] = (
1240
+ 5 # Number of macrostates for CellRank
1241
+ )
1242
+
1243
+ # Palantir specific parameters
1244
+ palantir_n_diffusion_components: Annotated[int, Field(gt=0, le=50)] = (
1245
+ 10 # Number of diffusion components
1246
+ )
1247
+ palantir_num_waypoints: Annotated[int, Field(gt=0)] = (
1248
+ 500 # Number of waypoints for Palantir
1249
+ )
1250
+
1251
+ # Fallback control
1252
+ # Removed: allow_fallback_to_dpt - No longer doing automatic fallbacks
1253
+ # LLMs should explicitly choose which method to use
1254
+
1255
+
1256
+ class IntegrationParameters(BaseModel):
1257
+ """Sample integration parameters model"""
1258
+
1259
+ method: Literal["harmony", "bbknn", "scanorama", "scvi"] = "harmony"
1260
+ batch_key: str = "batch" # Batch information key
1261
+ n_pcs: Annotated[int, Field(gt=0, le=100)] = (
1262
+ 30 # Number of principal components for integration
1263
+ )
1264
+ align_spatial: bool = True # Whether to align spatial coordinates
1265
+ reference_batch: Optional[str] = None # Reference batch for spatial alignment
1266
+
1267
+ # Common scvi-tools parameters
1268
+ use_gpu: bool = False # Whether to use GPU acceleration for scvi-tools methods
1269
+ n_epochs: Optional[int] = None # Number of training epochs (None = auto-determine)
1270
+
1271
+ # scVI integration parameters
1272
+ scvi_n_hidden: int = 128
1273
+ scvi_n_latent: int = 10
1274
+ scvi_n_layers: int = 1
1275
+ scvi_dropout_rate: float = 0.1
1276
+ scvi_gene_likelihood: Literal["zinb", "nb", "poisson"] = "zinb"
1277
+
1278
+
1279
+ class DeconvolutionParameters(BaseModel):
1280
+ """Spatial deconvolution parameters model"""
1281
+
1282
+ method: Literal[
1283
+ "flashdeconv",
1284
+ "cell2location",
1285
+ "rctd",
1286
+ "destvi",
1287
+ "stereoscope",
1288
+ "spotlight",
1289
+ "tangram",
1290
+ "card",
1291
+ ] = "flashdeconv"
1292
+ reference_data_id: Optional[str] = (
1293
+ None # Reference single-cell data for deconvolution
1294
+ )
1295
+ cell_type_key: str # REQUIRED: Key in reference data for cell type information. LLM will infer from metadata. Common values: 'cell_type', 'celltype', 'annotation', 'label'
1296
+
1297
+ # Universal GPU parameter
1298
+ use_gpu: bool = Field(
1299
+ False,
1300
+ description=(
1301
+ "Whether to use GPU acceleration for training. "
1302
+ "Supported by: Cell2location, DestVI, Stereoscope, Tangram. "
1303
+ "Not supported by: RCTD, SPOTlight, CARD (R-based methods). "
1304
+ "Requires CUDA-compatible GPU and proper PyTorch installation."
1305
+ ),
1306
+ )
1307
+
1308
+ # Cell2location specific parameters
1309
+ cell2location_ref_model_epochs: Annotated[int, Field(gt=0)] = Field(
1310
+ 250,
1311
+ description=(
1312
+ "Number of epochs for Cell2location reference model training (NB regression). "
1313
+ "This is the first stage training for estimating reference cell type signatures. "
1314
+ "Official recommendation: 250. "
1315
+ "ONLY USED BY CELL2LOCATION METHOD."
1316
+ ),
1317
+ )
1318
+ cell2location_n_epochs: Annotated[int, Field(gt=0)] = Field(
1319
+ 30000,
1320
+ description=(
1321
+ "Number of epochs for Cell2location spatial mapping model training. "
1322
+ "Official recommendation: 30000. "
1323
+ "ONLY USED BY CELL2LOCATION METHOD."
1324
+ ),
1325
+ )
1326
+ cell2location_n_cells_per_spot: Annotated[int, Field(gt=0)] = Field(
1327
+ 30,
1328
+ description=(
1329
+ "Expected number of cells per spatial location for Cell2location. "
1330
+ "This is tissue-dependent (e.g., 30 for Visium, 5-10 for MERFISH). "
1331
+ "Official recommendation: 30 for Visium data. "
1332
+ "ONLY USED BY CELL2LOCATION METHOD."
1333
+ ),
1334
+ )
1335
+ cell2location_detection_alpha: Annotated[float, Field(gt=0)] = Field(
1336
+ 20.0,
1337
+ description=(
1338
+ "RNA detection sensitivity parameter for Cell2location. "
1339
+ "NEW DEFAULT (2024): 20 for high technical variability, 200 for low variability. "
1340
+ "Recommendation: test both values on your data. "
1341
+ "ONLY USED BY CELL2LOCATION METHOD."
1342
+ ),
1343
+ )
1344
+
1345
+ # Batch and covariate correction for cell2location
1346
+ cell2location_batch_key: Optional[str] = Field(
1347
+ None,
1348
+ description=(
1349
+ "Column name in adata.obs for batch information (e.g., 'sample_id', 'batch'). "
1350
+ "Used for batch effect correction in Cell2location. "
1351
+ "ONLY USED BY CELL2LOCATION METHOD."
1352
+ ),
1353
+ )
1354
+ cell2location_categorical_covariate_keys: Optional[list[str]] = Field(
1355
+ None,
1356
+ description=(
1357
+ "List of column names in adata.obs for categorical technical covariates "
1358
+ "(e.g., ['platform', 'donor_id']) for Cell2location. "
1359
+ "ONLY USED BY CELL2LOCATION METHOD."
1360
+ ),
1361
+ )
1362
+
1363
+ # Gene filtering parameters (Cell2location-specific preprocessing)
1364
+ cell2location_apply_gene_filtering: bool = Field(
1365
+ True,
1366
+ description=(
1367
+ "Apply Cell2location's recommended permissive gene filtering before training. "
1368
+ "ONLY USED BY CELL2LOCATION. This is NOT the same as HVG selection:\n"
1369
+ "• Cell2location uses permissive filtering to keep rare cell type markers\n"
1370
+ "• Yields ~10k-16k genes (more than typical 2k HVGs)\n"
1371
+ "• Official recommendation: avoid further gene selection for robust results\n"
1372
+ "Other methods use different strategies (see spotlight_n_top_genes parameter)."
1373
+ ),
1374
+ )
1375
+ cell2location_gene_filter_cell_count_cutoff: int = Field(
1376
+ 5,
1377
+ description=(
1378
+ "Minimum cells expressing a gene for Cell2location filtering (official default: 5). "
1379
+ "Low cutoff preserves rare cell type markers. "
1380
+ "ONLY USED BY CELL2LOCATION METHOD."
1381
+ ),
1382
+ )
1383
+ cell2location_gene_filter_cell_percentage_cutoff2: float = Field(
1384
+ 0.03,
1385
+ description=(
1386
+ "Minimum percentage of cells expressing for Cell2location (official default: 0.03 = 3%). "
1387
+ "Genes detected in ≥3% of cells are always included. "
1388
+ "ONLY USED BY CELL2LOCATION METHOD."
1389
+ ),
1390
+ )
1391
+ cell2location_gene_filter_nonz_mean_cutoff: float = Field(
1392
+ 1.12,
1393
+ description=(
1394
+ "Minimum non-zero mean expression for Cell2location (official default: 1.12). "
1395
+ "For genes between cutoffs, only keep if avg expression in non-zero cells > 1.12. "
1396
+ "ONLY USED BY CELL2LOCATION METHOD."
1397
+ ),
1398
+ )
1399
+
1400
+ # Phase 2: Training enhancement parameters (Cell2location)
1401
+ cell2location_ref_model_lr: Annotated[float, Field(gt=0)] = Field(
1402
+ 0.002,
1403
+ description=(
1404
+ "Reference model learning rate for Cell2location (official default: 0.002 with ClippedAdam optimizer). "
1405
+ "ONLY USED BY CELL2LOCATION METHOD."
1406
+ ),
1407
+ )
1408
+ cell2location_lr: Annotated[float, Field(gt=0)] = Field(
1409
+ 0.005,
1410
+ description=(
1411
+ "Cell2location model learning rate (official default: 0.005). "
1412
+ "ONLY USED BY CELL2LOCATION METHOD."
1413
+ ),
1414
+ )
1415
+ cell2location_ref_model_train_size: Annotated[float, Field(gt=0, le=1)] = Field(
1416
+ 1.0,
1417
+ description=(
1418
+ "Fraction of reference data for training in Cell2location. "
1419
+ "DEFAULT: 1.0 (official tutorial recommendation - use all data). "
1420
+ "IMPORTANT: RegressionModel validation is not yet implemented, so train_size=1 is standard practice. "
1421
+ "ONLY USED BY CELL2LOCATION METHOD."
1422
+ ),
1423
+ )
1424
+ cell2location_train_size: Annotated[float, Field(gt=0, le=1)] = Field(
1425
+ 1.0,
1426
+ description=(
1427
+ "Fraction of spatial data for training in Cell2location. "
1428
+ "DEFAULT: 1.0 (official tutorial: 'we need to estimate cell abundance at all locations'). "
1429
+ "Using train_size=1 ensures all spatial locations are included in training. "
1430
+ "ONLY USED BY CELL2LOCATION METHOD."
1431
+ ),
1432
+ )
1433
+ cell2location_enable_qc_plots: bool = Field(
1434
+ False,
1435
+ description=(
1436
+ "Generate QC diagnostic plots for Cell2location (ELBO history, convergence diagnostics). "
1437
+ "ONLY USED BY CELL2LOCATION METHOD."
1438
+ ),
1439
+ )
1440
+ cell2location_qc_output_dir: Optional[str] = Field(
1441
+ None,
1442
+ description=(
1443
+ "Output directory for Cell2location QC plots (None = plots not saved to disk). "
1444
+ "ONLY USED BY CELL2LOCATION METHOD."
1445
+ ),
1446
+ )
1447
+
1448
+ # Phase 3: Runtime optimization parameters (Cell2location)
1449
+ cell2location_early_stopping: bool = Field(
1450
+ False,
1451
+ description=(
1452
+ "Enable early stopping to reduce Cell2location training time. "
1453
+ "DEFAULT: False (following official tutorial best practice). "
1454
+ "IMPORTANT: RegressionModel does not support validation, so early stopping is not recommended. "
1455
+ "Official tutorial uses train_size=1 without early stopping. "
1456
+ "Only enable if you have specific convergence monitoring needs. "
1457
+ "ONLY USED BY CELL2LOCATION METHOD."
1458
+ ),
1459
+ )
1460
+ cell2location_early_stopping_patience: Annotated[int, Field(gt=0)] = Field(
1461
+ 45,
1462
+ description=(
1463
+ "Epochs to wait before stopping if no improvement for Cell2location (official default: 45). "
1464
+ "ONLY USED BY CELL2LOCATION METHOD."
1465
+ ),
1466
+ )
1467
+ cell2location_early_stopping_threshold: Annotated[float, Field(gt=0)] = Field(
1468
+ 0.0,
1469
+ description=(
1470
+ "Minimum relative change to qualify as improvement for Cell2location (0 = any improvement). "
1471
+ "ONLY USED BY CELL2LOCATION METHOD."
1472
+ ),
1473
+ )
1474
+ cell2location_use_aggressive_training: bool = Field(
1475
+ False,
1476
+ description=(
1477
+ "Use train_aggressive() method for large-scale datasets in Cell2location. "
1478
+ "DEFAULT: False (standard train() method, following official tutorial). "
1479
+ "WHEN TO USE: Only for datasets with >50k locations that require mini-batch training due to GPU memory constraints. "
1480
+ "Standard Visium datasets (<50k locations) should use train_size=1 with batch_size=None (official best practice). "
1481
+ "Aggressive training implements amortised inference for scalability to 100k-1M+ locations. "
1482
+ "ONLY USED BY CELL2LOCATION METHOD."
1483
+ ),
1484
+ )
1485
+ cell2location_validation_size: Annotated[float, Field(gt=0, lt=1)] = Field(
1486
+ 0.1,
1487
+ description=(
1488
+ "Fraction of data for validation set in Cell2location (required if early_stopping=True). "
1489
+ "NOTE: Official tutorial uses train_size=1 (no validation split) for standard workflows. "
1490
+ "ONLY USED BY CELL2LOCATION METHOD."
1491
+ ),
1492
+ )
1493
+
1494
+ # SPOTlight specific parameters
1495
+ spotlight_n_top_genes: Annotated[int, Field(gt=0, le=5000)] = Field(
1496
+ 2000,
1497
+ description=(
1498
+ "Number of top highly variable genes (HVGs) to use for SPOTlight deconvolution. "
1499
+ "ONLY USED BY SPOTLIGHT METHOD. Other methods use different gene selection strategies:\n"
1500
+ "• Cell2location: Uses permissive gene filtering (apply_gene_filtering parameter)\n"
1501
+ "• RCTD/DestVI/Stereoscope/CARD/Tangram: Use all common genes between datasets\n"
1502
+ "Default: 2000. Recommended range: 1000-3000 for standard Visium data."
1503
+ ),
1504
+ )
1505
+ spotlight_nmf_model: Literal["ns"] = Field(
1506
+ "ns",
1507
+ description=(
1508
+ "NMF model type for SPOTlight. ONLY USED BY SPOTLIGHT METHOD.\n\n"
1509
+ "Currently only 'ns' (non-smooth NMF) is supported. This method produces "
1510
+ "sparser, more interpretable deconvolution results.\n\n"
1511
+ "NOTE: SPOTlight documentation mentions 'std' (standard NMF) as an option, "
1512
+ "but it is currently broken in SPOTlight (internally creates 'stdNMF' algorithm "
1513
+ "which doesn't exist in the NMF package registry). We only expose working parameters.\n\n"
1514
+ "Reference: Elosua-Bayes et al. (2021) Nucleic Acids Research."
1515
+ ),
1516
+ )
1517
+ spotlight_min_prop: Annotated[float, Field(ge=0, le=1)] = Field(
1518
+ 0.01,
1519
+ description=(
1520
+ "Minimum cell type proportion threshold for SPOTlight. "
1521
+ "Cell types contributing less than this value are filtered out as noise. "
1522
+ "Official default: 0.01 (1%). "
1523
+ "Lower values = keep more cell types but more noise. "
1524
+ "Higher values = stricter filtering but may lose rare cell types. "
1525
+ "ONLY USED BY SPOTLIGHT METHOD."
1526
+ ),
1527
+ )
1528
+ spotlight_scale: bool = Field(
1529
+ True,
1530
+ description=(
1531
+ "Whether to scale/normalize data in SPOTlight. "
1532
+ "Affects gene expression scale handling. "
1533
+ "Default: True (recommended). "
1534
+ "ONLY USED BY SPOTLIGHT METHOD."
1535
+ ),
1536
+ )
1537
+ spotlight_weight_id: str = Field(
1538
+ "mean.AUC",
1539
+ description=(
1540
+ "Column name for marker gene weights in SPOTlight. "
1541
+ "Specifies which metric to use for weighting marker genes. "
1542
+ "Common values: 'mean.AUC' (default), 'median.AUC'. "
1543
+ "ONLY USED BY SPOTLIGHT METHOD."
1544
+ ),
1545
+ )
1546
+
1547
+ # DestVI parameters
1548
+ destvi_n_epochs: Annotated[int, Field(gt=0)] = Field(
1549
+ 2000,
1550
+ description=(
1551
+ "Number of epochs for DestVI training. "
1552
+ "Official recommendation: 2000 (minimum 1000). "
1553
+ "ONLY USED BY DESTVI METHOD."
1554
+ ),
1555
+ )
1556
+ destvi_n_hidden: int = 128
1557
+ destvi_n_latent: int = 10
1558
+ destvi_n_layers: int = 1
1559
+ destvi_dropout_rate: float = 0.1
1560
+ destvi_learning_rate: float = 1e-3
1561
+
1562
+ # DestVI advanced parameters (official scvi-tools defaults)
1563
+ destvi_train_size: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
1564
+ default=0.9,
1565
+ description=(
1566
+ "Fraction of data to use for training DestVI (rest for validation). "
1567
+ "Official scvi-tools default: 0.9. "
1568
+ "Lower values (0.8) provide more robust validation but less training data. "
1569
+ "ONLY USED BY DESTVI METHOD."
1570
+ ),
1571
+ )
1572
+ destvi_vamp_prior_p: Annotated[int, Field(ge=1)] = Field(
1573
+ default=15,
1574
+ description=(
1575
+ "Number of VampPrior components for DestVI. "
1576
+ "Official scvi-tools default: 15. "
1577
+ "Higher values may improve modeling of complex cell type distributions. "
1578
+ "ONLY USED BY DESTVI METHOD."
1579
+ ),
1580
+ )
1581
+ destvi_l1_reg: Annotated[float, Field(ge=0.0)] = Field(
1582
+ default=10.0,
1583
+ description=(
1584
+ "L1 regularization strength for DestVI to encourage sparsity. "
1585
+ "Official scvi-tools default: 10.0. "
1586
+ "Higher values encourage sparser cell type assignments per spot. "
1587
+ "ONLY USED BY DESTVI METHOD."
1588
+ ),
1589
+ )
1590
+
1591
+ # Stereoscope parameters
1592
+ stereoscope_n_epochs: int = 150000
1593
+ stereoscope_learning_rate: float = 0.01
1594
+ stereoscope_batch_size: int = 128
1595
+
1596
+ # RCTD specific parameters
1597
+ rctd_mode: Literal["full", "doublet", "multi"] = Field(
1598
+ "full",
1599
+ description=(
1600
+ "RCTD deconvolution mode (Cable et al. 2022):\n"
1601
+ "• 'doublet': Assigns 1-2 cell types per spot, classifies each as 'singlet' or 'doublet'. "
1602
+ "Recommended for HIGH-RESOLUTION spatial data (Slide-seq ~10μm, MERFISH, Visium HD)\n"
1603
+ "• 'full' (default): Assigns any number of cell types per spot. "
1604
+ "Recommended for LOW-RESOLUTION data (standard Visium 55μm spots, 100μm spacing)\n"
1605
+ "• 'multi': Extension of doublet mode using greedy algorithm to add multiple cell types. "
1606
+ "Alternative to 'full' with more constraints on cell type mixing"
1607
+ ),
1608
+ )
1609
+ max_cores: Annotated[int, Field(gt=0, le=16)] = 4 # Maximum number of cores to use
1610
+ rctd_confidence_threshold: Annotated[float, Field(gt=0)] = (
1611
+ 10.0 # Confidence threshold for cell type assignment (higher = more stringent)
1612
+ )
1613
+ rctd_doublet_threshold: Annotated[float, Field(gt=0)] = (
1614
+ 25.0 # Threshold for doublet detection (used in doublet/multi modes)
1615
+ )
1616
+ rctd_max_multi_types: Annotated[int, Field(ge=2, le=10)] = Field(
1617
+ 4,
1618
+ description=(
1619
+ "Maximum number of cell types per spot in RCTD multi mode. "
1620
+ "Recommended: 4-6 for Visium (100μm spots), 2-3 for higher resolution. "
1621
+ "Must be less than total number of cell types in reference data."
1622
+ ),
1623
+ )
1624
+
1625
+ # CARD specific parameters
1626
+ card_minCountGene: Annotated[int, Field(gt=0)] = Field(
1627
+ 100,
1628
+ description="Minimum total counts per gene across all spots for CARD quality control filtering",
1629
+ )
1630
+ card_minCountSpot: Annotated[int, Field(gt=0)] = Field(
1631
+ 5,
1632
+ description="Minimum number of spots where a gene must be expressed for CARD quality control",
1633
+ )
1634
+ card_sample_key: Optional[str] = Field(
1635
+ None,
1636
+ description="Optional sample/batch column name in reference data for multi-sample CARD analysis",
1637
+ )
1638
+ card_imputation: bool = Field(
1639
+ False,
1640
+ description=(
1641
+ "Enable CARD spatial imputation to create enhanced high-resolution spatial maps. "
1642
+ "CARD's unique CAR (Conditional AutoRegressive) model allows imputation at unmeasured locations, "
1643
+ "constructing refined tissue maps with arbitrarily higher resolution than the original measurement. "
1644
+ "Extremely fast: 0.4s for all genes (5816x faster than BayesSpace). "
1645
+ "Use for: Enhancing Visium to near-cellular resolution, filling tissue gaps, smoothing artifacts"
1646
+ ),
1647
+ )
1648
+ card_NumGrids: Annotated[int, Field(gt=0)] = Field(
1649
+ 2000,
1650
+ description=(
1651
+ "Number of spatial grid points for CARD imputation (default: 2000). "
1652
+ "Higher values = finer spatial resolution but increased computation. "
1653
+ "Typical values: 2000 (standard), 5000 (high-res), 10000 (ultra high-res). "
1654
+ "The imputed map will have ~NumGrids locations covering the tissue area"
1655
+ ),
1656
+ )
1657
+ card_ineibor: Annotated[int, Field(gt=0)] = Field(
1658
+ 10,
1659
+ description=(
1660
+ "Number of nearest neighbors for CARD spatial imputation (default: 10). "
1661
+ "Controls the spatial smoothness of imputed results. "
1662
+ "Higher values = smoother maps, lower values = preserve local variation"
1663
+ ),
1664
+ )
1665
+
1666
+ # Tangram specific parameters
1667
+ tangram_n_epochs: Annotated[int, Field(gt=0)] = Field(
1668
+ 1000,
1669
+ description=(
1670
+ "Number of epochs for Tangram spatial mapping. "
1671
+ "Official recommendation: 1000. "
1672
+ "ONLY USED BY TANGRAM METHOD."
1673
+ ),
1674
+ )
1675
+ tangram_mode: Literal["cells", "clusters", "constrained"] = Field(
1676
+ "cells",
1677
+ description=(
1678
+ "Tangram mapping mode. "
1679
+ "'cells': Cell-level mapping (default). "
1680
+ "'clusters': Cluster-level mapping (requires cluster_label). "
1681
+ "'constrained': Constrained optimization with target_count. "
1682
+ "Official recommendation: 'cells' for most applications. "
1683
+ "ONLY USED BY TANGRAM METHOD."
1684
+ ),
1685
+ )
1686
+ tangram_learning_rate: Annotated[float, Field(gt=0)] = Field(
1687
+ 0.1,
1688
+ description=(
1689
+ "Learning rate for Tangram optimizer. "
1690
+ "Official default: 0.1. "
1691
+ "Higher values = faster convergence but less stable. "
1692
+ "Lower values = more stable but slower. "
1693
+ "ONLY USED BY TANGRAM METHOD."
1694
+ ),
1695
+ )
1696
+ tangram_density_prior: Literal["rna_count_based", "uniform"] = Field(
1697
+ "rna_count_based",
1698
+ description=(
1699
+ "Spatial density prior for Tangram. "
1700
+ "'rna_count_based': Weight by RNA counts (default, recommended). "
1701
+ "'uniform': Equal weight for all spots. "
1702
+ "Official recommendation: 'rna_count_based' for better biological interpretation. "
1703
+ "ONLY USED BY TANGRAM METHOD."
1704
+ ),
1705
+ )
1706
+
1707
+ # FlashDeconv specific parameters (DEFAULT METHOD - ultra-fast, atlas-scale)
1708
+ flashdeconv_sketch_dim: Annotated[int, Field(gt=0, le=2048)] = Field(
1709
+ 512,
1710
+ description=(
1711
+ "Dimension of the sketched space for FlashDeconv. "
1712
+ "Higher values preserve more information but increase computation. "
1713
+ "Default: 512 (recommended for most datasets). "
1714
+ "ONLY USED BY FLASHDECONV METHOD."
1715
+ ),
1716
+ )
1717
+ flashdeconv_lambda_spatial: Annotated[float, Field(gt=0)] = Field(
1718
+ 5000.0,
1719
+ description=(
1720
+ "Spatial regularization strength for FlashDeconv. "
1721
+ "Higher values encourage smoother spatial patterns. "
1722
+ "Recommended values by platform:\n"
1723
+ "• Standard Visium (55μm): 1000-10000 (default: 5000)\n"
1724
+ "• Visium HD (16μm): 5000-20000\n"
1725
+ "• Visium HD (8μm): 10000-50000\n"
1726
+ "• Visium HD (2μm): 50000-100000\n"
1727
+ "• Stereo-seq/Seq-Scope: 50000-200000\n"
1728
+ "Use 'auto' for automatic tuning (may underestimate for real data). "
1729
+ "ONLY USED BY FLASHDECONV METHOD."
1730
+ ),
1731
+ )
1732
+ flashdeconv_n_hvg: Annotated[int, Field(gt=0, le=5000)] = Field(
1733
+ 2000,
1734
+ description=(
1735
+ "Number of highly variable genes to select for FlashDeconv. "
1736
+ "Default: 2000. "
1737
+ "ONLY USED BY FLASHDECONV METHOD."
1738
+ ),
1739
+ )
1740
+ flashdeconv_n_markers_per_type: Annotated[int, Field(gt=0, le=500)] = Field(
1741
+ 50,
1742
+ description=(
1743
+ "Number of marker genes per cell type for FlashDeconv. "
1744
+ "Default: 50. "
1745
+ "ONLY USED BY FLASHDECONV METHOD."
1746
+ ),
1747
+ )
1748
+
1749
+
1750
+ class SpatialDomainParameters(BaseModel):
1751
+ """Spatial domain identification parameters model"""
1752
+
1753
+ method: Literal["spagcn", "leiden", "louvain", "stagate", "graphst"] = "spagcn"
1754
+ n_domains: Annotated[int, Field(gt=0, le=50)] = (
1755
+ 7 # Number of spatial domains to identify
1756
+ )
1757
+
1758
+ # SpaGCN specific parameters
1759
+ spagcn_s: Annotated[float, Field(gt=0.0)] = (
1760
+ 1.0 # Weight given to histology in SpaGCN
1761
+ )
1762
+ spagcn_b: Annotated[int, Field(gt=0)] = (
1763
+ 49 # Area of each spot when extracting color intensity
1764
+ )
1765
+ spagcn_p: Annotated[float, Field(ge=0.0, le=1.0)] = (
1766
+ 0.5 # Percentage of total expression contributed by neighborhoods
1767
+ )
1768
+ spagcn_use_histology: bool = True # Whether to use histology image in SpaGCN
1769
+ spagcn_random_seed: int = 100 # Random seed for SpaGCN
1770
+
1771
+ # General clustering parameters
1772
+ resolution: float = 0.5 # Resolution for leiden/louvain clustering
1773
+ use_highly_variable: bool = True # Whether to use highly variable genes only
1774
+ refine_domains: bool = (
1775
+ True # Whether to refine spatial domains using spatial smoothing
1776
+ )
1777
+ refinement_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = (
1778
+ 0.5 # Threshold for refinement: only relabel if >=threshold of neighbors differ (0.5 = 50%, following SpaGCN)
1779
+ )
1780
+
1781
+ # Clustering-specific parameters for leiden/louvain methods
1782
+ cluster_n_neighbors: Optional[Annotated[int, Field(gt=0)]] = (
1783
+ None # Number of neighbors for clustering (default: 15)
1784
+ )
1785
+ cluster_spatial_weight: Optional[Annotated[float, Field(ge=0.0, le=1.0)]] = (
1786
+ None # Weight for spatial information (default: 0.3)
1787
+ )
1788
+ cluster_resolution: Optional[float] = None # Resolution parameter for clustering
1789
+
1790
+ # STAGATE specific parameters
1791
+ stagate_rad_cutoff: Optional[float] = (
1792
+ None # Radius cutoff for spatial neighbors (default: 150)
1793
+ )
1794
+ stagate_learning_rate: Optional[float] = None # Learning rate (default: 0.001)
1795
+ stagate_weight_decay: Optional[float] = None # Weight decay (default: 0.0001)
1796
+ stagate_epochs: Optional[int] = None # Number of training epochs (default: 1000)
1797
+ stagate_dim_output: Optional[int] = (
1798
+ None # Dimension of output representation (default: 15)
1799
+ )
1800
+ stagate_random_seed: Optional[int] = None # Random seed (default: 42)
1801
+
1802
+ # GraphST specific parameters
1803
+ graphst_use_gpu: bool = False # Whether to use GPU acceleration
1804
+ graphst_clustering_method: Literal["mclust", "leiden", "louvain"] = (
1805
+ "leiden" # Clustering method for GraphST
1806
+ )
1807
+ graphst_refinement: bool = True # Whether to refine domains using spatial info
1808
+ graphst_radius: int = 50 # Radius for spatial refinement
1809
+ graphst_random_seed: int = 42 # Random seed for GraphST
1810
+ graphst_n_clusters: Optional[int] = (
1811
+ None # Number of clusters (if None, uses n_domains)
1812
+ )
1813
+
1814
+ # Simple timeout configuration
1815
+ timeout: Optional[int] = None # Timeout in seconds (default: 600)
1816
+
1817
+
1818
+ class SpatialVariableGenesParameters(BaseModel):
1819
+ """Spatial variable genes identification parameters model"""
1820
+
1821
+ # Method selection
1822
+ method: Literal["spatialde", "sparkx"] = (
1823
+ "sparkx" # Default to SPARK-X (best accuracy)
1824
+ )
1825
+
1826
+ # Common parameters for all methods
1827
+ n_top_genes: Optional[Annotated[int, Field(gt=0, le=5000)]] = (
1828
+ None # Number of top spatial variable genes to return (None = all significant)
1829
+ )
1830
+ spatial_key: str = "spatial" # Key in obsm containing spatial coordinates
1831
+
1832
+ # SpatialDE-specific parameters
1833
+ spatialde_normalized: bool = True # Whether data is already normalized
1834
+ spatialde_kernel: str = "SE" # Kernel function type for SpatialDE
1835
+ spatialde_pi0: Optional[float] = Field(
1836
+ default=None,
1837
+ gt=0.0,
1838
+ le=1.0,
1839
+ description=(
1840
+ "Prior probability of null hypothesis for SpatialDE q-value estimation. "
1841
+ "This represents the expected proportion of genes WITHOUT spatial patterns. "
1842
+ "\n\n"
1843
+ "VALUES:\n"
1844
+ "- None (default, RECOMMENDED): Uses adaptive pi0 estimation from SpatialDE\n"
1845
+ "- 0.9: Assumes 10% of genes have spatial patterns (conservative)\n"
1846
+ "- 0.5: Assumes 50% of genes have spatial patterns (moderate)\n"
1847
+ "- 0.1: Assumes 90% of genes have spatial patterns (aggressive, may increase false positives)\n"
1848
+ "\n"
1849
+ "SCIENTIFIC NOTE:\n"
1850
+ "The pi0 parameter directly affects the stringency of FDR correction. "
1851
+ "Lower pi0 values assume more genes are truly spatial, leading to more "
1852
+ "liberal q-value estimates and potentially more false positives. "
1853
+ "The default adaptive estimation (None) is recommended for most analyses "
1854
+ "as it learns pi0 from the data distribution."
1855
+ ),
1856
+ )
1857
+
1858
+ # SPARK-X specific parameters
1859
+ sparkx_percentage: Annotated[float, Field(gt=0.0, le=1.0)] = (
1860
+ 0.1 # Percentage of total expression for filtering
1861
+ )
1862
+ sparkx_min_total_counts: Annotated[int, Field(gt=0)] = (
1863
+ 10 # Minimum total counts per gene
1864
+ )
1865
+ sparkx_num_core: Annotated[int, Field(gt=0, le=16)] = (
1866
+ 1 # Number of cores for parallel processing
1867
+ )
1868
+ sparkx_option: Literal["single", "mixture"] = (
1869
+ "mixture" # Kernel testing: "single" (faster) or "mixture" (11 kernels)
1870
+ )
1871
+ sparkx_verbose: bool = False # Whether to print detailed R output
1872
+
1873
+ # Gene filtering parameters
1874
+ filter_mt_genes: bool = (
1875
+ True # Filter mitochondrial genes (MT-*) - standard practice
1876
+ )
1877
+ filter_ribo_genes: bool = (
1878
+ False # Filter ribosomal genes (RPS*, RPL*) - optional, may remove housekeeping
1879
+ )
1880
+ test_only_hvg: bool = (
1881
+ True # Test only highly variable genes - 2024 best practice for reducing housekeeping dominance
1882
+ # Requires preprocessing with HVG detection first; set to False to test all genes (not recommended)
1883
+ )
1884
+ warn_housekeeping: bool = True # Warn if >30% of top genes are housekeeping genes
1885
+
1886
+
1887
+ class CellCommunicationParameters(BaseModel):
1888
+ """Cell-cell communication analysis parameters model with explicit user control"""
1889
+
1890
+ # ========== Basic Method Selection ==========
1891
+ method: Literal["liana", "cellphonedb", "cellchat_r", "fastccc"] = "liana"
1892
+ # Methods:
1893
+ # - "liana": LIANA+ framework (Python, supports multiple resources)
1894
+ # - "cellphonedb": CellPhoneDB v5 (Python)
1895
+ # - "cellchat_r": Native R CellChat (full features with mediator proteins & pathways)
1896
+ # - "fastccc": FastCCC permutation-free framework (Nature Comm 2025, ultra-fast)
1897
+
1898
+ # ========== Species and Resource Control ==========
1899
+ species: Literal["human", "mouse", "zebrafish"]
1900
+ # REQUIRED: Must explicitly specify species for ligand-receptor database
1901
+ # - "human": For human data (genes like ACTB, GAPDH - all uppercase)
1902
+ # - "mouse": For mouse data (genes like Actb, Gapdh - capitalized)
1903
+ # - "zebrafish": For zebrafish data
1904
+
1905
+ # LIANA resource selection (matches actual LIANA+ supported resources)
1906
+ liana_resource: Literal[
1907
+ "consensus", # Default: consensus of multiple databases (recommended)
1908
+ "mouseconsensus", # Mouse consensus database
1909
+ "baccin2019", # Baccin et al. 2019 resource
1910
+ "cellcall", # CellCall database
1911
+ "cellchatdb", # CellChat database
1912
+ "cellinker", # CellLinker database
1913
+ "cellphonedb", # CellPhoneDB database (curated, stringent)
1914
+ "celltalkdb", # CellTalkDB database (large)
1915
+ "connectomedb2020", # Connectome database 2020
1916
+ "embrace", # EMBRACE database
1917
+ "guide2pharma", # Guide to Pharmacology
1918
+ "hpmr", # Human Plasma Membrane Receptome
1919
+ "icellnet", # iCellNet database (immune focus)
1920
+ "italk", # iTALK database
1921
+ "kirouac2010", # Kirouac et al. 2010
1922
+ "lrdb", # LRdb database
1923
+ "ramilowski2015", # Ramilowski et al. 2015
1924
+ ] = "consensus" # LR database resource
1925
+
1926
+ # ========== Spatial Analysis Control ==========
1927
+ perform_spatial_analysis: bool = (
1928
+ True # Whether to perform spatial bivariate analysis
1929
+ )
1930
+
1931
+ # ========== Cell Type Control ==========
1932
+ # Cell type key (unified naming with other tools)
1933
+ cell_type_key: str # REQUIRED: Which column to use for cell types. LLM will infer from metadata. Common values: 'cell_type', 'celltype', 'leiden', 'louvain', 'seurat_clusters'
1934
+
1935
+ # ========== LIANA Specific Parameters ==========
1936
+ liana_local_metric: Literal["cosine", "pearson", "spearman", "jaccard"] = (
1937
+ "cosine" # Local spatial metric
1938
+ )
1939
+ liana_global_metric: Literal["morans", "lee"] = "morans" # Global spatial metric
1940
+ liana_n_perms: Annotated[int, Field(gt=0)] = (
1941
+ 1000 # Number of permutations for LIANA (1000 minimum for publication-quality p-values)
1942
+ )
1943
+ liana_nz_prop: Annotated[float, Field(gt=0.0, le=1.0)] = (
1944
+ 0.2 # Minimum expression proportion
1945
+ )
1946
+ liana_bandwidth: Optional[int] = None # Bandwidth for spatial connectivity
1947
+ liana_cutoff: Annotated[float, Field(gt=0.0, le=1.0)] = (
1948
+ 0.1 # Cutoff for spatial connectivity
1949
+ )
1950
+ liana_significance_alpha: Annotated[float, Field(gt=0.0, lt=1.0)] = Field(
1951
+ default=0.05,
1952
+ description=(
1953
+ "Significance threshold (alpha) for FDR-corrected p-values in LIANA analysis.\n"
1954
+ "Default: 0.05 (standard statistical threshold).\n"
1955
+ "Use 0.01 for more stringent filtering, 0.10 for exploratory analysis.\n"
1956
+ "This controls both cluster-level (magnitude_rank) and spatial (FDR-corrected) significance."
1957
+ ),
1958
+ )
1959
+
1960
+ # ========== Expression Filtering Parameters ==========
1961
+ min_cells: Annotated[int, Field(ge=0)] = (
1962
+ 3 # Minimum cells expressing ligand or receptor (required by LIANA for statistical validity)
1963
+ )
1964
+
1965
+ # ========== Result Control ==========
1966
+ plot_top_pairs: Annotated[int, Field(gt=0, le=100)] = (
1967
+ 6 # Number of top LR pairs to include in results (chord diagrams may use 50+)
1968
+ )
1969
+
1970
+ # ========== CellPhoneDB Specific Parameters ==========
1971
+ cellphonedb_threshold: Annotated[float, Field(gt=0.0, le=1.0)] = (
1972
+ 0.1 # Expression threshold
1973
+ )
1974
+ cellphonedb_iterations: Annotated[int, Field(gt=0, le=10000)] = (
1975
+ 1000 # Statistical permutations
1976
+ )
1977
+ cellphonedb_result_precision: Annotated[int, Field(gt=0, le=5)] = (
1978
+ 3 # Result decimal precision
1979
+ )
1980
+ cellphonedb_pvalue: Annotated[float, Field(gt=0.0, le=1.0)] = (
1981
+ 0.05 # P-value significance threshold
1982
+ )
1983
+ cellphonedb_use_microenvironments: bool = (
1984
+ True # Whether to use spatial microenvironments
1985
+ )
1986
+ cellphonedb_spatial_radius: Optional[Annotated[float, Field(gt=0.0)]] = (
1987
+ None # Spatial radius for microenvironments
1988
+ )
1989
+ cellphonedb_debug_seed: Optional[int] = None # Random seed for reproducible results
1990
+
1991
+ # Multiple testing correction for CellPhoneDB
1992
+ # When using minimum p-value across multiple cell type pairs, correction is needed
1993
+ # to control false positive rate (e.g., 7 clusters = 49 pairs → FPR 91.9% without correction)
1994
+ cellphonedb_correction_method: Literal["fdr_bh", "bonferroni", "sidak", "none"] = (
1995
+ "fdr_bh" # Multiple testing correction method (default: Benjamini-Hochberg FDR)
1996
+ )
1997
+ # Options:
1998
+ # - "fdr_bh": Benjamini-Hochberg FDR (recommended, balances sensitivity & specificity)
1999
+ # - "bonferroni": Bonferroni correction (most conservative, controls FWER)
2000
+ # - "sidak": Šidák correction (similar to Bonferroni but more accurate for independent tests)
2001
+ # - "none": No correction (NOT recommended, leads to ~92% FPR with 7 clusters)
2002
+
2003
+ # ========== CellChat R Specific Parameters ==========
2004
+ # These parameters are only used when method="cellchat_r"
2005
+ cellchat_db_category: Literal[
2006
+ "Secreted Signaling",
2007
+ "ECM-Receptor",
2008
+ "Cell-Cell Contact",
2009
+ "All",
2010
+ ] = "All"
2011
+ # CellChatDB category to use:
2012
+ # - "Secreted Signaling": Ligand-receptor pairs for secreted signaling
2013
+ # - "ECM-Receptor": Extracellular matrix-receptor interactions
2014
+ # - "Cell-Cell Contact": Direct cell-cell contact interactions
2015
+ # - "All": Use all categories (default)
2016
+
2017
+ cellchat_type: Literal["triMean", "truncatedMean", "thresholdedMean", "median"] = (
2018
+ "triMean"
2019
+ )
2020
+ # CellChat expression aggregation method:
2021
+ # - "trimean": Tukey's trimean (robust, default, produces fewer interactions)
2022
+ # - "truncatedMean": Truncated mean (more interactions, use with trim parameter)
2023
+
2024
+ cellchat_trim: Annotated[float, Field(ge=0.0, le=0.5)] = 0.1
2025
+ # Trim proportion for truncatedMean method (0.1 = 10% truncated mean)
2026
+
2027
+ cellchat_population_size: bool = True
2028
+ # Whether to consider cell population size effect in communication probability
2029
+
2030
+ cellchat_min_cells: Annotated[int, Field(ge=1)] = 10
2031
+ # Minimum number of cells required in each cell group for filterCommunication
2032
+
2033
+ cellchat_distance_use: bool = True
2034
+ # Whether to use spatial distance constraints (for spatial data)
2035
+
2036
+ cellchat_interaction_range: Annotated[float, Field(gt=0.0)] = 250.0
2037
+ # Maximum interaction/diffusion range of ligands in microns (for spatial data)
2038
+
2039
+ cellchat_scale_distance: Annotated[float, Field(gt=0.0)] = 0.01
2040
+ # Scale factor for distance calculation (adjust based on imaging technology)
2041
+
2042
+ cellchat_contact_knn_k: Annotated[int, Field(ge=1)] = 6
2043
+ # Number of nearest neighbors for defining contact-dependent signaling
2044
+ # Used for spatial data to determine which cells are in contact range
2045
+
2046
+ cellchat_contact_range: Optional[Annotated[float, Field(gt=0.0)]] = None
2047
+ # Alternative to contact_knn_k: explicit distance threshold for contact signaling
2048
+ # If None, uses contact_knn_k instead (recommended for most spatial data)
2049
+
2050
+ # CellChat spatial conversion factors (platform-specific)
2051
+ cellchat_pixel_ratio: Annotated[float, Field(gt=0.0)] = Field(
2052
+ default=0.5,
2053
+ description=(
2054
+ "Conversion factor from image pixels to micrometers (um).\n"
2055
+ "Platform-specific defaults:\n"
2056
+ " - Visium (10x): 0.5 (1 pixel ≈ 0.5 um at full resolution)\n"
2057
+ " - MERFISH: Varies by imaging setup, typically 0.1-1.0\n"
2058
+ " - Slide-seq: ~0.5 (10 um beads)\n"
2059
+ " - CosMx: 0.18 (imaging resolution)\n"
2060
+ "Used in CellChat's spatial.factors for coordinate conversion."
2061
+ ),
2062
+ )
2063
+
2064
+ cellchat_spatial_tol: Annotated[float, Field(gt=0.0)] = Field(
2065
+ default=27.5,
2066
+ description=(
2067
+ "Spatial tolerance (half of spot/cell diameter) in micrometers.\n"
2068
+ "Platform-specific defaults:\n"
2069
+ " - Visium (10x): 27.5 um (spot diameter ~55um, half is ~27.5)\n"
2070
+ " - MERFISH: 5-10 um (single cell resolution)\n"
2071
+ " - Slide-seq: 5 um (10 um bead diameter / 2)\n"
2072
+ " - CosMx: 5-10 um (single cell resolution)\n"
2073
+ "Used in CellChat's spatial.factors.tol for defining spatial proximity."
2074
+ ),
2075
+ )
2076
+
2077
+ # ========== FastCCC Specific Parameters ==========
2078
+ # FastCCC is a permutation-free framework using FFT-based convolution
2079
+ # Reference: Nature Communications 2025 (https://github.com/Svvord/FastCCC)
2080
+ # Key advantage: Ultra-fast (16M cells in minutes vs hours for permutation methods)
2081
+
2082
+ fastccc_single_unit_summary: Literal["Mean", "Median", "Q3", "Quantile_0.9"] = (
2083
+ Field(
2084
+ default="Mean",
2085
+ description=(
2086
+ "Aggregation method for single-unit gene expression within cell types.\n"
2087
+ "Options:\n"
2088
+ " - 'Mean': Mean expression (default, most commonly used)\n"
2089
+ " - 'Median': Median expression (robust to outliers)\n"
2090
+ " - 'Q3': Third quartile (75th percentile)\n"
2091
+ " - 'Quantile_0.9': 90th percentile (captures high expressors)"
2092
+ ),
2093
+ )
2094
+ )
2095
+
2096
+ fastccc_complex_aggregation: Literal["Minimum", "Average"] = Field(
2097
+ default="Minimum",
2098
+ description=(
2099
+ "Aggregation method for multi-subunit protein complexes.\n"
2100
+ "Options:\n"
2101
+ " - 'Minimum': Use minimum expression (default, ensures all subunits present)\n"
2102
+ " - 'Average': Use average expression across subunits"
2103
+ ),
2104
+ )
2105
+
2106
+ fastccc_lr_combination: Literal["Arithmetic", "Geometric"] = Field(
2107
+ default="Arithmetic",
2108
+ description=(
2109
+ "Method for combining ligand and receptor scores.\n"
2110
+ "Options:\n"
2111
+ " - 'Arithmetic': Arithmetic mean of L and R (default)\n"
2112
+ " - 'Geometric': Geometric mean (more conservative)"
2113
+ ),
2114
+ )
2115
+
2116
+ fastccc_min_percentile: Annotated[float, Field(ge=0.0, le=1.0)] = Field(
2117
+ default=0.1,
2118
+ description=(
2119
+ "Minimum expression percentile threshold for filtering lowly expressed genes.\n"
2120
+ "Default: 0.1 (10% of cells must express the gene)"
2121
+ ),
2122
+ )
2123
+
2124
+ fastccc_use_cauchy: bool = Field(
2125
+ default=True,
2126
+ description=(
2127
+ "Whether to use Cauchy combination for multi-method aggregation.\n"
2128
+ "When True: Runs multiple parameter combinations and aggregates p-values\n"
2129
+ " using Cauchy distribution (more robust, slower)\n"
2130
+ "When False: Uses single parameter set (faster)"
2131
+ ),
2132
+ )
2133
+
2134
+ fastccc_pvalue_threshold: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
2135
+ default=0.05,
2136
+ description="P-value threshold for identifying significant interactions.",
2137
+ )
2138
+
2139
+ fastccc_use_deg: bool = Field(
2140
+ default=False,
2141
+ description=(
2142
+ "Apply differential expression gene filtering before analysis.\n"
2143
+ "When True: Only analyze differentially expressed genes (more specific)\n"
2144
+ "When False: Analyze all expressed genes (default, more comprehensive)"
2145
+ ),
2146
+ )
2147
+
2148
+
2149
+ class EnrichmentParameters(BaseModel):
2150
+ """Parameters for gene set enrichment analysis"""
2151
+
2152
+ model_config = ConfigDict(extra="forbid")
2153
+
2154
+ # REQUIRED: Species specification (no default value)
2155
+ species: Literal["human", "mouse", "zebrafish"]
2156
+ # Must explicitly specify the species for gene set matching:
2157
+ # - "human": For human data (genes like CD5L, PTPRC - all uppercase)
2158
+ # - "mouse": For mouse data (genes like Cd5l, Ptprc - capitalize format)
2159
+ # - "zebrafish": For zebrafish data
2160
+
2161
+ # Method selection
2162
+ method: Literal[
2163
+ "spatial_enrichmap",
2164
+ "pathway_gsea",
2165
+ "pathway_ora",
2166
+ "pathway_enrichr",
2167
+ "pathway_ssgsea",
2168
+ ] = "spatial_enrichmap" # Enrichment method
2169
+
2170
+ # Gene sets
2171
+ gene_sets: Optional[Union[list[str], dict[str, list[str]]]] = (
2172
+ None # Gene sets to analyze
2173
+ )
2174
+ score_keys: Optional[Union[str, list[str]]] = None # Names for gene signatures
2175
+
2176
+ # Gene set database - choose species-appropriate option
2177
+ gene_set_database: Optional[
2178
+ Literal[
2179
+ "GO_Biological_Process", # Default (auto-adapts to species)
2180
+ "GO_Molecular_Function", # GO molecular function terms
2181
+ "GO_Cellular_Component", # GO cellular component terms
2182
+ "KEGG_Pathways", # KEGG pathways (species-specific: human=2021, mouse=2019)
2183
+ "Reactome_Pathways", # Reactome pathway database (2022 version)
2184
+ "MSigDB_Hallmark", # MSigDB hallmark gene sets (2020 version)
2185
+ "Cell_Type_Markers", # Cell type marker genes
2186
+ ]
2187
+ ] = "GO_Biological_Process"
2188
+
2189
+ # Spatial parameters (for spatial_enrichmap)
2190
+ spatial_key: str = "spatial" # Key for spatial coordinates
2191
+ n_neighbors: Annotated[int, Field(gt=0)] = 6 # Number of spatial neighbors
2192
+ smoothing: bool = True # Whether to perform spatial smoothing
2193
+ correct_spatial_covariates: bool = True # Whether to correct for spatial covariates
2194
+
2195
+ # Analysis parameters
2196
+ batch_key: Optional[str] = None # Column for batch-wise normalization
2197
+ min_genes: Annotated[int, Field(gt=0)] = 10 # Minimum genes in gene set
2198
+ max_genes: Annotated[int, Field(gt=0)] = 500 # Maximum genes in gene set
2199
+
2200
+ # Statistical parameters
2201
+ pvalue_cutoff: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.05 # P-value cutoff
2202
+ adjust_method: Literal["bonferroni", "fdr", "none"] = (
2203
+ "fdr" # Multiple testing correction
2204
+ )
2205
+ n_permutations: Annotated[int, Field(gt=0)] = (
2206
+ 1000 # Number of permutations for GSEA
2207
+ )
2208
+
2209
+
2210
+ class CNVParameters(BaseModel):
2211
+ """Copy Number Variation (CNV) analysis parameters model"""
2212
+
2213
+ # Method selection
2214
+ method: Literal["infercnvpy", "numbat"] = Field(
2215
+ "infercnvpy",
2216
+ description=(
2217
+ "CNV analysis method. 'infercnvpy': expression-based (default), "
2218
+ "'numbat': haplotype-aware (requires allele data)"
2219
+ ),
2220
+ )
2221
+
2222
+ # Reference cell specification
2223
+ reference_key: str = Field(
2224
+ ...,
2225
+ description=(
2226
+ "Column name in adata.obs containing cell type or cluster labels "
2227
+ "for identifying reference (normal) cells. Common values: "
2228
+ "'cell_type', 'leiden', 'louvain', 'seurat_clusters'"
2229
+ ),
2230
+ )
2231
+ reference_categories: list[str] = Field(
2232
+ ...,
2233
+ description=(
2234
+ "List of cell types/clusters to use as reference (normal) cells. "
2235
+ "These should be non-malignant cells like immune cells, fibroblasts, etc. "
2236
+ "Example: ['T cells', 'B cells', 'Macrophages']"
2237
+ ),
2238
+ )
2239
+
2240
+ # infercnvpy parameters
2241
+ window_size: Annotated[int, Field(gt=0, le=500)] = Field(
2242
+ 100, description="Number of genes for CNV averaging window (default: 100)"
2243
+ )
2244
+ step: Annotated[int, Field(gt=0, le=100)] = Field(
2245
+ 10, description="Step size for sliding window (default: 10)"
2246
+ )
2247
+
2248
+ # Analysis options
2249
+ exclude_chromosomes: Optional[list[str]] = Field(
2250
+ None,
2251
+ description=(
2252
+ "Chromosomes to exclude from analysis (e.g., ['chrX', 'chrY', 'chrM'])"
2253
+ ),
2254
+ )
2255
+ dynamic_threshold: Optional[float] = Field(
2256
+ 1.5,
2257
+ gt=0.0,
2258
+ description="Threshold for dynamic CNV calling (default: 1.5)",
2259
+ )
2260
+
2261
+ # Clustering and visualization options (infercnvpy)
2262
+ cluster_cells: bool = Field(
2263
+ False, description="Whether to cluster cells by CNV pattern"
2264
+ )
2265
+ dendrogram: bool = Field(
2266
+ False, description="Whether to compute hierarchical clustering dendrogram"
2267
+ )
2268
+
2269
+ # Numbat-specific parameters
2270
+ numbat_genome: Literal["hg38", "hg19", "mm10", "mm39"] = Field(
2271
+ "hg38", description="Reference genome for Numbat (default: hg38)"
2272
+ )
2273
+ numbat_allele_data_key: str = Field(
2274
+ "allele_counts",
2275
+ description="Layer name in adata containing allele count data",
2276
+ )
2277
+ numbat_t: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
2278
+ 0.15, description="Transition probability threshold (default: 0.15)"
2279
+ )
2280
+ numbat_max_entropy: Annotated[float, Field(gt=0.0, le=1.0)] = Field(
2281
+ 0.8,
2282
+ description=(
2283
+ "Maximum entropy threshold. Use 0.8 for spatial data, "
2284
+ "0.5 for scRNA-seq (default: 0.8)"
2285
+ ),
2286
+ )
2287
+ numbat_min_cells: Annotated[int, Field(gt=0)] = Field(
2288
+ 10, description="Minimum cells per CNV event (default: 10)"
2289
+ )
2290
+ numbat_ncores: Annotated[int, Field(gt=0, le=16)] = Field(
2291
+ 1, description="Number of cores for parallel processing (default: 1)"
2292
+ )
2293
+ numbat_skip_nj: bool = Field(
2294
+ False, description="Skip neighbor-joining tree reconstruction (default: False)"
2295
+ )
2296
+
2297
+
2298
+ class RegistrationParameters(BaseModel):
2299
+ """Spatial registration parameters for aligning multiple tissue slices."""
2300
+
2301
+ method: Literal["paste", "stalign"] = Field(
2302
+ "paste",
2303
+ description=(
2304
+ "Registration method. 'paste': Probabilistic Alignment of ST Experiments "
2305
+ "(optimal transport-based, recommended). 'stalign': STalign diffeomorphic "
2306
+ "mapping (LDDMM-based, for complex deformations)."
2307
+ ),
2308
+ )
2309
+ reference_idx: Optional[int] = Field(
2310
+ None,
2311
+ ge=0,
2312
+ description="Index of reference slice (0-indexed). If None, uses first slice.",
2313
+ )
2314
+
2315
+ # PASTE-specific parameters
2316
+ paste_alpha: Annotated[float, Field(gt=0, le=1)] = Field(
2317
+ 0.1,
2318
+ description=(
2319
+ "Spatial regularization parameter for PASTE (0-1). "
2320
+ "Higher values give more weight to spatial coordinates vs expression. "
2321
+ "Default: 0.1 (expression-dominated alignment)."
2322
+ ),
2323
+ )
2324
+ paste_n_components: Annotated[int, Field(gt=0, le=100)] = Field(
2325
+ 30,
2326
+ description="Number of PCA components for PASTE center alignment (default: 30).",
2327
+ )
2328
+ paste_numItermax: Annotated[int, Field(gt=0, le=1000)] = Field(
2329
+ 200,
2330
+ description="Maximum iterations for optimal transport solver (default: 200).",
2331
+ )
2332
+
2333
+ # STalign-specific parameters
2334
+ stalign_image_size: tuple[int, int] = Field(
2335
+ (128, 128),
2336
+ description="Image size for STalign rasterization (height, width).",
2337
+ )
2338
+ stalign_niter: Annotated[int, Field(gt=0, le=500)] = Field(
2339
+ 50,
2340
+ description="Number of LDDMM iterations for STalign (default: 50).",
2341
+ )
2342
+ stalign_a: Annotated[float, Field(gt=0)] = Field(
2343
+ 500.0,
2344
+ description="Regularization parameter 'a' for STalign (default: 500).",
2345
+ )
2346
+ stalign_use_expression: bool = Field(
2347
+ True,
2348
+ description="Use gene expression for STalign intensity (vs uniform).",
2349
+ )
2350
+
2351
+ # Common parameters
2352
+ use_gpu: bool = Field(
2353
+ False,
2354
+ description="Use GPU acceleration (PASTE with PyTorch backend, STalign).",
2355
+ )
2356
+
2357
+
2358
+ class ConditionComparisonParameters(BaseModel):
2359
+ """Parameters for multi-sample condition comparison analysis.
2360
+
2361
+ This tool compares gene expression between experimental conditions (e.g., Treatment vs Control)
2362
+ across multiple biological samples, using proper statistical methods that account for
2363
+ sample-level variation.
2364
+
2365
+ Key difference from find_markers:
2366
+ - find_markers: Compares cell types/clusters WITHIN a dataset (e.g., T cell vs B cell)
2367
+ - compare_conditions: Compares CONDITIONS ACROSS samples (e.g., Treatment vs Control)
2368
+ """
2369
+
2370
+ # Required parameters
2371
+ condition_key: str = Field(
2372
+ ...,
2373
+ description=(
2374
+ "Column name in adata.obs containing experimental conditions. "
2375
+ "Examples: 'treatment', 'condition', 'group', 'disease_state'"
2376
+ ),
2377
+ )
2378
+
2379
+ condition1: str = Field(
2380
+ ...,
2381
+ description=(
2382
+ "First condition for comparison (typically the experimental/treatment group). "
2383
+ "Example: 'Treatment', 'Disease', 'Tumor'"
2384
+ ),
2385
+ )
2386
+
2387
+ condition2: str = Field(
2388
+ ...,
2389
+ description=(
2390
+ "Second condition for comparison (typically the control/reference group). "
2391
+ "Example: 'Control', 'Healthy', 'Normal'"
2392
+ ),
2393
+ )
2394
+
2395
+ sample_key: str = Field(
2396
+ ...,
2397
+ description=(
2398
+ "Column name in adata.obs identifying biological replicates/samples. "
2399
+ "This is CRITICAL for proper statistical inference - cells from the same sample "
2400
+ "are not independent observations. "
2401
+ "Examples: 'sample_id', 'patient_id', 'replicate', 'batch'"
2402
+ ),
2403
+ )
2404
+
2405
+ # Optional parameters
2406
+ cell_type_key: Optional[str] = Field(
2407
+ None,
2408
+ description=(
2409
+ "Column name in adata.obs for cell type annotations. "
2410
+ "If provided, differential expression is performed separately for each cell type, "
2411
+ "enabling cell type-specific condition effects. "
2412
+ "Examples: 'cell_type', 'leiden', 'cell_type_tangram'"
2413
+ ),
2414
+ )
2415
+
2416
+ method: Literal["pseudobulk"] = Field(
2417
+ "pseudobulk",
2418
+ description=(
2419
+ "Method for differential expression analysis.\n"
2420
+ "• 'pseudobulk' (default): Aggregate cells by sample, then use DESeq2\n"
2421
+ " - Best practice for multi-sample studies\n"
2422
+ " - Properly accounts for biological variation\n"
2423
+ " - Requires at least 2 samples per condition\n"
2424
+ "Future methods (not yet implemented):\n"
2425
+ "• 'cside': Cell type-Specific Inference of DE (from spacexr)\n"
2426
+ "• 'despace': Differential Spatial Patterns (from DESpace)"
2427
+ ),
2428
+ )
2429
+
2430
+ n_top_genes: Annotated[int, Field(gt=0, le=500)] = Field(
2431
+ 50,
2432
+ description="Number of top differentially expressed genes to return per comparison.",
2433
+ )
2434
+
2435
+ min_cells_per_sample: Annotated[int, Field(gt=0)] = Field(
2436
+ 10,
2437
+ description=(
2438
+ "Minimum number of cells per sample to include in analysis. "
2439
+ "Samples with fewer cells are excluded to ensure reliable aggregation."
2440
+ ),
2441
+ )
2442
+
2443
+ min_samples_per_condition: Annotated[int, Field(gt=0)] = Field(
2444
+ 2,
2445
+ description=(
2446
+ "Minimum number of samples required per condition. "
2447
+ "DESeq2 requires at least 2 samples per group for variance estimation."
2448
+ ),
2449
+ )
2450
+
2451
+ padj_threshold: Annotated[float, Field(gt=0, lt=1)] = Field(
2452
+ 0.05,
2453
+ description="Adjusted p-value threshold for significance (default: 0.05).",
2454
+ )
2455
+
2456
+ log2fc_threshold: Annotated[float, Field(ge=0)] = Field(
2457
+ 0.0,
2458
+ description=(
2459
+ "Minimum absolute log2 fold change threshold. "
2460
+ "Set to 0 for no filtering, or e.g., 1.0 for 2-fold change minimum."
2461
+ ),
2462
+ )