spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,802 @@
1
+ """
2
+ Confidence score transformation, filtering, and decision score storage.
3
+
4
+ This module provides utilities for:
5
+ 1. Transforming CellTypist decision scores to meaningful confidence values
6
+ 2. Storing decision score matrices in AnnData for downstream analysis
7
+ 3. Filtering cells by confidence or cell type count thresholds
8
+
9
+ For spatial transcriptomics, raw CellTypist confidence values may be less
10
+ informative than z-score transformed values, which capture how confident
11
+ a prediction is relative to other cell types.
12
+
13
+ References:
14
+ - CellTypist: https://www.celltypist.org/
15
+ - Domínguez Conde et al., Science (2022)
16
+ """
17
+
18
+ from typing import Dict, List, Literal, Optional, Tuple
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import anndata as ad
23
+ from scipy.special import softmax as scipy_softmax
24
+
25
+ from spatialcore.core.logging import get_logger
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ # ============================================================================
31
+ # Confidence Transformation
32
+ # ============================================================================
33
+
34
+ ConfidenceMethod = Literal["raw", "zscore", "softmax", "minmax"]
35
+
36
+
37
+ def transform_confidence(
38
+ decision_scores: np.ndarray,
39
+ method: ConfidenceMethod = "zscore",
40
+ ) -> np.ndarray:
41
+ """
42
+ Transform CellTypist decision scores to meaningful confidence values.
43
+
44
+ CellTypist produces logistic regression decision scores which can be
45
+ negative and unbounded. This function transforms them to interpretable
46
+ [0, 1] confidence values using different strategies.
47
+
48
+ Parameters
49
+ ----------
50
+ decision_scores : np.ndarray
51
+ Decision score matrix of shape (n_cells, n_types) from CellTypist.
52
+ Each row contains scores for all cell types for one cell.
53
+ method : {"raw", "zscore", "softmax", "minmax"}, default "zscore"
54
+ Transformation method:
55
+
56
+ - "raw": Return winning score directly (may be negative/unbounded)
57
+ - "zscore": Sigmoid of z-score of winning type vs all types.
58
+ Recommended for spatial data. Captures how "distinct" the prediction is.
59
+ - "softmax": Softmax probability of winning type.
60
+ Sums to 1 across types, good for comparing type probabilities.
61
+ - "minmax": Min-max scaling to [0, 1] per cell.
62
+ Simple but may not be well-calibrated.
63
+
64
+ Returns
65
+ -------
66
+ np.ndarray
67
+ Array of shape (n_cells,) with transformed confidence values.
68
+ For "zscore", "softmax", "minmax": values in [0, 1].
69
+ For "raw": unbounded values.
70
+
71
+ Notes
72
+ -----
73
+ **Why z-score for spatial data?**
74
+
75
+ CellTypist decision scores from logistic regression can be negative and
76
+ don't have a natural scale. The z-score method computes how many standard
77
+ deviations the winning type's score is above the mean, then applies
78
+ sigmoid to get a [0, 1] value:
79
+
80
+ confidence = sigmoid((winning_score - mean) / std)
81
+
82
+ This is more informative than raw confidence because:
83
+ 1. A cell with scores [5.0, 0.1, 0.1, 0.1] has high z-score (clear winner)
84
+ 2. A cell with scores [5.0, 4.8, 4.7, 4.6] has low z-score (ambiguous)
85
+
86
+ **Method comparison:**
87
+
88
+ +----------+------------------+------------------+
89
+ | Method | Best For | Output Range |
90
+ +==========+==================+==================+
91
+ | zscore | Spatial data | [0, 1] (sigmoid) |
92
+ | softmax | Probability est. | [0, 1] (prob) |
93
+ | minmax | Simple scaling | [0, 1] (linear) |
94
+ | raw | Debug/analysis | unbounded |
95
+ +----------+------------------+------------------+
96
+
97
+ Examples
98
+ --------
99
+ >>> from spatialcore.annotation.confidence import transform_confidence
100
+ >>> import numpy as np
101
+ >>> # Decision scores from CellTypist (n_cells=3, n_types=4)
102
+ >>> scores = np.array([
103
+ ... [5.0, 0.1, 0.1, 0.1], # Clear winner
104
+ ... [2.0, 1.8, 1.9, 1.7], # Ambiguous
105
+ ... [0.5, -1.0, -0.5, 0.3], # Negative scores
106
+ ... ])
107
+ >>> conf = transform_confidence(scores, method="zscore")
108
+ >>> print(f"Clear winner: {conf[0]:.3f}") # High confidence
109
+ >>> print(f"Ambiguous: {conf[1]:.3f}") # Low confidence
110
+ """
111
+ if decision_scores.ndim != 2:
112
+ raise ValueError(
113
+ f"Expected 2D array of shape (n_cells, n_types), "
114
+ f"got shape {decision_scores.shape}"
115
+ )
116
+
117
+ n_cells, n_types = decision_scores.shape
118
+
119
+ if n_types < 2:
120
+ raise ValueError(
121
+ f"Expected at least 2 cell types, got {n_types}"
122
+ )
123
+
124
+ # Get winning type index for each cell
125
+ winning_idx = np.argmax(decision_scores, axis=1)
126
+ winning_scores = decision_scores[np.arange(n_cells), winning_idx]
127
+
128
+ if method == "raw":
129
+ return winning_scores
130
+
131
+ elif method == "zscore":
132
+ # Z-score: how many std above mean is the winning score?
133
+ mean_scores = np.mean(decision_scores, axis=1)
134
+ std_scores = np.std(decision_scores, axis=1)
135
+
136
+ # Avoid division by zero (all types have same score)
137
+ std_scores = np.where(std_scores < 1e-10, 1.0, std_scores)
138
+
139
+ z_scores = (winning_scores - mean_scores) / std_scores
140
+
141
+ # Sigmoid to [0, 1]
142
+ confidence = 1 / (1 + np.exp(-z_scores))
143
+ return confidence
144
+
145
+ elif method == "softmax":
146
+ # Softmax probability of winning type
147
+ probs = scipy_softmax(decision_scores, axis=1)
148
+ confidence = probs[np.arange(n_cells), winning_idx]
149
+ return confidence
150
+
151
+ elif method == "minmax":
152
+ # Min-max scaling per cell
153
+ min_scores = np.min(decision_scores, axis=1, keepdims=True)
154
+ max_scores = np.max(decision_scores, axis=1, keepdims=True)
155
+
156
+ # Avoid division by zero
157
+ score_range = max_scores - min_scores
158
+ score_range = np.where(score_range < 1e-10, 1.0, score_range)
159
+
160
+ scaled = (decision_scores - min_scores) / score_range
161
+ confidence = scaled[np.arange(n_cells), winning_idx]
162
+ return confidence
163
+
164
+ else:
165
+ raise ValueError(
166
+ f"Unknown confidence method: {method}. "
167
+ f"Expected one of: raw, zscore, softmax, minmax"
168
+ )
169
+
170
+
171
+ # ============================================================================
172
+ # Decision Score Storage
173
+ # ============================================================================
174
+
175
+ def extract_decision_scores(
176
+ adata: ad.AnnData,
177
+ celltypist_result,
178
+ key_added: str = "celltypist",
179
+ copy: bool = False,
180
+ ) -> ad.AnnData:
181
+ """
182
+ Store CellTypist decision scores matrix in AnnData.
183
+
184
+ Extracts the full decision score matrix from CellTypist annotation
185
+ results and stores it in adata.obsm for downstream analysis
186
+ (e.g., confidence transforms, plotting, uncertainty analysis).
187
+
188
+ Parameters
189
+ ----------
190
+ adata : AnnData
191
+ AnnData object that was annotated with CellTypist.
192
+ celltypist_result
193
+ CellTypist AnnotationResult object from celltypist.annotate().
194
+ Must have .decision_matrix and .cell_types attributes.
195
+ key_added : str, default "celltypist"
196
+ Key prefix for stored results:
197
+
198
+ - adata.obsm[f"{key_added}_decision_scores"]: Decision matrix
199
+ - adata.uns[f"{key_added}_cell_types"]: Cell type names
200
+ copy : bool, default False
201
+ Whether to return a copy or modify in-place.
202
+
203
+ Returns
204
+ -------
205
+ AnnData
206
+ AnnData with decision scores stored in obsm.
207
+
208
+ Notes
209
+ -----
210
+ The decision score matrix has shape (n_cells, n_types) where each
211
+ row contains the logistic regression decision scores for all cell
212
+ types. Higher scores indicate stronger evidence for that cell type.
213
+
214
+ These scores can be used for:
215
+ - Custom confidence calculations (transform_confidence)
216
+ - Uncertainty visualization (plotting multiple high-scoring types)
217
+ - Ensemble methods (combining multiple model predictions)
218
+
219
+ Examples
220
+ --------
221
+ >>> import celltypist
222
+ >>> from spatialcore.annotation.confidence import extract_decision_scores
223
+ >>> # Run CellTypist annotation
224
+ >>> result = celltypist.annotate(adata, model=model)
225
+ >>> # Store decision scores
226
+ >>> adata = extract_decision_scores(adata, result, key_added="ct")
227
+ >>> # Access stored scores
228
+ >>> scores = adata.obsm["ct_decision_scores"]
229
+ >>> cell_types = adata.uns["ct_cell_types"]
230
+ """
231
+ if copy:
232
+ adata = adata.copy()
233
+
234
+ # Extract decision matrix from CellTypist result
235
+ if not hasattr(celltypist_result, "decision_matrix"):
236
+ raise ValueError(
237
+ "CellTypist result does not have decision_matrix. "
238
+ "Ensure you passed the result from celltypist.annotate()."
239
+ )
240
+
241
+ decision_matrix = celltypist_result.decision_matrix
242
+
243
+ # Handle DataFrame (from CellTypist) or numpy array
244
+ if isinstance(decision_matrix, pd.DataFrame):
245
+ cell_types = list(decision_matrix.columns)
246
+ decision_array = decision_matrix.values
247
+ else:
248
+ # Numpy array - get cell types from result
249
+ decision_array = decision_matrix
250
+ if hasattr(celltypist_result, "cell_types"):
251
+ cell_types = list(celltypist_result.cell_types)
252
+ else:
253
+ # Fallback: generate generic names
254
+ n_types = decision_array.shape[1]
255
+ cell_types = [f"type_{i}" for i in range(n_types)]
256
+ logger.warning(
257
+ f"Could not extract cell type names from result. "
258
+ f"Using generic names: type_0, type_1, ..."
259
+ )
260
+
261
+ # Validate shape
262
+ if decision_array.shape[0] != adata.n_obs:
263
+ raise ValueError(
264
+ f"Decision matrix has {decision_array.shape[0]} cells, "
265
+ f"but AnnData has {adata.n_obs} cells. "
266
+ f"Ensure the CellTypist result matches the AnnData object."
267
+ )
268
+
269
+ # Store in AnnData
270
+ adata.obsm[f"{key_added}_decision_scores"] = decision_array.astype(np.float32)
271
+ adata.uns[f"{key_added}_cell_types"] = cell_types
272
+
273
+ logger.info(
274
+ f"Stored decision scores: {decision_array.shape[0]:,} cells × "
275
+ f"{decision_array.shape[1]} types in adata.obsm['{key_added}_decision_scores']"
276
+ )
277
+
278
+ return adata
279
+
280
+
281
+ # ============================================================================
282
+ # Confidence Filtering
283
+ # ============================================================================
284
+
285
+ def filter_low_confidence(
286
+ adata: ad.AnnData,
287
+ label_column: str,
288
+ confidence_column: str,
289
+ threshold: float = 0.5,
290
+ unassigned_label: str = "Unassigned",
291
+ copy: bool = False,
292
+ ) -> ad.AnnData:
293
+ """
294
+ Mark cells below confidence threshold as Unassigned.
295
+
296
+ Cells with confidence values below the threshold have their cell type
297
+ label replaced with an unassigned label. This is useful for quality
298
+ control to flag uncertain predictions.
299
+
300
+ Parameters
301
+ ----------
302
+ adata : AnnData
303
+ AnnData object with cell type labels and confidence values.
304
+ label_column : str
305
+ Column in adata.obs containing cell type labels.
306
+ confidence_column : str
307
+ Column in adata.obs containing confidence values.
308
+ threshold : float, default 0.5
309
+ Confidence threshold. Cells with confidence < threshold are marked
310
+ as unassigned.
311
+ unassigned_label : str, default "Unassigned"
312
+ Label to assign to low-confidence cells.
313
+ copy : bool, default False
314
+ Whether to return a copy or modify in-place.
315
+
316
+ Returns
317
+ -------
318
+ AnnData
319
+ AnnData with low-confidence cells marked as unassigned.
320
+
321
+ Notes
322
+ -----
323
+ The original labels are preserved if you use copy=True. For tracking
324
+ purposes, consider storing the original labels in a separate column
325
+ before filtering.
326
+
327
+ Examples
328
+ --------
329
+ >>> from spatialcore.annotation.confidence import filter_low_confidence
330
+ >>> # Filter cells with confidence < 0.6
331
+ >>> adata = filter_low_confidence(
332
+ ... adata,
333
+ ... label_column="celltypist_prediction",
334
+ ... confidence_column="celltypist_confidence",
335
+ ... threshold=0.6,
336
+ ... )
337
+ >>> # Check how many were marked as unassigned
338
+ >>> n_unassigned = (adata.obs["celltypist_prediction"] == "Unassigned").sum()
339
+ >>> print(f"Marked {n_unassigned:,} cells as Unassigned")
340
+ """
341
+ if copy:
342
+ adata = adata.copy()
343
+
344
+ # Validate columns exist
345
+ if label_column not in adata.obs.columns:
346
+ raise ValueError(
347
+ f"Label column '{label_column}' not found in adata.obs. "
348
+ f"Available: {list(adata.obs.columns)}"
349
+ )
350
+ if confidence_column not in adata.obs.columns:
351
+ raise ValueError(
352
+ f"Confidence column '{confidence_column}' not found in adata.obs. "
353
+ f"Available: {list(adata.obs.columns)}"
354
+ )
355
+
356
+ # Get confidence values
357
+ confidence = adata.obs[confidence_column].values
358
+
359
+ # Mark low confidence cells
360
+ low_conf_mask = confidence < threshold
361
+ n_low_conf = low_conf_mask.sum()
362
+
363
+ if n_low_conf > 0:
364
+ # Ensure label column is string type for modification
365
+ adata.obs[label_column] = adata.obs[label_column].astype(str)
366
+ adata.obs.loc[low_conf_mask, label_column] = unassigned_label
367
+
368
+ pct = 100 * n_low_conf / adata.n_obs
369
+ logger.info(
370
+ f"Marked {n_low_conf:,} cells ({pct:.1f}%) as '{unassigned_label}' "
371
+ f"(confidence < {threshold})"
372
+ )
373
+ else:
374
+ logger.info(f"No cells below confidence threshold {threshold}")
375
+
376
+ return adata
377
+
378
+
379
+ def filter_low_count_types(
380
+ adata: ad.AnnData,
381
+ label_column: str,
382
+ min_cells: int = 15,
383
+ unassigned_label: str = "Low_count",
384
+ copy: bool = False,
385
+ ) -> ad.AnnData:
386
+ """
387
+ Mark cell types with fewer than min_cells as Low_count.
388
+
389
+ Cell types with very few cells may be unreliable annotations or
390
+ artifacts. This function marks cells of rare types with a special
391
+ label for downstream filtering or analysis.
392
+
393
+ Parameters
394
+ ----------
395
+ adata : AnnData
396
+ AnnData object with cell type labels.
397
+ label_column : str
398
+ Column in adata.obs containing cell type labels.
399
+ min_cells : int, default 15
400
+ Minimum cells required for a cell type. Types with fewer cells
401
+ are marked with unassigned_label.
402
+ unassigned_label : str, default "Low_count"
403
+ Label to assign to cells of rare types.
404
+ copy : bool, default False
405
+ Whether to return a copy or modify in-place.
406
+
407
+ Returns
408
+ -------
409
+ AnnData
410
+ AnnData with rare cell types marked.
411
+
412
+ Notes
413
+ -----
414
+ This is different from filter_low_confidence:
415
+
416
+ - filter_low_confidence: Marks individual cells with low prediction confidence
417
+ - filter_low_count_types: Marks entire cell types that have too few members
418
+
419
+ A cell type might have high individual confidence but still be rare
420
+ in the dataset (e.g., 5 cells all with 0.9 confidence).
421
+
422
+ Examples
423
+ --------
424
+ >>> from spatialcore.annotation.confidence import filter_low_count_types
425
+ >>> # Mark cell types with fewer than 20 cells
426
+ >>> adata = filter_low_count_types(
427
+ ... adata,
428
+ ... label_column="celltypist_prediction",
429
+ ... min_cells=20,
430
+ ... unassigned_label="Rare_type",
431
+ ... )
432
+ >>> # Check which types were affected
433
+ >>> print(adata.obs["celltypist_prediction"].value_counts())
434
+ """
435
+ if copy:
436
+ adata = adata.copy()
437
+
438
+ # Validate column exists
439
+ if label_column not in adata.obs.columns:
440
+ raise ValueError(
441
+ f"Label column '{label_column}' not found in adata.obs. "
442
+ f"Available: {list(adata.obs.columns)}"
443
+ )
444
+
445
+ # Count cells per type
446
+ type_counts = adata.obs[label_column].value_counts()
447
+ low_count_types = type_counts[type_counts < min_cells].index.tolist()
448
+
449
+ if len(low_count_types) > 0:
450
+ # Create mask for cells of low-count types
451
+ low_count_mask = adata.obs[label_column].isin(low_count_types)
452
+ n_affected = low_count_mask.sum()
453
+
454
+ # Ensure label column is string type
455
+ adata.obs[label_column] = adata.obs[label_column].astype(str)
456
+ adata.obs.loc[low_count_mask, label_column] = unassigned_label
457
+
458
+ pct = 100 * n_affected / adata.n_obs
459
+ logger.info(
460
+ f"Marked {n_affected:,} cells ({pct:.1f}%) from "
461
+ f"{len(low_count_types)} rare types as '{unassigned_label}' "
462
+ f"(types with < {min_cells} cells)"
463
+ )
464
+ logger.info(f" Affected types: {low_count_types[:5]}{'...' if len(low_count_types) > 5 else ''}")
465
+ else:
466
+ logger.info(f"No cell types with fewer than {min_cells} cells")
467
+
468
+ return adata
469
+
470
+
471
+ def compute_confidence_from_obsm(
472
+ adata: ad.AnnData,
473
+ decision_scores_key: str = "celltypist_decision_scores",
474
+ method: ConfidenceMethod = "zscore",
475
+ confidence_column: str = "confidence_transformed",
476
+ copy: bool = False,
477
+ ) -> ad.AnnData:
478
+ """
479
+ Compute transformed confidence from stored decision scores.
480
+
481
+ Convenience function that reads decision scores from adata.obsm
482
+ and applies transform_confidence, storing the result in adata.obs.
483
+
484
+ Parameters
485
+ ----------
486
+ adata : AnnData
487
+ AnnData with decision scores in obsm.
488
+ decision_scores_key : str, default "celltypist_decision_scores"
489
+ Key in adata.obsm containing decision score matrix.
490
+ method : {"raw", "zscore", "softmax", "minmax"}, default "zscore"
491
+ Transformation method (see transform_confidence).
492
+ confidence_column : str, default "confidence_transformed"
493
+ Column name in adata.obs for output confidence values.
494
+ copy : bool, default False
495
+ Whether to return a copy or modify in-place.
496
+
497
+ Returns
498
+ -------
499
+ AnnData
500
+ AnnData with confidence values in adata.obs[confidence_column].
501
+
502
+ Examples
503
+ --------
504
+ >>> from spatialcore.annotation.confidence import compute_confidence_from_obsm
505
+ >>> # Assuming decision scores are already stored
506
+ >>> adata = compute_confidence_from_obsm(adata, method="zscore")
507
+ >>> print(adata.obs["confidence_transformed"].describe())
508
+ """
509
+ if copy:
510
+ adata = adata.copy()
511
+
512
+ if decision_scores_key not in adata.obsm:
513
+ raise ValueError(
514
+ f"Decision scores key '{decision_scores_key}' not found in adata.obsm. "
515
+ f"Available: {list(adata.obsm.keys())}. "
516
+ f"Run extract_decision_scores() first."
517
+ )
518
+
519
+ decision_scores = adata.obsm[decision_scores_key]
520
+ confidence = transform_confidence(decision_scores, method=method)
521
+ adata.obs[confidence_column] = confidence
522
+
523
+ logger.info(
524
+ f"Computed {method} confidence in adata.obs['{confidence_column}'] "
525
+ f"(mean={confidence.mean():.3f}, std={confidence.std():.3f})"
526
+ )
527
+
528
+ return adata
529
+
530
+
531
+ # ============================================================================
532
+ # Dual-Threshold Marker Validation
533
+ # ============================================================================
534
+
535
+ def filter_by_marker_validation(
536
+ adata: ad.AnnData,
537
+ label_column: str,
538
+ confidence_column: str,
539
+ canonical_markers: Optional[Dict[str, List[str]]] = None,
540
+ confidence_threshold: float = 0.5,
541
+ n_components: int = 3,
542
+ min_cells_per_type: int = 15,
543
+ unassigned_label: str = "Unassigned",
544
+ copy: bool = False,
545
+ ) -> Tuple[ad.AnnData, pd.DataFrame]:
546
+ """
547
+ Filter cells using BOTH confidence threshold AND GMM-3 marker validation.
548
+
549
+ Implements dual-threshold QC from spec:
550
+ - X-axis: Confidence (z-score transformed)
551
+ - Y-axis: Marker expression score (GMM-3 threshold)
552
+
553
+ Cells must pass BOTH thresholds to retain their cell type label.
554
+ Uses `classify_by_threshold()` with `n_components=3` internally for
555
+ GMM-3 fitting on marker expression.
556
+
557
+ Parameters
558
+ ----------
559
+ adata : AnnData
560
+ AnnData object with cell type labels and confidence values.
561
+ label_column : str
562
+ Column in adata.obs containing cell type labels.
563
+ confidence_column : str
564
+ Column in adata.obs containing confidence values.
565
+ canonical_markers : Dict[str, List[str]], optional
566
+ Dictionary mapping cell types to marker gene lists.
567
+ If None, uses default CANONICAL_MARKERS from markers module.
568
+ confidence_threshold : float, default 0.5
569
+ Minimum confidence threshold. Cells below this are marked unassigned.
570
+ n_components : int, default 3
571
+ Number of GMM components for marker thresholding.
572
+ 3 = trimodal (dropout/moderate/high expression).
573
+ min_cells_per_type : int, default 15
574
+ Minimum cells required to validate a cell type. Types with fewer
575
+ cells are marked unassigned.
576
+ unassigned_label : str, default "Unassigned"
577
+ Label to assign to cells that fail validation.
578
+ copy : bool, default False
579
+ Whether to return a copy or modify in-place.
580
+
581
+ Returns
582
+ -------
583
+ Tuple[AnnData, pd.DataFrame]
584
+ - AnnData with filtered labels and validation columns added:
585
+ - `{label_column}_validated`: Final validated labels
586
+ - `marker_score`: Mean expression of canonical markers
587
+ - `marker_passes_gmm`: Whether cell passes GMM marker threshold
588
+ - `confidence_passes`: Whether cell passes confidence threshold
589
+ - `validation_pass`: Whether cell passes both thresholds
590
+ - Summary DataFrame with validation statistics per cell type.
591
+
592
+ Notes
593
+ -----
594
+ **Dual-threshold rationale:**
595
+
596
+ 1. Confidence alone may miss cells that are assigned to wrong types
597
+ with deceptively high confidence.
598
+ 2. Marker expression alone may miss cells where marker genes are
599
+ not expressed due to dropout or technical noise.
600
+ 3. Combining both axes provides more robust QC.
601
+
602
+ **GMM-3 thresholding:**
603
+
604
+ For spatial data with dropouts, marker expression is often trimodal:
605
+ - Component 0: Zero/dropout (no expression)
606
+ - Component 1: Moderate expression
607
+ - Component 2: High expression
608
+
609
+ The threshold is set at the boundary between component 0 and component 1,
610
+ identifying cells with biologically meaningful marker expression.
611
+
612
+ Examples
613
+ --------
614
+ >>> from spatialcore.annotation.confidence import filter_by_marker_validation
615
+ >>> # Filter with default markers
616
+ >>> adata, summary = filter_by_marker_validation(
617
+ ... adata,
618
+ ... label_column="celltypist",
619
+ ... confidence_column="celltypist_confidence_transformed",
620
+ ... confidence_threshold=0.5,
621
+ ... )
622
+ >>> # Check validation summary
623
+ >>> print(summary[["cell_type", "n_cells", "pct_pass"]])
624
+
625
+ See Also
626
+ --------
627
+ spatialcore.stats.classify_by_threshold : GMM-3 thresholding function.
628
+ spatialcore.plotting.validation.plot_2d_validation : 2D validation plot.
629
+ """
630
+ if copy:
631
+ adata = adata.copy()
632
+
633
+ # Validate columns exist
634
+ if label_column not in adata.obs.columns:
635
+ raise ValueError(
636
+ f"Label column '{label_column}' not found in adata.obs. "
637
+ f"Available: {list(adata.obs.columns)}"
638
+ )
639
+ if confidence_column not in adata.obs.columns:
640
+ raise ValueError(
641
+ f"Confidence column '{confidence_column}' not found in adata.obs. "
642
+ f"Available: {list(adata.obs.columns)}"
643
+ )
644
+
645
+ # Load canonical markers if not provided
646
+ if canonical_markers is None:
647
+ try:
648
+ from spatialcore.annotation.markers import load_canonical_markers
649
+ canonical_markers = load_canonical_markers()
650
+ logger.info(f"Loaded canonical markers for {len(canonical_markers)} cell types")
651
+ except Exception as e:
652
+ logger.warning(f"Could not load canonical markers: {e}")
653
+ canonical_markers = {}
654
+
655
+ # Get GMM classification function
656
+ try:
657
+ from spatialcore.stats.classify import classify_by_threshold
658
+ except ImportError:
659
+ raise ImportError(
660
+ "spatialcore.stats.classify is required for GMM-3 validation. "
661
+ "Ensure the stats module is properly installed."
662
+ )
663
+
664
+ # Initialize result columns
665
+ n_cells = adata.n_obs
666
+ marker_scores = np.zeros(n_cells)
667
+ marker_passes = np.zeros(n_cells, dtype=bool)
668
+ confidence_passes = adata.obs[confidence_column].values >= confidence_threshold
669
+
670
+ # Get unique cell types (excluding already unassigned)
671
+ cell_types = adata.obs[label_column].astype(str).unique()
672
+ cell_types = [ct for ct in cell_types if ct.lower() not in ["unassigned", "unknown", "low_count"]]
673
+
674
+ # Validation summary
675
+ summary_rows = []
676
+
677
+ for cell_type in cell_types:
678
+ # Get cells of this type
679
+ type_mask = adata.obs[label_column].astype(str) == cell_type
680
+ n_type_cells = type_mask.sum()
681
+
682
+ # Skip if too few cells
683
+ if n_type_cells < min_cells_per_type:
684
+ logger.debug(f"Skipping {cell_type}: only {n_type_cells} cells (< {min_cells_per_type})")
685
+ summary_rows.append({
686
+ "cell_type": cell_type,
687
+ "n_cells": n_type_cells,
688
+ "has_markers": False,
689
+ "gmm_threshold": np.nan,
690
+ "n_pass_confidence": (type_mask & confidence_passes).sum(),
691
+ "n_pass_marker": 0,
692
+ "n_pass_both": 0,
693
+ "pct_pass": 0.0,
694
+ })
695
+ continue
696
+
697
+ # Find matching markers
698
+ markers_for_type = canonical_markers.get(cell_type, [])
699
+
700
+ # Try case-insensitive match if exact match fails
701
+ if not markers_for_type:
702
+ for marker_type, markers in canonical_markers.items():
703
+ if marker_type.lower() == cell_type.lower():
704
+ markers_for_type = markers
705
+ break
706
+
707
+ # Get marker genes that exist in data
708
+ available_markers = [m for m in markers_for_type if m in adata.var_names]
709
+
710
+ if not available_markers:
711
+ logger.debug(f"No canonical markers found for {cell_type}")
712
+ # All cells pass marker validation if no markers defined
713
+ marker_passes[type_mask] = True
714
+ summary_rows.append({
715
+ "cell_type": cell_type,
716
+ "n_cells": n_type_cells,
717
+ "has_markers": False,
718
+ "gmm_threshold": np.nan,
719
+ "n_pass_confidence": (type_mask & confidence_passes).sum(),
720
+ "n_pass_marker": n_type_cells,
721
+ "n_pass_both": (type_mask & confidence_passes).sum(),
722
+ "pct_pass": 100 * (type_mask & confidence_passes).sum() / n_type_cells,
723
+ })
724
+ continue
725
+
726
+ # Calculate mean marker expression for cells of this type
727
+ # Use .X if normalized, or first available layer
728
+ if hasattr(adata.X, "toarray"):
729
+ expr_matrix = adata[type_mask, available_markers].X.toarray()
730
+ else:
731
+ expr_matrix = adata[type_mask, available_markers].X
732
+
733
+ mean_marker_expr = np.mean(expr_matrix, axis=1)
734
+ marker_scores[type_mask] = mean_marker_expr
735
+
736
+ # Fit GMM-3 threshold on marker expression
737
+ try:
738
+ threshold_result = classify_by_threshold(
739
+ mean_marker_expr,
740
+ n_components=n_components,
741
+ method="gmm",
742
+ )
743
+ gmm_threshold = threshold_result["threshold"]
744
+ type_marker_passes = mean_marker_expr >= gmm_threshold
745
+ marker_passes[type_mask] = type_marker_passes
746
+ except Exception as e:
747
+ logger.warning(f"GMM fitting failed for {cell_type}: {e}. Marking all as pass.")
748
+ marker_passes[type_mask] = True
749
+ gmm_threshold = np.nan
750
+
751
+ # Calculate summary stats
752
+ n_pass_conf = (type_mask & confidence_passes).sum()
753
+ n_pass_marker = marker_passes[type_mask].sum()
754
+ n_pass_both = (type_mask & confidence_passes & marker_passes).sum()
755
+
756
+ summary_rows.append({
757
+ "cell_type": cell_type,
758
+ "n_cells": n_type_cells,
759
+ "has_markers": True,
760
+ "n_markers": len(available_markers),
761
+ "markers": available_markers[:3], # First 3 for display
762
+ "gmm_threshold": gmm_threshold,
763
+ "n_pass_confidence": n_pass_conf,
764
+ "n_pass_marker": n_pass_marker,
765
+ "n_pass_both": n_pass_both,
766
+ "pct_pass": 100 * n_pass_both / n_type_cells if n_type_cells > 0 else 0,
767
+ })
768
+
769
+ # Store validation results in adata
770
+ adata.obs["marker_score"] = marker_scores
771
+ adata.obs["marker_passes_gmm"] = marker_passes
772
+ adata.obs["confidence_passes"] = confidence_passes
773
+ adata.obs["validation_pass"] = confidence_passes & marker_passes
774
+
775
+ # Create validated labels column
776
+ validated_labels = adata.obs[label_column].astype(str).copy()
777
+ fail_mask = ~adata.obs["validation_pass"]
778
+ n_failed = fail_mask.sum()
779
+
780
+ if n_failed > 0:
781
+ validated_labels[fail_mask] = unassigned_label
782
+ pct_failed = 100 * n_failed / n_cells
783
+ logger.info(
784
+ f"Dual-threshold validation: {n_failed:,} cells ({pct_failed:.1f}%) "
785
+ f"marked as '{unassigned_label}'"
786
+ )
787
+
788
+ adata.obs[f"{label_column}_validated"] = pd.Categorical(validated_labels)
789
+
790
+ # Create summary DataFrame
791
+ summary_df = pd.DataFrame(summary_rows)
792
+ if len(summary_df) > 0:
793
+ summary_df = summary_df.sort_values("n_cells", ascending=False).reset_index(drop=True)
794
+
795
+ # Log overall summary
796
+ n_pass_total = adata.obs["validation_pass"].sum()
797
+ logger.info(
798
+ f"Validation complete: {n_pass_total:,}/{n_cells:,} cells "
799
+ f"({100*n_pass_total/n_cells:.1f}%) passed dual-threshold QC"
800
+ )
801
+
802
+ return adata, summary_df