spatialcore 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatialcore/__init__.py +122 -0
- spatialcore/annotation/__init__.py +253 -0
- spatialcore/annotation/acquisition.py +529 -0
- spatialcore/annotation/annotate.py +603 -0
- spatialcore/annotation/cellxgene.py +365 -0
- spatialcore/annotation/confidence.py +802 -0
- spatialcore/annotation/discovery.py +529 -0
- spatialcore/annotation/expression.py +363 -0
- spatialcore/annotation/loading.py +529 -0
- spatialcore/annotation/markers.py +297 -0
- spatialcore/annotation/ontology.py +1282 -0
- spatialcore/annotation/patterns.py +247 -0
- spatialcore/annotation/pipeline.py +620 -0
- spatialcore/annotation/synapse.py +380 -0
- spatialcore/annotation/training.py +1457 -0
- spatialcore/annotation/validation.py +422 -0
- spatialcore/core/__init__.py +34 -0
- spatialcore/core/cache.py +118 -0
- spatialcore/core/logging.py +135 -0
- spatialcore/core/metadata.py +149 -0
- spatialcore/core/utils.py +768 -0
- spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
- spatialcore/data/markers/canonical_markers.json +83 -0
- spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
- spatialcore/plotting/__init__.py +109 -0
- spatialcore/plotting/benchmark.py +477 -0
- spatialcore/plotting/celltype.py +329 -0
- spatialcore/plotting/confidence.py +413 -0
- spatialcore/plotting/spatial.py +505 -0
- spatialcore/plotting/utils.py +411 -0
- spatialcore/plotting/validation.py +1342 -0
- spatialcore-0.1.9.dist-info/METADATA +213 -0
- spatialcore-0.1.9.dist-info/RECORD +36 -0
- spatialcore-0.1.9.dist-info/WHEEL +5 -0
- spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
- spatialcore-0.1.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,802 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Confidence score transformation, filtering, and decision score storage.
|
|
3
|
+
|
|
4
|
+
This module provides utilities for:
|
|
5
|
+
1. Transforming CellTypist decision scores to meaningful confidence values
|
|
6
|
+
2. Storing decision score matrices in AnnData for downstream analysis
|
|
7
|
+
3. Filtering cells by confidence or cell type count thresholds
|
|
8
|
+
|
|
9
|
+
For spatial transcriptomics, raw CellTypist confidence values may be less
|
|
10
|
+
informative than z-score transformed values, which capture how confident
|
|
11
|
+
a prediction is relative to other cell types.
|
|
12
|
+
|
|
13
|
+
References:
|
|
14
|
+
- CellTypist: https://www.celltypist.org/
|
|
15
|
+
- Domínguez Conde et al., Science (2022)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import anndata as ad
|
|
23
|
+
from scipy.special import softmax as scipy_softmax
|
|
24
|
+
|
|
25
|
+
from spatialcore.core.logging import get_logger
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ============================================================================
|
|
31
|
+
# Confidence Transformation
|
|
32
|
+
# ============================================================================
|
|
33
|
+
|
|
34
|
+
ConfidenceMethod = Literal["raw", "zscore", "softmax", "minmax"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def transform_confidence(
|
|
38
|
+
decision_scores: np.ndarray,
|
|
39
|
+
method: ConfidenceMethod = "zscore",
|
|
40
|
+
) -> np.ndarray:
|
|
41
|
+
"""
|
|
42
|
+
Transform CellTypist decision scores to meaningful confidence values.
|
|
43
|
+
|
|
44
|
+
CellTypist produces logistic regression decision scores which can be
|
|
45
|
+
negative and unbounded. This function transforms them to interpretable
|
|
46
|
+
[0, 1] confidence values using different strategies.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
decision_scores : np.ndarray
|
|
51
|
+
Decision score matrix of shape (n_cells, n_types) from CellTypist.
|
|
52
|
+
Each row contains scores for all cell types for one cell.
|
|
53
|
+
method : {"raw", "zscore", "softmax", "minmax"}, default "zscore"
|
|
54
|
+
Transformation method:
|
|
55
|
+
|
|
56
|
+
- "raw": Return winning score directly (may be negative/unbounded)
|
|
57
|
+
- "zscore": Sigmoid of z-score of winning type vs all types.
|
|
58
|
+
Recommended for spatial data. Captures how "distinct" the prediction is.
|
|
59
|
+
- "softmax": Softmax probability of winning type.
|
|
60
|
+
Sums to 1 across types, good for comparing type probabilities.
|
|
61
|
+
- "minmax": Min-max scaling to [0, 1] per cell.
|
|
62
|
+
Simple but may not be well-calibrated.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
np.ndarray
|
|
67
|
+
Array of shape (n_cells,) with transformed confidence values.
|
|
68
|
+
For "zscore", "softmax", "minmax": values in [0, 1].
|
|
69
|
+
For "raw": unbounded values.
|
|
70
|
+
|
|
71
|
+
Notes
|
|
72
|
+
-----
|
|
73
|
+
**Why z-score for spatial data?**
|
|
74
|
+
|
|
75
|
+
CellTypist decision scores from logistic regression can be negative and
|
|
76
|
+
don't have a natural scale. The z-score method computes how many standard
|
|
77
|
+
deviations the winning type's score is above the mean, then applies
|
|
78
|
+
sigmoid to get a [0, 1] value:
|
|
79
|
+
|
|
80
|
+
confidence = sigmoid((winning_score - mean) / std)
|
|
81
|
+
|
|
82
|
+
This is more informative than raw confidence because:
|
|
83
|
+
1. A cell with scores [5.0, 0.1, 0.1, 0.1] has high z-score (clear winner)
|
|
84
|
+
2. A cell with scores [5.0, 4.8, 4.7, 4.6] has low z-score (ambiguous)
|
|
85
|
+
|
|
86
|
+
**Method comparison:**
|
|
87
|
+
|
|
88
|
+
+----------+------------------+------------------+
|
|
89
|
+
| Method | Best For | Output Range |
|
|
90
|
+
+==========+==================+==================+
|
|
91
|
+
| zscore | Spatial data | [0, 1] (sigmoid) |
|
|
92
|
+
| softmax | Probability est. | [0, 1] (prob) |
|
|
93
|
+
| minmax | Simple scaling | [0, 1] (linear) |
|
|
94
|
+
| raw | Debug/analysis | unbounded |
|
|
95
|
+
+----------+------------------+------------------+
|
|
96
|
+
|
|
97
|
+
Examples
|
|
98
|
+
--------
|
|
99
|
+
>>> from spatialcore.annotation.confidence import transform_confidence
|
|
100
|
+
>>> import numpy as np
|
|
101
|
+
>>> # Decision scores from CellTypist (n_cells=3, n_types=4)
|
|
102
|
+
>>> scores = np.array([
|
|
103
|
+
... [5.0, 0.1, 0.1, 0.1], # Clear winner
|
|
104
|
+
... [2.0, 1.8, 1.9, 1.7], # Ambiguous
|
|
105
|
+
... [0.5, -1.0, -0.5, 0.3], # Negative scores
|
|
106
|
+
... ])
|
|
107
|
+
>>> conf = transform_confidence(scores, method="zscore")
|
|
108
|
+
>>> print(f"Clear winner: {conf[0]:.3f}") # High confidence
|
|
109
|
+
>>> print(f"Ambiguous: {conf[1]:.3f}") # Low confidence
|
|
110
|
+
"""
|
|
111
|
+
if decision_scores.ndim != 2:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"Expected 2D array of shape (n_cells, n_types), "
|
|
114
|
+
f"got shape {decision_scores.shape}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
n_cells, n_types = decision_scores.shape
|
|
118
|
+
|
|
119
|
+
if n_types < 2:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Expected at least 2 cell types, got {n_types}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Get winning type index for each cell
|
|
125
|
+
winning_idx = np.argmax(decision_scores, axis=1)
|
|
126
|
+
winning_scores = decision_scores[np.arange(n_cells), winning_idx]
|
|
127
|
+
|
|
128
|
+
if method == "raw":
|
|
129
|
+
return winning_scores
|
|
130
|
+
|
|
131
|
+
elif method == "zscore":
|
|
132
|
+
# Z-score: how many std above mean is the winning score?
|
|
133
|
+
mean_scores = np.mean(decision_scores, axis=1)
|
|
134
|
+
std_scores = np.std(decision_scores, axis=1)
|
|
135
|
+
|
|
136
|
+
# Avoid division by zero (all types have same score)
|
|
137
|
+
std_scores = np.where(std_scores < 1e-10, 1.0, std_scores)
|
|
138
|
+
|
|
139
|
+
z_scores = (winning_scores - mean_scores) / std_scores
|
|
140
|
+
|
|
141
|
+
# Sigmoid to [0, 1]
|
|
142
|
+
confidence = 1 / (1 + np.exp(-z_scores))
|
|
143
|
+
return confidence
|
|
144
|
+
|
|
145
|
+
elif method == "softmax":
|
|
146
|
+
# Softmax probability of winning type
|
|
147
|
+
probs = scipy_softmax(decision_scores, axis=1)
|
|
148
|
+
confidence = probs[np.arange(n_cells), winning_idx]
|
|
149
|
+
return confidence
|
|
150
|
+
|
|
151
|
+
elif method == "minmax":
|
|
152
|
+
# Min-max scaling per cell
|
|
153
|
+
min_scores = np.min(decision_scores, axis=1, keepdims=True)
|
|
154
|
+
max_scores = np.max(decision_scores, axis=1, keepdims=True)
|
|
155
|
+
|
|
156
|
+
# Avoid division by zero
|
|
157
|
+
score_range = max_scores - min_scores
|
|
158
|
+
score_range = np.where(score_range < 1e-10, 1.0, score_range)
|
|
159
|
+
|
|
160
|
+
scaled = (decision_scores - min_scores) / score_range
|
|
161
|
+
confidence = scaled[np.arange(n_cells), winning_idx]
|
|
162
|
+
return confidence
|
|
163
|
+
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Unknown confidence method: {method}. "
|
|
167
|
+
f"Expected one of: raw, zscore, softmax, minmax"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ============================================================================
|
|
172
|
+
# Decision Score Storage
|
|
173
|
+
# ============================================================================
|
|
174
|
+
|
|
175
|
+
def extract_decision_scores(
|
|
176
|
+
adata: ad.AnnData,
|
|
177
|
+
celltypist_result,
|
|
178
|
+
key_added: str = "celltypist",
|
|
179
|
+
copy: bool = False,
|
|
180
|
+
) -> ad.AnnData:
|
|
181
|
+
"""
|
|
182
|
+
Store CellTypist decision scores matrix in AnnData.
|
|
183
|
+
|
|
184
|
+
Extracts the full decision score matrix from CellTypist annotation
|
|
185
|
+
results and stores it in adata.obsm for downstream analysis
|
|
186
|
+
(e.g., confidence transforms, plotting, uncertainty analysis).
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
adata : AnnData
|
|
191
|
+
AnnData object that was annotated with CellTypist.
|
|
192
|
+
celltypist_result
|
|
193
|
+
CellTypist AnnotationResult object from celltypist.annotate().
|
|
194
|
+
Must have .decision_matrix and .cell_types attributes.
|
|
195
|
+
key_added : str, default "celltypist"
|
|
196
|
+
Key prefix for stored results:
|
|
197
|
+
|
|
198
|
+
- adata.obsm[f"{key_added}_decision_scores"]: Decision matrix
|
|
199
|
+
- adata.uns[f"{key_added}_cell_types"]: Cell type names
|
|
200
|
+
copy : bool, default False
|
|
201
|
+
Whether to return a copy or modify in-place.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
AnnData
|
|
206
|
+
AnnData with decision scores stored in obsm.
|
|
207
|
+
|
|
208
|
+
Notes
|
|
209
|
+
-----
|
|
210
|
+
The decision score matrix has shape (n_cells, n_types) where each
|
|
211
|
+
row contains the logistic regression decision scores for all cell
|
|
212
|
+
types. Higher scores indicate stronger evidence for that cell type.
|
|
213
|
+
|
|
214
|
+
These scores can be used for:
|
|
215
|
+
- Custom confidence calculations (transform_confidence)
|
|
216
|
+
- Uncertainty visualization (plotting multiple high-scoring types)
|
|
217
|
+
- Ensemble methods (combining multiple model predictions)
|
|
218
|
+
|
|
219
|
+
Examples
|
|
220
|
+
--------
|
|
221
|
+
>>> import celltypist
|
|
222
|
+
>>> from spatialcore.annotation.confidence import extract_decision_scores
|
|
223
|
+
>>> # Run CellTypist annotation
|
|
224
|
+
>>> result = celltypist.annotate(adata, model=model)
|
|
225
|
+
>>> # Store decision scores
|
|
226
|
+
>>> adata = extract_decision_scores(adata, result, key_added="ct")
|
|
227
|
+
>>> # Access stored scores
|
|
228
|
+
>>> scores = adata.obsm["ct_decision_scores"]
|
|
229
|
+
>>> cell_types = adata.uns["ct_cell_types"]
|
|
230
|
+
"""
|
|
231
|
+
if copy:
|
|
232
|
+
adata = adata.copy()
|
|
233
|
+
|
|
234
|
+
# Extract decision matrix from CellTypist result
|
|
235
|
+
if not hasattr(celltypist_result, "decision_matrix"):
|
|
236
|
+
raise ValueError(
|
|
237
|
+
"CellTypist result does not have decision_matrix. "
|
|
238
|
+
"Ensure you passed the result from celltypist.annotate()."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
decision_matrix = celltypist_result.decision_matrix
|
|
242
|
+
|
|
243
|
+
# Handle DataFrame (from CellTypist) or numpy array
|
|
244
|
+
if isinstance(decision_matrix, pd.DataFrame):
|
|
245
|
+
cell_types = list(decision_matrix.columns)
|
|
246
|
+
decision_array = decision_matrix.values
|
|
247
|
+
else:
|
|
248
|
+
# Numpy array - get cell types from result
|
|
249
|
+
decision_array = decision_matrix
|
|
250
|
+
if hasattr(celltypist_result, "cell_types"):
|
|
251
|
+
cell_types = list(celltypist_result.cell_types)
|
|
252
|
+
else:
|
|
253
|
+
# Fallback: generate generic names
|
|
254
|
+
n_types = decision_array.shape[1]
|
|
255
|
+
cell_types = [f"type_{i}" for i in range(n_types)]
|
|
256
|
+
logger.warning(
|
|
257
|
+
f"Could not extract cell type names from result. "
|
|
258
|
+
f"Using generic names: type_0, type_1, ..."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Validate shape
|
|
262
|
+
if decision_array.shape[0] != adata.n_obs:
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"Decision matrix has {decision_array.shape[0]} cells, "
|
|
265
|
+
f"but AnnData has {adata.n_obs} cells. "
|
|
266
|
+
f"Ensure the CellTypist result matches the AnnData object."
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Store in AnnData
|
|
270
|
+
adata.obsm[f"{key_added}_decision_scores"] = decision_array.astype(np.float32)
|
|
271
|
+
adata.uns[f"{key_added}_cell_types"] = cell_types
|
|
272
|
+
|
|
273
|
+
logger.info(
|
|
274
|
+
f"Stored decision scores: {decision_array.shape[0]:,} cells × "
|
|
275
|
+
f"{decision_array.shape[1]} types in adata.obsm['{key_added}_decision_scores']"
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return adata
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ============================================================================
|
|
282
|
+
# Confidence Filtering
|
|
283
|
+
# ============================================================================
|
|
284
|
+
|
|
285
|
+
def filter_low_confidence(
|
|
286
|
+
adata: ad.AnnData,
|
|
287
|
+
label_column: str,
|
|
288
|
+
confidence_column: str,
|
|
289
|
+
threshold: float = 0.5,
|
|
290
|
+
unassigned_label: str = "Unassigned",
|
|
291
|
+
copy: bool = False,
|
|
292
|
+
) -> ad.AnnData:
|
|
293
|
+
"""
|
|
294
|
+
Mark cells below confidence threshold as Unassigned.
|
|
295
|
+
|
|
296
|
+
Cells with confidence values below the threshold have their cell type
|
|
297
|
+
label replaced with an unassigned label. This is useful for quality
|
|
298
|
+
control to flag uncertain predictions.
|
|
299
|
+
|
|
300
|
+
Parameters
|
|
301
|
+
----------
|
|
302
|
+
adata : AnnData
|
|
303
|
+
AnnData object with cell type labels and confidence values.
|
|
304
|
+
label_column : str
|
|
305
|
+
Column in adata.obs containing cell type labels.
|
|
306
|
+
confidence_column : str
|
|
307
|
+
Column in adata.obs containing confidence values.
|
|
308
|
+
threshold : float, default 0.5
|
|
309
|
+
Confidence threshold. Cells with confidence < threshold are marked
|
|
310
|
+
as unassigned.
|
|
311
|
+
unassigned_label : str, default "Unassigned"
|
|
312
|
+
Label to assign to low-confidence cells.
|
|
313
|
+
copy : bool, default False
|
|
314
|
+
Whether to return a copy or modify in-place.
|
|
315
|
+
|
|
316
|
+
Returns
|
|
317
|
+
-------
|
|
318
|
+
AnnData
|
|
319
|
+
AnnData with low-confidence cells marked as unassigned.
|
|
320
|
+
|
|
321
|
+
Notes
|
|
322
|
+
-----
|
|
323
|
+
The original labels are preserved if you use copy=True. For tracking
|
|
324
|
+
purposes, consider storing the original labels in a separate column
|
|
325
|
+
before filtering.
|
|
326
|
+
|
|
327
|
+
Examples
|
|
328
|
+
--------
|
|
329
|
+
>>> from spatialcore.annotation.confidence import filter_low_confidence
|
|
330
|
+
>>> # Filter cells with confidence < 0.6
|
|
331
|
+
>>> adata = filter_low_confidence(
|
|
332
|
+
... adata,
|
|
333
|
+
... label_column="celltypist_prediction",
|
|
334
|
+
... confidence_column="celltypist_confidence",
|
|
335
|
+
... threshold=0.6,
|
|
336
|
+
... )
|
|
337
|
+
>>> # Check how many were marked as unassigned
|
|
338
|
+
>>> n_unassigned = (adata.obs["celltypist_prediction"] == "Unassigned").sum()
|
|
339
|
+
>>> print(f"Marked {n_unassigned:,} cells as Unassigned")
|
|
340
|
+
"""
|
|
341
|
+
if copy:
|
|
342
|
+
adata = adata.copy()
|
|
343
|
+
|
|
344
|
+
# Validate columns exist
|
|
345
|
+
if label_column not in adata.obs.columns:
|
|
346
|
+
raise ValueError(
|
|
347
|
+
f"Label column '{label_column}' not found in adata.obs. "
|
|
348
|
+
f"Available: {list(adata.obs.columns)}"
|
|
349
|
+
)
|
|
350
|
+
if confidence_column not in adata.obs.columns:
|
|
351
|
+
raise ValueError(
|
|
352
|
+
f"Confidence column '{confidence_column}' not found in adata.obs. "
|
|
353
|
+
f"Available: {list(adata.obs.columns)}"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Get confidence values
|
|
357
|
+
confidence = adata.obs[confidence_column].values
|
|
358
|
+
|
|
359
|
+
# Mark low confidence cells
|
|
360
|
+
low_conf_mask = confidence < threshold
|
|
361
|
+
n_low_conf = low_conf_mask.sum()
|
|
362
|
+
|
|
363
|
+
if n_low_conf > 0:
|
|
364
|
+
# Ensure label column is string type for modification
|
|
365
|
+
adata.obs[label_column] = adata.obs[label_column].astype(str)
|
|
366
|
+
adata.obs.loc[low_conf_mask, label_column] = unassigned_label
|
|
367
|
+
|
|
368
|
+
pct = 100 * n_low_conf / adata.n_obs
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Marked {n_low_conf:,} cells ({pct:.1f}%) as '{unassigned_label}' "
|
|
371
|
+
f"(confidence < {threshold})"
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
logger.info(f"No cells below confidence threshold {threshold}")
|
|
375
|
+
|
|
376
|
+
return adata
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def filter_low_count_types(
|
|
380
|
+
adata: ad.AnnData,
|
|
381
|
+
label_column: str,
|
|
382
|
+
min_cells: int = 15,
|
|
383
|
+
unassigned_label: str = "Low_count",
|
|
384
|
+
copy: bool = False,
|
|
385
|
+
) -> ad.AnnData:
|
|
386
|
+
"""
|
|
387
|
+
Mark cell types with fewer than min_cells as Low_count.
|
|
388
|
+
|
|
389
|
+
Cell types with very few cells may be unreliable annotations or
|
|
390
|
+
artifacts. This function marks cells of rare types with a special
|
|
391
|
+
label for downstream filtering or analysis.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
adata : AnnData
|
|
396
|
+
AnnData object with cell type labels.
|
|
397
|
+
label_column : str
|
|
398
|
+
Column in adata.obs containing cell type labels.
|
|
399
|
+
min_cells : int, default 15
|
|
400
|
+
Minimum cells required for a cell type. Types with fewer cells
|
|
401
|
+
are marked with unassigned_label.
|
|
402
|
+
unassigned_label : str, default "Low_count"
|
|
403
|
+
Label to assign to cells of rare types.
|
|
404
|
+
copy : bool, default False
|
|
405
|
+
Whether to return a copy or modify in-place.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
AnnData
|
|
410
|
+
AnnData with rare cell types marked.
|
|
411
|
+
|
|
412
|
+
Notes
|
|
413
|
+
-----
|
|
414
|
+
This is different from filter_low_confidence:
|
|
415
|
+
|
|
416
|
+
- filter_low_confidence: Marks individual cells with low prediction confidence
|
|
417
|
+
- filter_low_count_types: Marks entire cell types that have too few members
|
|
418
|
+
|
|
419
|
+
A cell type might have high individual confidence but still be rare
|
|
420
|
+
in the dataset (e.g., 5 cells all with 0.9 confidence).
|
|
421
|
+
|
|
422
|
+
Examples
|
|
423
|
+
--------
|
|
424
|
+
>>> from spatialcore.annotation.confidence import filter_low_count_types
|
|
425
|
+
>>> # Mark cell types with fewer than 20 cells
|
|
426
|
+
>>> adata = filter_low_count_types(
|
|
427
|
+
... adata,
|
|
428
|
+
... label_column="celltypist_prediction",
|
|
429
|
+
... min_cells=20,
|
|
430
|
+
... unassigned_label="Rare_type",
|
|
431
|
+
... )
|
|
432
|
+
>>> # Check which types were affected
|
|
433
|
+
>>> print(adata.obs["celltypist_prediction"].value_counts())
|
|
434
|
+
"""
|
|
435
|
+
if copy:
|
|
436
|
+
adata = adata.copy()
|
|
437
|
+
|
|
438
|
+
# Validate column exists
|
|
439
|
+
if label_column not in adata.obs.columns:
|
|
440
|
+
raise ValueError(
|
|
441
|
+
f"Label column '{label_column}' not found in adata.obs. "
|
|
442
|
+
f"Available: {list(adata.obs.columns)}"
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Count cells per type
|
|
446
|
+
type_counts = adata.obs[label_column].value_counts()
|
|
447
|
+
low_count_types = type_counts[type_counts < min_cells].index.tolist()
|
|
448
|
+
|
|
449
|
+
if len(low_count_types) > 0:
|
|
450
|
+
# Create mask for cells of low-count types
|
|
451
|
+
low_count_mask = adata.obs[label_column].isin(low_count_types)
|
|
452
|
+
n_affected = low_count_mask.sum()
|
|
453
|
+
|
|
454
|
+
# Ensure label column is string type
|
|
455
|
+
adata.obs[label_column] = adata.obs[label_column].astype(str)
|
|
456
|
+
adata.obs.loc[low_count_mask, label_column] = unassigned_label
|
|
457
|
+
|
|
458
|
+
pct = 100 * n_affected / adata.n_obs
|
|
459
|
+
logger.info(
|
|
460
|
+
f"Marked {n_affected:,} cells ({pct:.1f}%) from "
|
|
461
|
+
f"{len(low_count_types)} rare types as '{unassigned_label}' "
|
|
462
|
+
f"(types with < {min_cells} cells)"
|
|
463
|
+
)
|
|
464
|
+
logger.info(f" Affected types: {low_count_types[:5]}{'...' if len(low_count_types) > 5 else ''}")
|
|
465
|
+
else:
|
|
466
|
+
logger.info(f"No cell types with fewer than {min_cells} cells")
|
|
467
|
+
|
|
468
|
+
return adata
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def compute_confidence_from_obsm(
|
|
472
|
+
adata: ad.AnnData,
|
|
473
|
+
decision_scores_key: str = "celltypist_decision_scores",
|
|
474
|
+
method: ConfidenceMethod = "zscore",
|
|
475
|
+
confidence_column: str = "confidence_transformed",
|
|
476
|
+
copy: bool = False,
|
|
477
|
+
) -> ad.AnnData:
|
|
478
|
+
"""
|
|
479
|
+
Compute transformed confidence from stored decision scores.
|
|
480
|
+
|
|
481
|
+
Convenience function that reads decision scores from adata.obsm
|
|
482
|
+
and applies transform_confidence, storing the result in adata.obs.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
adata : AnnData
|
|
487
|
+
AnnData with decision scores in obsm.
|
|
488
|
+
decision_scores_key : str, default "celltypist_decision_scores"
|
|
489
|
+
Key in adata.obsm containing decision score matrix.
|
|
490
|
+
method : {"raw", "zscore", "softmax", "minmax"}, default "zscore"
|
|
491
|
+
Transformation method (see transform_confidence).
|
|
492
|
+
confidence_column : str, default "confidence_transformed"
|
|
493
|
+
Column name in adata.obs for output confidence values.
|
|
494
|
+
copy : bool, default False
|
|
495
|
+
Whether to return a copy or modify in-place.
|
|
496
|
+
|
|
497
|
+
Returns
|
|
498
|
+
-------
|
|
499
|
+
AnnData
|
|
500
|
+
AnnData with confidence values in adata.obs[confidence_column].
|
|
501
|
+
|
|
502
|
+
Examples
|
|
503
|
+
--------
|
|
504
|
+
>>> from spatialcore.annotation.confidence import compute_confidence_from_obsm
|
|
505
|
+
>>> # Assuming decision scores are already stored
|
|
506
|
+
>>> adata = compute_confidence_from_obsm(adata, method="zscore")
|
|
507
|
+
>>> print(adata.obs["confidence_transformed"].describe())
|
|
508
|
+
"""
|
|
509
|
+
if copy:
|
|
510
|
+
adata = adata.copy()
|
|
511
|
+
|
|
512
|
+
if decision_scores_key not in adata.obsm:
|
|
513
|
+
raise ValueError(
|
|
514
|
+
f"Decision scores key '{decision_scores_key}' not found in adata.obsm. "
|
|
515
|
+
f"Available: {list(adata.obsm.keys())}. "
|
|
516
|
+
f"Run extract_decision_scores() first."
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
decision_scores = adata.obsm[decision_scores_key]
|
|
520
|
+
confidence = transform_confidence(decision_scores, method=method)
|
|
521
|
+
adata.obs[confidence_column] = confidence
|
|
522
|
+
|
|
523
|
+
logger.info(
|
|
524
|
+
f"Computed {method} confidence in adata.obs['{confidence_column}'] "
|
|
525
|
+
f"(mean={confidence.mean():.3f}, std={confidence.std():.3f})"
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
return adata
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
# ============================================================================
|
|
532
|
+
# Dual-Threshold Marker Validation
|
|
533
|
+
# ============================================================================
|
|
534
|
+
|
|
535
|
+
def filter_by_marker_validation(
|
|
536
|
+
adata: ad.AnnData,
|
|
537
|
+
label_column: str,
|
|
538
|
+
confidence_column: str,
|
|
539
|
+
canonical_markers: Optional[Dict[str, List[str]]] = None,
|
|
540
|
+
confidence_threshold: float = 0.5,
|
|
541
|
+
n_components: int = 3,
|
|
542
|
+
min_cells_per_type: int = 15,
|
|
543
|
+
unassigned_label: str = "Unassigned",
|
|
544
|
+
copy: bool = False,
|
|
545
|
+
) -> Tuple[ad.AnnData, pd.DataFrame]:
|
|
546
|
+
"""
|
|
547
|
+
Filter cells using BOTH confidence threshold AND GMM-3 marker validation.
|
|
548
|
+
|
|
549
|
+
Implements dual-threshold QC from spec:
|
|
550
|
+
- X-axis: Confidence (z-score transformed)
|
|
551
|
+
- Y-axis: Marker expression score (GMM-3 threshold)
|
|
552
|
+
|
|
553
|
+
Cells must pass BOTH thresholds to retain their cell type label.
|
|
554
|
+
Uses `classify_by_threshold()` with `n_components=3` internally for
|
|
555
|
+
GMM-3 fitting on marker expression.
|
|
556
|
+
|
|
557
|
+
Parameters
|
|
558
|
+
----------
|
|
559
|
+
adata : AnnData
|
|
560
|
+
AnnData object with cell type labels and confidence values.
|
|
561
|
+
label_column : str
|
|
562
|
+
Column in adata.obs containing cell type labels.
|
|
563
|
+
confidence_column : str
|
|
564
|
+
Column in adata.obs containing confidence values.
|
|
565
|
+
canonical_markers : Dict[str, List[str]], optional
|
|
566
|
+
Dictionary mapping cell types to marker gene lists.
|
|
567
|
+
If None, uses default CANONICAL_MARKERS from markers module.
|
|
568
|
+
confidence_threshold : float, default 0.5
|
|
569
|
+
Minimum confidence threshold. Cells below this are marked unassigned.
|
|
570
|
+
n_components : int, default 3
|
|
571
|
+
Number of GMM components for marker thresholding.
|
|
572
|
+
3 = trimodal (dropout/moderate/high expression).
|
|
573
|
+
min_cells_per_type : int, default 15
|
|
574
|
+
Minimum cells required to validate a cell type. Types with fewer
|
|
575
|
+
cells are marked unassigned.
|
|
576
|
+
unassigned_label : str, default "Unassigned"
|
|
577
|
+
Label to assign to cells that fail validation.
|
|
578
|
+
copy : bool, default False
|
|
579
|
+
Whether to return a copy or modify in-place.
|
|
580
|
+
|
|
581
|
+
Returns
|
|
582
|
+
-------
|
|
583
|
+
Tuple[AnnData, pd.DataFrame]
|
|
584
|
+
- AnnData with filtered labels and validation columns added:
|
|
585
|
+
- `{label_column}_validated`: Final validated labels
|
|
586
|
+
- `marker_score`: Mean expression of canonical markers
|
|
587
|
+
- `marker_passes_gmm`: Whether cell passes GMM marker threshold
|
|
588
|
+
- `confidence_passes`: Whether cell passes confidence threshold
|
|
589
|
+
- `validation_pass`: Whether cell passes both thresholds
|
|
590
|
+
- Summary DataFrame with validation statistics per cell type.
|
|
591
|
+
|
|
592
|
+
Notes
|
|
593
|
+
-----
|
|
594
|
+
**Dual-threshold rationale:**
|
|
595
|
+
|
|
596
|
+
1. Confidence alone may miss cells that are assigned to wrong types
|
|
597
|
+
with deceptively high confidence.
|
|
598
|
+
2. Marker expression alone may miss cells where marker genes are
|
|
599
|
+
not expressed due to dropout or technical noise.
|
|
600
|
+
3. Combining both axes provides more robust QC.
|
|
601
|
+
|
|
602
|
+
**GMM-3 thresholding:**
|
|
603
|
+
|
|
604
|
+
For spatial data with dropouts, marker expression is often trimodal:
|
|
605
|
+
- Component 0: Zero/dropout (no expression)
|
|
606
|
+
- Component 1: Moderate expression
|
|
607
|
+
- Component 2: High expression
|
|
608
|
+
|
|
609
|
+
The threshold is set at the boundary between component 0 and component 1,
|
|
610
|
+
identifying cells with biologically meaningful marker expression.
|
|
611
|
+
|
|
612
|
+
Examples
|
|
613
|
+
--------
|
|
614
|
+
>>> from spatialcore.annotation.confidence import filter_by_marker_validation
|
|
615
|
+
>>> # Filter with default markers
|
|
616
|
+
>>> adata, summary = filter_by_marker_validation(
|
|
617
|
+
... adata,
|
|
618
|
+
... label_column="celltypist",
|
|
619
|
+
... confidence_column="celltypist_confidence_transformed",
|
|
620
|
+
... confidence_threshold=0.5,
|
|
621
|
+
... )
|
|
622
|
+
>>> # Check validation summary
|
|
623
|
+
>>> print(summary[["cell_type", "n_cells", "pct_pass"]])
|
|
624
|
+
|
|
625
|
+
See Also
|
|
626
|
+
--------
|
|
627
|
+
spatialcore.stats.classify_by_threshold : GMM-3 thresholding function.
|
|
628
|
+
spatialcore.plotting.validation.plot_2d_validation : 2D validation plot.
|
|
629
|
+
"""
|
|
630
|
+
if copy:
|
|
631
|
+
adata = adata.copy()
|
|
632
|
+
|
|
633
|
+
# Validate columns exist
|
|
634
|
+
if label_column not in adata.obs.columns:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
f"Label column '{label_column}' not found in adata.obs. "
|
|
637
|
+
f"Available: {list(adata.obs.columns)}"
|
|
638
|
+
)
|
|
639
|
+
if confidence_column not in adata.obs.columns:
|
|
640
|
+
raise ValueError(
|
|
641
|
+
f"Confidence column '{confidence_column}' not found in adata.obs. "
|
|
642
|
+
f"Available: {list(adata.obs.columns)}"
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# Load canonical markers if not provided
|
|
646
|
+
if canonical_markers is None:
|
|
647
|
+
try:
|
|
648
|
+
from spatialcore.annotation.markers import load_canonical_markers
|
|
649
|
+
canonical_markers = load_canonical_markers()
|
|
650
|
+
logger.info(f"Loaded canonical markers for {len(canonical_markers)} cell types")
|
|
651
|
+
except Exception as e:
|
|
652
|
+
logger.warning(f"Could not load canonical markers: {e}")
|
|
653
|
+
canonical_markers = {}
|
|
654
|
+
|
|
655
|
+
# Get GMM classification function
|
|
656
|
+
try:
|
|
657
|
+
from spatialcore.stats.classify import classify_by_threshold
|
|
658
|
+
except ImportError:
|
|
659
|
+
raise ImportError(
|
|
660
|
+
"spatialcore.stats.classify is required for GMM-3 validation. "
|
|
661
|
+
"Ensure the stats module is properly installed."
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
# Initialize result columns
|
|
665
|
+
n_cells = adata.n_obs
|
|
666
|
+
marker_scores = np.zeros(n_cells)
|
|
667
|
+
marker_passes = np.zeros(n_cells, dtype=bool)
|
|
668
|
+
confidence_passes = adata.obs[confidence_column].values >= confidence_threshold
|
|
669
|
+
|
|
670
|
+
# Get unique cell types (excluding already unassigned)
|
|
671
|
+
cell_types = adata.obs[label_column].astype(str).unique()
|
|
672
|
+
cell_types = [ct for ct in cell_types if ct.lower() not in ["unassigned", "unknown", "low_count"]]
|
|
673
|
+
|
|
674
|
+
# Validation summary
|
|
675
|
+
summary_rows = []
|
|
676
|
+
|
|
677
|
+
for cell_type in cell_types:
|
|
678
|
+
# Get cells of this type
|
|
679
|
+
type_mask = adata.obs[label_column].astype(str) == cell_type
|
|
680
|
+
n_type_cells = type_mask.sum()
|
|
681
|
+
|
|
682
|
+
# Skip if too few cells
|
|
683
|
+
if n_type_cells < min_cells_per_type:
|
|
684
|
+
logger.debug(f"Skipping {cell_type}: only {n_type_cells} cells (< {min_cells_per_type})")
|
|
685
|
+
summary_rows.append({
|
|
686
|
+
"cell_type": cell_type,
|
|
687
|
+
"n_cells": n_type_cells,
|
|
688
|
+
"has_markers": False,
|
|
689
|
+
"gmm_threshold": np.nan,
|
|
690
|
+
"n_pass_confidence": (type_mask & confidence_passes).sum(),
|
|
691
|
+
"n_pass_marker": 0,
|
|
692
|
+
"n_pass_both": 0,
|
|
693
|
+
"pct_pass": 0.0,
|
|
694
|
+
})
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Find matching markers
|
|
698
|
+
markers_for_type = canonical_markers.get(cell_type, [])
|
|
699
|
+
|
|
700
|
+
# Try case-insensitive match if exact match fails
|
|
701
|
+
if not markers_for_type:
|
|
702
|
+
for marker_type, markers in canonical_markers.items():
|
|
703
|
+
if marker_type.lower() == cell_type.lower():
|
|
704
|
+
markers_for_type = markers
|
|
705
|
+
break
|
|
706
|
+
|
|
707
|
+
# Get marker genes that exist in data
|
|
708
|
+
available_markers = [m for m in markers_for_type if m in adata.var_names]
|
|
709
|
+
|
|
710
|
+
if not available_markers:
|
|
711
|
+
logger.debug(f"No canonical markers found for {cell_type}")
|
|
712
|
+
# All cells pass marker validation if no markers defined
|
|
713
|
+
marker_passes[type_mask] = True
|
|
714
|
+
summary_rows.append({
|
|
715
|
+
"cell_type": cell_type,
|
|
716
|
+
"n_cells": n_type_cells,
|
|
717
|
+
"has_markers": False,
|
|
718
|
+
"gmm_threshold": np.nan,
|
|
719
|
+
"n_pass_confidence": (type_mask & confidence_passes).sum(),
|
|
720
|
+
"n_pass_marker": n_type_cells,
|
|
721
|
+
"n_pass_both": (type_mask & confidence_passes).sum(),
|
|
722
|
+
"pct_pass": 100 * (type_mask & confidence_passes).sum() / n_type_cells,
|
|
723
|
+
})
|
|
724
|
+
continue
|
|
725
|
+
|
|
726
|
+
# Calculate mean marker expression for cells of this type
|
|
727
|
+
# Use .X if normalized, or first available layer
|
|
728
|
+
if hasattr(adata.X, "toarray"):
|
|
729
|
+
expr_matrix = adata[type_mask, available_markers].X.toarray()
|
|
730
|
+
else:
|
|
731
|
+
expr_matrix = adata[type_mask, available_markers].X
|
|
732
|
+
|
|
733
|
+
mean_marker_expr = np.mean(expr_matrix, axis=1)
|
|
734
|
+
marker_scores[type_mask] = mean_marker_expr
|
|
735
|
+
|
|
736
|
+
# Fit GMM-3 threshold on marker expression
|
|
737
|
+
try:
|
|
738
|
+
threshold_result = classify_by_threshold(
|
|
739
|
+
mean_marker_expr,
|
|
740
|
+
n_components=n_components,
|
|
741
|
+
method="gmm",
|
|
742
|
+
)
|
|
743
|
+
gmm_threshold = threshold_result["threshold"]
|
|
744
|
+
type_marker_passes = mean_marker_expr >= gmm_threshold
|
|
745
|
+
marker_passes[type_mask] = type_marker_passes
|
|
746
|
+
except Exception as e:
|
|
747
|
+
logger.warning(f"GMM fitting failed for {cell_type}: {e}. Marking all as pass.")
|
|
748
|
+
marker_passes[type_mask] = True
|
|
749
|
+
gmm_threshold = np.nan
|
|
750
|
+
|
|
751
|
+
# Calculate summary stats
|
|
752
|
+
n_pass_conf = (type_mask & confidence_passes).sum()
|
|
753
|
+
n_pass_marker = marker_passes[type_mask].sum()
|
|
754
|
+
n_pass_both = (type_mask & confidence_passes & marker_passes).sum()
|
|
755
|
+
|
|
756
|
+
summary_rows.append({
|
|
757
|
+
"cell_type": cell_type,
|
|
758
|
+
"n_cells": n_type_cells,
|
|
759
|
+
"has_markers": True,
|
|
760
|
+
"n_markers": len(available_markers),
|
|
761
|
+
"markers": available_markers[:3], # First 3 for display
|
|
762
|
+
"gmm_threshold": gmm_threshold,
|
|
763
|
+
"n_pass_confidence": n_pass_conf,
|
|
764
|
+
"n_pass_marker": n_pass_marker,
|
|
765
|
+
"n_pass_both": n_pass_both,
|
|
766
|
+
"pct_pass": 100 * n_pass_both / n_type_cells if n_type_cells > 0 else 0,
|
|
767
|
+
})
|
|
768
|
+
|
|
769
|
+
# Store validation results in adata
|
|
770
|
+
adata.obs["marker_score"] = marker_scores
|
|
771
|
+
adata.obs["marker_passes_gmm"] = marker_passes
|
|
772
|
+
adata.obs["confidence_passes"] = confidence_passes
|
|
773
|
+
adata.obs["validation_pass"] = confidence_passes & marker_passes
|
|
774
|
+
|
|
775
|
+
# Create validated labels column
|
|
776
|
+
validated_labels = adata.obs[label_column].astype(str).copy()
|
|
777
|
+
fail_mask = ~adata.obs["validation_pass"]
|
|
778
|
+
n_failed = fail_mask.sum()
|
|
779
|
+
|
|
780
|
+
if n_failed > 0:
|
|
781
|
+
validated_labels[fail_mask] = unassigned_label
|
|
782
|
+
pct_failed = 100 * n_failed / n_cells
|
|
783
|
+
logger.info(
|
|
784
|
+
f"Dual-threshold validation: {n_failed:,} cells ({pct_failed:.1f}%) "
|
|
785
|
+
f"marked as '{unassigned_label}'"
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
adata.obs[f"{label_column}_validated"] = pd.Categorical(validated_labels)
|
|
789
|
+
|
|
790
|
+
# Create summary DataFrame
|
|
791
|
+
summary_df = pd.DataFrame(summary_rows)
|
|
792
|
+
if len(summary_df) > 0:
|
|
793
|
+
summary_df = summary_df.sort_values("n_cells", ascending=False).reset_index(drop=True)
|
|
794
|
+
|
|
795
|
+
# Log overall summary
|
|
796
|
+
n_pass_total = adata.obs["validation_pass"].sum()
|
|
797
|
+
logger.info(
|
|
798
|
+
f"Validation complete: {n_pass_total:,}/{n_cells:,} cells "
|
|
799
|
+
f"({100*n_pass_total/n_cells:.1f}%) passed dual-threshold QC"
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
return adata, summary_df
|