spatialcore 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatialcore/__init__.py +122 -0
- spatialcore/annotation/__init__.py +253 -0
- spatialcore/annotation/acquisition.py +529 -0
- spatialcore/annotation/annotate.py +603 -0
- spatialcore/annotation/cellxgene.py +365 -0
- spatialcore/annotation/confidence.py +802 -0
- spatialcore/annotation/discovery.py +529 -0
- spatialcore/annotation/expression.py +363 -0
- spatialcore/annotation/loading.py +529 -0
- spatialcore/annotation/markers.py +297 -0
- spatialcore/annotation/ontology.py +1282 -0
- spatialcore/annotation/patterns.py +247 -0
- spatialcore/annotation/pipeline.py +620 -0
- spatialcore/annotation/synapse.py +380 -0
- spatialcore/annotation/training.py +1457 -0
- spatialcore/annotation/validation.py +422 -0
- spatialcore/core/__init__.py +34 -0
- spatialcore/core/cache.py +118 -0
- spatialcore/core/logging.py +135 -0
- spatialcore/core/metadata.py +149 -0
- spatialcore/core/utils.py +768 -0
- spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
- spatialcore/data/markers/canonical_markers.json +83 -0
- spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
- spatialcore/plotting/__init__.py +109 -0
- spatialcore/plotting/benchmark.py +477 -0
- spatialcore/plotting/celltype.py +329 -0
- spatialcore/plotting/confidence.py +413 -0
- spatialcore/plotting/spatial.py +505 -0
- spatialcore/plotting/utils.py +411 -0
- spatialcore/plotting/validation.py +1342 -0
- spatialcore-0.1.9.dist-info/METADATA +213 -0
- spatialcore-0.1.9.dist-info/RECORD +36 -0
- spatialcore-0.1.9.dist-info/WHEEL +5 -0
- spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
- spatialcore-0.1.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,768 @@
|
|
|
1
|
+
"""
|
|
2
|
+
General utilities for SpatialCore.
|
|
3
|
+
|
|
4
|
+
This module provides cross-cutting utilities used throughout the codebase:
|
|
5
|
+
- Gene ID mapping (Ensembl → HUGO/HGNC symbols)
|
|
6
|
+
- Expression normalization status detection
|
|
7
|
+
|
|
8
|
+
These functions are not specific to any data source and can be used
|
|
9
|
+
with any AnnData object.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, Optional, Tuple, Union, Any
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import anndata as ad
|
|
18
|
+
|
|
19
|
+
from spatialcore.core.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ============================================================================
|
|
25
|
+
# Ensembl to HUGO Gene Mapping
|
|
26
|
+
# ============================================================================
|
|
27
|
+
|
|
28
|
+
BIOMART_URL = "http://www.ensembl.org/biomart/martservice"
|
|
29
|
+
|
|
30
|
+
BIOMART_QUERY_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
|
31
|
+
<!DOCTYPE Query>
|
|
32
|
+
<Query virtualSchemaName="default" formatter="TSV" header="1" uniqueRows="1" count="" datasetConfigVersion="0.6">
|
|
33
|
+
<Dataset name="hsapiens_gene_ensembl" interface="default">
|
|
34
|
+
<Attribute name="ensembl_gene_id"/>
|
|
35
|
+
<Attribute name="hgnc_symbol"/>
|
|
36
|
+
<Attribute name="external_gene_name"/>
|
|
37
|
+
</Dataset>
|
|
38
|
+
</Query>"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def download_ensembl_mapping(
|
|
42
|
+
output_path: Union[str, Path],
|
|
43
|
+
force: bool = False,
|
|
44
|
+
) -> Path:
|
|
45
|
+
"""
|
|
46
|
+
Download Ensembl-to-HUGO gene mapping from BioMart.
|
|
47
|
+
|
|
48
|
+
Downloads a TSV file mapping Ensembl gene IDs (ENSG...) to HUGO/HGNC gene symbols.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
output_path : str or Path
|
|
53
|
+
Path to save the mapping TSV file.
|
|
54
|
+
force : bool, default False
|
|
55
|
+
If True, re-download even if file exists.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Path
|
|
60
|
+
Path to the downloaded TSV file.
|
|
61
|
+
|
|
62
|
+
Notes
|
|
63
|
+
-----
|
|
64
|
+
The downloaded file contains columns:
|
|
65
|
+
- Gene stable ID (Ensembl ID)
|
|
66
|
+
- HGNC symbol
|
|
67
|
+
- Gene name (external gene name)
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> from spatialcore.core.utils import download_ensembl_mapping
|
|
72
|
+
>>> path = download_ensembl_mapping("./cache/ensembl_to_hugo.tsv")
|
|
73
|
+
"""
|
|
74
|
+
import urllib.request
|
|
75
|
+
import urllib.parse
|
|
76
|
+
|
|
77
|
+
output_path = Path(output_path)
|
|
78
|
+
|
|
79
|
+
if output_path.exists() and not force:
|
|
80
|
+
logger.info(f"Mapping file already exists: {output_path}")
|
|
81
|
+
return output_path
|
|
82
|
+
|
|
83
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
|
|
85
|
+
logger.info("Downloading Ensembl-to-HUGO gene mapping from BioMart...")
|
|
86
|
+
|
|
87
|
+
# Encode the query
|
|
88
|
+
query = urllib.parse.quote(BIOMART_QUERY_TEMPLATE)
|
|
89
|
+
url = f"{BIOMART_URL}?query={query}"
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
urllib.request.urlretrieve(url, output_path)
|
|
93
|
+
|
|
94
|
+
# Verify the download
|
|
95
|
+
df = pd.read_csv(output_path, sep="\t")
|
|
96
|
+
n_mappings = len(df[df["HGNC symbol"].notna() & (df["HGNC symbol"] != "")])
|
|
97
|
+
logger.info(f"Downloaded {n_mappings:,} gene mappings to: {output_path}")
|
|
98
|
+
|
|
99
|
+
return output_path
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Failed to download from BioMart: {e}")
|
|
103
|
+
raise
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_ensembl_to_hugo_mapping(
|
|
107
|
+
cache_path: Optional[Union[str, Path]] = None,
|
|
108
|
+
auto_download: bool = True,
|
|
109
|
+
) -> Dict[str, str]:
|
|
110
|
+
"""
|
|
111
|
+
Load Ensembl ID to HUGO gene symbol mapping.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
cache_path : str or Path, optional
|
|
116
|
+
Path to cached TSV file. If None, uses default cache location.
|
|
117
|
+
auto_download : bool, default True
|
|
118
|
+
If True and cache doesn't exist, download from BioMart.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
Dict[str, str]
|
|
123
|
+
Mapping from Ensembl ID (e.g., "ENSG00000141510") to HUGO symbol (e.g., "TP53").
|
|
124
|
+
|
|
125
|
+
Examples
|
|
126
|
+
--------
|
|
127
|
+
>>> from spatialcore.core.utils import load_ensembl_to_hugo_mapping
|
|
128
|
+
>>> mapping = load_ensembl_to_hugo_mapping()
|
|
129
|
+
>>> mapping["ENSG00000141510"]
|
|
130
|
+
'TP53'
|
|
131
|
+
"""
|
|
132
|
+
if cache_path is None:
|
|
133
|
+
# Default cache location
|
|
134
|
+
cache_path = Path.home() / ".cache" / "spatialcore" / "ensembl_to_hugo.tsv"
|
|
135
|
+
else:
|
|
136
|
+
cache_path = Path(cache_path)
|
|
137
|
+
|
|
138
|
+
if not cache_path.exists():
|
|
139
|
+
if auto_download:
|
|
140
|
+
download_ensembl_mapping(cache_path)
|
|
141
|
+
else:
|
|
142
|
+
raise FileNotFoundError(
|
|
143
|
+
f"Mapping file not found: {cache_path}. "
|
|
144
|
+
"Set auto_download=True to download from BioMart."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
df = pd.read_csv(cache_path, sep="\t")
|
|
148
|
+
|
|
149
|
+
# Filter for valid HGNC symbols
|
|
150
|
+
df = df.dropna(subset=["HGNC symbol"])
|
|
151
|
+
df = df[df["HGNC symbol"].str.len() > 0]
|
|
152
|
+
|
|
153
|
+
# Create mapping (Ensembl ID -> HGNC symbol)
|
|
154
|
+
mapping = dict(zip(df["Gene stable ID"], df["HGNC symbol"]))
|
|
155
|
+
|
|
156
|
+
logger.info(f"Loaded {len(mapping):,} Ensembl to HUGO gene mappings")
|
|
157
|
+
return mapping
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def is_ensembl_id(gene_name: str) -> bool:
|
|
161
|
+
"""
|
|
162
|
+
Check if a gene name looks like an Ensembl ID.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
gene_name : str
|
|
167
|
+
Gene name to check.
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
bool
|
|
172
|
+
True if the name matches Ensembl ID patterns.
|
|
173
|
+
|
|
174
|
+
Examples
|
|
175
|
+
--------
|
|
176
|
+
>>> from spatialcore.core.utils import is_ensembl_id
|
|
177
|
+
>>> is_ensembl_id("ENSG00000141510")
|
|
178
|
+
True
|
|
179
|
+
>>> is_ensembl_id("TP53")
|
|
180
|
+
False
|
|
181
|
+
>>> is_ensembl_id("ENSMUSG00000059552")
|
|
182
|
+
True
|
|
183
|
+
"""
|
|
184
|
+
if not gene_name or not isinstance(gene_name, str):
|
|
185
|
+
return False
|
|
186
|
+
return (
|
|
187
|
+
gene_name.startswith("ENSG") or # Human gene
|
|
188
|
+
gene_name.startswith("ENST") or # Human transcript
|
|
189
|
+
gene_name.startswith("ENSMUSG") or # Mouse gene
|
|
190
|
+
gene_name.startswith("ENSMUS") # Mouse
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _convert_ensembl_to_hugo(
|
|
195
|
+
gene_names: np.ndarray,
|
|
196
|
+
ensembl_to_hugo: Dict[str, str],
|
|
197
|
+
) -> Tuple[np.ndarray, Dict[str, int]]:
|
|
198
|
+
"""
|
|
199
|
+
Convert Ensembl IDs to HUGO gene symbols where possible.
|
|
200
|
+
|
|
201
|
+
Safe for HUGO symbols: passes them through unchanged.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
gene_names : np.ndarray
|
|
206
|
+
Array of gene names.
|
|
207
|
+
ensembl_to_hugo : Dict[str, str]
|
|
208
|
+
Mapping from Ensembl ID to HUGO symbol.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
Tuple[np.ndarray, Dict[str, int]]
|
|
213
|
+
(converted_names, stats_dict)
|
|
214
|
+
"""
|
|
215
|
+
converted = []
|
|
216
|
+
n_converted = 0
|
|
217
|
+
n_already_hugo = 0
|
|
218
|
+
n_unmapped = 0
|
|
219
|
+
|
|
220
|
+
for gene in gene_names:
|
|
221
|
+
gene_str = str(gene)
|
|
222
|
+
if is_ensembl_id(gene_str):
|
|
223
|
+
if gene_str in ensembl_to_hugo:
|
|
224
|
+
converted.append(ensembl_to_hugo[gene_str])
|
|
225
|
+
n_converted += 1
|
|
226
|
+
else:
|
|
227
|
+
# Keep unmapped Ensembl ID (will be filtered during panel subsetting)
|
|
228
|
+
converted.append(gene_str)
|
|
229
|
+
n_unmapped += 1
|
|
230
|
+
else:
|
|
231
|
+
# Already HUGO symbol - pass through
|
|
232
|
+
converted.append(gene_str)
|
|
233
|
+
n_already_hugo += 1
|
|
234
|
+
|
|
235
|
+
stats = {
|
|
236
|
+
"total_genes": len(gene_names),
|
|
237
|
+
"converted_ensembl": n_converted,
|
|
238
|
+
"already_hugo": n_already_hugo,
|
|
239
|
+
"unmapped_ensembl": n_unmapped,
|
|
240
|
+
}
|
|
241
|
+
return np.array(converted), stats
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def normalize_gene_names(
|
|
245
|
+
adata: ad.AnnData,
|
|
246
|
+
ensembl_to_hugo: Optional[Dict[str, str]] = None,
|
|
247
|
+
copy: bool = False,
|
|
248
|
+
) -> ad.AnnData:
|
|
249
|
+
"""
|
|
250
|
+
Normalize gene names in AnnData to use HUGO gene symbols.
|
|
251
|
+
|
|
252
|
+
Two-step process:
|
|
253
|
+
1. If var_names are Ensembl IDs/indices, use feature_name column as starting point
|
|
254
|
+
2. Apply Ensembl→HUGO mapping for any remaining Ensembl IDs
|
|
255
|
+
|
|
256
|
+
Safe to call on data that already uses HUGO symbols.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
adata : AnnData
|
|
261
|
+
AnnData object with genes in var.
|
|
262
|
+
ensembl_to_hugo : Dict[str, str], optional
|
|
263
|
+
Mapping from Ensembl ID to HUGO symbol. If None, loads from cache.
|
|
264
|
+
copy : bool, default False
|
|
265
|
+
If True, return a copy. Otherwise modify in place.
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
AnnData
|
|
270
|
+
AnnData with normalized gene names in var_names.
|
|
271
|
+
|
|
272
|
+
Notes
|
|
273
|
+
-----
|
|
274
|
+
CellxGene Census data commonly stores gene identifiers as:
|
|
275
|
+
- Numeric indices with symbols in var['feature_name']
|
|
276
|
+
- Ensembl IDs with symbols in var['feature_name']
|
|
277
|
+
- Mixed content in feature_name (some Ensembl, some HUGO)
|
|
278
|
+
|
|
279
|
+
This function handles all these cases.
|
|
280
|
+
|
|
281
|
+
Examples
|
|
282
|
+
--------
|
|
283
|
+
>>> from spatialcore.core.utils import normalize_gene_names, load_ensembl_to_hugo_mapping
|
|
284
|
+
>>> mapping = load_ensembl_to_hugo_mapping()
|
|
285
|
+
>>> adata = normalize_gene_names(adata, mapping)
|
|
286
|
+
"""
|
|
287
|
+
if copy:
|
|
288
|
+
adata = adata.copy()
|
|
289
|
+
|
|
290
|
+
first_gene = str(adata.var_names[0])
|
|
291
|
+
uses_non_symbol_ids = (
|
|
292
|
+
first_gene.isdigit() or
|
|
293
|
+
first_gene.startswith("ENSG") or
|
|
294
|
+
first_gene.startswith("ENST")
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if not uses_non_symbol_ids:
|
|
298
|
+
logger.info("Gene names already appear to be HUGO symbols")
|
|
299
|
+
# Still check for any remaining Ensembl IDs and convert them
|
|
300
|
+
if ensembl_to_hugo is None:
|
|
301
|
+
ensembl_to_hugo = load_ensembl_to_hugo_mapping()
|
|
302
|
+
|
|
303
|
+
converted_names, stats = _convert_ensembl_to_hugo(
|
|
304
|
+
adata.var_names.values, ensembl_to_hugo
|
|
305
|
+
)
|
|
306
|
+
if stats["converted_ensembl"] > 0:
|
|
307
|
+
adata.var_names = pd.Index(converted_names)
|
|
308
|
+
adata.var_names_make_unique()
|
|
309
|
+
logger.info(
|
|
310
|
+
f"Converted {stats['converted_ensembl']:,} remaining Ensembl IDs to HUGO"
|
|
311
|
+
)
|
|
312
|
+
return adata
|
|
313
|
+
|
|
314
|
+
# Step 1: Use feature_name column if available
|
|
315
|
+
if "feature_name" in adata.var.columns:
|
|
316
|
+
feature_names = adata.var["feature_name"].values.astype(str)
|
|
317
|
+
adata.var_names = pd.Index(feature_names)
|
|
318
|
+
logger.info("Using 'feature_name' column as gene names")
|
|
319
|
+
|
|
320
|
+
# Step 2: Apply Ensembl to HUGO mapping for any remaining Ensembl IDs
|
|
321
|
+
if ensembl_to_hugo is None:
|
|
322
|
+
ensembl_to_hugo = load_ensembl_to_hugo_mapping()
|
|
323
|
+
|
|
324
|
+
converted_names, stats = _convert_ensembl_to_hugo(
|
|
325
|
+
adata.var_names.values, ensembl_to_hugo
|
|
326
|
+
)
|
|
327
|
+
adata.var_names = pd.Index(converted_names)
|
|
328
|
+
|
|
329
|
+
if stats["converted_ensembl"] > 0:
|
|
330
|
+
logger.info(
|
|
331
|
+
f"Gene mapping: {stats['converted_ensembl']:,} converted, "
|
|
332
|
+
f"{stats['already_hugo']:,} already HUGO, "
|
|
333
|
+
f"{stats['unmapped_ensembl']:,} unmapped"
|
|
334
|
+
)
|
|
335
|
+
else:
|
|
336
|
+
logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
|
|
337
|
+
|
|
338
|
+
adata.var_names_make_unique()
|
|
339
|
+
return adata
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ============================================================================
|
|
343
|
+
# Expression Normalization Status Detection
|
|
344
|
+
# ============================================================================
|
|
345
|
+
|
|
346
|
+
# Layer names to search for raw counts (in priority order)
|
|
347
|
+
RAW_COUNT_LAYERS = ["counts", "raw_counts", "raw"]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _is_integer_like(
|
|
351
|
+
values: np.ndarray,
|
|
352
|
+
tolerance: float = 1e-6,
|
|
353
|
+
threshold: float = 0.95,
|
|
354
|
+
) -> bool:
|
|
355
|
+
"""
|
|
356
|
+
Check if array values are integer-like within floating point tolerance.
|
|
357
|
+
|
|
358
|
+
Parameters
|
|
359
|
+
----------
|
|
360
|
+
values : np.ndarray
|
|
361
|
+
Array of values to check (should be non-zero values only).
|
|
362
|
+
tolerance : float, default 1e-6
|
|
363
|
+
Tolerance for floating point comparison. Handles values like
|
|
364
|
+
1.0000000000000002 or 2.9999999999999996.
|
|
365
|
+
threshold : float, default 0.95
|
|
366
|
+
Fraction of values that must be integer-like to pass.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
bool
|
|
371
|
+
True if >= threshold fraction of values are integer-like.
|
|
372
|
+
"""
|
|
373
|
+
if len(values) == 0:
|
|
374
|
+
return False
|
|
375
|
+
|
|
376
|
+
remainder = np.abs(np.mod(values, 1))
|
|
377
|
+
is_integer = (remainder < tolerance) | (remainder > 1 - tolerance)
|
|
378
|
+
fraction_integer = np.mean(is_integer)
|
|
379
|
+
|
|
380
|
+
return fraction_integer >= threshold
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _get_matrix_sample(
|
|
384
|
+
matrix,
|
|
385
|
+
sample_size: int = 10000,
|
|
386
|
+
) -> np.ndarray:
|
|
387
|
+
"""
|
|
388
|
+
Get a dense sample from a matrix (sparse or dense).
|
|
389
|
+
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
matrix : array-like or sparse matrix
|
|
393
|
+
Expression matrix.
|
|
394
|
+
sample_size : int, default 10000
|
|
395
|
+
Maximum number of cells to sample.
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
np.ndarray
|
|
400
|
+
Dense 2D array sample.
|
|
401
|
+
"""
|
|
402
|
+
from scipy.sparse import issparse
|
|
403
|
+
|
|
404
|
+
n_cells = matrix.shape[0]
|
|
405
|
+
n_sample = min(sample_size, n_cells)
|
|
406
|
+
|
|
407
|
+
if issparse(matrix):
|
|
408
|
+
return matrix[:n_sample].toarray()
|
|
409
|
+
else:
|
|
410
|
+
return np.asarray(matrix[:n_sample])
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _check_raw_counts(
|
|
414
|
+
matrix,
|
|
415
|
+
sample_size: int = 10000,
|
|
416
|
+
integer_tolerance: float = 1e-6,
|
|
417
|
+
integer_threshold: float = 0.95,
|
|
418
|
+
) -> Dict[str, Any]:
|
|
419
|
+
"""
|
|
420
|
+
Check if a matrix contains raw counts.
|
|
421
|
+
|
|
422
|
+
Parameters
|
|
423
|
+
----------
|
|
424
|
+
matrix : array-like or sparse matrix
|
|
425
|
+
Expression matrix to check.
|
|
426
|
+
sample_size : int, default 10000
|
|
427
|
+
Number of cells to sample for checking.
|
|
428
|
+
integer_tolerance : float, default 1e-6
|
|
429
|
+
Tolerance for integer comparison.
|
|
430
|
+
integer_threshold : float, default 0.95
|
|
431
|
+
Fraction of values that must be integers.
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
Dict[str, Any]
|
|
436
|
+
Dictionary with:
|
|
437
|
+
- is_raw: bool
|
|
438
|
+
- fraction_integer: float
|
|
439
|
+
- min_val: float
|
|
440
|
+
- max_val: float
|
|
441
|
+
"""
|
|
442
|
+
sample_data = _get_matrix_sample(matrix, sample_size)
|
|
443
|
+
|
|
444
|
+
# Get non-zero values for integer check
|
|
445
|
+
flat_sample = sample_data.flatten()
|
|
446
|
+
non_zero = flat_sample[flat_sample != 0]
|
|
447
|
+
|
|
448
|
+
if len(non_zero) == 0:
|
|
449
|
+
return {
|
|
450
|
+
"is_raw": False,
|
|
451
|
+
"fraction_integer": 0.0,
|
|
452
|
+
"min_val": 0.0,
|
|
453
|
+
"max_val": 0.0,
|
|
454
|
+
"reason": "all_zeros",
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
min_val = float(np.min(sample_data))
|
|
458
|
+
max_val = float(np.max(sample_data))
|
|
459
|
+
|
|
460
|
+
# Raw counts cannot be negative
|
|
461
|
+
if min_val < 0:
|
|
462
|
+
return {
|
|
463
|
+
"is_raw": False,
|
|
464
|
+
"fraction_integer": 0.0,
|
|
465
|
+
"min_val": min_val,
|
|
466
|
+
"max_val": max_val,
|
|
467
|
+
"reason": "negative_values",
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
# Check integer-like property
|
|
471
|
+
is_integer = _is_integer_like(non_zero, integer_tolerance, integer_threshold)
|
|
472
|
+
|
|
473
|
+
# Calculate actual fraction for reporting
|
|
474
|
+
remainder = np.abs(np.mod(non_zero, 1))
|
|
475
|
+
fraction_integer = float(np.mean(
|
|
476
|
+
(remainder < integer_tolerance) | (remainder > 1 - integer_tolerance)
|
|
477
|
+
))
|
|
478
|
+
|
|
479
|
+
return {
|
|
480
|
+
"is_raw": is_integer,
|
|
481
|
+
"fraction_integer": fraction_integer,
|
|
482
|
+
"min_val": min_val,
|
|
483
|
+
"max_val": max_val,
|
|
484
|
+
"reason": "integer_check",
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _estimate_target_sum(
|
|
489
|
+
matrix,
|
|
490
|
+
sample_size: int = 1000,
|
|
491
|
+
) -> Dict[str, Any]:
|
|
492
|
+
"""
|
|
493
|
+
Estimate the target sum used for normalization by reversing log1p.
|
|
494
|
+
|
|
495
|
+
If data is log1p(counts / total * target_sum), then:
|
|
496
|
+
expm1(X).sum(axis=1) should equal target_sum for each cell.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
matrix : array-like or sparse matrix
|
|
501
|
+
Expression matrix (assumed to be log1p transformed).
|
|
502
|
+
sample_size : int, default 1000
|
|
503
|
+
Number of cells to sample.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
Dict[str, Any]
|
|
508
|
+
Dictionary with:
|
|
509
|
+
- estimated_target_sum: float (median of row sums)
|
|
510
|
+
- target_sum_std: float (std of row sums)
|
|
511
|
+
- is_log1p_10k: bool (True if target_sum ~ 10,000)
|
|
512
|
+
- is_log1p_cpm: bool (True if target_sum ~ 1,000,000)
|
|
513
|
+
"""
|
|
514
|
+
sample_data = _get_matrix_sample(matrix, sample_size)
|
|
515
|
+
|
|
516
|
+
# Reverse log1p transformation
|
|
517
|
+
reversed_data = np.expm1(sample_data)
|
|
518
|
+
|
|
519
|
+
# Compute row sums (should equal target_sum)
|
|
520
|
+
row_sums = reversed_data.sum(axis=1)
|
|
521
|
+
|
|
522
|
+
# Exclude empty cells (sum = 0)
|
|
523
|
+
non_empty_sums = row_sums[row_sums > 0]
|
|
524
|
+
|
|
525
|
+
if len(non_empty_sums) == 0:
|
|
526
|
+
return {
|
|
527
|
+
"estimated_target_sum": 0.0,
|
|
528
|
+
"target_sum_std": 0.0,
|
|
529
|
+
"is_log1p_10k": False,
|
|
530
|
+
"is_log1p_cpm": False,
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
median_sum = float(np.median(non_empty_sums))
|
|
534
|
+
std_sum = float(np.std(non_empty_sums))
|
|
535
|
+
|
|
536
|
+
# Check if close to 10,000 (allow 20% tolerance)
|
|
537
|
+
is_log1p_10k = 8_000 < median_sum < 12_000
|
|
538
|
+
|
|
539
|
+
# Check if close to 1,000,000 (CPM)
|
|
540
|
+
is_log1p_cpm = 800_000 < median_sum < 1_200_000
|
|
541
|
+
|
|
542
|
+
return {
|
|
543
|
+
"estimated_target_sum": median_sum,
|
|
544
|
+
"target_sum_std": std_sum,
|
|
545
|
+
"is_log1p_10k": is_log1p_10k,
|
|
546
|
+
"is_log1p_cpm": is_log1p_cpm,
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _find_raw_counts_source(
|
|
551
|
+
adata: ad.AnnData,
|
|
552
|
+
sample_size: int = 10000,
|
|
553
|
+
integer_tolerance: float = 1e-6,
|
|
554
|
+
integer_threshold: float = 0.95,
|
|
555
|
+
) -> Optional[str]:
|
|
556
|
+
"""
|
|
557
|
+
Search for raw counts in layers, adata.raw, and adata.X.
|
|
558
|
+
|
|
559
|
+
Parameters
|
|
560
|
+
----------
|
|
561
|
+
adata : AnnData
|
|
562
|
+
AnnData object to search.
|
|
563
|
+
sample_size : int, default 10000
|
|
564
|
+
Number of cells to sample for checking.
|
|
565
|
+
integer_tolerance : float, default 1e-6
|
|
566
|
+
Tolerance for integer comparison.
|
|
567
|
+
integer_threshold : float, default 0.95
|
|
568
|
+
Fraction of values that must be integers.
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
Optional[str]
|
|
573
|
+
Source location if found:
|
|
574
|
+
- "layers/{layer_name}" for layers
|
|
575
|
+
- "raw.X" for adata.raw
|
|
576
|
+
- "X" for adata.X
|
|
577
|
+
- None if no raw counts found
|
|
578
|
+
"""
|
|
579
|
+
# Check layers first (in priority order)
|
|
580
|
+
for layer_name in RAW_COUNT_LAYERS:
|
|
581
|
+
if layer_name in adata.layers:
|
|
582
|
+
result = _check_raw_counts(
|
|
583
|
+
adata.layers[layer_name],
|
|
584
|
+
sample_size,
|
|
585
|
+
integer_tolerance,
|
|
586
|
+
integer_threshold,
|
|
587
|
+
)
|
|
588
|
+
if result["is_raw"]:
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Found raw counts in layers['{layer_name}'] "
|
|
591
|
+
f"(fraction_integer={result['fraction_integer']:.3f})"
|
|
592
|
+
)
|
|
593
|
+
return f"layers/{layer_name}"
|
|
594
|
+
|
|
595
|
+
# Check adata.raw.X
|
|
596
|
+
if adata.raw is not None:
|
|
597
|
+
result = _check_raw_counts(
|
|
598
|
+
adata.raw.X,
|
|
599
|
+
sample_size,
|
|
600
|
+
integer_tolerance,
|
|
601
|
+
integer_threshold,
|
|
602
|
+
)
|
|
603
|
+
if result["is_raw"]:
|
|
604
|
+
logger.debug(
|
|
605
|
+
f"Found raw counts in raw.X "
|
|
606
|
+
f"(fraction_integer={result['fraction_integer']:.3f})"
|
|
607
|
+
)
|
|
608
|
+
return "raw.X"
|
|
609
|
+
|
|
610
|
+
# Check adata.X as last resort
|
|
611
|
+
result = _check_raw_counts(
|
|
612
|
+
adata.X,
|
|
613
|
+
sample_size,
|
|
614
|
+
integer_tolerance,
|
|
615
|
+
integer_threshold,
|
|
616
|
+
)
|
|
617
|
+
if result["is_raw"]:
|
|
618
|
+
logger.debug(
|
|
619
|
+
f"Found raw counts in X "
|
|
620
|
+
f"(fraction_integer={result['fraction_integer']:.3f})"
|
|
621
|
+
)
|
|
622
|
+
return "X"
|
|
623
|
+
|
|
624
|
+
return None
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def check_normalization_status(
|
|
628
|
+
adata: ad.AnnData,
|
|
629
|
+
sample_size: int = 1000,
|
|
630
|
+
integer_tolerance: float = 1e-6,
|
|
631
|
+
integer_threshold: float = 0.95,
|
|
632
|
+
) -> Dict[str, Any]:
|
|
633
|
+
"""
|
|
634
|
+
Detect the normalization state of expression data with robust validation.
|
|
635
|
+
|
|
636
|
+
This function searches for raw counts in layers and adata.raw, and verifies
|
|
637
|
+
log1p normalization by checking the target sum via expm1 reversal.
|
|
638
|
+
|
|
639
|
+
Parameters
|
|
640
|
+
----------
|
|
641
|
+
adata : AnnData
|
|
642
|
+
AnnData object to check.
|
|
643
|
+
sample_size : int, default 1000
|
|
644
|
+
Number of cells to sample for detection.
|
|
645
|
+
integer_tolerance : float, default 1e-6
|
|
646
|
+
Tolerance for integer comparison (handles float precision issues
|
|
647
|
+
like 1.0000000000000002).
|
|
648
|
+
integer_threshold : float, default 0.95
|
|
649
|
+
Fraction of non-zero values that must be integer-like to classify
|
|
650
|
+
as raw counts.
|
|
651
|
+
|
|
652
|
+
Returns
|
|
653
|
+
-------
|
|
654
|
+
Dict[str, Any]
|
|
655
|
+
Dictionary with keys:
|
|
656
|
+
|
|
657
|
+
- raw_source: str or None
|
|
658
|
+
Location of raw counts ("layers/counts", "raw.X", "X", or None)
|
|
659
|
+
- x_state: str
|
|
660
|
+
State of adata.X: "raw", "log1p_10k", "log1p_cpm", "log1p_other",
|
|
661
|
+
"linear", "negative", "unknown"
|
|
662
|
+
- x_target_sum: float or None
|
|
663
|
+
Estimated target sum if X appears log-transformed
|
|
664
|
+
- is_usable: bool
|
|
665
|
+
True if data can be safely normalized (raw available OR X is log1p_10k)
|
|
666
|
+
- stats: dict
|
|
667
|
+
Diagnostic statistics (mean, max, min, fraction_integer)
|
|
668
|
+
|
|
669
|
+
Notes
|
|
670
|
+
-----
|
|
671
|
+
**Detection Logic:**
|
|
672
|
+
|
|
673
|
+
1. Search for raw counts in: layers["counts"], layers["raw_counts"],
|
|
674
|
+
layers["raw"], adata.raw.X, adata.X (in order)
|
|
675
|
+
2. For each candidate, check if >95% of non-zero values are integers
|
|
676
|
+
within floating point tolerance
|
|
677
|
+
3. For adata.X, if not raw, reverse log1p and check row sums to
|
|
678
|
+
verify target_sum is ~10,000
|
|
679
|
+
|
|
680
|
+
**Usability Criteria:**
|
|
681
|
+
|
|
682
|
+
Data is considered usable (is_usable=True) if:
|
|
683
|
+
- Raw counts are found anywhere, OR
|
|
684
|
+
- adata.X is verified as log1p normalized to 10,000
|
|
685
|
+
|
|
686
|
+
Examples
|
|
687
|
+
--------
|
|
688
|
+
>>> from spatialcore.core.utils import check_normalization_status
|
|
689
|
+
>>> status = check_normalization_status(adata)
|
|
690
|
+
>>> if status["is_usable"]:
|
|
691
|
+
... if status["raw_source"]:
|
|
692
|
+
... print(f"Will normalize from {status['raw_source']}")
|
|
693
|
+
... else:
|
|
694
|
+
... print("X is already log1p_10k")
|
|
695
|
+
>>> else:
|
|
696
|
+
... print(f"Cannot use: X is {status['x_state']}")
|
|
697
|
+
"""
|
|
698
|
+
from scipy.sparse import issparse
|
|
699
|
+
|
|
700
|
+
# Step 1: Search for raw counts
|
|
701
|
+
raw_source = _find_raw_counts_source(
|
|
702
|
+
adata, sample_size * 10, integer_tolerance, integer_threshold
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# Step 2: Analyze adata.X
|
|
706
|
+
sample_data = _get_matrix_sample(adata.X, sample_size)
|
|
707
|
+
|
|
708
|
+
mean_val = float(np.mean(sample_data))
|
|
709
|
+
max_val = float(np.max(sample_data))
|
|
710
|
+
min_val = float(np.min(sample_data))
|
|
711
|
+
|
|
712
|
+
# Check if X contains raw counts
|
|
713
|
+
x_raw_check = _check_raw_counts(
|
|
714
|
+
adata.X, sample_size * 10, integer_tolerance, integer_threshold
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
stats = {
|
|
718
|
+
"mean": mean_val,
|
|
719
|
+
"max": max_val,
|
|
720
|
+
"min": min_val,
|
|
721
|
+
"fraction_integer": x_raw_check["fraction_integer"],
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
# Determine X state
|
|
725
|
+
if x_raw_check["is_raw"]:
|
|
726
|
+
x_state = "raw"
|
|
727
|
+
x_target_sum = None
|
|
728
|
+
elif min_val < 0:
|
|
729
|
+
x_state = "negative"
|
|
730
|
+
x_target_sum = None
|
|
731
|
+
elif max_val < 25 and mean_val < 10 and min_val >= 0:
|
|
732
|
+
# Likely log-transformed, verify target sum via expm1 reversal
|
|
733
|
+
# log1p(10k) typically has: max ~6-9, mean ~3-6 for typical scRNA-seq
|
|
734
|
+
# log1p(1M) typically has: max ~13-15, mean ~8-12
|
|
735
|
+
target_info = _estimate_target_sum(adata.X, sample_size)
|
|
736
|
+
x_target_sum = target_info["estimated_target_sum"]
|
|
737
|
+
|
|
738
|
+
if target_info["is_log1p_10k"]:
|
|
739
|
+
x_state = "log1p_10k"
|
|
740
|
+
elif target_info["is_log1p_cpm"]:
|
|
741
|
+
x_state = "log1p_cpm"
|
|
742
|
+
elif x_target_sum > 0:
|
|
743
|
+
x_state = "log1p_other"
|
|
744
|
+
else:
|
|
745
|
+
x_state = "unknown"
|
|
746
|
+
|
|
747
|
+
stats["estimated_target_sum"] = x_target_sum
|
|
748
|
+
elif max_val > 25 and x_raw_check["fraction_integer"] < 0.5:
|
|
749
|
+
x_state = "linear"
|
|
750
|
+
x_target_sum = None
|
|
751
|
+
else:
|
|
752
|
+
x_state = "unknown"
|
|
753
|
+
x_target_sum = None
|
|
754
|
+
|
|
755
|
+
# Determine if data is usable
|
|
756
|
+
is_usable = (raw_source is not None) or (x_state == "log1p_10k")
|
|
757
|
+
|
|
758
|
+
# Check for scanpy log1p metadata as additional confirmation
|
|
759
|
+
has_log1p_uns = "log1p" in adata.uns
|
|
760
|
+
|
|
761
|
+
return {
|
|
762
|
+
"raw_source": raw_source,
|
|
763
|
+
"x_state": x_state,
|
|
764
|
+
"x_target_sum": x_target_sum,
|
|
765
|
+
"is_usable": is_usable,
|
|
766
|
+
"has_log1p_uns": has_log1p_uns,
|
|
767
|
+
"stats": stats,
|
|
768
|
+
}
|