spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,768 @@
1
+ """
2
+ General utilities for SpatialCore.
3
+
4
+ This module provides cross-cutting utilities used throughout the codebase:
5
+ - Gene ID mapping (Ensembl → HUGO/HGNC symbols)
6
+ - Expression normalization status detection
7
+
8
+ These functions are not specific to any data source and can be used
9
+ with any AnnData object.
10
+ """
11
+
12
+ from pathlib import Path
13
+ from typing import Dict, Optional, Tuple, Union, Any
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import anndata as ad
18
+
19
+ from spatialcore.core.logging import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ # ============================================================================
25
+ # Ensembl to HUGO Gene Mapping
26
+ # ============================================================================
27
+
28
+ BIOMART_URL = "http://www.ensembl.org/biomart/martservice"
29
+
30
+ BIOMART_QUERY_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
31
+ <!DOCTYPE Query>
32
+ <Query virtualSchemaName="default" formatter="TSV" header="1" uniqueRows="1" count="" datasetConfigVersion="0.6">
33
+ <Dataset name="hsapiens_gene_ensembl" interface="default">
34
+ <Attribute name="ensembl_gene_id"/>
35
+ <Attribute name="hgnc_symbol"/>
36
+ <Attribute name="external_gene_name"/>
37
+ </Dataset>
38
+ </Query>"""
39
+
40
+
41
+ def download_ensembl_mapping(
42
+ output_path: Union[str, Path],
43
+ force: bool = False,
44
+ ) -> Path:
45
+ """
46
+ Download Ensembl-to-HUGO gene mapping from BioMart.
47
+
48
+ Downloads a TSV file mapping Ensembl gene IDs (ENSG...) to HUGO/HGNC gene symbols.
49
+
50
+ Parameters
51
+ ----------
52
+ output_path : str or Path
53
+ Path to save the mapping TSV file.
54
+ force : bool, default False
55
+ If True, re-download even if file exists.
56
+
57
+ Returns
58
+ -------
59
+ Path
60
+ Path to the downloaded TSV file.
61
+
62
+ Notes
63
+ -----
64
+ The downloaded file contains columns:
65
+ - Gene stable ID (Ensembl ID)
66
+ - HGNC symbol
67
+ - Gene name (external gene name)
68
+
69
+ Examples
70
+ --------
71
+ >>> from spatialcore.core.utils import download_ensembl_mapping
72
+ >>> path = download_ensembl_mapping("./cache/ensembl_to_hugo.tsv")
73
+ """
74
+ import urllib.request
75
+ import urllib.parse
76
+
77
+ output_path = Path(output_path)
78
+
79
+ if output_path.exists() and not force:
80
+ logger.info(f"Mapping file already exists: {output_path}")
81
+ return output_path
82
+
83
+ output_path.parent.mkdir(parents=True, exist_ok=True)
84
+
85
+ logger.info("Downloading Ensembl-to-HUGO gene mapping from BioMart...")
86
+
87
+ # Encode the query
88
+ query = urllib.parse.quote(BIOMART_QUERY_TEMPLATE)
89
+ url = f"{BIOMART_URL}?query={query}"
90
+
91
+ try:
92
+ urllib.request.urlretrieve(url, output_path)
93
+
94
+ # Verify the download
95
+ df = pd.read_csv(output_path, sep="\t")
96
+ n_mappings = len(df[df["HGNC symbol"].notna() & (df["HGNC symbol"] != "")])
97
+ logger.info(f"Downloaded {n_mappings:,} gene mappings to: {output_path}")
98
+
99
+ return output_path
100
+
101
+ except Exception as e:
102
+ logger.error(f"Failed to download from BioMart: {e}")
103
+ raise
104
+
105
+
106
+ def load_ensembl_to_hugo_mapping(
107
+ cache_path: Optional[Union[str, Path]] = None,
108
+ auto_download: bool = True,
109
+ ) -> Dict[str, str]:
110
+ """
111
+ Load Ensembl ID to HUGO gene symbol mapping.
112
+
113
+ Parameters
114
+ ----------
115
+ cache_path : str or Path, optional
116
+ Path to cached TSV file. If None, uses default cache location.
117
+ auto_download : bool, default True
118
+ If True and cache doesn't exist, download from BioMart.
119
+
120
+ Returns
121
+ -------
122
+ Dict[str, str]
123
+ Mapping from Ensembl ID (e.g., "ENSG00000141510") to HUGO symbol (e.g., "TP53").
124
+
125
+ Examples
126
+ --------
127
+ >>> from spatialcore.core.utils import load_ensembl_to_hugo_mapping
128
+ >>> mapping = load_ensembl_to_hugo_mapping()
129
+ >>> mapping["ENSG00000141510"]
130
+ 'TP53'
131
+ """
132
+ if cache_path is None:
133
+ # Default cache location
134
+ cache_path = Path.home() / ".cache" / "spatialcore" / "ensembl_to_hugo.tsv"
135
+ else:
136
+ cache_path = Path(cache_path)
137
+
138
+ if not cache_path.exists():
139
+ if auto_download:
140
+ download_ensembl_mapping(cache_path)
141
+ else:
142
+ raise FileNotFoundError(
143
+ f"Mapping file not found: {cache_path}. "
144
+ "Set auto_download=True to download from BioMart."
145
+ )
146
+
147
+ df = pd.read_csv(cache_path, sep="\t")
148
+
149
+ # Filter for valid HGNC symbols
150
+ df = df.dropna(subset=["HGNC symbol"])
151
+ df = df[df["HGNC symbol"].str.len() > 0]
152
+
153
+ # Create mapping (Ensembl ID -> HGNC symbol)
154
+ mapping = dict(zip(df["Gene stable ID"], df["HGNC symbol"]))
155
+
156
+ logger.info(f"Loaded {len(mapping):,} Ensembl to HUGO gene mappings")
157
+ return mapping
158
+
159
+
160
+ def is_ensembl_id(gene_name: str) -> bool:
161
+ """
162
+ Check if a gene name looks like an Ensembl ID.
163
+
164
+ Parameters
165
+ ----------
166
+ gene_name : str
167
+ Gene name to check.
168
+
169
+ Returns
170
+ -------
171
+ bool
172
+ True if the name matches Ensembl ID patterns.
173
+
174
+ Examples
175
+ --------
176
+ >>> from spatialcore.core.utils import is_ensembl_id
177
+ >>> is_ensembl_id("ENSG00000141510")
178
+ True
179
+ >>> is_ensembl_id("TP53")
180
+ False
181
+ >>> is_ensembl_id("ENSMUSG00000059552")
182
+ True
183
+ """
184
+ if not gene_name or not isinstance(gene_name, str):
185
+ return False
186
+ return (
187
+ gene_name.startswith("ENSG") or # Human gene
188
+ gene_name.startswith("ENST") or # Human transcript
189
+ gene_name.startswith("ENSMUSG") or # Mouse gene
190
+ gene_name.startswith("ENSMUS") # Mouse
191
+ )
192
+
193
+
194
+ def _convert_ensembl_to_hugo(
195
+ gene_names: np.ndarray,
196
+ ensembl_to_hugo: Dict[str, str],
197
+ ) -> Tuple[np.ndarray, Dict[str, int]]:
198
+ """
199
+ Convert Ensembl IDs to HUGO gene symbols where possible.
200
+
201
+ Safe for HUGO symbols: passes them through unchanged.
202
+
203
+ Parameters
204
+ ----------
205
+ gene_names : np.ndarray
206
+ Array of gene names.
207
+ ensembl_to_hugo : Dict[str, str]
208
+ Mapping from Ensembl ID to HUGO symbol.
209
+
210
+ Returns
211
+ -------
212
+ Tuple[np.ndarray, Dict[str, int]]
213
+ (converted_names, stats_dict)
214
+ """
215
+ converted = []
216
+ n_converted = 0
217
+ n_already_hugo = 0
218
+ n_unmapped = 0
219
+
220
+ for gene in gene_names:
221
+ gene_str = str(gene)
222
+ if is_ensembl_id(gene_str):
223
+ if gene_str in ensembl_to_hugo:
224
+ converted.append(ensembl_to_hugo[gene_str])
225
+ n_converted += 1
226
+ else:
227
+ # Keep unmapped Ensembl ID (will be filtered during panel subsetting)
228
+ converted.append(gene_str)
229
+ n_unmapped += 1
230
+ else:
231
+ # Already HUGO symbol - pass through
232
+ converted.append(gene_str)
233
+ n_already_hugo += 1
234
+
235
+ stats = {
236
+ "total_genes": len(gene_names),
237
+ "converted_ensembl": n_converted,
238
+ "already_hugo": n_already_hugo,
239
+ "unmapped_ensembl": n_unmapped,
240
+ }
241
+ return np.array(converted), stats
242
+
243
+
244
+ def normalize_gene_names(
245
+ adata: ad.AnnData,
246
+ ensembl_to_hugo: Optional[Dict[str, str]] = None,
247
+ copy: bool = False,
248
+ ) -> ad.AnnData:
249
+ """
250
+ Normalize gene names in AnnData to use HUGO gene symbols.
251
+
252
+ Two-step process:
253
+ 1. If var_names are Ensembl IDs/indices, use feature_name column as starting point
254
+ 2. Apply Ensembl→HUGO mapping for any remaining Ensembl IDs
255
+
256
+ Safe to call on data that already uses HUGO symbols.
257
+
258
+ Parameters
259
+ ----------
260
+ adata : AnnData
261
+ AnnData object with genes in var.
262
+ ensembl_to_hugo : Dict[str, str], optional
263
+ Mapping from Ensembl ID to HUGO symbol. If None, loads from cache.
264
+ copy : bool, default False
265
+ If True, return a copy. Otherwise modify in place.
266
+
267
+ Returns
268
+ -------
269
+ AnnData
270
+ AnnData with normalized gene names in var_names.
271
+
272
+ Notes
273
+ -----
274
+ CellxGene Census data commonly stores gene identifiers as:
275
+ - Numeric indices with symbols in var['feature_name']
276
+ - Ensembl IDs with symbols in var['feature_name']
277
+ - Mixed content in feature_name (some Ensembl, some HUGO)
278
+
279
+ This function handles all these cases.
280
+
281
+ Examples
282
+ --------
283
+ >>> from spatialcore.core.utils import normalize_gene_names, load_ensembl_to_hugo_mapping
284
+ >>> mapping = load_ensembl_to_hugo_mapping()
285
+ >>> adata = normalize_gene_names(adata, mapping)
286
+ """
287
+ if copy:
288
+ adata = adata.copy()
289
+
290
+ first_gene = str(adata.var_names[0])
291
+ uses_non_symbol_ids = (
292
+ first_gene.isdigit() or
293
+ first_gene.startswith("ENSG") or
294
+ first_gene.startswith("ENST")
295
+ )
296
+
297
+ if not uses_non_symbol_ids:
298
+ logger.info("Gene names already appear to be HUGO symbols")
299
+ # Still check for any remaining Ensembl IDs and convert them
300
+ if ensembl_to_hugo is None:
301
+ ensembl_to_hugo = load_ensembl_to_hugo_mapping()
302
+
303
+ converted_names, stats = _convert_ensembl_to_hugo(
304
+ adata.var_names.values, ensembl_to_hugo
305
+ )
306
+ if stats["converted_ensembl"] > 0:
307
+ adata.var_names = pd.Index(converted_names)
308
+ adata.var_names_make_unique()
309
+ logger.info(
310
+ f"Converted {stats['converted_ensembl']:,} remaining Ensembl IDs to HUGO"
311
+ )
312
+ return adata
313
+
314
+ # Step 1: Use feature_name column if available
315
+ if "feature_name" in adata.var.columns:
316
+ feature_names = adata.var["feature_name"].values.astype(str)
317
+ adata.var_names = pd.Index(feature_names)
318
+ logger.info("Using 'feature_name' column as gene names")
319
+
320
+ # Step 2: Apply Ensembl to HUGO mapping for any remaining Ensembl IDs
321
+ if ensembl_to_hugo is None:
322
+ ensembl_to_hugo = load_ensembl_to_hugo_mapping()
323
+
324
+ converted_names, stats = _convert_ensembl_to_hugo(
325
+ adata.var_names.values, ensembl_to_hugo
326
+ )
327
+ adata.var_names = pd.Index(converted_names)
328
+
329
+ if stats["converted_ensembl"] > 0:
330
+ logger.info(
331
+ f"Gene mapping: {stats['converted_ensembl']:,} converted, "
332
+ f"{stats['already_hugo']:,} already HUGO, "
333
+ f"{stats['unmapped_ensembl']:,} unmapped"
334
+ )
335
+ else:
336
+ logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
337
+
338
+ adata.var_names_make_unique()
339
+ return adata
340
+
341
+
342
+ # ============================================================================
343
+ # Expression Normalization Status Detection
344
+ # ============================================================================
345
+
346
+ # Layer names to search for raw counts (in priority order)
347
+ RAW_COUNT_LAYERS = ["counts", "raw_counts", "raw"]
348
+
349
+
350
+ def _is_integer_like(
351
+ values: np.ndarray,
352
+ tolerance: float = 1e-6,
353
+ threshold: float = 0.95,
354
+ ) -> bool:
355
+ """
356
+ Check if array values are integer-like within floating point tolerance.
357
+
358
+ Parameters
359
+ ----------
360
+ values : np.ndarray
361
+ Array of values to check (should be non-zero values only).
362
+ tolerance : float, default 1e-6
363
+ Tolerance for floating point comparison. Handles values like
364
+ 1.0000000000000002 or 2.9999999999999996.
365
+ threshold : float, default 0.95
366
+ Fraction of values that must be integer-like to pass.
367
+
368
+ Returns
369
+ -------
370
+ bool
371
+ True if >= threshold fraction of values are integer-like.
372
+ """
373
+ if len(values) == 0:
374
+ return False
375
+
376
+ remainder = np.abs(np.mod(values, 1))
377
+ is_integer = (remainder < tolerance) | (remainder > 1 - tolerance)
378
+ fraction_integer = np.mean(is_integer)
379
+
380
+ return fraction_integer >= threshold
381
+
382
+
383
+ def _get_matrix_sample(
384
+ matrix,
385
+ sample_size: int = 10000,
386
+ ) -> np.ndarray:
387
+ """
388
+ Get a dense sample from a matrix (sparse or dense).
389
+
390
+ Parameters
391
+ ----------
392
+ matrix : array-like or sparse matrix
393
+ Expression matrix.
394
+ sample_size : int, default 10000
395
+ Maximum number of cells to sample.
396
+
397
+ Returns
398
+ -------
399
+ np.ndarray
400
+ Dense 2D array sample.
401
+ """
402
+ from scipy.sparse import issparse
403
+
404
+ n_cells = matrix.shape[0]
405
+ n_sample = min(sample_size, n_cells)
406
+
407
+ if issparse(matrix):
408
+ return matrix[:n_sample].toarray()
409
+ else:
410
+ return np.asarray(matrix[:n_sample])
411
+
412
+
413
+ def _check_raw_counts(
414
+ matrix,
415
+ sample_size: int = 10000,
416
+ integer_tolerance: float = 1e-6,
417
+ integer_threshold: float = 0.95,
418
+ ) -> Dict[str, Any]:
419
+ """
420
+ Check if a matrix contains raw counts.
421
+
422
+ Parameters
423
+ ----------
424
+ matrix : array-like or sparse matrix
425
+ Expression matrix to check.
426
+ sample_size : int, default 10000
427
+ Number of cells to sample for checking.
428
+ integer_tolerance : float, default 1e-6
429
+ Tolerance for integer comparison.
430
+ integer_threshold : float, default 0.95
431
+ Fraction of values that must be integers.
432
+
433
+ Returns
434
+ -------
435
+ Dict[str, Any]
436
+ Dictionary with:
437
+ - is_raw: bool
438
+ - fraction_integer: float
439
+ - min_val: float
440
+ - max_val: float
441
+ """
442
+ sample_data = _get_matrix_sample(matrix, sample_size)
443
+
444
+ # Get non-zero values for integer check
445
+ flat_sample = sample_data.flatten()
446
+ non_zero = flat_sample[flat_sample != 0]
447
+
448
+ if len(non_zero) == 0:
449
+ return {
450
+ "is_raw": False,
451
+ "fraction_integer": 0.0,
452
+ "min_val": 0.0,
453
+ "max_val": 0.0,
454
+ "reason": "all_zeros",
455
+ }
456
+
457
+ min_val = float(np.min(sample_data))
458
+ max_val = float(np.max(sample_data))
459
+
460
+ # Raw counts cannot be negative
461
+ if min_val < 0:
462
+ return {
463
+ "is_raw": False,
464
+ "fraction_integer": 0.0,
465
+ "min_val": min_val,
466
+ "max_val": max_val,
467
+ "reason": "negative_values",
468
+ }
469
+
470
+ # Check integer-like property
471
+ is_integer = _is_integer_like(non_zero, integer_tolerance, integer_threshold)
472
+
473
+ # Calculate actual fraction for reporting
474
+ remainder = np.abs(np.mod(non_zero, 1))
475
+ fraction_integer = float(np.mean(
476
+ (remainder < integer_tolerance) | (remainder > 1 - integer_tolerance)
477
+ ))
478
+
479
+ return {
480
+ "is_raw": is_integer,
481
+ "fraction_integer": fraction_integer,
482
+ "min_val": min_val,
483
+ "max_val": max_val,
484
+ "reason": "integer_check",
485
+ }
486
+
487
+
488
+ def _estimate_target_sum(
489
+ matrix,
490
+ sample_size: int = 1000,
491
+ ) -> Dict[str, Any]:
492
+ """
493
+ Estimate the target sum used for normalization by reversing log1p.
494
+
495
+ If data is log1p(counts / total * target_sum), then:
496
+ expm1(X).sum(axis=1) should equal target_sum for each cell.
497
+
498
+ Parameters
499
+ ----------
500
+ matrix : array-like or sparse matrix
501
+ Expression matrix (assumed to be log1p transformed).
502
+ sample_size : int, default 1000
503
+ Number of cells to sample.
504
+
505
+ Returns
506
+ -------
507
+ Dict[str, Any]
508
+ Dictionary with:
509
+ - estimated_target_sum: float (median of row sums)
510
+ - target_sum_std: float (std of row sums)
511
+ - is_log1p_10k: bool (True if target_sum ~ 10,000)
512
+ - is_log1p_cpm: bool (True if target_sum ~ 1,000,000)
513
+ """
514
+ sample_data = _get_matrix_sample(matrix, sample_size)
515
+
516
+ # Reverse log1p transformation
517
+ reversed_data = np.expm1(sample_data)
518
+
519
+ # Compute row sums (should equal target_sum)
520
+ row_sums = reversed_data.sum(axis=1)
521
+
522
+ # Exclude empty cells (sum = 0)
523
+ non_empty_sums = row_sums[row_sums > 0]
524
+
525
+ if len(non_empty_sums) == 0:
526
+ return {
527
+ "estimated_target_sum": 0.0,
528
+ "target_sum_std": 0.0,
529
+ "is_log1p_10k": False,
530
+ "is_log1p_cpm": False,
531
+ }
532
+
533
+ median_sum = float(np.median(non_empty_sums))
534
+ std_sum = float(np.std(non_empty_sums))
535
+
536
+ # Check if close to 10,000 (allow 20% tolerance)
537
+ is_log1p_10k = 8_000 < median_sum < 12_000
538
+
539
+ # Check if close to 1,000,000 (CPM)
540
+ is_log1p_cpm = 800_000 < median_sum < 1_200_000
541
+
542
+ return {
543
+ "estimated_target_sum": median_sum,
544
+ "target_sum_std": std_sum,
545
+ "is_log1p_10k": is_log1p_10k,
546
+ "is_log1p_cpm": is_log1p_cpm,
547
+ }
548
+
549
+
550
+ def _find_raw_counts_source(
551
+ adata: ad.AnnData,
552
+ sample_size: int = 10000,
553
+ integer_tolerance: float = 1e-6,
554
+ integer_threshold: float = 0.95,
555
+ ) -> Optional[str]:
556
+ """
557
+ Search for raw counts in layers, adata.raw, and adata.X.
558
+
559
+ Parameters
560
+ ----------
561
+ adata : AnnData
562
+ AnnData object to search.
563
+ sample_size : int, default 10000
564
+ Number of cells to sample for checking.
565
+ integer_tolerance : float, default 1e-6
566
+ Tolerance for integer comparison.
567
+ integer_threshold : float, default 0.95
568
+ Fraction of values that must be integers.
569
+
570
+ Returns
571
+ -------
572
+ Optional[str]
573
+ Source location if found:
574
+ - "layers/{layer_name}" for layers
575
+ - "raw.X" for adata.raw
576
+ - "X" for adata.X
577
+ - None if no raw counts found
578
+ """
579
+ # Check layers first (in priority order)
580
+ for layer_name in RAW_COUNT_LAYERS:
581
+ if layer_name in adata.layers:
582
+ result = _check_raw_counts(
583
+ adata.layers[layer_name],
584
+ sample_size,
585
+ integer_tolerance,
586
+ integer_threshold,
587
+ )
588
+ if result["is_raw"]:
589
+ logger.debug(
590
+ f"Found raw counts in layers['{layer_name}'] "
591
+ f"(fraction_integer={result['fraction_integer']:.3f})"
592
+ )
593
+ return f"layers/{layer_name}"
594
+
595
+ # Check adata.raw.X
596
+ if adata.raw is not None:
597
+ result = _check_raw_counts(
598
+ adata.raw.X,
599
+ sample_size,
600
+ integer_tolerance,
601
+ integer_threshold,
602
+ )
603
+ if result["is_raw"]:
604
+ logger.debug(
605
+ f"Found raw counts in raw.X "
606
+ f"(fraction_integer={result['fraction_integer']:.3f})"
607
+ )
608
+ return "raw.X"
609
+
610
+ # Check adata.X as last resort
611
+ result = _check_raw_counts(
612
+ adata.X,
613
+ sample_size,
614
+ integer_tolerance,
615
+ integer_threshold,
616
+ )
617
+ if result["is_raw"]:
618
+ logger.debug(
619
+ f"Found raw counts in X "
620
+ f"(fraction_integer={result['fraction_integer']:.3f})"
621
+ )
622
+ return "X"
623
+
624
+ return None
625
+
626
+
627
+ def check_normalization_status(
628
+ adata: ad.AnnData,
629
+ sample_size: int = 1000,
630
+ integer_tolerance: float = 1e-6,
631
+ integer_threshold: float = 0.95,
632
+ ) -> Dict[str, Any]:
633
+ """
634
+ Detect the normalization state of expression data with robust validation.
635
+
636
+ This function searches for raw counts in layers and adata.raw, and verifies
637
+ log1p normalization by checking the target sum via expm1 reversal.
638
+
639
+ Parameters
640
+ ----------
641
+ adata : AnnData
642
+ AnnData object to check.
643
+ sample_size : int, default 1000
644
+ Number of cells to sample for detection.
645
+ integer_tolerance : float, default 1e-6
646
+ Tolerance for integer comparison (handles float precision issues
647
+ like 1.0000000000000002).
648
+ integer_threshold : float, default 0.95
649
+ Fraction of non-zero values that must be integer-like to classify
650
+ as raw counts.
651
+
652
+ Returns
653
+ -------
654
+ Dict[str, Any]
655
+ Dictionary with keys:
656
+
657
+ - raw_source: str or None
658
+ Location of raw counts ("layers/counts", "raw.X", "X", or None)
659
+ - x_state: str
660
+ State of adata.X: "raw", "log1p_10k", "log1p_cpm", "log1p_other",
661
+ "linear", "negative", "unknown"
662
+ - x_target_sum: float or None
663
+ Estimated target sum if X appears log-transformed
664
+ - is_usable: bool
665
+ True if data can be safely normalized (raw available OR X is log1p_10k)
666
+ - stats: dict
667
+ Diagnostic statistics (mean, max, min, fraction_integer)
668
+
669
+ Notes
670
+ -----
671
+ **Detection Logic:**
672
+
673
+ 1. Search for raw counts in: layers["counts"], layers["raw_counts"],
674
+ layers["raw"], adata.raw.X, adata.X (in order)
675
+ 2. For each candidate, check if >95% of non-zero values are integers
676
+ within floating point tolerance
677
+ 3. For adata.X, if not raw, reverse log1p and check row sums to
678
+ verify target_sum is ~10,000
679
+
680
+ **Usability Criteria:**
681
+
682
+ Data is considered usable (is_usable=True) if:
683
+ - Raw counts are found anywhere, OR
684
+ - adata.X is verified as log1p normalized to 10,000
685
+
686
+ Examples
687
+ --------
688
+ >>> from spatialcore.core.utils import check_normalization_status
689
+ >>> status = check_normalization_status(adata)
690
+ >>> if status["is_usable"]:
691
+ ... if status["raw_source"]:
692
+ ... print(f"Will normalize from {status['raw_source']}")
693
+ ... else:
694
+ ... print("X is already log1p_10k")
695
+ >>> else:
696
+ ... print(f"Cannot use: X is {status['x_state']}")
697
+ """
698
+ from scipy.sparse import issparse
699
+
700
+ # Step 1: Search for raw counts
701
+ raw_source = _find_raw_counts_source(
702
+ adata, sample_size * 10, integer_tolerance, integer_threshold
703
+ )
704
+
705
+ # Step 2: Analyze adata.X
706
+ sample_data = _get_matrix_sample(adata.X, sample_size)
707
+
708
+ mean_val = float(np.mean(sample_data))
709
+ max_val = float(np.max(sample_data))
710
+ min_val = float(np.min(sample_data))
711
+
712
+ # Check if X contains raw counts
713
+ x_raw_check = _check_raw_counts(
714
+ adata.X, sample_size * 10, integer_tolerance, integer_threshold
715
+ )
716
+
717
+ stats = {
718
+ "mean": mean_val,
719
+ "max": max_val,
720
+ "min": min_val,
721
+ "fraction_integer": x_raw_check["fraction_integer"],
722
+ }
723
+
724
+ # Determine X state
725
+ if x_raw_check["is_raw"]:
726
+ x_state = "raw"
727
+ x_target_sum = None
728
+ elif min_val < 0:
729
+ x_state = "negative"
730
+ x_target_sum = None
731
+ elif max_val < 25 and mean_val < 10 and min_val >= 0:
732
+ # Likely log-transformed, verify target sum via expm1 reversal
733
+ # log1p(10k) typically has: max ~6-9, mean ~3-6 for typical scRNA-seq
734
+ # log1p(1M) typically has: max ~13-15, mean ~8-12
735
+ target_info = _estimate_target_sum(adata.X, sample_size)
736
+ x_target_sum = target_info["estimated_target_sum"]
737
+
738
+ if target_info["is_log1p_10k"]:
739
+ x_state = "log1p_10k"
740
+ elif target_info["is_log1p_cpm"]:
741
+ x_state = "log1p_cpm"
742
+ elif x_target_sum > 0:
743
+ x_state = "log1p_other"
744
+ else:
745
+ x_state = "unknown"
746
+
747
+ stats["estimated_target_sum"] = x_target_sum
748
+ elif max_val > 25 and x_raw_check["fraction_integer"] < 0.5:
749
+ x_state = "linear"
750
+ x_target_sum = None
751
+ else:
752
+ x_state = "unknown"
753
+ x_target_sum = None
754
+
755
+ # Determine if data is usable
756
+ is_usable = (raw_source is not None) or (x_state == "log1p_10k")
757
+
758
+ # Check for scanpy log1p metadata as additional confirmation
759
+ has_log1p_uns = "log1p" in adata.uns
760
+
761
+ return {
762
+ "raw_source": raw_source,
763
+ "x_state": x_state,
764
+ "x_target_sum": x_target_sum,
765
+ "is_usable": is_usable,
766
+ "has_log1p_uns": has_log1p_uns,
767
+ "stats": stats,
768
+ }