spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,529 @@
1
+ """
2
+ Memory-efficient AnnData loading and subsampling utilities.
3
+
4
+ This module provides utilities for:
5
+ 1. Loading large h5ad files using backed mode to minimize memory usage
6
+ 2. Stratified subsampling to maintain cell type proportions
7
+ 3. Normalization validation and application
8
+
9
+ For large CellxGene datasets (>2GB), backed mode loads only metadata initially,
10
+ allowing efficient subsampling before loading expression data.
11
+
12
+ References:
13
+ - Scanpy backed mode: https://scanpy.readthedocs.io/en/stable/generated/scanpy.read_h5ad.html
14
+ """
15
+
16
+ from pathlib import Path
17
+ from typing import Dict, List, Optional, Tuple, Union, Any
18
+ import gc
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import scanpy as sc
23
+ import anndata as ad
24
+
25
+ from spatialcore.core.logging import get_logger
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ # ============================================================================
31
+ # Memory Utilities
32
+ # ============================================================================
33
+
34
+ def get_available_memory_gb() -> float:
35
+ """
36
+ Get available system memory in GB.
37
+
38
+ Returns
39
+ -------
40
+ float
41
+ Available memory in gigabytes.
42
+
43
+ Notes
44
+ -----
45
+ Requires psutil. Returns 0 if psutil is not available.
46
+ """
47
+ try:
48
+ import psutil
49
+ return psutil.virtual_memory().available / (1024**3)
50
+ except ImportError:
51
+ logger.warning("psutil not installed, cannot check available memory")
52
+ return 0.0
53
+
54
+
55
+ def estimate_adata_memory_gb(n_cells: int, n_genes: int, dtype_bytes: int = 4) -> float:
56
+ """
57
+ Estimate memory required for a dense expression matrix.
58
+
59
+ Parameters
60
+ ----------
61
+ n_cells : int
62
+ Number of cells.
63
+ n_genes : int
64
+ Number of genes.
65
+ dtype_bytes : int, default 4
66
+ Bytes per value (4 for float32, 8 for float64).
67
+
68
+ Returns
69
+ -------
70
+ float
71
+ Estimated memory in gigabytes.
72
+ """
73
+ return (n_cells * n_genes * dtype_bytes) / (1024**3)
74
+
75
+
76
+ # ============================================================================
77
+ # Stratified Sampling
78
+ # ============================================================================
79
+
80
+ def _stratified_sample_indices(
81
+ obs_df: pd.DataFrame,
82
+ label_column: str,
83
+ max_cells: int,
84
+ random_state: int = 42,
85
+ ) -> np.ndarray:
86
+ """
87
+ Get stratified sample indices maintaining cell type proportions.
88
+
89
+ Works with backed AnnData by only accessing .obs metadata.
90
+
91
+ Parameters
92
+ ----------
93
+ obs_df : pd.DataFrame
94
+ Observation dataframe (adata.obs).
95
+ label_column : str
96
+ Column containing cell type labels.
97
+ max_cells : int
98
+ Maximum number of cells to sample.
99
+ random_state : int, default 42
100
+ Random seed for reproducibility.
101
+
102
+ Returns
103
+ -------
104
+ np.ndarray
105
+ Sorted array of selected cell indices.
106
+ """
107
+ np.random.seed(random_state)
108
+
109
+ labels = obs_df[label_column].values
110
+ unique_labels = np.unique(labels)
111
+
112
+ # Calculate cells per type (proportional)
113
+ label_counts = pd.Series(labels).value_counts()
114
+ total_cells = len(labels)
115
+ indices = []
116
+
117
+ for label in unique_labels:
118
+ label_indices = np.where(labels == label)[0]
119
+ # Proportional allocation
120
+ n_sample = int(np.ceil(max_cells * len(label_indices) / total_cells))
121
+ n_sample = min(n_sample, len(label_indices))
122
+
123
+ sampled = np.random.choice(label_indices, size=n_sample, replace=False)
124
+ indices.extend(sampled)
125
+
126
+ # Trim to exact max_cells if we oversampled due to ceiling
127
+ indices = np.array(indices)
128
+ if len(indices) > max_cells:
129
+ indices = np.random.choice(indices, size=max_cells, replace=False)
130
+
131
+ return np.sort(indices)
132
+
133
+
134
+ def subsample_adata(
135
+ adata: ad.AnnData,
136
+ max_cells: int,
137
+ stratify_by: Optional[str] = None,
138
+ random_state: int = 42,
139
+ copy: bool = True,
140
+ ) -> ad.AnnData:
141
+ """
142
+ Subsample AnnData to max_cells, optionally maintaining cell type proportions.
143
+
144
+ Parameters
145
+ ----------
146
+ adata : AnnData
147
+ AnnData object to subsample.
148
+ max_cells : int
149
+ Maximum number of cells to keep.
150
+ stratify_by : str, optional
151
+ Column in adata.obs to use for stratified sampling.
152
+ If None, random sampling is used.
153
+ random_state : int, default 42
154
+ Random seed for reproducibility.
155
+ copy : bool, default True
156
+ If True, return a copy. Otherwise modifies in place.
157
+
158
+ Returns
159
+ -------
160
+ AnnData
161
+ Subsampled AnnData object.
162
+
163
+ Examples
164
+ --------
165
+ >>> from spatialcore.annotation import subsample_adata
166
+ >>> adata_small = subsample_adata(adata, max_cells=10000, stratify_by="cell_type")
167
+ """
168
+ if adata.n_obs <= max_cells:
169
+ logger.info(f"AnnData has {adata.n_obs:,} cells, no subsampling needed")
170
+ return adata.copy() if copy else adata
171
+
172
+ np.random.seed(random_state)
173
+
174
+ if stratify_by and stratify_by in adata.obs.columns:
175
+ indices = _stratified_sample_indices(
176
+ adata.obs, stratify_by, max_cells, random_state
177
+ )
178
+ logger.info(
179
+ f"Stratified subsampling by '{stratify_by}': "
180
+ f"{adata.n_obs:,} → {len(indices):,} cells"
181
+ )
182
+ else:
183
+ indices = np.random.choice(adata.n_obs, size=max_cells, replace=False)
184
+ logger.info(f"Random subsampling: {adata.n_obs:,} → {max_cells:,} cells")
185
+
186
+ return adata[indices].copy() if copy else adata[indices]
187
+
188
+
189
+ # ============================================================================
190
+ # Memory-Efficient Loading
191
+ # ============================================================================
192
+
193
+ def load_adata_backed(
194
+ path: Union[str, Path],
195
+ max_cells: Optional[int] = None,
196
+ label_column: Optional[str] = None,
197
+ large_file_threshold_gb: float = 2.0,
198
+ random_state: int = 42,
199
+ ) -> ad.AnnData:
200
+ """
201
+ Load AnnData with memory-efficient strategies for large files.
202
+
203
+ Strategy:
204
+ - Small files (<2GB): Load fully into memory
205
+ - Large files (>=2GB): Use backed mode, subsample indices, then load subset
206
+
207
+ Parameters
208
+ ----------
209
+ path : str or Path
210
+ Path to h5ad file.
211
+ max_cells : int, optional
212
+ Maximum cells to load. If None, loads all cells.
213
+ label_column : str, optional
214
+ Column for stratified sampling. If None, random sampling.
215
+ large_file_threshold_gb : float, default 2.0
216
+ File size threshold for using backed mode.
217
+ random_state : int, default 42
218
+ Random seed for reproducibility.
219
+
220
+ Returns
221
+ -------
222
+ AnnData
223
+ AnnData loaded into memory (not backed).
224
+
225
+ Notes
226
+ -----
227
+ For large files, this function:
228
+ 1. Opens in backed mode (only loads metadata)
229
+ 2. Determines subsample indices from metadata only
230
+ 3. Loads ONLY the selected cells into memory
231
+
232
+ This can reduce memory usage significantly. For example, HLCA (584k cells)
233
+ can be subsampled to 100k cells without ever loading the full matrix.
234
+
235
+ Examples
236
+ --------
237
+ >>> from spatialcore.annotation import load_adata_backed
238
+ >>> # Load up to 100k cells with stratified sampling
239
+ >>> adata = load_adata_backed(
240
+ ... "large_dataset.h5ad",
241
+ ... max_cells=100000,
242
+ ... label_column="cell_type"
243
+ ... )
244
+ """
245
+ path = Path(path)
246
+ if not path.exists():
247
+ raise FileNotFoundError(f"File not found: {path}")
248
+
249
+ file_size_gb = path.stat().st_size / (1024**3)
250
+ available_memory = get_available_memory_gb()
251
+
252
+ logger.info(f"Loading: {path.name} ({file_size_gb:.2f} GB)")
253
+ if available_memory > 0:
254
+ logger.info(f" Available memory: {available_memory:.1f} GB")
255
+
256
+ if file_size_gb >= large_file_threshold_gb:
257
+ logger.info(f" Large file - using backed mode for memory efficiency")
258
+
259
+ # Step 1: Open in backed mode (only loads metadata, not expression data)
260
+ adata_backed = sc.read_h5ad(str(path), backed='r')
261
+ logger.info(
262
+ f" Opened: {adata_backed.n_obs:,} cells × {adata_backed.n_vars:,} genes (backed)"
263
+ )
264
+
265
+ # Step 2: Determine subsample indices BEFORE loading data
266
+ n_cells = adata_backed.n_obs
267
+ if max_cells is not None and n_cells > max_cells:
268
+ if label_column and label_column in adata_backed.obs.columns:
269
+ indices = _stratified_sample_indices(
270
+ adata_backed.obs, label_column, max_cells, random_state
271
+ )
272
+ logger.info(
273
+ f" Stratified subsampling: {n_cells:,} → {len(indices):,} cells"
274
+ )
275
+ else:
276
+ np.random.seed(random_state)
277
+ indices = np.random.choice(n_cells, size=max_cells, replace=False)
278
+ logger.info(f" Random subsampling: {n_cells:,} → {max_cells:,} cells")
279
+ else:
280
+ indices = np.arange(n_cells)
281
+
282
+ # Step 3: Load only the selected cells into memory
283
+ adata = adata_backed[indices].to_memory()
284
+ logger.info(
285
+ f" Loaded into memory: {adata.n_obs:,} cells × {adata.n_vars:,} genes"
286
+ )
287
+
288
+ # Clean up backed reference
289
+ adata_backed.file.close()
290
+ gc.collect()
291
+
292
+ else:
293
+ # Small file - load directly
294
+ logger.info(f" Loading full file into memory")
295
+ adata = sc.read_h5ad(str(path))
296
+
297
+ # Subsample if needed
298
+ if max_cells is not None and adata.n_obs > max_cells:
299
+ adata = subsample_adata(
300
+ adata, max_cells, stratify_by=label_column, random_state=random_state
301
+ )
302
+
303
+ return adata
304
+
305
+
306
+ # ============================================================================
307
+ # Normalization Utilities
308
+ # ============================================================================
309
+
310
+ def _copy_raw_to_x(adata: ad.AnnData, raw_source: str) -> None:
311
+ """
312
+ Copy raw counts from source location to adata.X (in-place).
313
+
314
+ Parameters
315
+ ----------
316
+ adata : AnnData
317
+ AnnData object to modify.
318
+ raw_source : str
319
+ Source location: "layers/{name}", "raw.X", or "X".
320
+ """
321
+ from scipy.sparse import issparse, csr_matrix
322
+
323
+ if raw_source == "X":
324
+ # Already in X, nothing to do
325
+ return
326
+
327
+ if raw_source == "raw.X":
328
+ source_matrix = adata.raw.X
329
+ elif raw_source.startswith("layers/"):
330
+ layer_name = raw_source.split("/", 1)[1]
331
+ source_matrix = adata.layers[layer_name]
332
+ else:
333
+ raise ValueError(f"Unknown raw_source: {raw_source}")
334
+
335
+ # Copy to X, preserving sparsity
336
+ if issparse(source_matrix):
337
+ adata.X = source_matrix.copy()
338
+ else:
339
+ adata.X = np.array(source_matrix, copy=True)
340
+
341
+ logger.info(f"Copied raw counts from {raw_source} to X")
342
+
343
+
344
+ def ensure_normalized(
345
+ adata: ad.AnnData,
346
+ target_sum: float = 1e4,
347
+ unsafe_force: bool = False,
348
+ copy: bool = False,
349
+ ) -> ad.AnnData:
350
+ """
351
+ Ensure data is log1p normalized to target counts per cell.
352
+
353
+ This function robustly detects the normalization state by:
354
+ 1. Searching for raw counts in layers["counts"], layers["raw_counts"],
355
+ layers["raw"], adata.raw.X, and adata.X
356
+ 2. Verifying raw counts via integer test with floating-point tolerance
357
+ 3. Verifying log1p_10k via expm1 row sum estimation
358
+
359
+ CellTypist REQUIRES: log1p(10k) with exclude_highly_expressed=False
360
+
361
+ Parameters
362
+ ----------
363
+ adata : AnnData
364
+ AnnData object to normalize.
365
+ target_sum : float, default 1e4
366
+ Target sum for normalization (10000 for CellTypist).
367
+ unsafe_force : bool, default False
368
+ **DANGEROUS**: If True, applies normalization even when data state
369
+ cannot be verified. This may produce INCORRECT results if:
370
+
371
+ - Data is already log-transformed (double-logging destroys signal)
372
+ - Data uses a different target sum (e.g., CPM vs 10k)
373
+ - Data contains negative values (z-scored/batch-corrected)
374
+ - Data is latent space embeddings (not expression)
375
+
376
+ Only use this if you have manually verified your data's state
377
+ through other means. Incorrect normalization will produce
378
+ systematically wrong cell type predictions.
379
+
380
+ When enabled, logs a WARNING with the detected (unverified) state.
381
+ copy : bool, default False
382
+ If True, return a copy. Otherwise modifies in place.
383
+
384
+ Returns
385
+ -------
386
+ AnnData
387
+ Normalized AnnData object with log1p(10k) in X.
388
+
389
+ Raises
390
+ ------
391
+ ValueError
392
+ If no raw counts found and adata.X is not verified as log1p_10k,
393
+ unless ``unsafe_force=True``.
394
+
395
+ Notes
396
+ -----
397
+ **Safe normalization paths:**
398
+
399
+ 1. Raw counts found (in layers, raw.X, or X): Copy to X, normalize, log1p
400
+ 2. X verified as log1p_10k: No action needed
401
+
402
+ **Unsafe states (require unsafe_force=True):**
403
+
404
+ - log1p_cpm: Normalized to 1M instead of 10k
405
+ - log1p_other: Unknown target sum
406
+ - linear: Normalized but not log-transformed
407
+ - negative: Contains negative values (z-scored?)
408
+ - unknown: Cannot determine state
409
+
410
+ Examples
411
+ --------
412
+ >>> from spatialcore.annotation import ensure_normalized
413
+ >>> # Normal usage - will error if data state cannot be verified
414
+ >>> adata = ensure_normalized(adata, target_sum=10000)
415
+
416
+ >>> # Dangerous: force normalization on unverified data
417
+ >>> adata = ensure_normalized(adata, unsafe_force=True) # NOT RECOMMENDED
418
+ """
419
+ from spatialcore.core.utils import check_normalization_status
420
+
421
+ if copy:
422
+ adata = adata.copy()
423
+
424
+ status = check_normalization_status(adata)
425
+
426
+ logger.info(
427
+ f"Normalization status: x_state={status['x_state']}, "
428
+ f"raw_source={status['raw_source']}"
429
+ )
430
+
431
+ # Path 1: X is already log1p_10k - nothing to do
432
+ if status["x_state"] == "log1p_10k":
433
+ logger.info("Data already log1p normalized to 10k")
434
+ return adata
435
+
436
+ # Path 2: Raw counts available - normalize from raw
437
+ if status["raw_source"] is not None:
438
+ logger.info(f"Normalizing from raw counts ({status['raw_source']})")
439
+
440
+ # Copy raw to X if not already there
441
+ _copy_raw_to_x(adata, status["raw_source"])
442
+
443
+ # Apply normalization
444
+ # CRITICAL: exclude_highly_expressed=False for CellTypist compatibility
445
+ sc.pp.normalize_total(
446
+ adata, target_sum=target_sum, exclude_highly_expressed=False
447
+ )
448
+ sc.pp.log1p(adata)
449
+
450
+ logger.info(f"Applied normalize_total({target_sum:.0f}) + log1p")
451
+ return adata
452
+
453
+ # Path 3: No raw counts and X is not log1p_10k - unsafe territory
454
+ if not status["is_usable"]:
455
+ error_msg = (
456
+ f"Cannot safely normalize data.\n"
457
+ f" Detected X state: {status['x_state']}\n"
458
+ f" Estimated target_sum: {status.get('x_target_sum', 'N/A')}\n"
459
+ f" Raw counts found: None\n"
460
+ f"\n"
461
+ f"To resolve this:\n"
462
+ f" 1. Provide raw counts in adata.layers['counts'] or adata.raw.X\n"
463
+ f" 2. Ensure adata.X contains log1p(10k) normalized data\n"
464
+ f" 3. Use unsafe_force=True if you have manually verified your data\n"
465
+ )
466
+
467
+ if unsafe_force:
468
+ logger.warning("=" * 60)
469
+ logger.warning("UNSAFE NORMALIZATION FORCED")
470
+ logger.warning(f"Detected X state: {status['x_state']}")
471
+ logger.warning(f"Estimated target_sum: {status.get('x_target_sum', 'N/A')}")
472
+ logger.warning("This may produce INCORRECT downstream results.")
473
+ logger.warning("You have been warned.")
474
+ logger.warning("=" * 60)
475
+
476
+ # Apply full pipeline anyway
477
+ sc.pp.normalize_total(
478
+ adata, target_sum=target_sum, exclude_highly_expressed=False
479
+ )
480
+ sc.pp.log1p(adata)
481
+ logger.warning("Applied normalize_total + log1p on unverified data")
482
+ return adata
483
+ else:
484
+ raise ValueError(error_msg)
485
+
486
+ # Should not reach here, but handle gracefully
487
+ logger.warning(f"Unexpected state: {status}")
488
+ return adata
489
+
490
+
491
+ def get_loading_summary(adata: ad.AnnData) -> Dict[str, Any]:
492
+ """
493
+ Get summary statistics for loaded AnnData.
494
+
495
+ Parameters
496
+ ----------
497
+ adata : AnnData
498
+ Loaded AnnData object.
499
+
500
+ Returns
501
+ -------
502
+ Dict[str, Any]
503
+ Summary statistics including cell/gene counts, memory usage, etc.
504
+ """
505
+ from scipy.sparse import issparse
506
+
507
+ summary = {
508
+ "n_cells": adata.n_obs,
509
+ "n_genes": adata.n_vars,
510
+ "is_sparse": issparse(adata.X),
511
+ "dtype": str(adata.X.dtype),
512
+ }
513
+
514
+ # Memory estimate
515
+ if issparse(adata.X):
516
+ summary["matrix_memory_mb"] = adata.X.data.nbytes / (1024**2)
517
+ summary["sparsity"] = 1 - (adata.X.nnz / (adata.n_obs * adata.n_vars))
518
+ else:
519
+ summary["matrix_memory_mb"] = adata.X.nbytes / (1024**2)
520
+ summary["sparsity"] = 0.0
521
+
522
+ # Cell type info if available
523
+ for col in ["cell_type", "celltype", "CellType"]:
524
+ if col in adata.obs.columns:
525
+ summary["cell_type_column"] = col
526
+ summary["n_cell_types"] = adata.obs[col].nunique()
527
+ break
528
+
529
+ return summary