spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,365 @@
1
+ """
2
+ CellxGene reference data download utilities.
3
+
4
+ This module provides utilities for downloading reference datasets from
5
+ CellxGene Census, including:
6
+ - Downloading predefined datasets by key
7
+ - Querying Census with flexible filters (tissue, disease, cell type)
8
+ - Listing available datasets
9
+
10
+ Gene mapping utilities (Ensembl → HUGO) have been moved to spatialcore.core.utils
11
+ and are re-exported here for backward compatibility.
12
+
13
+ References:
14
+ - CellxGene Census: https://chanzuckerberg.github.io/cellxgene-census/
15
+ """
16
+
17
+ from pathlib import Path
18
+ from typing import Dict, List, Optional, Any, Union
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import anndata as ad
23
+
24
+ from spatialcore.core.logging import get_logger
25
+
26
+ # Re-export gene mapping utilities from core/utils for backward compatibility
27
+ from spatialcore.core.utils import (
28
+ load_ensembl_to_hugo_mapping,
29
+ normalize_gene_names,
30
+ check_normalization_status,
31
+ download_ensembl_mapping,
32
+ is_ensembl_id,
33
+ )
34
+
35
+ logger = get_logger(__name__)
36
+
37
+ # ============================================================================
38
+ # CellxGene Dataset Registry
39
+ # ============================================================================
40
+
41
+ CELLXGENE_DATASETS: Dict[str, Dict[str, Any]] = {
42
+ # Liver datasets
43
+ "healthy_human_liver": {
44
+ "dataset_id": "4f88c1be-5156-463d-b64d-a3a3a8e0da6d",
45
+ "description": "Cell types from scRNA-seq and snRNA-seq of healthy human liver",
46
+ "tissue": "liver",
47
+ "cell_type_column": "cell_type",
48
+ "expected_cells": "~100,000",
49
+ },
50
+ # Colon / GI datasets
51
+ "colon_immune_niches": {
52
+ "dataset_id": "2872f4b0-b171-46e2-abc6-befcf6de6306",
53
+ "description": "Distinct microbial and immune niches of the human colon",
54
+ "tissue": "colon",
55
+ "cell_type_column": "cell_type",
56
+ "expected_cells": "~41,650",
57
+ },
58
+ "colon_ulcerative_colitis": {
59
+ "dataset_id": "4dd00779-7f73-4f50-89bb-e2d3c6b71b18",
60
+ "description": "Human Colon during Ulcerative Colitis (Smillie et al.)",
61
+ "tissue": "colon",
62
+ "cell_type_column": "cell_type",
63
+ "expected_cells": "~34,772",
64
+ },
65
+ "colon_crohns_immune": {
66
+ "dataset_id": "518d9049-2a76-44f8-8abc-1e2b59ab5ba1",
67
+ "description": "Crohn's disease colon immune cells",
68
+ "tissue": "colon",
69
+ "cell_type_column": "cell_type",
70
+ "expected_cells": "~152,509",
71
+ },
72
+ # Lung datasets
73
+ "human_lung_cell_atlas": {
74
+ "dataset_id": "f72958f5-7f42-4ebb-98da-445b0c6de516",
75
+ "description": "Human Lung Cell Atlas (HLCA) - Azimuth",
76
+ "tissue": "lung",
77
+ "cell_type_column": "ann_finest_level",
78
+ "expected_cells": "~584,884",
79
+ },
80
+ "lung_covid": {
81
+ "dataset_id": "d8da613f-e681-4c69-b463-e94f5e66847f",
82
+ "description": "Molecular single-cell lung atlas of lethal COVID-19",
83
+ "tissue": "lung",
84
+ "cell_type_column": "cell_type",
85
+ "expected_cells": "~116,313",
86
+ },
87
+ # CRC datasets
88
+ "crc_htan_epithelial_discovery": {
89
+ "dataset_id": "e40c6272-af77-4a10-9385-62a398884f27",
90
+ "description": "HTAN VUMC CRC Polyps - Epithelial (Discovery)",
91
+ "tissue": "colon",
92
+ "cell_type_column": "cell_type",
93
+ "expected_cells": "~65,088",
94
+ },
95
+ }
96
+
97
+
98
+ def list_available_datasets() -> pd.DataFrame:
99
+ """
100
+ List all available CellxGene datasets with metadata.
101
+
102
+ Returns
103
+ -------
104
+ pd.DataFrame
105
+ DataFrame with dataset keys, descriptions, tissues, and expected cell counts.
106
+ """
107
+ records = []
108
+ for key, info in CELLXGENE_DATASETS.items():
109
+ records.append({
110
+ "dataset_key": key,
111
+ "description": info["description"],
112
+ "tissue": info["tissue"],
113
+ "cell_type_column": info["cell_type_column"],
114
+ "expected_cells": info.get("expected_cells", "unknown"),
115
+ })
116
+ return pd.DataFrame(records)
117
+
118
+
119
+ def download_cellxgene_reference(
120
+ dataset_key: str,
121
+ output_dir: Union[str, Path],
122
+ force: bool = False,
123
+ ) -> Path:
124
+ """
125
+ Download a reference dataset from CellxGene Census.
126
+
127
+ Parameters
128
+ ----------
129
+ dataset_key : str
130
+ Key from CELLXGENE_DATASETS registry (e.g., "healthy_human_liver").
131
+ output_dir : str or Path
132
+ Directory to save the downloaded h5ad file.
133
+ force : bool, default False
134
+ If True, re-download even if file exists.
135
+
136
+ Returns
137
+ -------
138
+ Path
139
+ Path to the downloaded h5ad file.
140
+
141
+ Raises
142
+ ------
143
+ ValueError
144
+ If dataset_key is not in CELLXGENE_DATASETS.
145
+ ImportError
146
+ If cellxgene-census is not installed.
147
+
148
+ Examples
149
+ --------
150
+ >>> from spatialcore.annotation import download_cellxgene_reference
151
+ >>> path = download_cellxgene_reference("healthy_human_liver", "./references")
152
+ >>> print(path)
153
+ references/healthy_human_liver.h5ad
154
+ """
155
+ if dataset_key not in CELLXGENE_DATASETS:
156
+ available = ", ".join(CELLXGENE_DATASETS.keys())
157
+ raise ValueError(
158
+ f"Unknown dataset: '{dataset_key}'. Available: {available}"
159
+ )
160
+
161
+ try:
162
+ import cellxgene_census
163
+ except ImportError:
164
+ raise ImportError(
165
+ "cellxgene-census is required for downloading CellxGene data. "
166
+ "Install with: pip install cellxgene-census"
167
+ )
168
+
169
+ output_dir = Path(output_dir)
170
+ output_dir.mkdir(parents=True, exist_ok=True)
171
+ output_file = output_dir / f"{dataset_key}.h5ad"
172
+
173
+ if output_file.exists() and not force:
174
+ logger.info(f"Dataset already exists: {output_file}")
175
+ return output_file
176
+
177
+ dataset_info = CELLXGENE_DATASETS[dataset_key]
178
+ dataset_id = dataset_info["dataset_id"]
179
+
180
+ logger.info(f"Downloading {dataset_key} (ID: {dataset_id})...")
181
+ logger.info(f" Description: {dataset_info['description']}")
182
+ logger.info(f" Expected cells: {dataset_info.get('expected_cells', 'unknown')}")
183
+
184
+ # Download using Census API
185
+ cellxgene_census.download_source_h5ad(
186
+ dataset_id,
187
+ to_path=str(output_file),
188
+ )
189
+
190
+ logger.info(f"Downloaded to: {output_file}")
191
+ return output_file
192
+
193
+
194
+ def query_cellxgene_census(
195
+ tissue: Optional[str] = None,
196
+ disease: Optional[str] = None,
197
+ cell_type: Optional[str] = None,
198
+ assay: Optional[str] = None,
199
+ organism: str = "Homo sapiens",
200
+ obs_columns: Optional[List[str]] = None,
201
+ max_cells: Optional[int] = None,
202
+ output_path: Optional[Union[str, Path]] = None,
203
+ random_state: int = 42,
204
+ ) -> ad.AnnData:
205
+ """
206
+ Query cells from CellxGene Census with flexible filters.
207
+
208
+ This provides more flexibility than download_cellxgene_reference() by
209
+ allowing arbitrary tissue/disease/cell_type combinations.
210
+
211
+ Parameters
212
+ ----------
213
+ tissue : str, optional
214
+ Tissue filter (e.g., "liver", "lung", "colon").
215
+ disease : str, optional
216
+ Disease filter (e.g., "normal", "hepatocellular carcinoma").
217
+ cell_type : str, optional
218
+ Cell type filter (e.g., "T cell", "hepatocyte").
219
+ assay : str, optional
220
+ Assay filter (e.g., "10x 3' v3", "Smart-seq2").
221
+ organism : str, default "Homo sapiens"
222
+ Organism to query.
223
+ obs_columns : List[str], optional
224
+ Columns to include in obs. Default: cell_type, disease, assay, tissue.
225
+ max_cells : int, optional
226
+ Maximum cells to return. Default None downloads ALL matching cells
227
+ (recommended for production). If specified, uses memory-efficient
228
+ sampling: queries cell IDs first, samples in memory, then downloads
229
+ only the sampled cells. Use this for testing/development to avoid
230
+ OOM errors on memory-constrained systems.
231
+ output_path : str or Path, optional
232
+ If provided, save result to this h5ad file.
233
+ random_state : int, default 42
234
+ Random seed for subsampling (only used when max_cells is specified).
235
+
236
+ Returns
237
+ -------
238
+ AnnData
239
+ AnnData object with queried cells.
240
+
241
+ Raises
242
+ ------
243
+ ImportError
244
+ If cellxgene-census is not installed (Linux only, no Windows support).
245
+ ValueError
246
+ If no filter criteria provided.
247
+
248
+ Examples
249
+ --------
250
+ >>> from spatialcore.annotation import query_cellxgene_census
251
+ >>> # Production: Download ALL healthy liver cells
252
+ >>> adata = query_cellxgene_census(
253
+ ... tissue="liver",
254
+ ... disease="normal",
255
+ ... output_path="./references/healthy_liver.h5ad"
256
+ ... )
257
+ >>> # Testing: Sample 5000 cells (memory-efficient for development)
258
+ >>> sample = query_cellxgene_census(
259
+ ... tissue="liver",
260
+ ... disease="hepatocellular carcinoma",
261
+ ... max_cells=5000, # Only for testing
262
+ ... )
263
+ """
264
+ try:
265
+ import cellxgene_census
266
+ except ImportError:
267
+ raise ImportError(
268
+ "cellxgene-census is required for querying CellxGene data. "
269
+ "Install with: pip install cellxgene-census"
270
+ )
271
+
272
+ # Build filter string
273
+ filters = ["is_primary_data == True"]
274
+ if tissue:
275
+ filters.append(f"tissue == '{tissue}'")
276
+ if disease:
277
+ filters.append(f"disease == '{disease}'")
278
+ if cell_type:
279
+ filters.append(f"cell_type == '{cell_type}'")
280
+ if assay:
281
+ filters.append(f"assay == '{assay}'")
282
+
283
+ if len(filters) == 1:
284
+ raise ValueError(
285
+ "At least one filter (tissue, disease, cell_type, or assay) is required"
286
+ )
287
+
288
+ filter_string = " and ".join(filters)
289
+
290
+ # Default obs columns - includes ontology ID if available in Census
291
+ if obs_columns is None:
292
+ obs_columns = [
293
+ "cell_type",
294
+ "cell_type_ontology_term_id", # CL ID from CellxGene curators
295
+ "disease",
296
+ "assay",
297
+ "dataset_id",
298
+ "tissue",
299
+ ]
300
+
301
+ logger.info("Querying CellxGene Census...")
302
+ logger.info(f" Organism: {organism}")
303
+ logger.info(f" Filter: {filter_string}")
304
+
305
+ with cellxgene_census.open_soma() as census:
306
+ # Memory-efficient approach: sample cell IDs BEFORE downloading expression data
307
+ # This prevents OOM by only fetching the cells we actually need
308
+
309
+ # Convert organism name to Census key format (e.g., "Homo sapiens" -> "homo_sapiens")
310
+ organism_key = organism.lower().replace(" ", "_")
311
+
312
+ if max_cells:
313
+ # Step 1: Get cell IDs matching filter (lightweight - no expression data)
314
+ logger.info(" Step 1: Counting cells matching filter...")
315
+ human = census["census_data"][organism_key]
316
+ obs_df = human.obs.read(
317
+ value_filter=filter_string,
318
+ column_names=["soma_joinid"], # Only get IDs, very lightweight
319
+ ).concat().to_pandas()
320
+
321
+ total_cells = len(obs_df)
322
+ logger.info(f" Found {total_cells:,} cells matching filter")
323
+
324
+ # Step 2: Sample cell IDs if needed
325
+ if total_cells > max_cells:
326
+ logger.info(f" Step 2: Sampling {max_cells:,} cell IDs (memory-efficient)...")
327
+ np.random.seed(random_state)
328
+ sampled_ids = np.random.choice(
329
+ obs_df["soma_joinid"].values,
330
+ size=max_cells,
331
+ replace=False,
332
+ )
333
+ else:
334
+ sampled_ids = obs_df["soma_joinid"].values
335
+ logger.info(f" Step 2: Using all {len(sampled_ids):,} cells (under max_cells limit)")
336
+
337
+ # Step 3: Download only sampled cells (key memory optimization!)
338
+ logger.info(f" Step 3: Downloading expression data for {len(sampled_ids):,} cells...")
339
+ adata = cellxgene_census.get_anndata(
340
+ census=census,
341
+ organism=organism,
342
+ obs_coords=sampled_ids, # Only fetch these specific cells!
343
+ obs_column_names=obs_columns,
344
+ )
345
+ else:
346
+ # No max_cells limit - download everything (use with caution!)
347
+ logger.warning(" No max_cells limit set - downloading ALL matching cells!")
348
+ logger.warning(" This may use significant memory. Consider setting max_cells.")
349
+ adata = cellxgene_census.get_anndata(
350
+ census=census,
351
+ organism=organism,
352
+ obs_value_filter=filter_string,
353
+ obs_column_names=obs_columns,
354
+ )
355
+
356
+ logger.info(f" Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
357
+
358
+ # Save if output path provided
359
+ if output_path:
360
+ output_path = Path(output_path)
361
+ output_path.parent.mkdir(parents=True, exist_ok=True)
362
+ adata.write_h5ad(output_path)
363
+ logger.info(f" Saved to: {output_path}")
364
+
365
+ return adata