spatialcore 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatialcore/__init__.py +122 -0
- spatialcore/annotation/__init__.py +253 -0
- spatialcore/annotation/acquisition.py +529 -0
- spatialcore/annotation/annotate.py +603 -0
- spatialcore/annotation/cellxgene.py +365 -0
- spatialcore/annotation/confidence.py +802 -0
- spatialcore/annotation/discovery.py +529 -0
- spatialcore/annotation/expression.py +363 -0
- spatialcore/annotation/loading.py +529 -0
- spatialcore/annotation/markers.py +297 -0
- spatialcore/annotation/ontology.py +1282 -0
- spatialcore/annotation/patterns.py +247 -0
- spatialcore/annotation/pipeline.py +620 -0
- spatialcore/annotation/synapse.py +380 -0
- spatialcore/annotation/training.py +1457 -0
- spatialcore/annotation/validation.py +422 -0
- spatialcore/core/__init__.py +34 -0
- spatialcore/core/cache.py +118 -0
- spatialcore/core/logging.py +135 -0
- spatialcore/core/metadata.py +149 -0
- spatialcore/core/utils.py +768 -0
- spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
- spatialcore/data/markers/canonical_markers.json +83 -0
- spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
- spatialcore/plotting/__init__.py +109 -0
- spatialcore/plotting/benchmark.py +477 -0
- spatialcore/plotting/celltype.py +329 -0
- spatialcore/plotting/confidence.py +413 -0
- spatialcore/plotting/spatial.py +505 -0
- spatialcore/plotting/utils.py +411 -0
- spatialcore/plotting/validation.py +1342 -0
- spatialcore-0.1.9.dist-info/METADATA +213 -0
- spatialcore-0.1.9.dist-info/RECORD +36 -0
- spatialcore-0.1.9.dist-info/WHEEL +5 -0
- spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
- spatialcore-0.1.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CellxGene reference data download utilities.
|
|
3
|
+
|
|
4
|
+
This module provides utilities for downloading reference datasets from
|
|
5
|
+
CellxGene Census, including:
|
|
6
|
+
- Downloading predefined datasets by key
|
|
7
|
+
- Querying Census with flexible filters (tissue, disease, cell type)
|
|
8
|
+
- Listing available datasets
|
|
9
|
+
|
|
10
|
+
Gene mapping utilities (Ensembl → HUGO) have been moved to spatialcore.core.utils
|
|
11
|
+
and are re-exported here for backward compatibility.
|
|
12
|
+
|
|
13
|
+
References:
|
|
14
|
+
- CellxGene Census: https://chanzuckerberg.github.io/cellxgene-census/
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, List, Optional, Any, Union
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import anndata as ad
|
|
23
|
+
|
|
24
|
+
from spatialcore.core.logging import get_logger
|
|
25
|
+
|
|
26
|
+
# Re-export gene mapping utilities from core/utils for backward compatibility
|
|
27
|
+
from spatialcore.core.utils import (
|
|
28
|
+
load_ensembl_to_hugo_mapping,
|
|
29
|
+
normalize_gene_names,
|
|
30
|
+
check_normalization_status,
|
|
31
|
+
download_ensembl_mapping,
|
|
32
|
+
is_ensembl_id,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
logger = get_logger(__name__)
|
|
36
|
+
|
|
37
|
+
# ============================================================================
|
|
38
|
+
# CellxGene Dataset Registry
|
|
39
|
+
# ============================================================================
|
|
40
|
+
|
|
41
|
+
CELLXGENE_DATASETS: Dict[str, Dict[str, Any]] = {
|
|
42
|
+
# Liver datasets
|
|
43
|
+
"healthy_human_liver": {
|
|
44
|
+
"dataset_id": "4f88c1be-5156-463d-b64d-a3a3a8e0da6d",
|
|
45
|
+
"description": "Cell types from scRNA-seq and snRNA-seq of healthy human liver",
|
|
46
|
+
"tissue": "liver",
|
|
47
|
+
"cell_type_column": "cell_type",
|
|
48
|
+
"expected_cells": "~100,000",
|
|
49
|
+
},
|
|
50
|
+
# Colon / GI datasets
|
|
51
|
+
"colon_immune_niches": {
|
|
52
|
+
"dataset_id": "2872f4b0-b171-46e2-abc6-befcf6de6306",
|
|
53
|
+
"description": "Distinct microbial and immune niches of the human colon",
|
|
54
|
+
"tissue": "colon",
|
|
55
|
+
"cell_type_column": "cell_type",
|
|
56
|
+
"expected_cells": "~41,650",
|
|
57
|
+
},
|
|
58
|
+
"colon_ulcerative_colitis": {
|
|
59
|
+
"dataset_id": "4dd00779-7f73-4f50-89bb-e2d3c6b71b18",
|
|
60
|
+
"description": "Human Colon during Ulcerative Colitis (Smillie et al.)",
|
|
61
|
+
"tissue": "colon",
|
|
62
|
+
"cell_type_column": "cell_type",
|
|
63
|
+
"expected_cells": "~34,772",
|
|
64
|
+
},
|
|
65
|
+
"colon_crohns_immune": {
|
|
66
|
+
"dataset_id": "518d9049-2a76-44f8-8abc-1e2b59ab5ba1",
|
|
67
|
+
"description": "Crohn's disease colon immune cells",
|
|
68
|
+
"tissue": "colon",
|
|
69
|
+
"cell_type_column": "cell_type",
|
|
70
|
+
"expected_cells": "~152,509",
|
|
71
|
+
},
|
|
72
|
+
# Lung datasets
|
|
73
|
+
"human_lung_cell_atlas": {
|
|
74
|
+
"dataset_id": "f72958f5-7f42-4ebb-98da-445b0c6de516",
|
|
75
|
+
"description": "Human Lung Cell Atlas (HLCA) - Azimuth",
|
|
76
|
+
"tissue": "lung",
|
|
77
|
+
"cell_type_column": "ann_finest_level",
|
|
78
|
+
"expected_cells": "~584,884",
|
|
79
|
+
},
|
|
80
|
+
"lung_covid": {
|
|
81
|
+
"dataset_id": "d8da613f-e681-4c69-b463-e94f5e66847f",
|
|
82
|
+
"description": "Molecular single-cell lung atlas of lethal COVID-19",
|
|
83
|
+
"tissue": "lung",
|
|
84
|
+
"cell_type_column": "cell_type",
|
|
85
|
+
"expected_cells": "~116,313",
|
|
86
|
+
},
|
|
87
|
+
# CRC datasets
|
|
88
|
+
"crc_htan_epithelial_discovery": {
|
|
89
|
+
"dataset_id": "e40c6272-af77-4a10-9385-62a398884f27",
|
|
90
|
+
"description": "HTAN VUMC CRC Polyps - Epithelial (Discovery)",
|
|
91
|
+
"tissue": "colon",
|
|
92
|
+
"cell_type_column": "cell_type",
|
|
93
|
+
"expected_cells": "~65,088",
|
|
94
|
+
},
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def list_available_datasets() -> pd.DataFrame:
|
|
99
|
+
"""
|
|
100
|
+
List all available CellxGene datasets with metadata.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
pd.DataFrame
|
|
105
|
+
DataFrame with dataset keys, descriptions, tissues, and expected cell counts.
|
|
106
|
+
"""
|
|
107
|
+
records = []
|
|
108
|
+
for key, info in CELLXGENE_DATASETS.items():
|
|
109
|
+
records.append({
|
|
110
|
+
"dataset_key": key,
|
|
111
|
+
"description": info["description"],
|
|
112
|
+
"tissue": info["tissue"],
|
|
113
|
+
"cell_type_column": info["cell_type_column"],
|
|
114
|
+
"expected_cells": info.get("expected_cells", "unknown"),
|
|
115
|
+
})
|
|
116
|
+
return pd.DataFrame(records)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def download_cellxgene_reference(
|
|
120
|
+
dataset_key: str,
|
|
121
|
+
output_dir: Union[str, Path],
|
|
122
|
+
force: bool = False,
|
|
123
|
+
) -> Path:
|
|
124
|
+
"""
|
|
125
|
+
Download a reference dataset from CellxGene Census.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
dataset_key : str
|
|
130
|
+
Key from CELLXGENE_DATASETS registry (e.g., "healthy_human_liver").
|
|
131
|
+
output_dir : str or Path
|
|
132
|
+
Directory to save the downloaded h5ad file.
|
|
133
|
+
force : bool, default False
|
|
134
|
+
If True, re-download even if file exists.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
Path
|
|
139
|
+
Path to the downloaded h5ad file.
|
|
140
|
+
|
|
141
|
+
Raises
|
|
142
|
+
------
|
|
143
|
+
ValueError
|
|
144
|
+
If dataset_key is not in CELLXGENE_DATASETS.
|
|
145
|
+
ImportError
|
|
146
|
+
If cellxgene-census is not installed.
|
|
147
|
+
|
|
148
|
+
Examples
|
|
149
|
+
--------
|
|
150
|
+
>>> from spatialcore.annotation import download_cellxgene_reference
|
|
151
|
+
>>> path = download_cellxgene_reference("healthy_human_liver", "./references")
|
|
152
|
+
>>> print(path)
|
|
153
|
+
references/healthy_human_liver.h5ad
|
|
154
|
+
"""
|
|
155
|
+
if dataset_key not in CELLXGENE_DATASETS:
|
|
156
|
+
available = ", ".join(CELLXGENE_DATASETS.keys())
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"Unknown dataset: '{dataset_key}'. Available: {available}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
import cellxgene_census
|
|
163
|
+
except ImportError:
|
|
164
|
+
raise ImportError(
|
|
165
|
+
"cellxgene-census is required for downloading CellxGene data. "
|
|
166
|
+
"Install with: pip install cellxgene-census"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
output_dir = Path(output_dir)
|
|
170
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
output_file = output_dir / f"{dataset_key}.h5ad"
|
|
172
|
+
|
|
173
|
+
if output_file.exists() and not force:
|
|
174
|
+
logger.info(f"Dataset already exists: {output_file}")
|
|
175
|
+
return output_file
|
|
176
|
+
|
|
177
|
+
dataset_info = CELLXGENE_DATASETS[dataset_key]
|
|
178
|
+
dataset_id = dataset_info["dataset_id"]
|
|
179
|
+
|
|
180
|
+
logger.info(f"Downloading {dataset_key} (ID: {dataset_id})...")
|
|
181
|
+
logger.info(f" Description: {dataset_info['description']}")
|
|
182
|
+
logger.info(f" Expected cells: {dataset_info.get('expected_cells', 'unknown')}")
|
|
183
|
+
|
|
184
|
+
# Download using Census API
|
|
185
|
+
cellxgene_census.download_source_h5ad(
|
|
186
|
+
dataset_id,
|
|
187
|
+
to_path=str(output_file),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
logger.info(f"Downloaded to: {output_file}")
|
|
191
|
+
return output_file
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def query_cellxgene_census(
|
|
195
|
+
tissue: Optional[str] = None,
|
|
196
|
+
disease: Optional[str] = None,
|
|
197
|
+
cell_type: Optional[str] = None,
|
|
198
|
+
assay: Optional[str] = None,
|
|
199
|
+
organism: str = "Homo sapiens",
|
|
200
|
+
obs_columns: Optional[List[str]] = None,
|
|
201
|
+
max_cells: Optional[int] = None,
|
|
202
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
203
|
+
random_state: int = 42,
|
|
204
|
+
) -> ad.AnnData:
|
|
205
|
+
"""
|
|
206
|
+
Query cells from CellxGene Census with flexible filters.
|
|
207
|
+
|
|
208
|
+
This provides more flexibility than download_cellxgene_reference() by
|
|
209
|
+
allowing arbitrary tissue/disease/cell_type combinations.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
tissue : str, optional
|
|
214
|
+
Tissue filter (e.g., "liver", "lung", "colon").
|
|
215
|
+
disease : str, optional
|
|
216
|
+
Disease filter (e.g., "normal", "hepatocellular carcinoma").
|
|
217
|
+
cell_type : str, optional
|
|
218
|
+
Cell type filter (e.g., "T cell", "hepatocyte").
|
|
219
|
+
assay : str, optional
|
|
220
|
+
Assay filter (e.g., "10x 3' v3", "Smart-seq2").
|
|
221
|
+
organism : str, default "Homo sapiens"
|
|
222
|
+
Organism to query.
|
|
223
|
+
obs_columns : List[str], optional
|
|
224
|
+
Columns to include in obs. Default: cell_type, disease, assay, tissue.
|
|
225
|
+
max_cells : int, optional
|
|
226
|
+
Maximum cells to return. Default None downloads ALL matching cells
|
|
227
|
+
(recommended for production). If specified, uses memory-efficient
|
|
228
|
+
sampling: queries cell IDs first, samples in memory, then downloads
|
|
229
|
+
only the sampled cells. Use this for testing/development to avoid
|
|
230
|
+
OOM errors on memory-constrained systems.
|
|
231
|
+
output_path : str or Path, optional
|
|
232
|
+
If provided, save result to this h5ad file.
|
|
233
|
+
random_state : int, default 42
|
|
234
|
+
Random seed for subsampling (only used when max_cells is specified).
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
AnnData
|
|
239
|
+
AnnData object with queried cells.
|
|
240
|
+
|
|
241
|
+
Raises
|
|
242
|
+
------
|
|
243
|
+
ImportError
|
|
244
|
+
If cellxgene-census is not installed (Linux only, no Windows support).
|
|
245
|
+
ValueError
|
|
246
|
+
If no filter criteria provided.
|
|
247
|
+
|
|
248
|
+
Examples
|
|
249
|
+
--------
|
|
250
|
+
>>> from spatialcore.annotation import query_cellxgene_census
|
|
251
|
+
>>> # Production: Download ALL healthy liver cells
|
|
252
|
+
>>> adata = query_cellxgene_census(
|
|
253
|
+
... tissue="liver",
|
|
254
|
+
... disease="normal",
|
|
255
|
+
... output_path="./references/healthy_liver.h5ad"
|
|
256
|
+
... )
|
|
257
|
+
>>> # Testing: Sample 5000 cells (memory-efficient for development)
|
|
258
|
+
>>> sample = query_cellxgene_census(
|
|
259
|
+
... tissue="liver",
|
|
260
|
+
... disease="hepatocellular carcinoma",
|
|
261
|
+
... max_cells=5000, # Only for testing
|
|
262
|
+
... )
|
|
263
|
+
"""
|
|
264
|
+
try:
|
|
265
|
+
import cellxgene_census
|
|
266
|
+
except ImportError:
|
|
267
|
+
raise ImportError(
|
|
268
|
+
"cellxgene-census is required for querying CellxGene data. "
|
|
269
|
+
"Install with: pip install cellxgene-census"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Build filter string
|
|
273
|
+
filters = ["is_primary_data == True"]
|
|
274
|
+
if tissue:
|
|
275
|
+
filters.append(f"tissue == '{tissue}'")
|
|
276
|
+
if disease:
|
|
277
|
+
filters.append(f"disease == '{disease}'")
|
|
278
|
+
if cell_type:
|
|
279
|
+
filters.append(f"cell_type == '{cell_type}'")
|
|
280
|
+
if assay:
|
|
281
|
+
filters.append(f"assay == '{assay}'")
|
|
282
|
+
|
|
283
|
+
if len(filters) == 1:
|
|
284
|
+
raise ValueError(
|
|
285
|
+
"At least one filter (tissue, disease, cell_type, or assay) is required"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
filter_string = " and ".join(filters)
|
|
289
|
+
|
|
290
|
+
# Default obs columns - includes ontology ID if available in Census
|
|
291
|
+
if obs_columns is None:
|
|
292
|
+
obs_columns = [
|
|
293
|
+
"cell_type",
|
|
294
|
+
"cell_type_ontology_term_id", # CL ID from CellxGene curators
|
|
295
|
+
"disease",
|
|
296
|
+
"assay",
|
|
297
|
+
"dataset_id",
|
|
298
|
+
"tissue",
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
logger.info("Querying CellxGene Census...")
|
|
302
|
+
logger.info(f" Organism: {organism}")
|
|
303
|
+
logger.info(f" Filter: {filter_string}")
|
|
304
|
+
|
|
305
|
+
with cellxgene_census.open_soma() as census:
|
|
306
|
+
# Memory-efficient approach: sample cell IDs BEFORE downloading expression data
|
|
307
|
+
# This prevents OOM by only fetching the cells we actually need
|
|
308
|
+
|
|
309
|
+
# Convert organism name to Census key format (e.g., "Homo sapiens" -> "homo_sapiens")
|
|
310
|
+
organism_key = organism.lower().replace(" ", "_")
|
|
311
|
+
|
|
312
|
+
if max_cells:
|
|
313
|
+
# Step 1: Get cell IDs matching filter (lightweight - no expression data)
|
|
314
|
+
logger.info(" Step 1: Counting cells matching filter...")
|
|
315
|
+
human = census["census_data"][organism_key]
|
|
316
|
+
obs_df = human.obs.read(
|
|
317
|
+
value_filter=filter_string,
|
|
318
|
+
column_names=["soma_joinid"], # Only get IDs, very lightweight
|
|
319
|
+
).concat().to_pandas()
|
|
320
|
+
|
|
321
|
+
total_cells = len(obs_df)
|
|
322
|
+
logger.info(f" Found {total_cells:,} cells matching filter")
|
|
323
|
+
|
|
324
|
+
# Step 2: Sample cell IDs if needed
|
|
325
|
+
if total_cells > max_cells:
|
|
326
|
+
logger.info(f" Step 2: Sampling {max_cells:,} cell IDs (memory-efficient)...")
|
|
327
|
+
np.random.seed(random_state)
|
|
328
|
+
sampled_ids = np.random.choice(
|
|
329
|
+
obs_df["soma_joinid"].values,
|
|
330
|
+
size=max_cells,
|
|
331
|
+
replace=False,
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
sampled_ids = obs_df["soma_joinid"].values
|
|
335
|
+
logger.info(f" Step 2: Using all {len(sampled_ids):,} cells (under max_cells limit)")
|
|
336
|
+
|
|
337
|
+
# Step 3: Download only sampled cells (key memory optimization!)
|
|
338
|
+
logger.info(f" Step 3: Downloading expression data for {len(sampled_ids):,} cells...")
|
|
339
|
+
adata = cellxgene_census.get_anndata(
|
|
340
|
+
census=census,
|
|
341
|
+
organism=organism,
|
|
342
|
+
obs_coords=sampled_ids, # Only fetch these specific cells!
|
|
343
|
+
obs_column_names=obs_columns,
|
|
344
|
+
)
|
|
345
|
+
else:
|
|
346
|
+
# No max_cells limit - download everything (use with caution!)
|
|
347
|
+
logger.warning(" No max_cells limit set - downloading ALL matching cells!")
|
|
348
|
+
logger.warning(" This may use significant memory. Consider setting max_cells.")
|
|
349
|
+
adata = cellxgene_census.get_anndata(
|
|
350
|
+
census=census,
|
|
351
|
+
organism=organism,
|
|
352
|
+
obs_value_filter=filter_string,
|
|
353
|
+
obs_column_names=obs_columns,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
logger.info(f" Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
|
|
357
|
+
|
|
358
|
+
# Save if output path provided
|
|
359
|
+
if output_path:
|
|
360
|
+
output_path = Path(output_path)
|
|
361
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
362
|
+
adata.write_h5ad(output_path)
|
|
363
|
+
logger.info(f" Saved to: {output_path}")
|
|
364
|
+
|
|
365
|
+
return adata
|