spatialcore 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {spatialcore-0.2.1 → spatialcore-0.2.3}/PKG-INFO +1 -1
- {spatialcore-0.2.1 → spatialcore-0.2.3}/pyproject.toml +1 -1
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/__init__.py +1 -1
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/cellxgene.py +37 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/validation.py +74 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/utils.py +70 -38
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/PKG-INFO +1 -1
- {spatialcore-0.2.1 → spatialcore-0.2.3}/LICENSE +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/README.md +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/setup.cfg +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/acquisition.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/annotate.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/confidence.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/discovery.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/expression.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/loading.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/markers.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/ontology.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/patterns.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/pipeline.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/synapse.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/training.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/cache.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/logging.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/metadata.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/markers/canonical_markers.json +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/ontology_mappings/ontology_index.json +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/benchmark.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/celltype.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/confidence.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/spatial.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/utils.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/validation.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/r_bridge/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/r_bridge/subprocess_runner.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/autocorrelation.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/distance.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/domains.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/neighborhoods.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/__init__.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/_thresholding.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/classify.py +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/SOURCES.txt +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/dependency_links.txt +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/requires.txt +0 -0
- {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ A thin, robust wrapper around standard libraries to ensure Python and R users
|
|
|
5
5
|
get the exact same result for the same biological question.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "0.2.
|
|
8
|
+
__version__ = "0.2.3"
|
|
9
9
|
|
|
10
10
|
# Track which modules are available in this installation
|
|
11
11
|
_available_modules: list[str] = []
|
|
@@ -201,6 +201,7 @@ def query_cellxgene_census(
|
|
|
201
201
|
max_cells: Optional[int] = None,
|
|
202
202
|
output_path: Optional[Union[str, Path]] = None,
|
|
203
203
|
random_state: int = 42,
|
|
204
|
+
validate_labels: bool = True,
|
|
204
205
|
) -> ad.AnnData:
|
|
205
206
|
"""
|
|
206
207
|
Query cells from CellxGene Census with flexible filters.
|
|
@@ -232,6 +233,9 @@ def query_cellxgene_census(
|
|
|
232
233
|
If provided, save result to this h5ad file.
|
|
233
234
|
random_state : int, default 42
|
|
234
235
|
Random seed for subsampling (only used when max_cells is specified).
|
|
236
|
+
validate_labels : bool, default True
|
|
237
|
+
If True, check for label-to-ontology inconsistencies in CellxGene
|
|
238
|
+
columns (cell_type vs cell_type_ontology_term_id) and log warnings.
|
|
235
239
|
|
|
236
240
|
Returns
|
|
237
241
|
-------
|
|
@@ -355,6 +359,39 @@ def query_cellxgene_census(
|
|
|
355
359
|
|
|
356
360
|
logger.info(f" Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
|
|
357
361
|
|
|
362
|
+
if validate_labels:
|
|
363
|
+
if (
|
|
364
|
+
"cell_type" in adata.obs.columns
|
|
365
|
+
and "cell_type_ontology_term_id" in adata.obs.columns
|
|
366
|
+
):
|
|
367
|
+
from spatialcore.annotation.validation import (
|
|
368
|
+
check_label_ontology_consistency,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
consistency = check_label_ontology_consistency(
|
|
372
|
+
adata,
|
|
373
|
+
label_column="cell_type",
|
|
374
|
+
ontology_column="cell_type_ontology_term_id",
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if consistency.n_labels_with_multiple_ids > 0:
|
|
378
|
+
examples = []
|
|
379
|
+
for label in sorted(consistency.labels_with_multiple_ids.keys())[:5]:
|
|
380
|
+
ids = ", ".join(consistency.labels_with_multiple_ids[label])
|
|
381
|
+
examples.append(f"{label} -> {ids}")
|
|
382
|
+
logger.warning(
|
|
383
|
+
"CellxGene label/ontology mismatch: %d labels map to multiple CL IDs. "
|
|
384
|
+
"Examples: %s",
|
|
385
|
+
consistency.n_labels_with_multiple_ids,
|
|
386
|
+
"; ".join(examples),
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
if consistency.n_hierarchical_labels > 0:
|
|
390
|
+
logger.warning(
|
|
391
|
+
"CellxGene labels look hierarchical (parent/child in one label): %s",
|
|
392
|
+
", ".join(sorted(consistency.hierarchical_labels)[:5]),
|
|
393
|
+
)
|
|
394
|
+
|
|
358
395
|
# Save if output path provided
|
|
359
396
|
if output_path:
|
|
360
397
|
output_path = Path(output_path)
|
|
@@ -347,9 +347,83 @@ def validate_cell_type_column(
|
|
|
347
347
|
log_fn = logger.error if issue.severity == "error" else logger.warning
|
|
348
348
|
log_fn(f" {issue.code}: {issue.message}")
|
|
349
349
|
|
|
350
|
+
return result
|
|
351
|
+
|
|
350
352
|
return result
|
|
351
353
|
|
|
352
354
|
|
|
355
|
+
@dataclass
|
|
356
|
+
class LabelOntologyConsistencyResult:
|
|
357
|
+
"""Result of checking label to ontology ID consistency."""
|
|
358
|
+
|
|
359
|
+
label_column: str
|
|
360
|
+
ontology_column: str
|
|
361
|
+
n_labels: int
|
|
362
|
+
n_labels_with_multiple_ids: int
|
|
363
|
+
labels_with_multiple_ids: Dict[str, List[str]]
|
|
364
|
+
n_hierarchical_labels: int
|
|
365
|
+
hierarchical_labels: List[str]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
_HIERARCHY_PATTERN = re.compile(r"(?:\s>\s|\s->\s|;|\|)")
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def check_label_ontology_consistency(
|
|
372
|
+
adata: ad.AnnData,
|
|
373
|
+
label_column: str,
|
|
374
|
+
ontology_column: str,
|
|
375
|
+
detect_hierarchy: bool = True,
|
|
376
|
+
) -> LabelOntologyConsistencyResult:
|
|
377
|
+
"""
|
|
378
|
+
Check whether each label maps to a single ontology ID.
|
|
379
|
+
|
|
380
|
+
Flags labels that map to multiple valid CL IDs, which can cause label
|
|
381
|
+
collapsing when IDs are inferred from labels.
|
|
382
|
+
"""
|
|
383
|
+
if label_column not in adata.obs.columns:
|
|
384
|
+
raise ValueError(
|
|
385
|
+
f"Label column '{label_column}' not found in adata.obs. "
|
|
386
|
+
f"Available columns: {list(adata.obs.columns)}"
|
|
387
|
+
)
|
|
388
|
+
if ontology_column not in adata.obs.columns:
|
|
389
|
+
raise ValueError(
|
|
390
|
+
f"Ontology column '{ontology_column}' not found in adata.obs. "
|
|
391
|
+
f"Available columns: {list(adata.obs.columns)}"
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
labels = adata.obs[label_column].dropna().astype(str)
|
|
395
|
+
n_labels = int(labels.nunique())
|
|
396
|
+
|
|
397
|
+
pairs = adata.obs[[label_column, ontology_column]].dropna()
|
|
398
|
+
pairs = pairs.drop_duplicates().astype(str)
|
|
399
|
+
valid_mask = pairs[ontology_column].str.startswith("CL:")
|
|
400
|
+
unique_pairs = pairs.loc[valid_mask, [label_column, ontology_column]]
|
|
401
|
+
|
|
402
|
+
labels_with_multiple_ids: Dict[str, List[str]] = {}
|
|
403
|
+
if not unique_pairs.empty:
|
|
404
|
+
grouped = unique_pairs.groupby(label_column)[ontology_column].unique()
|
|
405
|
+
for label, ids in grouped.items():
|
|
406
|
+
unique_ids = sorted(set(ids))
|
|
407
|
+
if len(unique_ids) > 1:
|
|
408
|
+
labels_with_multiple_ids[str(label)] = unique_ids
|
|
409
|
+
|
|
410
|
+
hierarchical_labels: List[str] = []
|
|
411
|
+
if detect_hierarchy:
|
|
412
|
+
for label in labels.unique():
|
|
413
|
+
if _HIERARCHY_PATTERN.search(str(label)):
|
|
414
|
+
hierarchical_labels.append(str(label))
|
|
415
|
+
|
|
416
|
+
return LabelOntologyConsistencyResult(
|
|
417
|
+
label_column=label_column,
|
|
418
|
+
ontology_column=ontology_column,
|
|
419
|
+
n_labels=n_labels,
|
|
420
|
+
n_labels_with_multiple_ids=len(labels_with_multiple_ids),
|
|
421
|
+
labels_with_multiple_ids=labels_with_multiple_ids,
|
|
422
|
+
n_hierarchical_labels=len(hierarchical_labels),
|
|
423
|
+
hierarchical_labels=hierarchical_labels,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
|
|
353
427
|
def validate_multiple_columns(
|
|
354
428
|
adatas: List[ad.AnnData],
|
|
355
429
|
columns: List[str],
|
|
@@ -251,6 +251,36 @@ def _convert_ensembl_to_hugo(
|
|
|
251
251
|
return np.array(converted), stats
|
|
252
252
|
|
|
253
253
|
|
|
254
|
+
def _normalize_var_names(
|
|
255
|
+
var_names: pd.Index,
|
|
256
|
+
var_df: pd.DataFrame,
|
|
257
|
+
ensembl_to_hugo: Dict[str, str],
|
|
258
|
+
) -> Tuple[np.ndarray, Dict[str, int], bool, bool]:
|
|
259
|
+
"""
|
|
260
|
+
Normalize var_names using feature_name and Ensembl -> HUGO mapping.
|
|
261
|
+
|
|
262
|
+
Returns converted names, conversion stats, and flags indicating
|
|
263
|
+
whether non-symbol IDs were detected and feature_name was used.
|
|
264
|
+
"""
|
|
265
|
+
first_gene = str(var_names[0])
|
|
266
|
+
uses_non_symbol_ids = (
|
|
267
|
+
first_gene.isdigit() or
|
|
268
|
+
first_gene.startswith("ENSG") or
|
|
269
|
+
first_gene.startswith("ENST")
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
base_names = var_names.values
|
|
273
|
+
used_feature_name = False
|
|
274
|
+
if uses_non_symbol_ids and "feature_name" in var_df.columns:
|
|
275
|
+
base_names = var_df["feature_name"].values.astype(str)
|
|
276
|
+
used_feature_name = True
|
|
277
|
+
|
|
278
|
+
converted_names, stats = _convert_ensembl_to_hugo(
|
|
279
|
+
np.asarray(base_names), ensembl_to_hugo
|
|
280
|
+
)
|
|
281
|
+
return converted_names, stats, uses_non_symbol_ids, used_feature_name
|
|
282
|
+
|
|
283
|
+
|
|
254
284
|
def normalize_gene_names(
|
|
255
285
|
adata: ad.AnnData,
|
|
256
286
|
ensembl_to_hugo: Optional[Dict[str, str]] = None,
|
|
@@ -278,6 +308,7 @@ def normalize_gene_names(
|
|
|
278
308
|
-------
|
|
279
309
|
AnnData
|
|
280
310
|
AnnData with normalized gene names in var_names.
|
|
311
|
+
If adata.raw is present, its var_names are updated to stay aligned.
|
|
281
312
|
|
|
282
313
|
Notes
|
|
283
314
|
-----
|
|
@@ -297,22 +328,15 @@ def normalize_gene_names(
|
|
|
297
328
|
if copy:
|
|
298
329
|
adata = adata.copy()
|
|
299
330
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
331
|
+
if ensembl_to_hugo is None:
|
|
332
|
+
ensembl_to_hugo = load_ensembl_to_hugo_mapping()
|
|
333
|
+
|
|
334
|
+
converted_names, stats, uses_non_symbol_ids, used_feature_name = _normalize_var_names(
|
|
335
|
+
adata.var_names, adata.var, ensembl_to_hugo
|
|
305
336
|
)
|
|
306
337
|
|
|
307
338
|
if not uses_non_symbol_ids:
|
|
308
339
|
logger.info("Gene names already appear to be HUGO symbols")
|
|
309
|
-
# Still check for any remaining Ensembl IDs and convert them
|
|
310
|
-
if ensembl_to_hugo is None:
|
|
311
|
-
ensembl_to_hugo = load_ensembl_to_hugo_mapping()
|
|
312
|
-
|
|
313
|
-
converted_names, stats = _convert_ensembl_to_hugo(
|
|
314
|
-
adata.var_names.values, ensembl_to_hugo
|
|
315
|
-
)
|
|
316
340
|
if stats["converted_ensembl"] > 0:
|
|
317
341
|
adata.var_names = pd.Index(converted_names)
|
|
318
342
|
adata.var_names_make_unique()
|
|
@@ -324,38 +348,46 @@ def normalize_gene_names(
|
|
|
324
348
|
f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
|
|
325
349
|
"leaving them unchanged"
|
|
326
350
|
)
|
|
327
|
-
|
|
351
|
+
else:
|
|
352
|
+
if used_feature_name:
|
|
353
|
+
logger.info("Using 'feature_name' column as gene names")
|
|
328
354
|
|
|
329
|
-
|
|
330
|
-
if "feature_name" in adata.var.columns:
|
|
331
|
-
feature_names = adata.var["feature_name"].values.astype(str)
|
|
332
|
-
adata.var_names = pd.Index(feature_names)
|
|
333
|
-
logger.info("Using 'feature_name' column as gene names")
|
|
355
|
+
adata.var_names = pd.Index(converted_names)
|
|
334
356
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
357
|
+
if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
|
|
358
|
+
logger.info(
|
|
359
|
+
f"Gene mapping: {stats['converted_ensembl']:,} converted, "
|
|
360
|
+
f"{stats['already_hugo']:,} already HUGO, "
|
|
361
|
+
f"{stats['unmapped_ensembl']:,} unmapped"
|
|
362
|
+
)
|
|
363
|
+
if stats["unmapped_ensembl"] > 0:
|
|
364
|
+
logger.warning(
|
|
365
|
+
f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
|
|
366
|
+
"leaving them unchanged"
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
|
|
338
370
|
|
|
339
|
-
|
|
340
|
-
adata.var_names.values, ensembl_to_hugo
|
|
341
|
-
)
|
|
342
|
-
adata.var_names = pd.Index(converted_names)
|
|
371
|
+
adata.var_names_make_unique()
|
|
343
372
|
|
|
344
|
-
if
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
f"{stats['already_hugo']:,} already HUGO, "
|
|
348
|
-
f"{stats['unmapped_ensembl']:,} unmapped"
|
|
373
|
+
if adata.raw is not None:
|
|
374
|
+
raw_converted, raw_stats, _, raw_used_feature = _normalize_var_names(
|
|
375
|
+
adata.raw.var_names, adata.raw.var, ensembl_to_hugo
|
|
349
376
|
)
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
377
|
+
raw_converted_index = pd.Index(raw_converted)
|
|
378
|
+
|
|
379
|
+
if raw_used_feature or not raw_converted_index.equals(adata.raw.var_names):
|
|
380
|
+
raw_adata = adata.raw.to_adata()
|
|
381
|
+
raw_adata.var_names = raw_converted_index
|
|
382
|
+
raw_adata.var_names_make_unique()
|
|
383
|
+
adata.raw = raw_adata
|
|
384
|
+
logger.info("Updated adata.raw.var_names to normalized HUGO symbols")
|
|
385
|
+
if raw_stats["unmapped_ensembl"] > 0:
|
|
386
|
+
logger.warning(
|
|
387
|
+
f"{raw_stats['unmapped_ensembl']:,} raw Ensembl IDs not found in mapping; "
|
|
388
|
+
"leaving them unchanged"
|
|
389
|
+
)
|
|
357
390
|
|
|
358
|
-
adata.var_names_make_unique()
|
|
359
391
|
return adata
|
|
360
392
|
|
|
361
393
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv
RENAMED
|
File without changes
|
|
File without changes
|
{spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/ontology_mappings/ontology_index.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|