spatialcore 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {spatialcore-0.2.1 → spatialcore-0.2.3}/PKG-INFO +1 -1
  2. {spatialcore-0.2.1 → spatialcore-0.2.3}/pyproject.toml +1 -1
  3. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/__init__.py +1 -1
  4. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/cellxgene.py +37 -0
  5. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/validation.py +74 -0
  6. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/utils.py +70 -38
  7. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/PKG-INFO +1 -1
  8. {spatialcore-0.2.1 → spatialcore-0.2.3}/LICENSE +0 -0
  9. {spatialcore-0.2.1 → spatialcore-0.2.3}/README.md +0 -0
  10. {spatialcore-0.2.1 → spatialcore-0.2.3}/setup.cfg +0 -0
  11. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/__init__.py +0 -0
  12. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/acquisition.py +0 -0
  13. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/annotate.py +0 -0
  14. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/confidence.py +0 -0
  15. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/discovery.py +0 -0
  16. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/expression.py +0 -0
  17. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/loading.py +0 -0
  18. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/markers.py +0 -0
  19. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/ontology.py +0 -0
  20. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/patterns.py +0 -0
  21. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/pipeline.py +0 -0
  22. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/synapse.py +0 -0
  23. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/annotation/training.py +0 -0
  24. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/__init__.py +0 -0
  25. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/cache.py +0 -0
  26. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/logging.py +0 -0
  27. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/core/metadata.py +0 -0
  28. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +0 -0
  29. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/markers/canonical_markers.json +0 -0
  30. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/data/ontology_mappings/ontology_index.json +0 -0
  31. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/__init__.py +0 -0
  32. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/benchmark.py +0 -0
  33. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/celltype.py +0 -0
  34. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/confidence.py +0 -0
  35. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/spatial.py +0 -0
  36. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/utils.py +0 -0
  37. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/plotting/validation.py +0 -0
  38. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/r_bridge/__init__.py +0 -0
  39. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/r_bridge/subprocess_runner.py +0 -0
  40. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/__init__.py +0 -0
  41. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/autocorrelation.py +0 -0
  42. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/distance.py +0 -0
  43. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/domains.py +0 -0
  44. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/spatial/neighborhoods.py +0 -0
  45. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/__init__.py +0 -0
  46. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/_thresholding.py +0 -0
  47. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore/stats/classify.py +0 -0
  48. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/SOURCES.txt +0 -0
  49. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/dependency_links.txt +0 -0
  50. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/requires.txt +0 -0
  51. {spatialcore-0.2.1 → spatialcore-0.2.3}/src/spatialcore.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spatialcore
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Standardized spatial statistics tools for computational biology
5
5
  Author: SpatialCore Contributors
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spatialcore"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  description = "Standardized spatial statistics tools for computational biology"
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -5,7 +5,7 @@ A thin, robust wrapper around standard libraries to ensure Python and R users
5
5
  get the exact same result for the same biological question.
6
6
  """
7
7
 
8
- __version__ = "0.2.1"
8
+ __version__ = "0.2.3"
9
9
 
10
10
  # Track which modules are available in this installation
11
11
  _available_modules: list[str] = []
@@ -201,6 +201,7 @@ def query_cellxgene_census(
201
201
  max_cells: Optional[int] = None,
202
202
  output_path: Optional[Union[str, Path]] = None,
203
203
  random_state: int = 42,
204
+ validate_labels: bool = True,
204
205
  ) -> ad.AnnData:
205
206
  """
206
207
  Query cells from CellxGene Census with flexible filters.
@@ -232,6 +233,9 @@ def query_cellxgene_census(
232
233
  If provided, save result to this h5ad file.
233
234
  random_state : int, default 42
234
235
  Random seed for subsampling (only used when max_cells is specified).
236
+ validate_labels : bool, default True
237
+ If True, check for label-to-ontology inconsistencies in CellxGene
238
+ columns (cell_type vs cell_type_ontology_term_id) and log warnings.
235
239
 
236
240
  Returns
237
241
  -------
@@ -355,6 +359,39 @@ def query_cellxgene_census(
355
359
 
356
360
  logger.info(f" Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
357
361
 
362
+ if validate_labels:
363
+ if (
364
+ "cell_type" in adata.obs.columns
365
+ and "cell_type_ontology_term_id" in adata.obs.columns
366
+ ):
367
+ from spatialcore.annotation.validation import (
368
+ check_label_ontology_consistency,
369
+ )
370
+
371
+ consistency = check_label_ontology_consistency(
372
+ adata,
373
+ label_column="cell_type",
374
+ ontology_column="cell_type_ontology_term_id",
375
+ )
376
+
377
+ if consistency.n_labels_with_multiple_ids > 0:
378
+ examples = []
379
+ for label in sorted(consistency.labels_with_multiple_ids.keys())[:5]:
380
+ ids = ", ".join(consistency.labels_with_multiple_ids[label])
381
+ examples.append(f"{label} -> {ids}")
382
+ logger.warning(
383
+ "CellxGene label/ontology mismatch: %d labels map to multiple CL IDs. "
384
+ "Examples: %s",
385
+ consistency.n_labels_with_multiple_ids,
386
+ "; ".join(examples),
387
+ )
388
+
389
+ if consistency.n_hierarchical_labels > 0:
390
+ logger.warning(
391
+ "CellxGene labels look hierarchical (parent/child in one label): %s",
392
+ ", ".join(sorted(consistency.hierarchical_labels)[:5]),
393
+ )
394
+
358
395
  # Save if output path provided
359
396
  if output_path:
360
397
  output_path = Path(output_path)
@@ -347,9 +347,83 @@ def validate_cell_type_column(
347
347
  log_fn = logger.error if issue.severity == "error" else logger.warning
348
348
  log_fn(f" {issue.code}: {issue.message}")
349
349
 
350
+ return result
351
+
350
352
  return result
351
353
 
352
354
 
355
+ @dataclass
356
+ class LabelOntologyConsistencyResult:
357
+ """Result of checking label to ontology ID consistency."""
358
+
359
+ label_column: str
360
+ ontology_column: str
361
+ n_labels: int
362
+ n_labels_with_multiple_ids: int
363
+ labels_with_multiple_ids: Dict[str, List[str]]
364
+ n_hierarchical_labels: int
365
+ hierarchical_labels: List[str]
366
+
367
+
368
+ _HIERARCHY_PATTERN = re.compile(r"(?:\s>\s|\s->\s|;|\|)")
369
+
370
+
371
+ def check_label_ontology_consistency(
372
+ adata: ad.AnnData,
373
+ label_column: str,
374
+ ontology_column: str,
375
+ detect_hierarchy: bool = True,
376
+ ) -> LabelOntologyConsistencyResult:
377
+ """
378
+ Check whether each label maps to a single ontology ID.
379
+
380
+ Flags labels that map to multiple valid CL IDs, which can cause label
381
+ collapsing when IDs are inferred from labels.
382
+ """
383
+ if label_column not in adata.obs.columns:
384
+ raise ValueError(
385
+ f"Label column '{label_column}' not found in adata.obs. "
386
+ f"Available columns: {list(adata.obs.columns)}"
387
+ )
388
+ if ontology_column not in adata.obs.columns:
389
+ raise ValueError(
390
+ f"Ontology column '{ontology_column}' not found in adata.obs. "
391
+ f"Available columns: {list(adata.obs.columns)}"
392
+ )
393
+
394
+ labels = adata.obs[label_column].dropna().astype(str)
395
+ n_labels = int(labels.nunique())
396
+
397
+ pairs = adata.obs[[label_column, ontology_column]].dropna()
398
+ pairs = pairs.drop_duplicates().astype(str)
399
+ valid_mask = pairs[ontology_column].str.startswith("CL:")
400
+ unique_pairs = pairs.loc[valid_mask, [label_column, ontology_column]]
401
+
402
+ labels_with_multiple_ids: Dict[str, List[str]] = {}
403
+ if not unique_pairs.empty:
404
+ grouped = unique_pairs.groupby(label_column)[ontology_column].unique()
405
+ for label, ids in grouped.items():
406
+ unique_ids = sorted(set(ids))
407
+ if len(unique_ids) > 1:
408
+ labels_with_multiple_ids[str(label)] = unique_ids
409
+
410
+ hierarchical_labels: List[str] = []
411
+ if detect_hierarchy:
412
+ for label in labels.unique():
413
+ if _HIERARCHY_PATTERN.search(str(label)):
414
+ hierarchical_labels.append(str(label))
415
+
416
+ return LabelOntologyConsistencyResult(
417
+ label_column=label_column,
418
+ ontology_column=ontology_column,
419
+ n_labels=n_labels,
420
+ n_labels_with_multiple_ids=len(labels_with_multiple_ids),
421
+ labels_with_multiple_ids=labels_with_multiple_ids,
422
+ n_hierarchical_labels=len(hierarchical_labels),
423
+ hierarchical_labels=hierarchical_labels,
424
+ )
425
+
426
+
353
427
  def validate_multiple_columns(
354
428
  adatas: List[ad.AnnData],
355
429
  columns: List[str],
@@ -251,6 +251,36 @@ def _convert_ensembl_to_hugo(
251
251
  return np.array(converted), stats
252
252
 
253
253
 
254
+ def _normalize_var_names(
255
+ var_names: pd.Index,
256
+ var_df: pd.DataFrame,
257
+ ensembl_to_hugo: Dict[str, str],
258
+ ) -> Tuple[np.ndarray, Dict[str, int], bool, bool]:
259
+ """
260
+ Normalize var_names using feature_name and Ensembl -> HUGO mapping.
261
+
262
+ Returns converted names, conversion stats, and flags indicating
263
+ whether non-symbol IDs were detected and feature_name was used.
264
+ """
265
+ first_gene = str(var_names[0])
266
+ uses_non_symbol_ids = (
267
+ first_gene.isdigit() or
268
+ first_gene.startswith("ENSG") or
269
+ first_gene.startswith("ENST")
270
+ )
271
+
272
+ base_names = var_names.values
273
+ used_feature_name = False
274
+ if uses_non_symbol_ids and "feature_name" in var_df.columns:
275
+ base_names = var_df["feature_name"].values.astype(str)
276
+ used_feature_name = True
277
+
278
+ converted_names, stats = _convert_ensembl_to_hugo(
279
+ np.asarray(base_names), ensembl_to_hugo
280
+ )
281
+ return converted_names, stats, uses_non_symbol_ids, used_feature_name
282
+
283
+
254
284
  def normalize_gene_names(
255
285
  adata: ad.AnnData,
256
286
  ensembl_to_hugo: Optional[Dict[str, str]] = None,
@@ -278,6 +308,7 @@ def normalize_gene_names(
278
308
  -------
279
309
  AnnData
280
310
  AnnData with normalized gene names in var_names.
311
+ If adata.raw is present, its var_names are updated to stay aligned.
281
312
 
282
313
  Notes
283
314
  -----
@@ -297,22 +328,15 @@ def normalize_gene_names(
297
328
  if copy:
298
329
  adata = adata.copy()
299
330
 
300
- first_gene = str(adata.var_names[0])
301
- uses_non_symbol_ids = (
302
- first_gene.isdigit() or
303
- first_gene.startswith("ENSG") or
304
- first_gene.startswith("ENST")
331
+ if ensembl_to_hugo is None:
332
+ ensembl_to_hugo = load_ensembl_to_hugo_mapping()
333
+
334
+ converted_names, stats, uses_non_symbol_ids, used_feature_name = _normalize_var_names(
335
+ adata.var_names, adata.var, ensembl_to_hugo
305
336
  )
306
337
 
307
338
  if not uses_non_symbol_ids:
308
339
  logger.info("Gene names already appear to be HUGO symbols")
309
- # Still check for any remaining Ensembl IDs and convert them
310
- if ensembl_to_hugo is None:
311
- ensembl_to_hugo = load_ensembl_to_hugo_mapping()
312
-
313
- converted_names, stats = _convert_ensembl_to_hugo(
314
- adata.var_names.values, ensembl_to_hugo
315
- )
316
340
  if stats["converted_ensembl"] > 0:
317
341
  adata.var_names = pd.Index(converted_names)
318
342
  adata.var_names_make_unique()
@@ -324,38 +348,46 @@ def normalize_gene_names(
324
348
  f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
325
349
  "leaving them unchanged"
326
350
  )
327
- return adata
351
+ else:
352
+ if used_feature_name:
353
+ logger.info("Using 'feature_name' column as gene names")
328
354
 
329
- # Step 1: Use feature_name column if available
330
- if "feature_name" in adata.var.columns:
331
- feature_names = adata.var["feature_name"].values.astype(str)
332
- adata.var_names = pd.Index(feature_names)
333
- logger.info("Using 'feature_name' column as gene names")
355
+ adata.var_names = pd.Index(converted_names)
334
356
 
335
- # Step 2: Apply Ensembl to HUGO mapping for any remaining Ensembl IDs
336
- if ensembl_to_hugo is None:
337
- ensembl_to_hugo = load_ensembl_to_hugo_mapping()
357
+ if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
358
+ logger.info(
359
+ f"Gene mapping: {stats['converted_ensembl']:,} converted, "
360
+ f"{stats['already_hugo']:,} already HUGO, "
361
+ f"{stats['unmapped_ensembl']:,} unmapped"
362
+ )
363
+ if stats["unmapped_ensembl"] > 0:
364
+ logger.warning(
365
+ f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
366
+ "leaving them unchanged"
367
+ )
368
+ else:
369
+ logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
338
370
 
339
- converted_names, stats = _convert_ensembl_to_hugo(
340
- adata.var_names.values, ensembl_to_hugo
341
- )
342
- adata.var_names = pd.Index(converted_names)
371
+ adata.var_names_make_unique()
343
372
 
344
- if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
345
- logger.info(
346
- f"Gene mapping: {stats['converted_ensembl']:,} converted, "
347
- f"{stats['already_hugo']:,} already HUGO, "
348
- f"{stats['unmapped_ensembl']:,} unmapped"
373
+ if adata.raw is not None:
374
+ raw_converted, raw_stats, _, raw_used_feature = _normalize_var_names(
375
+ adata.raw.var_names, adata.raw.var, ensembl_to_hugo
349
376
  )
350
- if stats["unmapped_ensembl"] > 0:
351
- logger.warning(
352
- f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
353
- "leaving them unchanged"
354
- )
355
- else:
356
- logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
377
+ raw_converted_index = pd.Index(raw_converted)
378
+
379
+ if raw_used_feature or not raw_converted_index.equals(adata.raw.var_names):
380
+ raw_adata = adata.raw.to_adata()
381
+ raw_adata.var_names = raw_converted_index
382
+ raw_adata.var_names_make_unique()
383
+ adata.raw = raw_adata
384
+ logger.info("Updated adata.raw.var_names to normalized HUGO symbols")
385
+ if raw_stats["unmapped_ensembl"] > 0:
386
+ logger.warning(
387
+ f"{raw_stats['unmapped_ensembl']:,} raw Ensembl IDs not found in mapping; "
388
+ "leaving them unchanged"
389
+ )
357
390
 
358
- adata.var_names_make_unique()
359
391
  return adata
360
392
 
361
393
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spatialcore
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Standardized spatial statistics tools for computational biology
5
5
  Author: SpatialCore Contributors
6
6
  License-Expression: Apache-2.0
File without changes
File without changes
File without changes