spatialcore 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {spatialcore-0.2.0 → spatialcore-0.2.2}/PKG-INFO +1 -1
  2. {spatialcore-0.2.0 → spatialcore-0.2.2}/pyproject.toml +1 -1
  3. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/__init__.py +1 -1
  4. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/cellxgene.py +37 -0
  5. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/training.py +36 -3
  6. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/validation.py +73 -1
  7. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/utils.py +70 -38
  8. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/PKG-INFO +1 -1
  9. {spatialcore-0.2.0 → spatialcore-0.2.2}/LICENSE +0 -0
  10. {spatialcore-0.2.0 → spatialcore-0.2.2}/README.md +0 -0
  11. {spatialcore-0.2.0 → spatialcore-0.2.2}/setup.cfg +0 -0
  12. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/__init__.py +0 -0
  13. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/acquisition.py +0 -0
  14. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/annotate.py +0 -0
  15. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/confidence.py +0 -0
  16. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/discovery.py +0 -0
  17. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/expression.py +0 -0
  18. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/loading.py +0 -0
  19. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/markers.py +0 -0
  20. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/ontology.py +0 -0
  21. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/patterns.py +0 -0
  22. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/pipeline.py +0 -0
  23. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/synapse.py +0 -0
  24. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/__init__.py +0 -0
  25. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/cache.py +0 -0
  26. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/logging.py +0 -0
  27. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/metadata.py +0 -0
  28. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +0 -0
  29. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/data/markers/canonical_markers.json +0 -0
  30. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/data/ontology_mappings/ontology_index.json +0 -0
  31. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/__init__.py +0 -0
  32. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/benchmark.py +0 -0
  33. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/celltype.py +0 -0
  34. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/confidence.py +0 -0
  35. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/spatial.py +0 -0
  36. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/utils.py +0 -0
  37. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/plotting/validation.py +0 -0
  38. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/r_bridge/__init__.py +0 -0
  39. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/r_bridge/subprocess_runner.py +0 -0
  40. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/spatial/__init__.py +0 -0
  41. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/spatial/autocorrelation.py +0 -0
  42. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/spatial/distance.py +0 -0
  43. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/spatial/domains.py +0 -0
  44. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/spatial/neighborhoods.py +0 -0
  45. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/stats/__init__.py +0 -0
  46. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/stats/_thresholding.py +0 -0
  47. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/stats/classify.py +0 -0
  48. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/SOURCES.txt +0 -0
  49. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/dependency_links.txt +0 -0
  50. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/requires.txt +0 -0
  51. {spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spatialcore
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Standardized spatial statistics tools for computational biology
5
5
  Author: SpatialCore Contributors
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spatialcore"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "Standardized spatial statistics tools for computational biology"
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -5,7 +5,7 @@ A thin, robust wrapper around standard libraries to ensure Python and R users
5
5
  get the exact same result for the same biological question.
6
6
  """
7
7
 
8
- __version__ = "0.2.0"
8
+ __version__ = "0.2.2"
9
9
 
10
10
  # Track which modules are available in this installation
11
11
  _available_modules: list[str] = []
@@ -201,6 +201,7 @@ def query_cellxgene_census(
201
201
  max_cells: Optional[int] = None,
202
202
  output_path: Optional[Union[str, Path]] = None,
203
203
  random_state: int = 42,
204
+ validate_labels: bool = True,
204
205
  ) -> ad.AnnData:
205
206
  """
206
207
  Query cells from CellxGene Census with flexible filters.
@@ -232,6 +233,9 @@ def query_cellxgene_census(
232
233
  If provided, save result to this h5ad file.
233
234
  random_state : int, default 42
234
235
  Random seed for subsampling (only used when max_cells is specified).
236
+ validate_labels : bool, default True
237
+ If True, check for label-to-ontology inconsistencies in CellxGene
238
+ columns (cell_type vs cell_type_ontology_term_id) and log warnings.
235
239
 
236
240
  Returns
237
241
  -------
@@ -355,6 +359,39 @@ def query_cellxgene_census(
355
359
 
356
360
  logger.info(f" Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
357
361
 
362
+ if validate_labels:
363
+ if (
364
+ "cell_type" in adata.obs.columns
365
+ and "cell_type_ontology_term_id" in adata.obs.columns
366
+ ):
367
+ from spatialcore.annotation.validation import (
368
+ check_label_ontology_consistency,
369
+ )
370
+
371
+ consistency = check_label_ontology_consistency(
372
+ adata,
373
+ label_column="cell_type",
374
+ ontology_column="cell_type_ontology_term_id",
375
+ )
376
+
377
+ if consistency.n_labels_with_multiple_ids > 0:
378
+ examples = []
379
+ for label in sorted(consistency.labels_with_multiple_ids.keys())[:5]:
380
+ ids = ", ".join(consistency.labels_with_multiple_ids[label])
381
+ examples.append(f"{label} -> {ids}")
382
+ logger.warning(
383
+ "CellxGene label/ontology mismatch: %d labels map to multiple CL IDs. "
384
+ "Examples: %s",
385
+ consistency.n_labels_with_multiple_ids,
386
+ "; ".join(examples),
387
+ )
388
+
389
+ if consistency.n_hierarchical_labels > 0:
390
+ logger.warning(
391
+ "CellxGene labels look hierarchical (parent/child in one label): %s",
392
+ ", ".join(sorted(consistency.hierarchical_labels)[:5]),
393
+ )
394
+
358
395
  # Save if output path provided
359
396
  if output_path:
360
397
  output_path = Path(output_path)
@@ -1243,8 +1243,8 @@ def subsample_balanced(
1243
1243
  max_cells_per_type : int, default 5000
1244
1244
  Maximum cells per cell type in output.
1245
1245
  min_cells_per_type : int, default 50
1246
- Cell types with fewer cells than this are kept entirely
1247
- (no subsampling applied).
1246
+ Minimum cells required to keep a cell type. Types with fewer
1247
+ cells are removed before balancing.
1248
1248
  source_column : str, optional, default "reference_source"
1249
1249
  Column identifying which reference each cell came from.
1250
1250
  Set to None to disable source-aware balancing (simple capping).
@@ -1387,8 +1387,41 @@ def subsample_balanced(
1387
1387
  else:
1388
1388
  cell_types = adata.obs[label_column].astype(str)
1389
1389
 
1390
- unique_types = cell_types.unique()
1391
1390
  type_counts = cell_types.value_counts()
1391
+ if min_cells_per_type > 0:
1392
+ low_count_types = type_counts[type_counts < min_cells_per_type].index.tolist()
1393
+ if low_count_types:
1394
+ n_removed = int(type_counts[type_counts < min_cells_per_type].sum())
1395
+ logger.info(
1396
+ f"\nFiltering low-count cell types (<{min_cells_per_type} cells) before balancing:"
1397
+ )
1398
+ logger.info(f" Removing {len(low_count_types)} types, {n_removed:,} cells")
1399
+ for ct in low_count_types[:10]:
1400
+ logger.info(f" {ct}: {type_counts[ct]} cells")
1401
+ if len(low_count_types) > 10:
1402
+ logger.info(f" ... and {len(low_count_types) - 10} more types")
1403
+
1404
+ keep_mask = ~cell_types.isin(low_count_types)
1405
+ adata = adata[keep_mask].copy()
1406
+
1407
+ if group_by_column is not None:
1408
+ cell_types = adata.obs[group_by_column].astype(str)
1409
+ else:
1410
+ cell_types = adata.obs[label_column].astype(str)
1411
+
1412
+ if props:
1413
+ dropped = sorted(set(props) & set(low_count_types))
1414
+ if dropped:
1415
+ for ct in dropped:
1416
+ props.pop(ct, None)
1417
+ logger.warning(
1418
+ "Dropping target_proportions for low-count types: %s",
1419
+ ", ".join(dropped),
1420
+ )
1421
+
1422
+ type_counts = cell_types.value_counts()
1423
+
1424
+ unique_types = cell_types.unique()
1392
1425
  target_totals = _resolve_target_totals(
1393
1426
  type_counts=type_counts,
1394
1427
  min_cells_per_type=min_cells_per_type,
@@ -347,7 +347,79 @@ def validate_cell_type_column(
347
347
  log_fn = logger.error if issue.severity == "error" else logger.warning
348
348
  log_fn(f" {issue.code}: {issue.message}")
349
349
 
350
- return result
350
+ return result
351
+
352
+
353
+ @dataclass
354
+ class LabelOntologyConsistencyResult:
355
+ """Result of checking label to ontology ID consistency."""
356
+
357
+ label_column: str
358
+ ontology_column: str
359
+ n_labels: int
360
+ n_labels_with_multiple_ids: int
361
+ labels_with_multiple_ids: Dict[str, List[str]]
362
+ n_hierarchical_labels: int
363
+ hierarchical_labels: List[str]
364
+
365
+
366
+ _HIERARCHY_PATTERN = re.compile(r"(?:\s>\s|\s->\s|;|\|)")
367
+
368
+
369
+ def check_label_ontology_consistency(
370
+ adata: ad.AnnData,
371
+ label_column: str,
372
+ ontology_column: str,
373
+ detect_hierarchy: bool = True,
374
+ ) -> LabelOntologyConsistencyResult:
375
+ """
376
+ Check whether each label maps to a single ontology ID.
377
+
378
+ Flags labels that map to multiple valid CL IDs, which can cause label
379
+ collapsing when IDs are inferred from labels.
380
+ """
381
+ if label_column not in adata.obs.columns:
382
+ raise ValueError(
383
+ f"Label column '{label_column}' not found in adata.obs. "
384
+ f"Available columns: {list(adata.obs.columns)}"
385
+ )
386
+ if ontology_column not in adata.obs.columns:
387
+ raise ValueError(
388
+ f"Ontology column '{ontology_column}' not found in adata.obs. "
389
+ f"Available columns: {list(adata.obs.columns)}"
390
+ )
391
+
392
+ labels = adata.obs[label_column].dropna().astype(str)
393
+ n_labels = int(labels.nunique())
394
+
395
+ pairs = adata.obs[[label_column, ontology_column]].dropna()
396
+ pairs = pairs.drop_duplicates().astype(str)
397
+ valid_mask = pairs[ontology_column].str.startswith("CL:")
398
+ unique_pairs = pairs.loc[valid_mask, [label_column, ontology_column]]
399
+
400
+ labels_with_multiple_ids: Dict[str, List[str]] = {}
401
+ if not unique_pairs.empty:
402
+ grouped = unique_pairs.groupby(label_column)[ontology_column].unique()
403
+ for label, ids in grouped.items():
404
+ unique_ids = sorted(set(ids))
405
+ if len(unique_ids) > 1:
406
+ labels_with_multiple_ids[str(label)] = unique_ids
407
+
408
+ hierarchical_labels: List[str] = []
409
+ if detect_hierarchy:
410
+ for label in labels.unique():
411
+ if _HIERARCHY_PATTERN.search(str(label)):
412
+ hierarchical_labels.append(str(label))
413
+
414
+ return LabelOntologyConsistencyResult(
415
+ label_column=label_column,
416
+ ontology_column=ontology_column,
417
+ n_labels=n_labels,
418
+ n_labels_with_multiple_ids=len(labels_with_multiple_ids),
419
+ labels_with_multiple_ids=labels_with_multiple_ids,
420
+ n_hierarchical_labels=len(hierarchical_labels),
421
+ hierarchical_labels=hierarchical_labels,
422
+ )
351
423
 
352
424
 
353
425
  def validate_multiple_columns(
@@ -251,6 +251,36 @@ def _convert_ensembl_to_hugo(
251
251
  return np.array(converted), stats
252
252
 
253
253
 
254
+ def _normalize_var_names(
255
+ var_names: pd.Index,
256
+ var_df: pd.DataFrame,
257
+ ensembl_to_hugo: Dict[str, str],
258
+ ) -> Tuple[np.ndarray, Dict[str, int], bool, bool]:
259
+ """
260
+ Normalize var_names using feature_name and Ensembl -> HUGO mapping.
261
+
262
+ Returns converted names, conversion stats, and flags indicating
263
+ whether non-symbol IDs were detected and feature_name was used.
264
+ """
265
+ first_gene = str(var_names[0])
266
+ uses_non_symbol_ids = (
267
+ first_gene.isdigit() or
268
+ first_gene.startswith("ENSG") or
269
+ first_gene.startswith("ENST")
270
+ )
271
+
272
+ base_names = var_names.values
273
+ used_feature_name = False
274
+ if uses_non_symbol_ids and "feature_name" in var_df.columns:
275
+ base_names = var_df["feature_name"].values.astype(str)
276
+ used_feature_name = True
277
+
278
+ converted_names, stats = _convert_ensembl_to_hugo(
279
+ np.asarray(base_names), ensembl_to_hugo
280
+ )
281
+ return converted_names, stats, uses_non_symbol_ids, used_feature_name
282
+
283
+
254
284
  def normalize_gene_names(
255
285
  adata: ad.AnnData,
256
286
  ensembl_to_hugo: Optional[Dict[str, str]] = None,
@@ -278,6 +308,7 @@ def normalize_gene_names(
278
308
  -------
279
309
  AnnData
280
310
  AnnData with normalized gene names in var_names.
311
+ If adata.raw is present, its var_names are updated to stay aligned.
281
312
 
282
313
  Notes
283
314
  -----
@@ -297,22 +328,15 @@ def normalize_gene_names(
297
328
  if copy:
298
329
  adata = adata.copy()
299
330
 
300
- first_gene = str(adata.var_names[0])
301
- uses_non_symbol_ids = (
302
- first_gene.isdigit() or
303
- first_gene.startswith("ENSG") or
304
- first_gene.startswith("ENST")
331
+ if ensembl_to_hugo is None:
332
+ ensembl_to_hugo = load_ensembl_to_hugo_mapping()
333
+
334
+ converted_names, stats, uses_non_symbol_ids, used_feature_name = _normalize_var_names(
335
+ adata.var_names, adata.var, ensembl_to_hugo
305
336
  )
306
337
 
307
338
  if not uses_non_symbol_ids:
308
339
  logger.info("Gene names already appear to be HUGO symbols")
309
- # Still check for any remaining Ensembl IDs and convert them
310
- if ensembl_to_hugo is None:
311
- ensembl_to_hugo = load_ensembl_to_hugo_mapping()
312
-
313
- converted_names, stats = _convert_ensembl_to_hugo(
314
- adata.var_names.values, ensembl_to_hugo
315
- )
316
340
  if stats["converted_ensembl"] > 0:
317
341
  adata.var_names = pd.Index(converted_names)
318
342
  adata.var_names_make_unique()
@@ -324,38 +348,46 @@ def normalize_gene_names(
324
348
  f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
325
349
  "leaving them unchanged"
326
350
  )
327
- return adata
351
+ else:
352
+ if used_feature_name:
353
+ logger.info("Using 'feature_name' column as gene names")
328
354
 
329
- # Step 1: Use feature_name column if available
330
- if "feature_name" in adata.var.columns:
331
- feature_names = adata.var["feature_name"].values.astype(str)
332
- adata.var_names = pd.Index(feature_names)
333
- logger.info("Using 'feature_name' column as gene names")
355
+ adata.var_names = pd.Index(converted_names)
334
356
 
335
- # Step 2: Apply Ensembl to HUGO mapping for any remaining Ensembl IDs
336
- if ensembl_to_hugo is None:
337
- ensembl_to_hugo = load_ensembl_to_hugo_mapping()
357
+ if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
358
+ logger.info(
359
+ f"Gene mapping: {stats['converted_ensembl']:,} converted, "
360
+ f"{stats['already_hugo']:,} already HUGO, "
361
+ f"{stats['unmapped_ensembl']:,} unmapped"
362
+ )
363
+ if stats["unmapped_ensembl"] > 0:
364
+ logger.warning(
365
+ f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
366
+ "leaving them unchanged"
367
+ )
368
+ else:
369
+ logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
338
370
 
339
- converted_names, stats = _convert_ensembl_to_hugo(
340
- adata.var_names.values, ensembl_to_hugo
341
- )
342
- adata.var_names = pd.Index(converted_names)
371
+ adata.var_names_make_unique()
343
372
 
344
- if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
345
- logger.info(
346
- f"Gene mapping: {stats['converted_ensembl']:,} converted, "
347
- f"{stats['already_hugo']:,} already HUGO, "
348
- f"{stats['unmapped_ensembl']:,} unmapped"
373
+ if adata.raw is not None:
374
+ raw_converted, raw_stats, _, raw_used_feature = _normalize_var_names(
375
+ adata.raw.var_names, adata.raw.var, ensembl_to_hugo
349
376
  )
350
- if stats["unmapped_ensembl"] > 0:
351
- logger.warning(
352
- f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
353
- "leaving them unchanged"
354
- )
355
- else:
356
- logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
377
+ raw_converted_index = pd.Index(raw_converted)
378
+
379
+ if raw_used_feature or not raw_converted_index.equals(adata.raw.var_names):
380
+ raw_adata = adata.raw.to_adata()
381
+ raw_adata.var_names = raw_converted_index
382
+ raw_adata.var_names_make_unique()
383
+ adata.raw = raw_adata
384
+ logger.info("Updated adata.raw.var_names to normalized HUGO symbols")
385
+ if raw_stats["unmapped_ensembl"] > 0:
386
+ logger.warning(
387
+ f"{raw_stats['unmapped_ensembl']:,} raw Ensembl IDs not found in mapping; "
388
+ "leaving them unchanged"
389
+ )
357
390
 
358
- adata.var_names_make_unique()
359
391
  return adata
360
392
 
361
393
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spatialcore
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Standardized spatial statistics tools for computational biology
5
5
  Author: SpatialCore Contributors
6
6
  License-Expression: Apache-2.0
File without changes
File without changes
File without changes