PyPI - spatialcore - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

spatialcore 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{spatialcore-0.2.0 → spatialcore-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spatialcore
-Version: 0.2.0
+Version: 0.2.2
 Summary: Standardized spatial statistics tools for computational biology
 Author: SpatialCore Contributors
 License-Expression: Apache-2.0

{spatialcore-0.2.0 → spatialcore-0.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spatialcore"
-version = "0.2.0"
+version = "0.2.2"
 description = "Standardized spatial statistics tools for computational biology"
 readme = "README.md"
 license = "Apache-2.0"

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ A thin, robust wrapper around standard libraries to ensure Python and R users
 get the exact same result for the same biological question.
 """
-__version__ = "0.2.0"
+__version__ = "0.2.2"
 # Track which modules are available in this installation
 _available_modules: list[str] = []

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/cellxgene.py RENAMED Viewed

@@ -201,6 +201,7 @@ def query_cellxgene_census(
     max_cells: Optional[int] = None,
     output_path: Optional[Union[str, Path]] = None,
     random_state: int = 42,
+    validate_labels: bool = True,
 ) -> ad.AnnData:
     """
     Query cells from CellxGene Census with flexible filters.
@@ -232,6 +233,9 @@ def query_cellxgene_census(
         If provided, save result to this h5ad file.
     random_state : int, default 42
         Random seed for subsampling (only used when max_cells is specified).
+    validate_labels : bool, default True
+        If True, check for label-to-ontology inconsistencies in CellxGene
+        columns (cell_type vs cell_type_ontology_term_id) and log warnings.
     Returns
     -------
@@ -355,6 +359,39 @@ def query_cellxgene_census(
     logger.info(f"  Downloaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
+    if validate_labels:
+        if (
+            "cell_type" in adata.obs.columns
+            and "cell_type_ontology_term_id" in adata.obs.columns
+        ):
+            from spatialcore.annotation.validation import (
+                check_label_ontology_consistency,
+            )
+            consistency = check_label_ontology_consistency(
+                adata,
+                label_column="cell_type",
+                ontology_column="cell_type_ontology_term_id",
+            )
+            if consistency.n_labels_with_multiple_ids > 0:
+                examples = []
+                for label in sorted(consistency.labels_with_multiple_ids.keys())[:5]:
+                    ids = ", ".join(consistency.labels_with_multiple_ids[label])
+                    examples.append(f"{label} -> {ids}")
+                logger.warning(
+                    "CellxGene label/ontology mismatch: %d labels map to multiple CL IDs. "
+                    "Examples: %s",
+                    consistency.n_labels_with_multiple_ids,
+                    "; ".join(examples),
+                )
+            if consistency.n_hierarchical_labels > 0:
+                logger.warning(
+                    "CellxGene labels look hierarchical (parent/child in one label): %s",
+                    ", ".join(sorted(consistency.hierarchical_labels)[:5]),
+                )
     # Save if output path provided
     if output_path:
         output_path = Path(output_path)

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/training.py RENAMED Viewed

@@ -1243,8 +1243,8 @@ def subsample_balanced(
     max_cells_per_type : int, default 5000
         Maximum cells per cell type in output.
     min_cells_per_type : int, default 50
-        Cell types with fewer cells than this are kept entirely
-        (no subsampling applied).
+        Minimum cells required to keep a cell type. Types with fewer
+        cells are removed before balancing.
     source_column : str, optional, default "reference_source"
         Column identifying which reference each cell came from.
         Set to None to disable source-aware balancing (simple capping).
@@ -1387,8 +1387,41 @@ def subsample_balanced(
     else:
         cell_types = adata.obs[label_column].astype(str)
-    unique_types = cell_types.unique()
     type_counts = cell_types.value_counts()
+    if min_cells_per_type > 0:
+        low_count_types = type_counts[type_counts < min_cells_per_type].index.tolist()
+        if low_count_types:
+            n_removed = int(type_counts[type_counts < min_cells_per_type].sum())
+            logger.info(
+                f"\nFiltering low-count cell types (<{min_cells_per_type} cells) before balancing:"
+            )
+            logger.info(f"  Removing {len(low_count_types)} types, {n_removed:,} cells")
+            for ct in low_count_types[:10]:
+                logger.info(f"    {ct}: {type_counts[ct]} cells")
+            if len(low_count_types) > 10:
+                logger.info(f"    ... and {len(low_count_types) - 10} more types")
+            keep_mask = ~cell_types.isin(low_count_types)
+            adata = adata[keep_mask].copy()
+            if group_by_column is not None:
+                cell_types = adata.obs[group_by_column].astype(str)
+            else:
+                cell_types = adata.obs[label_column].astype(str)
+            if props:
+                dropped = sorted(set(props) & set(low_count_types))
+                if dropped:
+                    for ct in dropped:
+                        props.pop(ct, None)
+                    logger.warning(
+                        "Dropping target_proportions for low-count types: %s",
+                        ", ".join(dropped),
+                    )
+            type_counts = cell_types.value_counts()
+    unique_types = cell_types.unique()
     target_totals = _resolve_target_totals(
         type_counts=type_counts,
         min_cells_per_type=min_cells_per_type,

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/annotation/validation.py RENAMED Viewed

@@ -347,7 +347,79 @@ def validate_cell_type_column(
             log_fn = logger.error if issue.severity == "error" else logger.warning
             log_fn(f"  {issue.code}: {issue.message}")
-    return result
+        return result
+@dataclass
+class LabelOntologyConsistencyResult:
+    """Result of checking label to ontology ID consistency."""
+    label_column: str
+    ontology_column: str
+    n_labels: int
+    n_labels_with_multiple_ids: int
+    labels_with_multiple_ids: Dict[str, List[str]]
+    n_hierarchical_labels: int
+    hierarchical_labels: List[str]
+_HIERARCHY_PATTERN = re.compile(r"(?:\s>\s|\s->\s|;|\|)")
+def check_label_ontology_consistency(
+    adata: ad.AnnData,
+    label_column: str,
+    ontology_column: str,
+    detect_hierarchy: bool = True,
+) -> LabelOntologyConsistencyResult:
+    """
+    Check whether each label maps to a single ontology ID.
+    Flags labels that map to multiple valid CL IDs, which can cause label
+    collapsing when IDs are inferred from labels.
+    """
+    if label_column not in adata.obs.columns:
+        raise ValueError(
+            f"Label column '{label_column}' not found in adata.obs. "
+            f"Available columns: {list(adata.obs.columns)}"
+        )
+    if ontology_column not in adata.obs.columns:
+        raise ValueError(
+            f"Ontology column '{ontology_column}' not found in adata.obs. "
+            f"Available columns: {list(adata.obs.columns)}"
+        )
+    labels = adata.obs[label_column].dropna().astype(str)
+    n_labels = int(labels.nunique())
+    pairs = adata.obs[[label_column, ontology_column]].dropna()
+    pairs = pairs.drop_duplicates().astype(str)
+    valid_mask = pairs[ontology_column].str.startswith("CL:")
+    unique_pairs = pairs.loc[valid_mask, [label_column, ontology_column]]
+    labels_with_multiple_ids: Dict[str, List[str]] = {}
+    if not unique_pairs.empty:
+        grouped = unique_pairs.groupby(label_column)[ontology_column].unique()
+        for label, ids in grouped.items():
+            unique_ids = sorted(set(ids))
+            if len(unique_ids) > 1:
+                labels_with_multiple_ids[str(label)] = unique_ids
+    hierarchical_labels: List[str] = []
+    if detect_hierarchy:
+        for label in labels.unique():
+            if _HIERARCHY_PATTERN.search(str(label)):
+                hierarchical_labels.append(str(label))
+    return LabelOntologyConsistencyResult(
+        label_column=label_column,
+        ontology_column=ontology_column,
+        n_labels=n_labels,
+        n_labels_with_multiple_ids=len(labels_with_multiple_ids),
+        labels_with_multiple_ids=labels_with_multiple_ids,
+        n_hierarchical_labels=len(hierarchical_labels),
+        hierarchical_labels=hierarchical_labels,
+    )
 def validate_multiple_columns(

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore/core/utils.py RENAMED Viewed

@@ -251,6 +251,36 @@ def _convert_ensembl_to_hugo(
     return np.array(converted), stats
+def _normalize_var_names(
+    var_names: pd.Index,
+    var_df: pd.DataFrame,
+    ensembl_to_hugo: Dict[str, str],
+) -> Tuple[np.ndarray, Dict[str, int], bool, bool]:
+    """
+    Normalize var_names using feature_name and Ensembl -> HUGO mapping.
+    Returns converted names, conversion stats, and flags indicating
+    whether non-symbol IDs were detected and feature_name was used.
+    """
+    first_gene = str(var_names[0])
+    uses_non_symbol_ids = (
+        first_gene.isdigit() or
+        first_gene.startswith("ENSG") or
+        first_gene.startswith("ENST")
+    )
+    base_names = var_names.values
+    used_feature_name = False
+    if uses_non_symbol_ids and "feature_name" in var_df.columns:
+        base_names = var_df["feature_name"].values.astype(str)
+        used_feature_name = True
+    converted_names, stats = _convert_ensembl_to_hugo(
+        np.asarray(base_names), ensembl_to_hugo
+    )
+    return converted_names, stats, uses_non_symbol_ids, used_feature_name
 def normalize_gene_names(
     adata: ad.AnnData,
     ensembl_to_hugo: Optional[Dict[str, str]] = None,
@@ -278,6 +308,7 @@ def normalize_gene_names(
     -------
     AnnData
         AnnData with normalized gene names in var_names.
+        If adata.raw is present, its var_names are updated to stay aligned.
     Notes
     -----
@@ -297,22 +328,15 @@ def normalize_gene_names(
     if copy:
         adata = adata.copy()
-    first_gene = str(adata.var_names[0])
-    uses_non_symbol_ids = (
-        first_gene.isdigit() or
-        first_gene.startswith("ENSG") or
-        first_gene.startswith("ENST")
+    if ensembl_to_hugo is None:
+        ensembl_to_hugo = load_ensembl_to_hugo_mapping()
+    converted_names, stats, uses_non_symbol_ids, used_feature_name = _normalize_var_names(
+        adata.var_names, adata.var, ensembl_to_hugo
     )
     if not uses_non_symbol_ids:
         logger.info("Gene names already appear to be HUGO symbols")
-        # Still check for any remaining Ensembl IDs and convert them
-        if ensembl_to_hugo is None:
-            ensembl_to_hugo = load_ensembl_to_hugo_mapping()
-        converted_names, stats = _convert_ensembl_to_hugo(
-            adata.var_names.values, ensembl_to_hugo
-        )
         if stats["converted_ensembl"] > 0:
             adata.var_names = pd.Index(converted_names)
             adata.var_names_make_unique()
@@ -324,38 +348,46 @@ def normalize_gene_names(
                 f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
                 "leaving them unchanged"
             )
-        return adata
+    else:
+        if used_feature_name:
+            logger.info("Using 'feature_name' column as gene names")
-    # Step 1: Use feature_name column if available
-    if "feature_name" in adata.var.columns:
-        feature_names = adata.var["feature_name"].values.astype(str)
-        adata.var_names = pd.Index(feature_names)
-        logger.info("Using 'feature_name' column as gene names")
+        adata.var_names = pd.Index(converted_names)
-    # Step 2: Apply Ensembl to HUGO mapping for any remaining Ensembl IDs
-    if ensembl_to_hugo is None:
-        ensembl_to_hugo = load_ensembl_to_hugo_mapping()
+        if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
+            logger.info(
+                f"Gene mapping: {stats['converted_ensembl']:,} converted, "
+                f"{stats['already_hugo']:,} already HUGO, "
+                f"{stats['unmapped_ensembl']:,} unmapped"
+            )
+            if stats["unmapped_ensembl"] > 0:
+                logger.warning(
+                    f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
+                    "leaving them unchanged"
+                )
+        else:
+            logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
-    converted_names, stats = _convert_ensembl_to_hugo(
-        adata.var_names.values, ensembl_to_hugo
-    )
-    adata.var_names = pd.Index(converted_names)
+        adata.var_names_make_unique()
-    if stats["converted_ensembl"] > 0 or stats["unmapped_ensembl"] > 0:
-        logger.info(
-            f"Gene mapping: {stats['converted_ensembl']:,} converted, "
-            f"{stats['already_hugo']:,} already HUGO, "
-            f"{stats['unmapped_ensembl']:,} unmapped"
+    if adata.raw is not None:
+        raw_converted, raw_stats, _, raw_used_feature = _normalize_var_names(
+            adata.raw.var_names, adata.raw.var, ensembl_to_hugo
         )
-        if stats["unmapped_ensembl"] > 0:
-            logger.warning(
-                f"{stats['unmapped_ensembl']:,} Ensembl IDs not found in mapping; "
-                "leaving them unchanged"
-            )
-    else:
-        logger.info(f"All {stats['already_hugo']:,} genes already HUGO symbols")
+        raw_converted_index = pd.Index(raw_converted)
+        if raw_used_feature or not raw_converted_index.equals(adata.raw.var_names):
+            raw_adata = adata.raw.to_adata()
+            raw_adata.var_names = raw_converted_index
+            raw_adata.var_names_make_unique()
+            adata.raw = raw_adata
+            logger.info("Updated adata.raw.var_names to normalized HUGO symbols")
+            if raw_stats["unmapped_ensembl"] > 0:
+                logger.warning(
+                    f"{raw_stats['unmapped_ensembl']:,} raw Ensembl IDs not found in mapping; "
+                    "leaving them unchanged"
+                )
-    adata.var_names_make_unique()
     return adata

{spatialcore-0.2.0 → spatialcore-0.2.2}/src/spatialcore.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spatialcore
-Version: 0.2.0
+Version: 0.2.2
 Summary: Standardized spatial statistics tools for computational biology
 Author: SpatialCore Contributors
 License-Expression: Apache-2.0