PyPI - oncoordinate - Versions diffs - 0.1.7__py3-none-any.whl - Mend

oncoordinate 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

oncoordinate/sc.py ADDED Viewed

@@ -0,0 +1,729 @@
+from __future__ import annotations
+import logging
+from importlib import resources
+from pathlib import Path
+from typing import Iterable, Dict, Optional, Union, List
+import anndata as ad
+import joblib
+import numpy as np
+import pandas as pd
+import scanpy as _sc
+_sc.settings.verbosity = 0
+from scipy import sparse as sp
+from sklearn.preprocessing import MinMaxScaler
+import os
+import inspect
+logger = logging.getLogger(__name__)
+logging.getLogger().setLevel(logging.ERROR)
+ad.settings.allow_write_nullable_strings = True
+def _configure_debug_log():
+    try:
+        import __main__
+        main_file = getattr(__main__, "__file__", None)
+        if main_file:
+            log_dir = Path(main_file).resolve().parent
+        else:
+            log_dir = Path.cwd()
+    except Exception:
+        log_dir = Path.cwd()
+    log_path = log_dir / "debug_log.txt"
+    pkg_logger = logging.getLogger("oncoordinate")
+    pkg_logger.setLevel(logging.DEBUG)
+    for h in pkg_logger.handlers:
+        if isinstance(h, logging.FileHandler):
+            try:
+                if Path(getattr(h, "baseFilename", "")) == log_path:
+                    break
+            except Exception:
+                continue
+    else:
+        try:
+            handler = logging.FileHandler(log_path, mode="a", encoding="utf-8")
+            formatter = logging.Formatter(
+                "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+            )
+            handler.setFormatter(formatter)
+            handler.setLevel(logging.DEBUG)
+            pkg_logger.addHandler(handler)
+            pkg_logger.debug("Initialized oncoordinate debug logging to %s", log_path)
+        except Exception as e:
+            logger.warning("Failed to configure debug_log.txt logging: %s", e)
+_configure_debug_log()
+def _iter_gmt_files() -> Iterable[Path]:
+    try:
+        pkg_root = resources.files("oncoordinate").joinpath("HallmarkPathGMT")
+        for res in pkg_root.iterdir():
+            if res.name.endswith(".gmt"):
+                with resources.as_file(res) as p:
+                    yield Path(p)
+        return
+    except Exception:
+        pass
+    local_root = Path(__file__).resolve().parent / "HallmarkPathGMT"
+    if local_root.is_dir():
+        for p in sorted(local_root.glob("*.gmt")):
+            yield p
+    else:
+        logger.warning("HallmarkPathGMT directory not found; no pathway scores will be computed.")
+def _safe_joblib_load(path: Path, device: Optional[str] = None) -> Dict:
+    desired_raw = device or "auto"
+    desired = desired_raw.strip().lower() if isinstance(desired_raw, str) else "auto"
+    if desired == "gpu":
+        desired = "cuda"
+    try:
+        import torch
+    except Exception as e:
+        logger.info(
+            "torch import failed or unavailable (%s); attempting joblib.load without torch mapping.",
+            e,
+        )
+        return joblib.load(path)
+    try:
+        has_cuda = bool(torch.cuda.is_available())
+    except Exception as e:
+        logger.info("torch.cuda.is_available() check failed (%s); assuming CUDA unavailable.", e)
+        has_cuda = False
+    if desired == "cpu":
+        use_gpu = False
+    elif desired == "cuda":
+        use_gpu = has_cuda
+    else:
+        use_gpu = has_cuda
+    if use_gpu:
+        logger.info(
+            "Loading %s preferring GPU (device=%s, torch.cuda.is_available=%s).",
+            path.name,
+            desired,
+            has_cuda,
+        )
+        return joblib.load(path)
+    logger.info(
+        "Forcing CPU load for %s (device=%s). Will patch torch.load to set map_location='cpu' during joblib.load.",
+        path.name,
+        desired,
+    )
+    orig_torch_load = getattr(torch, "load", None)
+    def _torch_load_cpu(*args, **kwargs):
+        kwargs.setdefault("map_location", "cpu")
+        return orig_torch_load(*args, **kwargs)
+    if orig_torch_load is None:
+        logger.info(
+            "torch.load not found; falling back to joblib.load() without torch map_location patch."
+        )
+        return joblib.load(path)
+    try:
+        torch.load = _torch_load_cpu
+        return joblib.load(path)
+    finally:
+        try:
+            torch.load = orig_torch_load
+        except Exception:
+            logger.warning("Could not restore original torch.load after joblib.load().")
+def _load_model_bundle(device: Optional[str] = None) -> Dict:
+    try:
+        res = resources.files("oncoordinate").joinpath("oncoordinate.joblib")
+        with resources.as_file(res) as p:
+            path = Path(p)
+            if path.is_file():
+                return _safe_joblib_load(path, device=device)
+    except Exception:
+        pass
+    local_path = Path(__file__).resolve().parent / "oncoordinate.joblib"
+    if not local_path.is_file():
+        raise FileNotFoundError(
+            "Could not find 'oncoordinate.joblib' in package resources or next to sc.py"
+        )
+    return _safe_joblib_load(local_path, device=device)
+def _read_gmt(path: Path) -> Dict[str, List[str]]:
+    gene_sets: Dict[str, List[str]] = {}
+    with path.open() as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split("\t")
+            if len(parts) < 3:
+                continue
+            name = parts[0]
+            genes = parts[2:]
+            gene_sets[name] = genes
+    return gene_sets
+class sc:
+    def __init__(
+        self,
+        adata: Optional[ad.AnnData] = None,
+        *,
+        sample_key: str = "sample",
+        batch_key: Optional[str] = None,
+        user_celltype_vectors: Optional[pd.DataFrame] = None,
+        device: Optional[str] = None,
+    ):
+        self.adata = adata
+        self.sample_key = sample_key
+        self.batch_key = batch_key
+        self.device = (device.strip().lower() if isinstance(device, str) else None) or "auto"
+        if self.adata is not None:
+            self.adata.obs.index = self.adata.obs.index.astype(str)
+            self.adata.var.index = self.adata.var.index.astype(str)
+        if self.adata is not None and user_celltype_vectors is not None:
+            inter = user_celltype_vectors.loc[
+                user_celltype_vectors.index.intersection(self.adata.obs_names)
+            ].copy()
+            inter = inter.apply(pd.to_numeric, errors="coerce").fillna(0.0)
+            inter.columns = [
+                c if str(c).startswith("ctv_") else f"ctv_{c}" for c in inter.columns
+            ]
+            self.adata.obs = self.adata.obs.join(inter, how="left")
+    @staticmethod
+    def _ensure_cpu_safe_model(model, device: Optional[str] = None):
+        try:
+            import torch
+        except Exception:
+            return model
+        requested = device or "auto"
+        if isinstance(requested, str):
+            requested = requested.strip().lower()
+        else:
+            requested = "auto"
+        try:
+            has_cuda = bool(torch.cuda.is_available())
+        except Exception:
+            has_cuda = False
+        force_cpu = (requested == "cpu") or (not has_cuda)
+        if not force_cpu:
+            return model
+        cls_name = type(model).__name__.lower()
+        if cls_name.startswith("tabnet"):
+            try:
+                model.device_name = "cpu"
+            except Exception:
+                pass
+            try:
+                model.device = torch.device("cpu")
+            except Exception:
+                pass
+            net = getattr(model, "network", None)
+            if net is not None:
+                try:
+                    net.to(torch.device("cpu"))
+                except Exception:
+                    pass
+        return model
+    @staticmethod
+    def _preprocess(
+        adata: ad.AnnData,
+        *,
+        min_genes: int = 200,
+        min_cells: int = 10,
+    ) -> ad.AnnData:
+        adata = adata.copy()
+        adata.obs.index = adata.obs.index.astype(str)
+        adata.var.index = adata.var.index.astype(str)
+        adata.var_names = adata.var_names.astype(str)
+        adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-")
+        _sc.pp.calculate_qc_metrics(
+            adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
+        )
+        _sc.pp.filter_cells(adata, min_genes=min_genes)
+        if min_cells is not None and min_cells > 0 and adata.n_obs > 0:
+            if sp.issparse(adata.X):
+                gene_nonzero = np.asarray((adata.X > 0).sum(axis=0)).ravel()
+            else:
+                gene_nonzero = np.count_nonzero(adata.X > 0, axis=0)
+            survivors = gene_nonzero >= min_cells
+            n_survivors = int(survivors.sum())
+            if n_survivors == 0:
+                relaxed = gene_nonzero >= 1
+                n_relaxed = int(relaxed.sum())
+                if n_relaxed == 0:
+                    logger.warning(
+                        "After filter_cells (min_genes=%d), all genes have zero counts "
+                        "for this sample (n_obs=%d). Skipping gene filtering.",
+                        min_genes,
+                        adata.n_obs,
+                    )
+                else:
+                    logger.warning(
+                        "filter_genes(min_cells=%d) would remove ALL genes for this "
+                        "sample (n_obs=%d). Relaxing to min_cells=1; keeping %d genes.",
+                        min_cells,
+                        adata.n_obs,
+                        n_relaxed,
+                    )
+                    adata = adata[:, relaxed]
+            else:
+                adata = adata[:, survivors]
+        if adata.n_vars == 0:
+            raise ValueError(
+                f"After preprocessing (min_genes={min_genes}, min_cells={min_cells}), "
+                f"no genes remain for this sample (n_obs={adata.n_obs}). "
+                "Try lowering min_cells and/or min_genes."
+            )
+        if sp.issparse(adata.X):
+            if hasattr(adata.X, "data"):
+                adata.X.data = np.nan_to_num(adata.X.data)
+        else:
+            adata.X = np.nan_to_num(adata.X)
+        if adata.is_view:
+            adata = adata.copy()
+        _sc.pp.normalize_total(adata, target_sum=1e4)
+        _sc.pp.log1p(adata)
+        return adata
+    def _run_embeddings(
+        self,
+        adata: ad.AnnData,
+        *,
+        pca_n_comps: Optional[int] = None,
+        neighbors_n_pcs: Optional[int] = None,
+        neighbors_k: Optional[int] = None,
+        use_batch_correction: bool = False,
+    ) -> ad.AnnData:
+        adata = adata.copy()
+        max_rank = max(0, min(adata.n_obs, adata.n_vars) - 1)
+        n_comps = int(pca_n_comps) if pca_n_comps is not None else max(
+            2, min(50, max_rank)
+        )
+        _sc.tl.pca(adata, n_comps=n_comps, svd_solver="arpack")
+        n_pcs_avail = int(adata.obsm["X_pca"].shape[1])
+        if n_pcs_avail < 2:
+            raise ValueError(
+                f"Too few PCs after preprocessing (got {n_pcs_avail}). "
+                "Consider lowering filtering thresholds or increasing pca_n_comps."
+            )
+        n_pcs_use = int(neighbors_n_pcs) if neighbors_n_pcs is not None else min(
+            40, n_pcs_avail
+        )
+        if n_pcs_use > n_pcs_avail:
+            raise ValueError(
+                f"Requested neighbors_n_pcs={n_pcs_use} > available PCs ({n_pcs_avail})."
+            )
+        k = int(neighbors_k) if neighbors_k is not None else min(
+            15, max(2, adata.n_obs - 1)
+        )
+        k = max(2, min(k, max(2, adata.n_obs - 1)))
+        batch_key = self.batch_key or self.sample_key
+        if use_batch_correction and batch_key in adata.obs.columns:
+            try:
+                import scanpy.external as sce
+                logger.info(
+                    f"Running BBKNN batch correction with batch_key='{batch_key}'"
+                )
+                sce.pp.bbknn(adata, batch_key=batch_key, n_pcs=n_pcs_use)
+            except Exception as e:
+                logger.warning(
+                    f"BBKNN batch correction failed ({e}); falling back to standard neighbors."
+                )
+                _sc.pp.neighbors(
+                    adata,
+                    n_neighbors=k,
+                    n_pcs=n_pcs_use,
+                    use_rep="X_pca",
+                )
+        else:
+            _sc.pp.neighbors(
+                adata,
+                n_neighbors=k,
+                n_pcs=n_pcs_use,
+                use_rep="X_pca",
+            )
+        _sc.tl.umap(adata)
+        _sc.tl.diffmap(adata)
+        return adata
+    @staticmethod
+    def _score_pathways(adata: ad.AnnData) -> ad.AnnData:
+        adata = adata.copy()
+        adata.var_names = adata.var_names.astype(str).str.upper()
+        adata.var_names_make_unique()
+        genes_in_adata = list(adata.var_names)
+        score_genes = _sc.tl.score_genes
+        try:
+            sig = inspect.signature(score_genes)
+            supports_ctrl_as_ref = "ctrl_as_ref" in sig.parameters
+        except Exception:
+            supports_ctrl_as_ref = False
+        for gmt_path in _iter_gmt_files():
+            logger.info(f"Scoring pathways from {gmt_path.name}")
+            gene_sets = _read_gmt(gmt_path)
+            for pathway, genes in gene_sets.items():
+                if not genes:
+                    continue
+                genes_upper = [g.upper() for g in genes]
+                genes_inter = [g for g in genes_upper if g in genes_in_adata]
+                if not genes_inter:
+                    continue
+                kwargs = dict(
+                    gene_list=genes_inter,
+                    score_name=pathway,
+                    use_raw=False,
+                )
+                if supports_ctrl_as_ref:
+                    kwargs["ctrl_as_ref"] = False
+                try:
+                    score_genes(adata, **kwargs)
+                except RuntimeError as e:
+                    msg = str(e)
+                    if "No control genes found in any cut" in msg:
+                        logger.warning(
+                            "score_genes failed for pathway '%s' due to lack of control "
+                            "genes (message: %s). Skipping this pathway. "
+                            "Consider adjusting gene sets or ctrl_size if needed.",
+                            pathway,
+                            msg,
+                        )
+                        continue
+                    raise
+        return adata
+    @staticmethod
+    def _build_feature_matrix(
+        obs: pd.DataFrame,
+        feature_names: List[str],
+    ) -> pd.DataFrame:
+        X = pd.DataFrame(index=obs.index)
+        for feat in feature_names:
+            if feat in obs.columns:
+                col = obs[feat]
+            else:
+                col = 0.0
+            col = pd.to_numeric(col, errors="coerce").fillna(0.0).astype(np.float32)
+            X[feat] = col
+        return X
+    def _annotate_with_model(self, adata: ad.AnnData, device: Optional[str] = None) -> ad.AnnData:
+        adata = adata.copy()
+        bundle = _load_model_bundle(device=device)
+        model = bundle["model"]
+        feature_list = list(bundle.get("features", []))
+        scaler = bundle.get("scaler", None)
+        label_map = bundle.get(
+            "label_map",
+            {0: "normal", 1: "abnormal", 2: "pre-malignant", 3: "malignant"},
+        )
+        if not feature_list:
+            logger.warning(
+                "Model bundle has empty 'features'; skipping oncoordinate annotation."
+            )
+            return adata
+        meta = adata.obs.copy()
+        X_df = self._build_feature_matrix(meta, feature_list)
+        X_df_float = X_df.astype(np.float32)
+        if scaler is not None:
+            try:
+                X_scaled = scaler.transform(X_df_float)
+            except Exception as e:
+                logger.warning(
+                    f"Scaler transform failed ({e}); falling back to MinMaxScaler fit on the fly."
+                )
+                X_scaled = MinMaxScaler().fit_transform(X_df_float)
+        else:
+            X_scaled = MinMaxScaler().fit_transform(X_df_float)
+        X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)
+        base_model = model
+        if hasattr(base_model, "best_estimator_"):
+            base_model = base_model.best_estimator_
+        base_model = self._ensure_cpu_safe_model(base_model, device=device)
+        if not hasattr(base_model, "predict_proba"):
+            raise RuntimeError(
+                "Loaded oncoordinate model does not support predict_proba(). "
+                "Cannot annotate cells."
+            )
+        proba = np.asarray(base_model.predict_proba(X_scaled), dtype=float)
+        if proba.ndim != 2:
+            raise ValueError(
+                f"Model.predict_proba returned array with shape {proba.shape}, expected 2D."
+            )
+        classes_attr = getattr(base_model, "classes_", None)
+        if classes_attr is not None:
+            try:
+                class_names = [label_map.get(int(c), str(c)) for c in classes_attr]
+            except Exception:
+                class_names = [str(c) for c in classes_attr]
+        else:
+            class_names = [
+                label_map.get(i, str(i)) for i in range(proba.shape[1])
+            ]
+        pred_idx = np.argmax(proba, axis=1)
+        pred_labels = [class_names[i] for i in pred_idx]
+        adata.obs["oncoordinate_stage_idx"] = pred_idx.astype(int)
+        adata.obs["oncoordinate_stage"] = pd.Categorical(
+            pred_labels,
+            categories=class_names,
+            ordered=True,
+        )
+        for j, cname in enumerate(class_names):
+            col_name = f"oncoordinate_proba_{cname}"
+            adata.obs[col_name] = proba[:, j]
+        return adata
+    def _process_one_sample(
+        self,
+        adata_s: ad.AnnData,
+        *,
+        pca_n_comps: Optional[int],
+        neighbors_n_pcs: Optional[int],
+        neighbors_k: Optional[int],
+        min_cells: int,
+        min_genes: int,
+        use_batch_correction: bool,
+        device: Optional[str] = None,
+        run_embeddings: bool = True,
+    ) -> ad.AnnData:
+        ad = self._preprocess(
+            adata_s,
+            min_cells=min_cells,
+            min_genes=min_genes,
+        )
+        if run_embeddings:
+            ad = self._run_embeddings(
+                ad,
+                pca_n_comps=pca_n_comps,
+                neighbors_n_pcs=neighbors_n_pcs,
+                neighbors_k=neighbors_k,
+                use_batch_correction=use_batch_correction,
+            )
+        ad = self._score_pathways(ad)
+        ad = self._annotate_with_model(ad, device=device)
+        ad.uns.setdefault("oncoordinate_params", {})
+        ad.uns["oncoordinate_params"].update(
+            dict(
+                min_cells=int(min_cells),
+                min_genes=int(min_genes),
+                device=device or self.device,
+            )
+        )
+        return ad
+    def annotate(
+        self,
+        *,
+        sample_key: Optional[str] = None,
+        pca_n_comps: Optional[int] = None,
+        neighbors_n_pcs: Optional[int] = None,
+        neighbors_k: Optional[int] = None,
+        min_cells: int = 3,
+        min_genes: int = 200,
+        use_batch_correction: bool = False,
+        save_path: Optional[Union[str, Path]] = None,
+        device: Optional[str] = None,
+    ) -> ad.AnnData:
+        if self.adata is None:
+            raise AttributeError("OnCoordinateSC.adata is None – please provide an AnnData.")
+        adata = self.adata.copy()
+        adata.obs.index = adata.obs.index.astype(str)
+        adata.var.index = adata.var.index.astype(str)
+        skey = sample_key or self.sample_key
+        if skey not in adata.obs.columns:
+            logger.info(
+                "'%s' not found in adata.obs – processing as a single sample.",
+                skey,
+            )
+            annotated = self._process_one_sample(
+                adata,
+                pca_n_comps=pca_n_comps,
+                neighbors_n_pcs=neighbors_n_pcs,
+                neighbors_k=neighbors_k,
+                min_cells=min_cells,
+                min_genes=min_genes,
+                use_batch_correction=use_batch_correction,
+                device=(device or self.device),
+            )
+        else:
+            col = adata.obs[skey].astype(str).str.strip()
+            adata.obs[skey] = col
+            unique_samples = col.unique().tolist()
+            if len(unique_samples) == 1:
+                logger.info(
+                    "Single sample detected (sample_key='%s'). Processing whole AnnData.",
+                    skey,
+                )
+                annotated = self._process_one_sample(
+                    adata,
+                    pca_n_comps=pca_n_comps,
+                    neighbors_n_pcs=neighbors_n_pcs,
+                    neighbors_k=neighbors_k,
+                    min_cells=min_cells,
+                    min_genes=min_genes,
+                    use_batch_correction=use_batch_correction,
+                    device=(device or self.device),
+                )
+            else:
+                logger.info(
+                    "Multiple samples detected in '%s': %s (use_batch_correction=%s)",
+                    skey,
+                    unique_samples,
+                    use_batch_correction,
+                )
+                annotated_list: List[ad.AnnData] = []
+                for sid in unique_samples:
+                    mask = adata.obs[skey] == sid
+                    if mask.sum() == 0:
+                        continue
+                    ad_s = adata[mask, :].copy()
+                    if use_batch_correction:
+                        ann_s = self._process_one_sample(
+                            ad_s,
+                            pca_n_comps=pca_n_comps,
+                            neighbors_n_pcs=neighbors_n_pcs,
+                            neighbors_k=neighbors_k,
+                            min_cells=min_cells,
+                            min_genes=min_genes,
+                            use_batch_correction=False,
+                            device=(device or self.device),
+                            run_embeddings=False,
+                        )
+                    else:
+                        ann_s = self._process_one_sample(
+                            ad_s,
+                            pca_n_comps=pca_n_comps,
+                            neighbors_n_pcs=neighbors_n_pcs,
+                            neighbors_k=neighbors_k,
+                            min_cells=min_cells,
+                            min_genes=min_genes,
+                            use_batch_correction=False,
+                            device=(device or self.device),
+                        )
+                    ann_s.obs[skey] = sid
+                    annotated_list.append(ann_s)
+                if not annotated_list:
+                    raise ValueError(
+                        f"No samples contained any cells after filtering (sample_key='{skey}')."
+                    )
+                annotated = ad.concat(
+                    annotated_list,
+                    label="__sample__",
+                    keys=[str(a.obs[skey].unique()[0]) for a in annotated_list],
+                    join="outer",
+                    index_unique=None,
+                )
+                if use_batch_correction:
+                    batch_key = self.batch_key or skey
+                    logger.info(
+                        "Running BBKNN batch correction once on concatenated AnnData "
+                        "(n_obs=%d, n_vars=%d, batch_key='%s').",
+                        annotated.n_obs,
+                        annotated.n_vars,
+                        batch_key,
+                    )
+                    annotated = self._run_embeddings(
+                        annotated,
+                        pca_n_comps=pca_n_comps,
+                        neighbors_n_pcs=neighbors_n_pcs,
+                        neighbors_k=neighbors_k,
+                        use_batch_correction=True,
+                    )
+                    annotated.uns.setdefault("oncoordinate_params", {})
+                    annotated.uns["oncoordinate_params"].update(
+                        dict(
+                            min_cells=int(min_cells),
+                            min_genes=int(min_genes),
+                            device=device or self.device,
+                        )
+                    )
+        if save_path is not None:
+            save_path = Path(save_path)
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            annotated.write_h5ad(save_path)
+            logger.info("Wrote oncoordinate-annotated AnnData to: %s", save_path)
+        return annotated