PyPI - univi - Versions diffs - 0.3.4__py3-none-any.whl - Mend

univi 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

univi/__init__.py +120 -0
univi/__main__.py +5 -0
univi/cli.py +60 -0
univi/config.py +340 -0
univi/data.py +345 -0
univi/diagnostics.py +130 -0
univi/evaluation.py +632 -0
univi/hyperparam_optimization/__init__.py +17 -0
univi/hyperparam_optimization/common.py +339 -0
univi/hyperparam_optimization/run_adt_hparam_search.py +109 -0
univi/hyperparam_optimization/run_atac_hparam_search.py +109 -0
univi/hyperparam_optimization/run_citeseq_hparam_search.py +137 -0
univi/hyperparam_optimization/run_multiome_hparam_search.py +145 -0
univi/hyperparam_optimization/run_rna_hparam_search.py +111 -0
univi/hyperparam_optimization/run_teaseq_hparam_search.py +146 -0
univi/interpretability.py +399 -0
univi/matching.py +394 -0
univi/models/__init__.py +8 -0
univi/models/decoders.py +249 -0
univi/models/encoders.py +848 -0
univi/models/mlp.py +36 -0
univi/models/tokenizers.py +376 -0
univi/models/transformer.py +249 -0
univi/models/univi.py +1284 -0
univi/objectives.py +46 -0
univi/pipeline.py +194 -0
univi/plotting.py +126 -0
univi/trainer.py +478 -0
univi/utils/__init__.py +5 -0
univi/utils/io.py +621 -0
univi/utils/logging.py +16 -0
univi/utils/seed.py +18 -0
univi/utils/stats.py +23 -0
univi/utils/torch_utils.py +23 -0
univi-0.3.4.dist-info/METADATA +908 -0
univi-0.3.4.dist-info/RECORD +40 -0
univi-0.3.4.dist-info/WHEEL +5 -0
univi-0.3.4.dist-info/entry_points.txt +2 -0
univi-0.3.4.dist-info/licenses/LICENSE +21 -0
univi-0.3.4.dist-info/top_level.txt +1 -0

univi/matching.py ADDED Viewed

@@ -0,0 +1,394 @@
+# univi/matching.py
+import warnings
+import numpy as np
+from sklearn.metrics import pairwise_distances
+from sklearn.neighbors import NearestNeighbors
+from scipy.optimize import linear_sum_assignment
+from typing import Optional, Dict
+def _subsample_indices(n: int, max_cells: int, rng: np.random.Generator) -> np.ndarray:
+    """
+    Helper to subsample up to max_cells indices from range(n) without replacement.
+    """
+    idx_full = np.arange(n)
+    if n <= max_cells:
+        return idx_full
+    return rng.choice(idx_full, size=max_cells, replace=False)
+# ---------------------------------------------------------------------------
+# 1. Basic bipartite matching (Hungarian) in a shared embedding
+# ---------------------------------------------------------------------------
+def bipartite_match_adata(
+    adata_A,
+    adata_B,
+    emb_key: str = "X_pca",
+    metric: str = "euclidean",
+    max_cells: int = 20000,
+    random_state: int = 0,
+):
+    """
+    Bipartite matching between cells in adata_A and adata_B based on a shared embedding.
+    This is the basic "Hungarian in latent space" matcher. It assumes that both
+    adata_A and adata_B have a *comparable* embedding in .obsm[emb_key], e.g.:
+        - both are in the same PCA space, or
+        - both are in a shared latent space (CCA, UniVI encoder, etc.)
+    """
+    rng = np.random.default_rng(random_state)
+    XA = np.asarray(adata_A.obsm[emb_key])
+    XB = np.asarray(adata_B.obsm[emb_key])
+    na, nb = XA.shape[0], XB.shape[0]
+    n = min(na, nb, max_cells)
+    idx_A = _subsample_indices(na, n, rng)
+    idx_B = _subsample_indices(nb, n, rng)
+    XA_sub = XA[idx_A]
+    XB_sub = XB[idx_B]
+    # cost matrix
+    D = pairwise_distances(XA_sub, XB_sub, metric=metric)
+    # Hungarian algorithm (min-cost)
+    row_ind, col_ind = linear_sum_assignment(D)
+    matched_A = idx_A[row_ind]
+    matched_B = idx_B[col_ind]
+    return matched_A, matched_B
+# ---------------------------------------------------------------------------
+# 2. Stratified bipartite matching (per group / cell type)
+# ---------------------------------------------------------------------------
+def stratified_bipartite_match_adata(
+    adata_A,
+    adata_B,
+    group_key_A: str,
+    group_key_B: Optional[str] = None,
+    group_map: Optional[Dict] = None,
+    emb_key: str = "X_pca",
+    metric: str = "euclidean",
+    max_cells_per_group: int = 20000,
+    random_state: int = 0,
+    shuffle: bool = True,
+):
+    """
+    Per-group (e.g. per-celltype) bipartite matching in a shared embedding.
+    This wraps `bipartite_match_adata` but runs it separately within each group,
+    then concatenates the matches.
+    """
+    rng = np.random.default_rng(random_state)
+    if group_key_B is None:
+        group_key_B = group_key_A
+    if group_key_A not in adata_A.obs.columns:
+        raise KeyError(f"{group_key_A!r} not found in adata_A.obs")
+    if group_key_B not in adata_B.obs.columns:
+        raise KeyError(f"{group_key_B!r} not found in adata_B.obs")
+    labels_A = adata_A.obs[group_key_A].astype(str).to_numpy()
+    labels_B = adata_B.obs[group_key_B].astype(str).to_numpy()
+    unique_A = np.unique(labels_A)
+    all_matched_A = []
+    all_matched_B = []
+    group_counts: Dict[str, int] = {}
+    for gA in unique_A:
+        # Determine which label in B we should match to
+        if group_map is not None:
+            gB = group_map.get(gA, None)
+            if gB is None:
+                # group not mapped; skip
+                continue
+        else:
+            gB = gA
+        idx_A_g = np.where(labels_A == gA)[0]
+        idx_B_g = np.where(labels_B == gB)[0]
+        if (idx_A_g.size == 0) or (idx_B_g.size == 0):
+            continue
+        # Build small views
+        adata_A_g = adata_A[idx_A_g]
+        adata_B_g = adata_B[idx_B_g]
+        # n per group for bipartite matching
+        n_grp = min(idx_A_g.size, idx_B_g.size, max_cells_per_group)
+        if n_grp == 0:
+            continue
+        mA_local, mB_local = bipartite_match_adata(
+            adata_A_g,
+            adata_B_g,
+            emb_key=emb_key,
+            metric=metric,
+            max_cells=n_grp,
+            random_state=random_state,
+        )
+        if mA_local.size == 0:
+            continue
+        mA = idx_A_g[mA_local]
+        mB = idx_B_g[mB_local]
+        all_matched_A.append(mA)
+        all_matched_B.append(mB)
+        group_counts[gA] = mA.size
+    if not all_matched_A:
+        raise RuntimeError("No stratified matches were found for any group.")
+    matched_A = np.concatenate(all_matched_A)
+    matched_B = np.concatenate(all_matched_B)
+    if shuffle:
+        perm = rng.permutation(matched_A.size)
+        matched_A = matched_A[perm]
+        matched_B = matched_B[perm]
+    return matched_A, matched_B, group_counts
+# ---------------------------------------------------------------------------
+# 3. Mutual Nearest Neighbor (MNN) anchors
+# ---------------------------------------------------------------------------
+def mnn_anchors_adata(
+    adata_A,
+    adata_B,
+    emb_key: str = "X_pca",
+    k: int = 20,
+    max_cells: int = 20000,
+    random_state: int = 0,
+):
+    """
+    Mutual Nearest Neighbor (MNN) anchors between adata_A and adata_B.
+    """
+    rng = np.random.default_rng(random_state)
+    XA = np.asarray(adata_A.obsm[emb_key])
+    XB = np.asarray(adata_B.obsm[emb_key])
+    na, nb = XA.shape[0], XB.shape[0]
+    idx_A = _subsample_indices(na, max_cells, rng)
+    idx_B = _subsample_indices(nb, max_cells, rng)
+    XA_sub = XA[idx_A]
+    XB_sub = XB[idx_B]
+    k_A = min(k, idx_B.size)
+    k_B = min(k, idx_A.size)
+    # A -> B neighbors
+    nn_B = NearestNeighbors(n_neighbors=k_A)
+    nn_B.fit(XB_sub)
+    dist_A2B, ind_A2B = nn_B.kneighbors(XA_sub, return_distance=True)
+    # B -> A neighbors
+    nn_A = NearestNeighbors(n_neighbors=k_B)
+    nn_A.fit(XA_sub)
+    dist_B2A, ind_B2A = nn_A.kneighbors(XB_sub, return_distance=True)
+    # Mutual neighbors
+    anchors_A_list = []
+    anchors_B_list = []
+    # For quick membership tests: for each j, set of neighbors in A
+    neighbors_B2A = [set(ind_B2A[j]) for j in range(idx_B.size)]
+    for i in range(idx_A.size):
+        for j_local in ind_A2B[i]:
+            # Check mutuality
+            if i in neighbors_B2A[j_local]:
+                anchors_A_list.append(idx_A[i])
+                anchors_B_list.append(idx_B[j_local])
+    if not anchors_A_list:
+        warnings.warn("mnn_anchors_adata: no mutual nearest neighbors found.")
+        return np.array([], dtype=int), np.array([], dtype=int)
+    anchors_A = np.array(anchors_A_list, dtype=int)
+    anchors_B = np.array(anchors_B_list, dtype=int)
+    return anchors_A, anchors_B
+# ---------------------------------------------------------------------------
+# 4. Cluster / cell-type centroid matching (for building group maps)
+# ---------------------------------------------------------------------------
+def cluster_centroid_matching_adata(
+    adata_A,
+    adata_B,
+    group_key_A: str,
+    group_key_B: Optional[str] = None,
+    emb_key: str = "X_pca",
+    metric: str = "euclidean",
+):
+    """
+    Match cluster / cell-type centroids across datasets via Hungarian.
+    """
+    if group_key_B is None:
+        group_key_B = group_key_A
+    if group_key_A not in adata_A.obs.columns:
+        raise KeyError(f"{group_key_A!r} not found in adata_A.obs")
+    if group_key_B not in adata_B.obs.columns:
+        raise KeyError(f"{group_key_B!r} not found in adata_B.obs")
+    labels_A = adata_A.obs[group_key_A].astype(str).to_numpy()
+    labels_B = adata_B.obs[group_key_B].astype(str).to_numpy()
+    XA = np.asarray(adata_A.obsm[emb_key])
+    XB = np.asarray(adata_B.obsm[emb_key])
+    groups_A = np.unique(labels_A)
+    groups_B = np.unique(labels_B)
+    # Compute centroids
+    centroids_A = []
+    for g in groups_A:
+        idx = np.where(labels_A == g)[0]
+        centroids_A.append(XA[idx].mean(axis=0))
+    centroids_A = np.vstack(centroids_A)
+    centroids_B = []
+    for g in groups_B:
+        idx = np.where(labels_B == g)[0]
+        centroids_B.append(XB[idx].mean(axis=0))
+    centroids_B = np.vstack(centroids_B)
+    # Cost between centroids
+    D = pairwise_distances(centroids_A, centroids_B, metric=metric)
+    row_ind, col_ind = linear_sum_assignment(D)
+    group_map: Dict[str, str] = {}
+    for i, j in zip(row_ind, col_ind):
+        gA = groups_A[i]
+        gB = groups_B[j]
+        group_map[gA] = gB
+    return group_map
+# ---------------------------------------------------------------------------
+# 5. Gromov–Wasserstein OT-based anchors (optional; requires POT)
+# ---------------------------------------------------------------------------
+def gw_ot_anchors_adata(
+    adata_A,
+    adata_B,
+    emb_key: str = "X_pca",
+    max_cells: int = 3000,
+    random_state: int = 0,
+    normalize_distances: bool = True,
+):
+    """
+    Geometry-aware anchors via Gromov–Wasserstein optimal transport.
+    """
+    try:
+        import ot  # type: ignore
+    except ImportError as e:
+        raise ImportError(
+            "gw_ot_anchors_adata requires the 'pot' package. "
+            "Install with: pip install pot"
+        ) from e
+    rng = np.random.default_rng(random_state)
+    XA = np.asarray(adata_A.obsm[emb_key])
+    XB = np.asarray(adata_B.obsm[emb_key])
+    na, nb = XA.shape[0], XB.shape[0]
+    idx_A = _subsample_indices(na, max_cells, rng)
+    idx_B = _subsample_indices(nb, max_cells, rng)
+    XA_sub = XA[idx_A]
+    XB_sub = XB[idx_B]
+    # Distance matrices within each dataset
+    DA = pairwise_distances(XA_sub, XA_sub, metric="euclidean")
+    DB = pairwise_distances(XB_sub, XB_sub, metric="euclidean")
+    if normalize_distances:
+        if DA.max() > 0:
+            DA = DA / DA.max()
+        if DB.max() > 0:
+            DB = DB / DB.max()
+    # Uniform weights
+    p = np.ones(DA.shape[0]) / DA.shape[0]
+    q = np.ones(DB.shape[0]) / DB.shape[0]
+    # Compute GW coupling
+    T = ot.gromov.gromov_wasserstein(
+        DA, DB, p, q, loss_fun="square_loss", verbose=False
+    )
+    # For each i in A, pick the j in B with maximum coupling mass
+    anchors_A_list = []
+    anchors_B_list = []
+    for i in range(T.shape[0]):
+        j = int(np.argmax(T[i]))
+        anchors_A_list.append(idx_A[i])
+        anchors_B_list.append(idx_B[j])
+    anchors_A = np.array(anchors_A_list, dtype=int)
+    anchors_B = np.array(anchors_B_list, dtype=int)
+    return anchors_A, anchors_B
+# ---------------------------------------------------------------------------
+# 6. Group latent statistics (for distribution-level alignment)
+# ---------------------------------------------------------------------------
+def group_latent_stats_adata(
+    adata,
+    group_key: str,
+    emb_key: str = "X_pca",
+):
+    """
+    Compute simple group-wise latent statistics (mean, covariance) to
+    support distribution-level alignment strategies.
+    """
+    if group_key not in adata.obs.columns:
+        raise KeyError(f"{group_key!r} not found in adata.obs")
+    labels = adata.obs[group_key].astype(str).to_numpy()
+    X = np.asarray(adata.obsm[emb_key])
+    groups = np.unique(labels)
+    stats = {}
+    for g in groups:
+        idx = np.where(labels == g)[0]
+        if idx.size == 0:
+            continue
+        Xg = X[idx]
+        mu = Xg.mean(axis=0)
+        # rowvar=False so that columns are variables
+        cov = np.cov(Xg, rowvar=False)
+        stats[g] = {
+            "mean": mu,
+            "cov": cov,
+            "n": idx.size,
+        }
+    return stats

univi/models/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# univi/models/__init__.py
+from __future__ import annotations
+from .univi import UniVIMultiModalVAE
+from .transformer import TransformerEncoder
+from .tokenizers import build_tokenizer
+__all__ = ["UniVIMultiModalVAE", "TransformerEncoder", "build_tokenizer"]

univi/models/decoders.py ADDED Viewed

@@ -0,0 +1,249 @@
+# univi/models/decoders.py
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .mlp import build_mlp
+@dataclass
+class DecoderConfig:
+    """Generic configuration for feed-forward decoders."""
+    output_dim: int
+    hidden_dims: List[int]
+    dropout: float = 0.0
+    batchnorm: bool = False
+class GaussianDecoder(nn.Module):
+    """z -> mean reconstruction (use with MSE/Gaussian losses)."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        return self.net(z)
+class GaussianDiagDecoder(nn.Module):
+    """z -> {'mean','logvar'} for full diagonal Gaussian likelihoods."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.backbone = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=2 * cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        out = self.backbone(z)
+        mean, logvar = out.chunk(2, dim=-1)
+        return {"mean": mean, "logvar": logvar}
+class BernoulliDecoder(nn.Module):
+    """z -> {'logits'} for Bernoulli likelihoods."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        logits = self.net(z)
+        return {"logits": logits}
+class PoissonDecoder(nn.Module):
+    """z -> {'log_rate','rate'} for Poisson likelihoods."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        log_rate = self.net(z)
+        rate = F.softplus(log_rate)
+        return {"log_rate": log_rate, "rate": rate}
+class NegativeBinomialDecoder(nn.Module):
+    """z -> {'mu','log_theta'} (theta can be global or gene-wise)."""
+    def __init__(
+        self,
+        cfg: DecoderConfig,
+        latent_dim: int,
+        dispersion: str = "gene",
+        init_log_theta: float = 0.0,
+        eps: float = 1e-8,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.dispersion = dispersion
+        self.eps = float(eps)
+        self.mu_net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+        if dispersion == "global":
+            self.log_theta = nn.Parameter(torch.full((1,), float(init_log_theta)))
+        elif dispersion == "gene":
+            self.log_theta = nn.Parameter(torch.full((cfg.output_dim,), float(init_log_theta)))
+        else:
+            raise ValueError("Unknown dispersion mode: %r (expected 'global' or 'gene')" % dispersion)
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        mu = F.softplus(self.mu_net(z)) + self.eps
+        return {"mu": mu, "log_theta": self.log_theta}
+class ZeroInflatedNegativeBinomialDecoder(nn.Module):
+    """z -> {'mu','log_theta','logit_pi'}."""
+    def __init__(
+        self,
+        cfg: DecoderConfig,
+        latent_dim: int,
+        dispersion: str = "gene",
+        init_log_theta: float = 0.0,
+        eps: float = 1e-8,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.dispersion = dispersion
+        self.eps = float(eps)
+        self.backbone = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=2 * cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+        if dispersion == "global":
+            self.log_theta = nn.Parameter(torch.full((1,), float(init_log_theta)))
+        elif dispersion == "gene":
+            self.log_theta = nn.Parameter(torch.full((cfg.output_dim,), float(init_log_theta)))
+        else:
+            raise ValueError("Unknown dispersion mode: %r (expected 'global' or 'gene')" % dispersion)
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        out = self.backbone(z)
+        mu_logits, logit_pi = out.chunk(2, dim=-1)
+        mu = F.softplus(mu_logits) + self.eps
+        return {"mu": mu, "log_theta": self.log_theta, "logit_pi": logit_pi}
+class LogisticNormalDecoder(nn.Module):
+    """z -> {'logits','probs'} for compositions/toy probability vectors."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        logits = self.net(z)
+        probs = F.softmax(logits, dim=-1)
+        return {"logits": logits, "probs": probs}
+class CategoricalDecoder(nn.Module):
+    """z -> {'logits','probs'} for discrete labels."""
+    def __init__(self, cfg: DecoderConfig, latent_dim: int):
+        super().__init__()
+        self.cfg = cfg
+        self.net = build_mlp(
+            in_dim=latent_dim,
+            hidden_dims=cfg.hidden_dims,
+            out_dim=cfg.output_dim,
+            dropout=cfg.dropout,
+            batchnorm=cfg.batchnorm,
+        )
+    def forward(self, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+        logits = self.net(z)
+        probs = F.softmax(logits, dim=-1)
+        return {"logits": logits, "probs": probs}
+DECODER_REGISTRY = {
+    # gaussian
+    "gaussian": GaussianDecoder,
+    "normal": GaussianDecoder,
+    "gaussian_diag": GaussianDiagDecoder,
+    # bernoulli/poisson
+    "bernoulli": BernoulliDecoder,
+    "poisson": PoissonDecoder,
+    # count models
+    "nb": NegativeBinomialDecoder,
+    "negative_binomial": NegativeBinomialDecoder,
+    "zinb": ZeroInflatedNegativeBinomialDecoder,
+    "zero_inflated_negative_binomial": ZeroInflatedNegativeBinomialDecoder,
+    # compositions / discrete
+    "logistic_normal": LogisticNormalDecoder,
+    "categorical": CategoricalDecoder,
+    "cat": CategoricalDecoder,
+    "ce": CategoricalDecoder,
+    "cross_entropy": CategoricalDecoder,
+}
+def build_decoder(kind: str, cfg: DecoderConfig, latent_dim: int, **kwargs: Any) -> nn.Module:
+    key = str(kind).lower()
+    if key not in DECODER_REGISTRY:
+        raise ValueError(
+            "Unknown decoder kind: %r. Available: %s" % (kind, list(DECODER_REGISTRY.keys()))
+        )
+    cls = DECODER_REGISTRY[key]
+    return cls(cfg=cfg, latent_dim=latent_dim, **kwargs)