PyPI - gengeneeval - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gengeneeval 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

geneval/__init__.py +43 -1
geneval/deg/__init__.py +65 -0
geneval/deg/context.py +271 -0
geneval/deg/detection.py +578 -0
geneval/deg/evaluator.py +538 -0
geneval/deg/visualization.py +376 -0
{gengeneeval-0.3.0.dist-info → gengeneeval-0.4.0.dist-info}/METADATA +90 -3
{gengeneeval-0.3.0.dist-info → gengeneeval-0.4.0.dist-info}/RECORD +11 -6
{gengeneeval-0.3.0.dist-info → gengeneeval-0.4.0.dist-info}/WHEEL +0 -0
{gengeneeval-0.3.0.dist-info → gengeneeval-0.4.0.dist-info}/entry_points.txt +0 -0
{gengeneeval-0.3.0.dist-info → gengeneeval-0.4.0.dist-info}/licenses/LICENSE +0 -0

geneval/deg/detection.py ADDED Viewed

@@ -0,0 +1,578 @@
+"""
+Fast DEG detection with CPU/GPU acceleration.
+This module provides vectorized statistical tests for DEG detection:
+- Welch's t-test (default, robust to unequal variance)
+- Student's t-test
+- Wilcoxon rank-sum test
+- Log-fold change thresholding
+All methods are accelerated using:
+- Vectorized NumPy operations
+- Optional GPU acceleration via PyTorch
+- Parallel computation via joblib
+"""
+from __future__ import annotations
+from typing import Optional, Literal, Dict, Union, Tuple, List
+from dataclasses import dataclass, field
+import numpy as np
+import warnings
+# Optional dependencies
+try:
+    import torch
+    HAS_TORCH = True
+except ImportError:
+    HAS_TORCH = False
+try:
+    from joblib import Parallel, delayed
+    HAS_JOBLIB = True
+except ImportError:
+    HAS_JOBLIB = False
+try:
+    from scipy import stats
+    from scipy.stats import ttest_ind, mannwhitneyu
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+# Type alias for DEG methods
+DEGMethod = Literal["welch", "student", "wilcoxon", "logfc"]
+@dataclass
+class DEGResult:
+    """Results from DEG detection.
+    Attributes
+    ----------
+    gene_names : np.ndarray
+        Names of all genes
+    pvalues : np.ndarray
+        P-values for each gene (NaN for logfc method)
+    pvalues_adj : np.ndarray
+        Adjusted p-values (Benjamini-Hochberg)
+    log_fold_changes : np.ndarray
+        Log2 fold changes (mean_perturbed / mean_control)
+    mean_control : np.ndarray
+        Mean expression in control
+    mean_perturbed : np.ndarray
+        Mean expression in perturbed
+    is_deg : np.ndarray
+        Boolean mask of significant DEGs
+    n_degs : int
+        Number of significant DEGs
+    method : str
+        Method used for detection
+    pval_threshold : float
+        P-value threshold used
+    lfc_threshold : float
+        Log fold change threshold used
+    """
+    gene_names: np.ndarray
+    pvalues: np.ndarray
+    pvalues_adj: np.ndarray
+    log_fold_changes: np.ndarray
+    mean_control: np.ndarray
+    mean_perturbed: np.ndarray
+    is_deg: np.ndarray
+    n_degs: int
+    method: str
+    pval_threshold: float
+    lfc_threshold: float
+    # Optional: indices of DEGs for fast slicing
+    deg_indices: np.ndarray = field(default_factory=lambda: np.array([], dtype=int))
+    def __post_init__(self):
+        """Compute DEG indices after initialization."""
+        if len(self.deg_indices) == 0:
+            self.deg_indices = np.where(self.is_deg)[0]
+    def get_deg_names(self) -> np.ndarray:
+        """Get names of significant DEGs."""
+        return self.gene_names[self.is_deg]
+    def to_dataframe(self):
+        """Convert to pandas DataFrame."""
+        import pandas as pd
+        return pd.DataFrame({
+            "gene": self.gene_names,
+            "pvalue": self.pvalues,
+            "pvalue_adj": self.pvalues_adj,
+            "log2fc": self.log_fold_changes,
+            "mean_control": self.mean_control,
+            "mean_perturbed": self.mean_perturbed,
+            "is_deg": self.is_deg,
+        }).set_index("gene")
+    def __repr__(self) -> str:
+        return (
+            f"DEGResult(n_genes={len(self.gene_names)}, n_degs={self.n_degs}, "
+            f"method='{self.method}', pval<{self.pval_threshold}, |lfc|>{self.lfc_threshold})"
+        )
+def _benjamini_hochberg(pvalues: np.ndarray) -> np.ndarray:
+    """Apply Benjamini-Hochberg correction for multiple testing.
+    Parameters
+    ----------
+    pvalues : np.ndarray
+        Raw p-values
+    Returns
+    -------
+    np.ndarray
+        Adjusted p-values (FDR)
+    """
+    n = len(pvalues)
+    if n == 0:
+        return pvalues
+    # Handle NaN values
+    valid_mask = ~np.isnan(pvalues)
+    pvalues_adj = np.full_like(pvalues, np.nan)
+    if not np.any(valid_mask):
+        return pvalues_adj
+    valid_pvals = pvalues[valid_mask]
+    # Sort p-values
+    sorted_idx = np.argsort(valid_pvals)
+    sorted_pvals = valid_pvals[sorted_idx]
+    # BH correction
+    n_valid = len(sorted_pvals)
+    rank = np.arange(1, n_valid + 1)
+    adjusted = sorted_pvals * n_valid / rank
+    # Ensure monotonicity (cumulative minimum from right)
+    adjusted = np.minimum.accumulate(adjusted[::-1])[::-1]
+    # Clip to [0, 1]
+    adjusted = np.clip(adjusted, 0, 1)
+    # Restore original order
+    unsorted_adj = np.empty_like(adjusted)
+    unsorted_adj[sorted_idx] = adjusted
+    pvalues_adj[valid_mask] = unsorted_adj
+    return pvalues_adj
+def compute_degs_fast(
+    control: np.ndarray,
+    perturbed: np.ndarray,
+    gene_names: Optional[np.ndarray] = None,
+    method: DEGMethod = "welch",
+    pval_threshold: float = 0.05,
+    lfc_threshold: float = 0.5,
+    use_adjusted_pval: bool = True,
+    n_jobs: int = 1,
+) -> DEGResult:
+    """
+    Fast DEG detection using vectorized statistical tests.
+    Parameters
+    ----------
+    control : np.ndarray
+        Control expression matrix (n_samples_control, n_genes)
+    perturbed : np.ndarray
+        Perturbed expression matrix (n_samples_perturbed, n_genes)
+    gene_names : np.ndarray, optional
+        Gene names. If None, uses indices.
+    method : str
+        Statistical test: "welch", "student", "wilcoxon", "logfc"
+    pval_threshold : float
+        P-value threshold for significance
+    lfc_threshold : float
+        Absolute log2 fold change threshold
+    use_adjusted_pval : bool
+        If True, use adjusted p-values (BH correction)
+    n_jobs : int
+        Number of parallel jobs (only for wilcoxon)
+    Returns
+    -------
+    DEGResult
+        DEG detection results
+    Examples
+    --------
+    >>> control = np.random.randn(100, 1000)  # 100 control cells, 1000 genes
+    >>> perturbed = control + np.random.randn(100, 1000) * 0.5  # Add noise
+    >>> perturbed[:, :50] += 2  # Make first 50 genes differentially expressed
+    >>> result = compute_degs_fast(control, perturbed, method="welch")
+    >>> print(f"Found {result.n_degs} DEGs")
+    """
+    n_genes = control.shape[1]
+    # Gene names
+    if gene_names is None:
+        gene_names = np.array([f"Gene_{i}" for i in range(n_genes)])
+    # Compute means
+    mean_control = np.mean(control, axis=0)
+    mean_perturbed = np.mean(perturbed, axis=0)
+    # Compute log fold change (add pseudocount for stability)
+    # Use pseudocount of 1 for log normalization (common in RNA-seq)
+    eps = 1.0  # pseudocount
+    log_fold_changes = np.log2((mean_perturbed + eps) / (mean_control + eps))
+    # Compute p-values based on method
+    if method == "logfc":
+        # No statistical test, just fold change thresholding
+        pvalues = np.full(n_genes, np.nan)
+        pvalues_adj = pvalues.copy()
+    elif method == "welch":
+        pvalues = _welch_ttest_vectorized(control, perturbed)
+        pvalues_adj = _benjamini_hochberg(pvalues)
+    elif method == "student":
+        pvalues = _student_ttest_vectorized(control, perturbed)
+        pvalues_adj = _benjamini_hochberg(pvalues)
+    elif method == "wilcoxon":
+        pvalues = _wilcoxon_vectorized(control, perturbed, n_jobs=n_jobs)
+        pvalues_adj = _benjamini_hochberg(pvalues)
+    else:
+        raise ValueError(f"Unknown method: {method}. Use 'welch', 'student', 'wilcoxon', or 'logfc'")
+    # Determine significant DEGs
+    if method == "logfc":
+        is_deg = np.abs(log_fold_changes) > lfc_threshold
+    else:
+        pval_test = pvalues_adj if use_adjusted_pval else pvalues
+        is_deg = (pval_test < pval_threshold) & (np.abs(log_fold_changes) > lfc_threshold)
+    return DEGResult(
+        gene_names=gene_names,
+        pvalues=pvalues,
+        pvalues_adj=pvalues_adj,
+        log_fold_changes=log_fold_changes,
+        mean_control=mean_control,
+        mean_perturbed=mean_perturbed,
+        is_deg=is_deg,
+        n_degs=int(np.sum(is_deg)),
+        method=method,
+        pval_threshold=pval_threshold,
+        lfc_threshold=lfc_threshold,
+    )
+def _welch_ttest_vectorized(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """
+    Vectorized Welch's t-test across all genes simultaneously.
+    Much faster than scipy.stats.ttest_ind for many genes.
+    """
+    n1, n2 = x.shape[0], y.shape[0]
+    # Sample means
+    mean1 = np.mean(x, axis=0)
+    mean2 = np.mean(y, axis=0)
+    # Sample variances (unbiased)
+    var1 = np.var(x, axis=0, ddof=1)
+    var2 = np.var(y, axis=0, ddof=1)
+    # Standard error
+    se = np.sqrt(var1 / n1 + var2 / n2)
+    # T-statistic
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t_stat = (mean1 - mean2) / se
+    # Welch-Satterthwaite degrees of freedom
+    with np.errstate(divide='ignore', invalid='ignore'):
+        num = (var1 / n1 + var2 / n2) ** 2
+        denom = (var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1)
+        df = num / denom
+    # Handle edge cases
+    df = np.clip(df, 1, np.inf)
+    df = np.nan_to_num(df, nan=1.0)
+    # Two-tailed p-value using scipy (still fast for vectorized computation)
+    if HAS_SCIPY:
+        pvalues = 2 * stats.t.sf(np.abs(t_stat), df)
+    else:
+        # Fallback: approximate p-value using normal distribution for large df
+        pvalues = 2 * (1 - _normal_cdf(np.abs(t_stat)))
+    return np.nan_to_num(pvalues, nan=1.0)
+def _student_ttest_vectorized(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """
+    Vectorized Student's t-test (equal variance assumption).
+    """
+    n1, n2 = x.shape[0], y.shape[0]
+    # Sample means
+    mean1 = np.mean(x, axis=0)
+    mean2 = np.mean(y, axis=0)
+    # Pooled variance
+    var1 = np.var(x, axis=0, ddof=1)
+    var2 = np.var(y, axis=0, ddof=1)
+    pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
+    # Standard error
+    se = np.sqrt(pooled_var * (1/n1 + 1/n2))
+    # T-statistic
+    with np.errstate(divide='ignore', invalid='ignore'):
+        t_stat = (mean1 - mean2) / se
+    # Degrees of freedom
+    df = n1 + n2 - 2
+    # Two-tailed p-value
+    if HAS_SCIPY:
+        pvalues = 2 * stats.t.sf(np.abs(t_stat), df)
+    else:
+        pvalues = 2 * (1 - _normal_cdf(np.abs(t_stat)))
+    return np.nan_to_num(pvalues, nan=1.0)
+def _wilcoxon_vectorized(
+    x: np.ndarray,
+    y: np.ndarray,
+    n_jobs: int = 1
+) -> np.ndarray:
+    """
+    Wilcoxon rank-sum test with optional parallelization.
+    Note: This is slower than t-tests but more robust for non-normal data.
+    """
+    if not HAS_SCIPY:
+        raise ImportError("scipy is required for Wilcoxon test")
+    n_genes = x.shape[1]
+    if HAS_JOBLIB and n_jobs != 1:
+        # Parallel computation
+        def _compute_pval(i):
+            try:
+                _, pval = mannwhitneyu(x[:, i], y[:, i], alternative='two-sided')
+                return pval
+            except Exception:
+                return 1.0
+        pvalues = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_pval)(i) for i in range(n_genes)
+        )
+        return np.array(pvalues)
+    else:
+        # Sequential computation
+        pvalues = np.zeros(n_genes)
+        for i in range(n_genes):
+            try:
+                _, pvalues[i] = mannwhitneyu(x[:, i], y[:, i], alternative='two-sided')
+            except Exception:
+                pvalues[i] = 1.0
+        return pvalues
+def _normal_cdf(x: np.ndarray) -> np.ndarray:
+    """Approximate normal CDF without scipy."""
+    return 0.5 * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * x**3)))
+def compute_degs_gpu(
+    control: np.ndarray,
+    perturbed: np.ndarray,
+    gene_names: Optional[np.ndarray] = None,
+    method: DEGMethod = "welch",
+    pval_threshold: float = 0.05,
+    lfc_threshold: float = 0.5,
+    use_adjusted_pval: bool = True,
+    device: str = "cuda",
+) -> DEGResult:
+    """
+    GPU-accelerated DEG detection using PyTorch.
+    Parameters
+    ----------
+    control : np.ndarray
+        Control expression matrix (n_samples_control, n_genes)
+    perturbed : np.ndarray
+        Perturbed expression matrix (n_samples_perturbed, n_genes)
+    gene_names : np.ndarray, optional
+        Gene names. If None, uses indices.
+    method : str
+        Statistical test: "welch" or "student" (wilcoxon not supported on GPU)
+    pval_threshold : float
+        P-value threshold for significance
+    lfc_threshold : float
+        Absolute log2 fold change threshold
+    use_adjusted_pval : bool
+        If True, use adjusted p-values (BH correction)
+    device : str
+        GPU device: "cuda", "cuda:0", "mps", etc.
+    Returns
+    -------
+    DEGResult
+        DEG detection results
+    """
+    if not HAS_TORCH:
+        warnings.warn("PyTorch not available, falling back to CPU")
+        return compute_degs_fast(
+            control, perturbed, gene_names, method,
+            pval_threshold, lfc_threshold, use_adjusted_pval
+        )
+    if method == "wilcoxon":
+        warnings.warn("Wilcoxon test not supported on GPU, falling back to CPU")
+        return compute_degs_fast(
+            control, perturbed, gene_names, method,
+            pval_threshold, lfc_threshold, use_adjusted_pval
+        )
+    # Get device
+    if device == "auto":
+        if torch.cuda.is_available():
+            device = "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            device = "mps"
+        else:
+            device = "cpu"
+    torch_device = torch.device(device)
+    n_genes = control.shape[1]
+    # Gene names
+    if gene_names is None:
+        gene_names = np.array([f"Gene_{i}" for i in range(n_genes)])
+    # Move data to GPU
+    x = torch.tensor(control, dtype=torch.float32, device=torch_device)
+    y = torch.tensor(perturbed, dtype=torch.float32, device=torch_device)
+    n1, n2 = x.shape[0], y.shape[0]
+    # Compute means
+    mean_control = x.mean(dim=0)
+    mean_perturbed = y.mean(dim=0)
+    # Log fold change (use pseudocount of 1 for stability)
+    eps = 1.0
+    log_fold_changes = torch.log2((mean_perturbed + eps) / (mean_control + eps))
+    if method == "logfc":
+        pvalues = torch.full((n_genes,), float('nan'), device=torch_device)
+        pvalues_adj = pvalues.clone()
+    else:
+        # Compute variances
+        var1 = x.var(dim=0, unbiased=True)
+        var2 = y.var(dim=0, unbiased=True)
+        if method == "welch":
+            # Welch's t-test
+            se = torch.sqrt(var1 / n1 + var2 / n2)
+            t_stat = (mean_control - mean_perturbed) / (se + 1e-10)
+            # Welch-Satterthwaite degrees of freedom
+            num = (var1 / n1 + var2 / n2) ** 2
+            denom = (var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1)
+            df = num / (denom + 1e-10)
+            df = torch.clamp(df, min=1.0)
+        else:  # student
+            # Student's t-test
+            pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
+            se = torch.sqrt(pooled_var * (1/n1 + 1/n2))
+            t_stat = (mean_control - mean_perturbed) / (se + 1e-10)
+            df = torch.full((n_genes,), n1 + n2 - 2, device=torch_device, dtype=torch.float32)
+        # Move to CPU for p-value computation (scipy needed for t-distribution)
+        t_stat_np = torch.abs(t_stat).cpu().numpy()
+        df_np = df.cpu().numpy()
+        if HAS_SCIPY:
+            pvalues_np = 2 * stats.t.sf(t_stat_np, df_np)
+        else:
+            pvalues_np = 2 * (1 - _normal_cdf(t_stat_np))
+        pvalues_np = np.nan_to_num(pvalues_np, nan=1.0).astype(np.float32)
+        pvalues_adj_np = _benjamini_hochberg(pvalues_np).astype(np.float32)
+        pvalues = torch.tensor(pvalues_np, device=torch_device, dtype=torch.float32)
+        pvalues_adj = torch.tensor(pvalues_adj_np, device=torch_device, dtype=torch.float32)
+    # Determine significant DEGs
+    lfc_abs = torch.abs(log_fold_changes)
+    if method == "logfc":
+        is_deg = lfc_abs > lfc_threshold
+    else:
+        pval_test = pvalues_adj if use_adjusted_pval else pvalues
+        is_deg = (pval_test < pval_threshold) & (lfc_abs > lfc_threshold)
+    # Move results to CPU
+    return DEGResult(
+        gene_names=gene_names,
+        pvalues=pvalues.cpu().numpy(),
+        pvalues_adj=pvalues_adj.cpu().numpy(),
+        log_fold_changes=log_fold_changes.cpu().numpy(),
+        mean_control=mean_control.cpu().numpy(),
+        mean_perturbed=mean_perturbed.cpu().numpy(),
+        is_deg=is_deg.cpu().numpy(),
+        n_degs=int(is_deg.sum().item()),
+        method=method,
+        pval_threshold=pval_threshold,
+        lfc_threshold=lfc_threshold,
+    )
+def compute_degs_auto(
+    control: np.ndarray,
+    perturbed: np.ndarray,
+    gene_names: Optional[np.ndarray] = None,
+    method: DEGMethod = "welch",
+    pval_threshold: float = 0.05,
+    lfc_threshold: float = 0.5,
+    use_adjusted_pval: bool = True,
+    n_jobs: int = 1,
+    device: str = "auto",
+) -> DEGResult:
+    """
+    Automatically select the fastest DEG computation method.
+    Chooses GPU if available and data is large enough to benefit,
+    otherwise uses CPU with optional parallelization.
+    """
+    n_genes = control.shape[1]
+    n_samples = control.shape[0] + perturbed.shape[0]
+    # Use GPU for large datasets
+    use_gpu = False
+    if device != "cpu" and HAS_TORCH:
+        if device == "auto":
+            if torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
+                # GPU worthwhile for >1000 genes or >1000 samples
+                if n_genes > 1000 or n_samples > 1000:
+                    use_gpu = True
+        else:
+            use_gpu = True
+    if use_gpu:
+        return compute_degs_gpu(
+            control, perturbed, gene_names, method,
+            pval_threshold, lfc_threshold, use_adjusted_pval,
+            device=device if device != "auto" else "auto",
+        )
+    else:
+        return compute_degs_fast(
+            control, perturbed, gene_names, method,
+            pval_threshold, lfc_threshold, use_adjusted_pval,
+            n_jobs=n_jobs,
+        )

gengeneeval 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gengeneeval 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl