PyPI - edgepython - Versions diffs - 0.2.0__tar.gz → 0.2.1__tar.gz - Mend

edgepython 0.2.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{edgepython-0.2.0 → edgepython-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: edgepython
-Version: 0.2.0
+Version: 0.2.1
 Summary: Python port of the edgeR Bioconductor package for differential expression analysis of digital gene expression data.
 Author: Lior Pachter
 License-Expression: GPL-3.0-or-later

{edgepython-0.2.0 → edgepython-0.2.1}/edgepython/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ edgePython: Python port of the edgeR Bioconductor package.
 Empirical analysis of digital gene expression data in Python.
 """
-__version__ = "0.1.0"
+__version__ = "0.2.1"
 # --- Classes ---
 from .classes import DGEList, DGEExact, DGEGLM, DGELRT, TopTags
@@ -42,6 +42,7 @@ from .dispersion import (
     estimate_glm_common_disp,
     estimate_glm_trended_disp,
     estimate_glm_tagwise_disp,
+    estimate_glm_robust_disp,
 )
 # --- GLM fitting ---
@@ -112,3 +113,12 @@ from .sc_fit import glm_sc_fit, glm_sc_test, shrink_sc_disp
 # --- limma utilities ---
 from .limma_port import squeeze_var
+# --- voom ---
+from .voom_lmfit import (
+    voom,
+    voom_lmfit,
+    voom_basic,
+    array_weights,
+    duplicate_correlation,
+)

{edgepython-0.2.0 → edgepython-0.2.1}/edgepython/dispersion.py RENAMED Viewed

@@ -918,3 +918,155 @@ def estimate_glm_tagwise_disp(y, design=None, offset=None, dispersion=None,
         ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
     return tagwise
+def _calc_resid(fit, residual_type='pearson'):
+    """Compute GLM residual matrix for robust dispersion fitting."""
+    residual_type = str(residual_type).lower()
+    if residual_type not in ('pearson', 'anscombe', 'deviance'):
+        raise ValueError("residual_type must be one of ('pearson', 'anscombe', 'deviance')")
+    mu = np.asarray(fit['fitted.values'], dtype=np.float64)
+    yi = np.asarray(fit['counts'], dtype=np.float64)
+    disp = expand_as_matrix(np.asarray(fit['dispersion'], dtype=np.float64), mu.shape)
+    if residual_type == 'pearson':
+        res = (yi - mu) / np.sqrt(np.maximum(mu * (1 + disp * mu), 1e-12))
+    elif residual_type == 'deviance':
+        y_adj = yi + 1e-5
+        with np.errstate(divide='ignore', invalid='ignore'):
+            r = 2 * (y_adj * np.log(np.maximum(y_adj, 1e-12) / np.maximum(mu, 1e-12)) +
+                     (y_adj + 1 / np.maximum(disp, 1e-12)) *
+                     np.log((mu + 1 / np.maximum(disp, 1e-12)) /
+                            (y_adj + 1 / np.maximum(disp, 1e-12))))
+        r = np.maximum(r, 0)
+        res = np.sign(yi - mu) * np.sqrt(r)
+    else:
+        # Numerical approximation to the Anscombe residual integral used by edgeR.
+        from scipy.integrate import quad
+        def _anscombe_scalar(yv, muv, dv):
+            if muv <= 0 or yv <= 0:
+                return 0.0
+            def ffun(x):
+                return (x * (1 + dv * x)) ** (-1.0 / 3.0)
+            const = ffun(muv) ** 0.5
+            if yv == muv:
+                return 0.0
+            val, _ = quad(ffun, muv, yv, limit=50)
+            return const * val
+        res = np.zeros_like(yi, dtype=np.float64)
+        for g in range(yi.shape[0]):
+            for s in range(yi.shape[1]):
+                res[g, s] = _anscombe_scalar(yi[g, s], mu[g, s], disp[g, s])
+    res[mu == 0] = 0
+    return res
+def _psi_huber_matrix(u, k=1.345):
+    """Huber psi weights on a residual matrix."""
+    u = np.asarray(u, dtype=np.float64)
+    out = np.ones_like(u, dtype=np.float64)
+    mask = np.abs(u) > k
+    out[mask] = k / np.abs(u[mask])
+    out[~np.isfinite(out)] = 1.0
+    return out
+def _record_robust_disp_state(y, i, res=None, weights=None, fit=None):
+    """Store per-iteration state for estimate_glm_robust_disp(record=True)."""
+    key = f'iteration_{i}'
+    rec = y.get('record')
+    if rec is None:
+        rec = {
+            'AveLogCPM': {},
+            'trended.dispersion': {},
+            'tagwise.dispersion': {},
+            'weights': {},
+            'res': {},
+            'mu': {}
+        }
+    if y.get('AveLogCPM') is not None:
+        rec['AveLogCPM'][key] = np.asarray(y['AveLogCPM']).copy()
+    if y.get('trended.dispersion') is not None:
+        rec['trended.dispersion'][key] = np.asarray(y['trended.dispersion']).copy()
+    if y.get('tagwise.dispersion') is not None:
+        rec['tagwise.dispersion'][key] = np.asarray(y['tagwise.dispersion']).copy()
+    if weights is not None:
+        rec['weights'][key] = np.asarray(weights).copy()
+    if res is not None:
+        rec['res'][key] = np.asarray(res).copy()
+    if fit is not None and fit.get('fitted.values') is not None:
+        rec['mu'][key] = np.asarray(fit['fitted.values']).copy()
+    y['record'] = rec
+    return y
+def estimate_glm_robust_disp(y, design=None, prior_df=10, update_trend=True,
+                             trend_method='bin.loess', maxit=6, k=1.345,
+                             residual_type='pearson', verbose=False,
+                             record=False):
+    """Robust GLM dispersion estimation via iterative Huber reweighting.
+    Port of edgeR's estimateGLMRobustDisp.
+    """
+    from .utils import _resolve_design
+    design = _resolve_design(design, y)
+    if not (isinstance(y, dict) and 'counts' in y):
+        raise ValueError("Input must be a DGEList-like dict with 'counts'.")
+    from .dgelist import valid_dgelist
+    y = valid_dgelist(y)
+    y['weights'] = np.ones_like(np.asarray(y['counts'], dtype=np.float64), dtype=np.float64)
+    if y.get('trended.dispersion') is None:
+        y = estimate_glm_trended_disp(y, design=design, method=trend_method,
+                                      weights=y['weights'])
+    if y.get('tagwise.dispersion') is None:
+        y = estimate_glm_tagwise_disp(y, design=design, prior_df=prior_df,
+                                      weights=y['weights'])
+    if record:
+        y = _record_robust_disp_state(y, i=0, weights=y['weights'])
+    from .glm_fit import glm_fit
+    for i in range(1, int(maxit) + 1):
+        if verbose:
+            print(f"Iteration {i}: Re-fitting GLM.")
+        fit = glm_fit(y, design=design, prior_count=0)
+        res = _calc_resid(fit, residual_type=residual_type)
+        y['weights'] = _psi_huber_matrix(res, k=k)
+        y['AveLogCPM'] = ave_log_cpm(y, dispersion=y.get('trended.dispersion'))
+        if update_trend:
+            if verbose:
+                print("Re-estimating trended dispersion.")
+            y = estimate_glm_trended_disp(y, design=design, method=trend_method,
+                                          weights=y['weights'])
+        if verbose:
+            print("Re-estimating tagwise dispersion.")
+        y = estimate_glm_tagwise_disp(y, design=design, prior_df=prior_df,
+                                      weights=y['weights'])
+        if record:
+            y = _record_robust_disp_state(y, i=i, res=res,
+                                          weights=y['weights'], fit=fit)
+    return y
+def estimateGLMRobustDisp(*args, **kwargs):
+    """Compatibility alias for edgeR-style camelCase naming."""
+    return estimate_glm_robust_disp(*args, **kwargs)

{edgepython-0.2.0 → edgepython-0.2.1}/edgepython/dispersion_lowlevel.py RENAMED Viewed

@@ -20,6 +20,27 @@ from .expression import ave_log_cpm
 from .limma_port import is_fullrank
+def _cox_reid_adjust_from_xtwx(XtWX):
+    """Return -0.5 * log|XtWX| using LDL, matching edgeR's C path."""
+    from scipy.linalg import ldl
+    A = np.asarray(XtWX, dtype=np.float64)
+    if A.ndim == 2:
+        A = A[None, :, :]
+    ngenes = A.shape[0]
+    out = np.zeros(ngenes, dtype=np.float64)
+    for g in range(ngenes):
+        # edgeR's C code uses LAPACK Bunch-Kaufman factorization (dsytrf)
+        # and sums half log diagonal terms with clipping; LDL is the same
+        # symmetric-indefinite factorization family.
+        _, dmat, _ = ldl(A[g], lower=True, hermitian=True)
+        diag = np.abs(np.diag(dmat))
+        diag = np.where(diag > 1e-10, diag, 1e-10)
+        out[g] = -0.5 * np.sum(np.log(diag))
+    return out
 def adjusted_profile_lik_grid(grid_dispersions, y, design, offset, weights=None):
     """Evaluate APL at multiple dispersion grid points efficiently.
@@ -127,20 +148,45 @@ def adjusted_profile_lik_grid(grid_dispersions, y, design, offset, weights=None)
         XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design)
-        if ncoefs == 1:
-            logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
-        elif ncoefs == 2:
-            det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
-            logdet = np.log(np.maximum(det, 1e-300))
-        else:
-            sign, logdet = np.linalg.slogdet(XtWX)
-            logdet = np.where(sign > 0, logdet, 0.0)
-        apl[:, gi] = ll - 0.5 * logdet
+        apl[:, gi] = ll + _cox_reid_adjust_from_xtwx(XtWX)
     return apl
+def _apl_sum_oneway_scalar(dispersion, y, design, offset, w, group_cols, lgamma_y1):
+    """Fast sum of Cox-Reid adjusted profile log-likelihood for one-way designs."""
+    from .glm_fit import mglm_one_group
+    y = np.asarray(y, dtype=np.float64)
+    offset = np.asarray(offset, dtype=np.float64)
+    w = np.asarray(w, dtype=np.float64)
+    ngenes, _ = y.shape
+    ncoefs = design.shape[1]
+    d = float(max(dispersion, 1e-300))
+    mu = np.empty_like(y, dtype=np.float64)
+    # Fit each group independently via one-group Fisher scoring.
+    for cols in group_cols:
+        y_g = y[:, cols]
+        off_g = offset[:, cols]
+        w_g = w[:, cols]
+        disp_g = np.full_like(y_g, d)
+        b = mglm_one_group(y_g, dispersion=disp_g, offset=off_g, weights=w_g)
+        mu[:, cols] = np.exp(np.clip(b[:, None] + off_g, -500, 500))
+    mu_safe = np.maximum(mu, 1e-300)
+    r = 1.0 / d
+    ll = np.sum(w * (gammaln(y + r) - gammaln(r) - lgamma_y1
+                + r * np.log(r) + y * np.log(mu_safe)
+                - (r + y) * np.log(r + mu_safe)), axis=1)
+    working_w = np.maximum(w * mu_safe / (1.0 + d * mu_safe), 1e-300)
+    XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design)
+    return float(np.sum(ll + _cox_reid_adjust_from_xtwx(XtWX)))
 def adjusted_profile_lik(dispersion, y, design, offset, weights=None,
                          start=None, get_coef=False):
     """Tagwise Cox-Reid adjusted profile log-likelihoods for the dispersion.
@@ -195,12 +241,23 @@ def adjusted_profile_lik(dispersion, y, design, offset, weights=None,
     else:
         w = np.ones_like(y)
-    # Fit GLM to get mu
-    from .glm_fit import glm_fit
-    fit = glm_fit(y, design=design, dispersion=disp, offset=offset,
-                  weights=weights, prior_count=0, start=start)
-    mu = fit['fitted.values']
-    beta = fit.get('unshrunk.coefficients', fit['coefficients'])
+    # Fit GLM to get mu.
+    # Fast path for one-way designs avoids glm_fit bookkeeping overhead.
+    from .glm_fit import glm_fit, mglm_one_way
+    from .utils import design_as_factor
+    group = design_as_factor(design)
+    is_oneway = (len(np.unique(group)) == ncoefs)
+    if is_oneway:
+        fit = mglm_one_way(y, design=design, group=group, dispersion=disp,
+                           offset=offset, weights=weights, coef_start=start)
+        mu = fit['fitted.values']
+        beta = fit['coefficients']
+    else:
+        fit = glm_fit(y, design=design, dispersion=disp, offset=offset,
+                      weights=weights, prior_count=0, start=start)
+        mu = fit['fitted.values']
+        beta = fit.get('unshrunk.coefficients', fit['coefficients'])
     # Compute adjusted profile log-likelihood for all genes (vectorized)
     mu_safe = np.maximum(mu, 1e-300)  # (ngenes, nlibs)
@@ -233,17 +290,7 @@ def adjusted_profile_lik(dispersion, y, design, offset, weights=None,
     # XtWX[g, k, l] = sum_j working_w[g,j] * design[j,k] * design[j,l]
     XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design)  # (ngenes, ncoefs, ncoefs)
-    # Log determinant for all genes
-    if ncoefs == 1:
-        logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
-    elif ncoefs == 2:
-        det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
-        logdet = np.log(np.maximum(det, 1e-300))
-    else:
-        sign, logdet = np.linalg.slogdet(XtWX)
-        logdet = np.where(sign > 0, logdet, 0.0)
-    cr_adj = -0.5 * logdet
+    cr_adj = _cox_reid_adjust_from_xtwx(XtWX)
     apl = ll + cr_adj
     if get_coef:
@@ -555,10 +602,31 @@ def disp_cox_reid(y, design=None, offset=None, weights=None, ave_log_cpm_vals=No
         if weights is not None and weights.ndim == 2:
             weights = weights[i]
-    # Function to optimize
-    def fun(par):
-        disp = par ** 4
-        return -np.sum(adjusted_profile_lik(disp, y, design, offset, weights=weights))
+    # Function to optimize.
+    # Fast path: one-way designs can evaluate APL sum without generic glm_fit overhead.
+    from .utils import design_as_factor
+    group = design_as_factor(design)
+    is_oneway = len(np.unique(group)) == design.shape[1]
+    if weights is None:
+        w = np.ones_like(y)
+    else:
+        w = np.asarray(weights, dtype=np.float64)
+        if w.ndim == 1:
+            w = np.tile(w, (y.shape[0], 1))
+    if is_oneway:
+        unique_groups = np.unique(group)
+        group_cols = [np.where(group == grp)[0] for grp in unique_groups]
+        lgamma_y1 = gammaln(y + 1)
+        def fun(par):
+            disp = par ** 4
+            return -_apl_sum_oneway_scalar(disp, y, design, offset, w, group_cols, lgamma_y1)
+    else:
+        def fun(par):
+            disp = par ** 4
+            return -np.sum(adjusted_profile_lik(disp, y, design, offset, weights=weights))
     # Optimize
     lo = interval[0] ** 0.25
@@ -930,12 +998,24 @@ def disp_bin_trend(y, design=None, offset=None, df=5, span=0.3,
             bin_d[i - 1] = 0.1
         bin_a[i - 1] = np.mean(bin_ave)
-    # If few bins, use linear interpolation
+    # If few bins, use linear interpolation with R's approxfun(rule=2, ties=mean)
+    # behavior: average duplicate x values and clamp to boundary values outside range.
     if nbins < 7:
-        from scipy.interpolate import interp1d
-        f = interp1d(bin_a, np.sqrt(np.maximum(bin_d, 0)),
-                     fill_value='extrapolate', kind='linear')
-        dispersion = f(ave_log_cpm_vals) ** 2
+        x = np.asarray(bin_a, dtype=np.float64)
+        yv = np.sqrt(np.maximum(np.asarray(bin_d, dtype=np.float64), 0))
+        order = np.argsort(x)
+        x = x[order]
+        yv = yv[order]
+        # ties=mean
+        xu, inv = np.unique(x, return_inverse=True)
+        yu = np.zeros_like(xu, dtype=np.float64)
+        cnt = np.zeros_like(xu, dtype=np.float64)
+        for i, idx in enumerate(inv):
+            yu[idx] += yv[i]
+            cnt[idx] += 1.0
+        yu = yu / np.maximum(cnt, 1.0)
+        y_interp = np.interp(ave_log_cpm_vals, xu, yu, left=yu[0], right=yu[-1])
+        dispersion = np.maximum(y_interp ** 2, 0)
         return {'AveLogCPM': ave_log_cpm_vals, 'dispersion': dispersion,
                 'bin.AveLogCPM': bin_a, 'bin.dispersion': bin_d}

edgepython 0.2.0__tar.gz → 0.2.1__tar.gz

edgepython 0.2.0tar.gz → 0.2.1tar.gz