PyPI - DeConveil - Versions diffs - 0.1.0__py3-none-any.whl - Mend

DeConveil 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

DeConveil/__init__.py +7 -0
DeConveil/dds.py +1279 -0
DeConveil/default_inference.py +284 -0
DeConveil/ds.py +758 -0
DeConveil/grid_search.py +195 -0
DeConveil/inference.py +373 -0
DeConveil/utils_CNaware.py +809 -0
DeConveil-0.1.0.dist-info/LICENSE +21 -0
DeConveil-0.1.0.dist-info/METADATA +35 -0
DeConveil-0.1.0.dist-info/RECORD +12 -0
DeConveil-0.1.0.dist-info/WHEEL +5 -0
DeConveil-0.1.0.dist-info/top_level.txt +1 -0

DeConveil/grid_search.py ADDED Viewed

@@ -0,0 +1,195 @@
+from typing import Optional
+import numpy as np
+from scipy.special import gammaln  # type: ignore
+from deconveil import utils_CNaware
+def grid_fit_beta(
+    counts: np.ndarray,
+    size_factors: np.ndarray,
+    design_matrix: np.ndarray,
+    disp: float,
+    cnv: np.ndarray,
+    min_mu: float = 0.5,
+    grid_length: int = 60,
+    min_beta: float = -30,
+    max_beta: float = 30,
+) -> np.ndarray:
+    """Find best LFC parameter.
+    Perform 2D grid search to maximize negative binomial
+    GLM log-likelihood w.r.t. LFCs.
+    Parameters
+    ----------
+    counts : ndarray
+        Raw counts for a given gene.
+    size_factors : ndarray
+        DESeq2 normalization factors.
+    design_matrix : ndarray
+        Design matrix.
+    disp : float
+        Gene-wise dispersion prior.
+    min_mu : float
+        Lower threshold for dispersion parameters.
+    grid_length : int
+        Number of grid points. (default: ``100``).
+    min_beta : float
+        Lower-bound on LFC. (default: ``30``).
+    max_beta : float
+        Upper-bound on LFC. (default: ``30``).
+    Returns
+    -------
+    ndarray
+        Fitted LFC parameter.
+    """
+    x_grid = np.linspace(min_beta, max_beta, grid_length)
+    y_grid = np.linspace(min_beta, max_beta, grid_length)
+    ll_grid = np.zeros((grid_length, grid_length))
+    def loss(beta: np.ndarray) -> np.ndarray:
+        # closure to minimize
+        print(f"Shape of beta: {beta.shape}")
+        print(f"Shape of design_matrix: {design_matrix.shape}")
+        if beta is None or len(beta.shape) < 2:
+            raise ValueError("Beta is not properly initialized or has an unexpected shape.")
+        mu = np.maximum(cnv * size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu)
+        return vec_nb_nll(counts, mu, disp) + 0.5 * (1e-6 * beta**2).sum(1)
+    for i, x in enumerate(x_grid):
+        ll_grid[i, :] = loss(np.array([[x, y] for y in y_grid]))
+    min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
+    delta = x_grid[1] - x_grid[0]
+    fine_x_grid = np.linspace(
+        x_grid[min_idxs[0]] - delta, x_grid[min_idxs[0]] + delta, grid_length
+    )
+    fine_y_grid = np.linspace(
+        y_grid[min_idxs[1]] - delta,
+        y_grid[min_idxs[1]] + delta,
+        grid_length,
+    )
+    for i, x in enumerate(fine_x_grid):
+        ll_grid[i, :] = loss(np.array([[x, y] for y in fine_y_grid]))
+    min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
+    beta = np.array([fine_x_grid[min_idxs[0]], fine_y_grid[min_idxs[1]]])
+    return beta
+def grid_fit_shrink_beta(
+    counts: np.ndarray,
+    cnv: np.ndarray,
+    offset: np.ndarray,
+    design_matrix: np.ndarray,
+    size: np.ndarray,
+    prior_no_shrink_scale: float,
+    prior_scale: float,
+    scale_cnst: float,
+    grid_length: int = 60,
+    min_beta: float = -30,
+    max_beta: float = 30,
+) -> np.ndarray:
+    """Find best LFC parameter.
+    Performs 2D grid search to maximize MAP negative binomial
+    GLM log-likelihood w.r.t. LFCs, with apeGLM prior.
+    Parameters
+    ----------
+    counts : ndarray
+        Raw counts for a given gene.
+    offset : ndarray
+        Natural logarithm of size factor.
+    design_matrix : ndarray
+        Design matrix.
+    size : ndarray
+        Size parameter of NB family (inverse of dispersion).
+    prior_no_shrink_scale : float
+        Prior variance for the intercept.
+    prior_scale : float
+        Prior variance for the LFC coefficient.
+    scale_cnst : float
+        Scaling factor for the optimization.
+    grid_length : int
+        Number of grid points. (default: ``100``).
+    min_beta : int
+        Lower-bound on LFC. (default: ``30``).
+    max_beta : int
+        Upper-bound on LFC. (default: ``30``).
+    Returns
+    -------
+    ndarray
+        Fitted MAP LFC parameter.
+    """
+    x_grid = np.linspace(min_beta, max_beta, grid_length)
+    y_grid = np.linspace(min_beta, max_beta, grid_length)
+    ll_grid = np.zeros((grid_length, grid_length))
+    def loss(beta: np.ndarray) -> float:
+        # closure to minimize
+        return (
+            utils_CNaware.nbinomFn(
+                beta,
+                design_matrix,
+                counts,
+                cnv,
+                size,
+                offset,
+                prior_no_shrink_scale,
+                prior_scale,
+            )
+            / scale_cnst
+        )
+    for i, x in enumerate(x_grid):
+        for j, y in enumerate(y_grid):
+            ll_grid[i, j] = loss(np.array([x, y]))
+    min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
+    delta = x_grid[1] - x_grid[0]
+    fine_x_grid = np.linspace(
+        x_grid[min_idxs[0]] - delta, x_grid[min_idxs[0]] + delta, grid_length
+    )
+    fine_y_grid = np.linspace(
+        y_grid[min_idxs[1]] - delta,
+        y_grid[min_idxs[1]] + delta,
+        grid_length,
+    )
+    for i, x in enumerate(fine_x_grid):
+        for j, y in enumerate(fine_y_grid):
+            ll_grid[i, j] = loss(np.array([x, y]))
+    min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
+    beta = np.array([fine_x_grid[min_idxs[0]], fine_y_grid[min_idxs[1]]])
+    return beta

DeConveil/inference.py ADDED Viewed

@@ -0,0 +1,373 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Literal
+from typing import Optional
+from typing import Tuple
+import numpy as np
+import pandas as pd
+class Inference(ABC):
+    """Abstract class with DESeq2-related inference methods."""
+    @abstractmethod
+    def lin_reg_mu(
+        self,
+        counts: np.ndarray,
+        size_factors: np.ndarray,
+        design_matrix: np.ndarray,
+        min_mu: float,
+    ) -> np.ndarray:
+        """Estimate mean of negative binomial model using a linear regression.
+        Used to initialize genewise dispersion models.
+        Parameters
+        ----------
+        counts : ndarray
+            Raw counts.
+        size_factors : ndarray
+            Sample-wise scaling factors (obtained from median-of-ratios).
+        design_matrix : ndarray
+            Design matrix.
+        min_mu : float
+            Lower threshold for fitted means, for numerical stability.
+            (default: ``0.5``).
+        Returns
+        -------
+        ndarray
+            Estimated mean.
+        """
+    @abstractmethod
+    def irls_glm(
+        self,
+        counts: np.ndarray,
+        size_factors: np.ndarray,
+        design_matrix: np.ndarray,
+        disp: np.ndarray,
+        cnv: np.ndarray,
+        min_mu: float,
+        beta_tol: float,
+        min_beta: float = -30,
+        max_beta: float = 30,
+        optimizer: Literal["BFGS", "L-BFGS-B"] = "L-BFGS-B",
+        maxiter: int = 250,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        r"""Fit a NB GLM wit log-link to predict counts from the design matrix.
+        See equations (1-2) in the DESeq2 paper.
+        Parameters
+        ----------
+        counts : ndarray
+            Raw counts.
+        size_factors : ndarray
+            Sample-wise scaling factors (obtained from median-of-ratios).
+        design_matrix : ndarray
+            Design matrix.
+        disp : ndarray
+            Gene-wise dispersion prior.
+        min_mu : ndarray
+            Lower bound on estimated means, to ensure numerical stability.
+            (default: ``0.5``).
+        beta_tol : float
+            Stopping criterion for IRWLS:
+            :math:`\vert dev - dev_{old}\vert / \vert dev + 0.1 \vert < \beta_{tol}`.
+            (default: ``1e-8``).
+        min_beta : float
+            Lower-bound on LFC. (default: ``-30``).
+        max_beta : float
+            Upper-bound on LFC. (default: ``-30``).
+        optimizer : str
+            Optimizing method to use in case IRLS starts diverging.
+            Accepted values: 'BFGS' or 'L-BFGS-B'.
+            NB: only 'L-BFGS-B' ensures that LFCS will
+            lay in the [min_beta, max_beta] range. (default: ``'L-BFGS-B'``).
+        maxiter : int
+            Maximum number of IRLS iterations to perform before switching to L-BFGS-B.
+            (default: ``250``).
+        Returns
+        -------
+        beta: ndarray
+            Fitted (basemean, lfc) coefficients of negative binomial GLM.
+        mu: ndarray
+            Means estimated from size factors and beta:
+            :math:`\mu = s_{ij} \exp(\beta^t X)`.
+        H: ndarray
+            Diagonal of the :math:`W^{1/2} X (X^t W X)^-1 X^t W^{1/2}`
+            covariance matrix.
+        converged: ndarray
+            Whether IRLS or the optimizer converged. If not and if dimension allows it,
+            perform grid search.
+        """
+    @abstractmethod
+    def alpha_mle(
+        self,
+        counts: np.ndarray,
+        design_matrix: np.ndarray,
+        mu: np.ndarray,
+        alpha_hat: np.ndarray,
+        min_disp: float,
+        max_disp: float,
+        prior_disp_var: Optional[float] = None,
+        cr_reg: bool = True,
+        prior_reg: bool = False,
+        optimizer: Literal["BFGS", "L-BFGS-B"] = "L-BFGS-B",
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Estimate the dispersion parameter of a negative binomial GLM.
+        Parameters
+        ----------
+        counts : ndarray
+            Raw counts.
+        design_matrix : ndarray
+            Design matrix.
+        mu : ndarray
+            Mean estimation for the NB model.
+        alpha_hat : ndarray
+            Initial dispersion estimate.
+        min_disp : float
+            Lower threshold for dispersion parameters.
+        max_disp : float
+            Upper threshold for dispersion parameters.
+        prior_disp_var : float
+            Prior dispersion variance.
+        cr_reg : bool
+            Whether to use Cox-Reid regularization. (default: ``True``).
+        prior_reg : bool
+            Whether to use prior log-residual regularization. (default: ``False``).
+        optimizer : str
+            Optimizing method to use. Accepted values: 'BFGS' or 'L-BFGS-B'.
+            (default: ``'L-BFGS-B'``).
+        Returns
+        -------
+        ndarray
+            Dispersion estimate.
+        ndarray
+            Whether L-BFGS-B converged. If not, dispersion is estimated
+            using grid search.
+        """
+    @abstractmethod
+    def fit_rough_dispersions(
+        self, normed_counts: np.ndarray, design_matrix: np.ndarray
+    ) -> np.ndarray:
+        """'Rough dispersion' estimates from linear model, as per the R code.
+        Used as initial estimates in :meth:`DeseqDataSet.fit_genewise_dispersions()
+        <pydeseq2.dds.DeseqDataSet.fit_genewise_dispersions>`.
+        Parameters
+        ----------
+        normed_counts : ndarray
+            Array of deseq2-normalized read counts. Rows: samples, columns: genes.
+        design_matrix : pandas.DataFrame
+            A DataFrame with experiment design information (to split cohorts).
+            Indexed by sample barcodes. Unexpanded, *with* intercept.
+        Returns
+        -------
+        ndarray
+            Estimated dispersion parameter for each gene.
+        """
+    @abstractmethod
+    def fit_moments_dispersions2(
+        self, normed_counts: np.ndarray, size_factors: np.ndarray
+    ) -> np.ndarray:
+        """Dispersion estimates based on moments, as per the R code.
+        Used as initial estimates in :meth:`DeseqDataSet.fit_genewise_dispersions()
+        <pydeseq2.dds.DeseqDataSet.fit_genewise_dispersions>`.
+        Parameters
+        ----------
+        normed_counts : ndarray
+            Array of deseq2-normalized read counts. Rows: samples, columns: genes.
+        size_factors : ndarray
+            DESeq2 normalization factors.
+        Returns
+        -------
+        ndarray
+            Estimated dispersion parameter for each gene.
+        """
+    @abstractmethod
+    def dispersion_trend_gamma_glm(
+        self, covariates: pd.Series, targets: pd.Series
+    ) -> Tuple[np.ndarray, np.ndarray, bool]:
+        """Fit a gamma glm on gene dispersions.
+        The intercept should be concatenated in this method
+        and the first returned coefficient should be the intercept.
+        Parameters
+        ----------
+        covariates : pd.Series
+            Covariates for the regression (num_genes,).
+        targets : pd.Series
+            Targets for the regression (num_genes,).
+        Returns
+        -------
+        coeffs : ndarray
+            Coefficients of the regression.
+        predictions : ndarray
+            Predictions of the regression.
+        converged : bool
+            Whether the optimization converged.
+        """
+    @abstractmethod
+    def wald_test(
+        self,
+        design_matrix: np.ndarray,
+        disp: np.ndarray,
+        lfc: np.ndarray,
+        mu: np.ndarray,
+        ridge_factor: np.ndarray,
+        contrast: np.ndarray,
+        lfc_null: np.ndarray,
+        alt_hypothesis: Optional[
+            Literal["greaterAbs", "lessAbs", "greater", "less"]
+        ] = None,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Run Wald test for differential expression.
+        Computes Wald statistics, standard error and p-values from
+        dispersion and LFC estimates.
+        Parameters
+        ----------
+        design_matrix : ndarray
+            Design matrix.
+        disp : float
+            Dispersion estimate.
+        lfc : ndarray
+            Log-fold change estimate (in natural log scale).
+        mu : float
+            Mean estimation for the NB model.
+        ridge_factor : ndarray
+            Regularization factors.
+        contrast : ndarray
+            Vector encoding the contrast that is being tested.
+        lfc_null : float
+            The (log2) log fold change under the null hypothesis.
+        alt_hypothesis : str or None
+            The alternative hypothesis for computing wald p-values.
+        Returns
+        -------
+        wald_p_value : ndarray
+            Estimated p-value.
+        wald_statistic : ndarray
+            Wald statistic.
+        wald_se : ndarray
+            Standard error of the Wald statistic.
+        """
+    @abstractmethod
+    def lfc_shrink_nbinom_glm(
+        self,
+        design_matrix: np.ndarray,
+        counts: np.ndarray,
+        cnv: np.ndarray,
+        size: np.ndarray,
+        offset: np.ndarray,
+        prior_no_shrink_scale: float,
+        prior_scale: float,
+        optimizer: str,
+        shrink_index: int,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Fit a negative binomial MAP LFC using an apeGLM prior.
+        Only the LFC is shrinked, and not the intercept.
+        Parameters
+        ----------
+        design_matrix : ndarray
+            Design matrix.
+        counts : ndarray
+            Raw counts.
+        size : ndarray
+            Size parameter of NB family (inverse of dispersion).
+        offset : ndarray
+            Natural logarithm of size factor.
+        prior_no_shrink_scale : float
+            Prior variance for the intercept.
+        prior_scale : float
+            Prior variance for the LFC parameter.
+        optimizer : str
+            Optimizing method to use in case IRLS starts diverging.
+            Accepted values: 'L-BFGS-B', 'BFGS' or 'Newton-CG'.
+        shrink_index : int
+            Index of the LFC coordinate to shrink. (default: ``1``).
+        Returns
+        -------
+        beta: ndarray
+            2-element array, containing the intercept (first) and the LFC (second).
+        inv_hessian: ndarray
+            Inverse of the Hessian of the objective at the estimated MAP LFC.
+        converged: ndarray
+            Whether L-BFGS-B converged for each optimization problem.
+        """