PyPI - pertpy - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pertpy 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pertpy/__init__.py +1 -1
pertpy/_doc.py +19 -0
pertpy/data/_datasets.py +1 -1
pertpy/metadata/_cell_line.py +18 -8
pertpy/metadata/_compound.py +3 -4
pertpy/metadata/_metadata.py +1 -1
pertpy/preprocessing/_guide_rna.py +114 -13
pertpy/preprocessing/_guide_rna_mixture.py +179 -0
pertpy/tools/__init__.py +1 -1
pertpy/tools/_augur.py +64 -86
pertpy/tools/_cinemaot.py +21 -17
pertpy/tools/_coda/_base_coda.py +90 -117
pertpy/tools/_dialogue.py +32 -40
pertpy/tools/_differential_gene_expression/__init__.py +1 -2
pertpy/tools/_differential_gene_expression/_base.py +486 -112
pertpy/tools/_differential_gene_expression/_edger.py +30 -21
pertpy/tools/_differential_gene_expression/_pydeseq2.py +15 -29
pertpy/tools/_differential_gene_expression/_statsmodels.py +0 -11
pertpy/tools/_distances/_distances.py +71 -56
pertpy/tools/_enrichment.py +16 -8
pertpy/tools/_milo.py +54 -50
pertpy/tools/_mixscape.py +307 -208
pertpy/tools/_perturbation_space/_perturbation_space.py +40 -31
pertpy/tools/_perturbation_space/_simple.py +48 -0
pertpy/tools/_scgen/_scgen.py +35 -27
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/METADATA +6 -6
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/RECORD +29 -28
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/WHEEL +1 -1
pertpy/tools/_differential_gene_expression/_formulaic.py +0 -189
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_differential_gene_expression/_edger.py CHANGED Viewed

@@ -2,7 +2,7 @@ from collections.abc import Sequence
 import numpy as np
 import pandas as pd
-from scanpy import logging
+from lamin_utils import logger
 from scipy.sparse import issparse
 from ._base import LinearModelBase
@@ -27,16 +27,11 @@ class EdgeR(LinearModelBase):
         # pandas2ri.activate()
         # rpy2.robjects.numpy2ri.activate()
         try:
-            import rpy2.robjects.numpy2ri
-            import rpy2.robjects.pandas2ri
             from rpy2 import robjects as ro
             from rpy2.robjects import numpy2ri, pandas2ri
-            from rpy2.robjects.conversion import localconverter
+            from rpy2.robjects.conversion import get_conversion, localconverter
             from rpy2.robjects.packages import importr
-            pandas2ri.activate()
-            rpy2.robjects.numpy2ri.activate()
         except ImportError:
             raise ImportError("edger requires rpy2 to be installed.") from None
@@ -49,25 +44,30 @@ class EdgeR(LinearModelBase):
             ) from e
         # Convert dataframe
-        with localconverter(ro.default_converter + numpy2ri.converter):
+        with localconverter(get_conversion() + numpy2ri.converter):
             expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
             if issparse(expr):
                 expr = expr.T.toarray()
             else:
                 expr = expr.T
-        expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
+        with localconverter(get_conversion() + pandas2ri.converter):
+            expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
+            samples_r = ro.conversion.py2rpy(self.adata.obs)
-        dge = edger.DGEList(counts=expr_r, samples=self.adata.obs)
+        dge = edger.DGEList(counts=expr_r, samples=samples_r)
-        logging.info("Calculating NormFactors")
+        logger.info("Calculating NormFactors")
         dge = edger.calcNormFactors(dge)
-        logging.info("Estimating Dispersions")
-        dge = edger.estimateDisp(dge, design=self.design)
+        with localconverter(get_conversion() + numpy2ri.converter):
+            design_r = ro.conversion.py2rpy(self.design.values)
+        logger.info("Estimating Dispersions")
+        dge = edger.estimateDisp(dge, design=design_r)
-        logging.info("Fitting linear model")
-        fit = edger.glmQLFit(dge, design=self.design, **kwargs)
+        logger.info("Fitting linear model")
+        fit = edger.glmQLFit(dge, design=design_r, **kwargs)
         ro.globalenv["fit"] = fit
         self.fit = fit
@@ -88,11 +88,9 @@ class EdgeR(LinearModelBase):
         #  Fix mask for .fit()
         try:
-            import rpy2.robjects.numpy2ri
-            import rpy2.robjects.pandas2ri
             from rpy2 import robjects as ro
             from rpy2.robjects import numpy2ri, pandas2ri
-            from rpy2.robjects.conversion import localconverter
+            from rpy2.robjects.conversion import get_conversion, localconverter
             from rpy2.robjects.packages import importr
         except ImportError:
@@ -106,7 +104,8 @@ class EdgeR(LinearModelBase):
             ) from None
         # Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out.
-        contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
+        with localconverter(get_conversion() + numpy2ri.converter):
+            contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
         ro.globalenv["contrast_vec"] = contrast_vec_r
         # Test contrast with R
@@ -117,8 +116,18 @@ class EdgeR(LinearModelBase):
             """
         )
-        # Convert results to pandas
-        de_res = ro.conversion.rpy2py(ro.globalenv["de_res"])
+        # Retrieve the `de_res` object
+        de_res = ro.globalenv["de_res"]
+        # If already a Pandas DataFrame, return it directly
+        if isinstance(de_res, pd.DataFrame):
+            de_res.index.name = "variable"
+            return de_res.reset_index().rename(columns={"PValue": "p_value", "logFC": "log_fc", "FDR": "adj_p_value"})
+        # Convert to Pandas DataFrame if still an R object
+        with localconverter(get_conversion() + pandas2ri.converter):
+            de_res = ro.conversion.rpy2py(de_res)
         de_res.index.name = "variable"
         de_res = de_res.reset_index()

pertpy/tools/_differential_gene_expression/_pydeseq2.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 import warnings
+import numpy as np
 import pandas as pd
 from anndata import AnnData
 from numpy import ndarray
@@ -40,33 +41,25 @@ class PyDESeq2(LinearModelBase):
         Args:
             **kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
         """
-        inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
-        covars = self.design.columns.tolist()
-        if "Intercept" not in covars:
-            warnings.warn(
-                "Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
-            )
-        processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
+        try:
+            usable_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            usable_cpus = os.cpu_count()
+        inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", usable_cpus))
         dds = DeseqDataSet(
-            adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
+            adata=self.adata,
+            design=self.design,  # initialize using design matrix, not formula
+            refit_cooks=True,
+            inference=inference,
+            **kwargs,
         )
-        # workaround code to insert design array
-        des_mtx_cols = dds.obsm["design_matrix"].columns
-        dds.obsm["design_matrix"] = self.design
-        if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
-            dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
         dds.deseq2()
         self.dds = dds
-    # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
-    # see https://github.com/owkin/PyDESeq2/issues/213
-    # Therefore these functions are overridden in a way to make it work with PyDESeq2,
-    # ingoring the inconsistency of function signatures. Once arbitrary design
-    # matrices and contrasts are supported by PyDEseq2, we can fully support the
-    # Linear model interface.
-    def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame:  # type: ignore
+    def _test_single_contrast(self, contrast, alpha=0.05, **kwargs) -> pd.DataFrame:
         """Conduct a specific test and returns a Pandas DataFrame.
         Args:
@@ -74,6 +67,7 @@ class PyDESeq2(LinearModelBase):
             alpha: p value threshold used for controlling fdr with independent hypothesis weighting
             **kwargs: extra arguments to pass to DeseqStats()
         """
+        contrast = np.array(contrast)
         stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
         # Calling `.summary()` is required to fill the `results_df` data frame
         stat_res.summary()
@@ -85,11 +79,3 @@ class PyDESeq2(LinearModelBase):
         res_df.index.name = "variable"
         res_df = res_df.reset_index()
         return res_df
-    def cond(self, **kwargs) -> ndarray:
-        raise NotImplementedError(
-            "PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
-        )
-    def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]:  # type: ignore
-        return (column, group_to_compare, baseline)

pertpy/tools/_differential_gene_expression/_statsmodels.py CHANGED Viewed

@@ -59,14 +59,3 @@ class Statsmodels(LinearModelBase):
                 }
             )
         return pd.DataFrame(res).sort_values("p_value")
-    def contrast(self, column: str, baseline: str, group_to_compare: str) -> np.ndarray:
-        """Build a simple contrast for pairwise comparisons.
-        This is equivalent to
-        ```
-        model.cond(<column> = baseline) - model.cond(<column> = group_to_compare)
-        ```
-        """
-        return self.cond(**{column: baseline}) - self.cond(**{column: group_to_compare})

pertpy/tools/_distances/_distances.py CHANGED Viewed

@@ -344,9 +344,9 @@ class Distance:
             else:
                 embedding = adata.obsm[self.obsm_key].copy()
             for index_x, group_x in enumerate(fct(groups)):
-                cells_x = embedding[grouping == group_x].copy()
+                cells_x = embedding[np.asarray(grouping == group_x)].copy()
                 for group_y in groups[index_x:]:  # type: ignore
-                    cells_y = embedding[grouping == group_y].copy()
+                    cells_y = embedding[np.asarray(grouping == group_y)].copy()
                     if not bootstrap:
                         # By distance axiom, the distance between a group and itself is 0
                         dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
@@ -478,9 +478,9 @@ class Distance:
             else:
                 embedding = adata.obsm[self.obsm_key].copy()
             for group_x in fct(groups):
-                cells_x = embedding[grouping == group_x].copy()
+                cells_x = embedding[np.asarray(grouping == group_x)].copy()
                 group_y = selected_group
-                cells_y = embedding[grouping == group_y].copy()
+                cells_y = embedding[np.asarray(grouping == group_y)].copy()
                 if not bootstrap:
                     # By distance axiom, the distance between a group and itself is 0
                     dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
@@ -691,17 +691,18 @@ class MMD(AbstractDistance):
 class WassersteinDistance(AbstractDistance):
-    """Wasserstein distance metric (solved with entropy regularized Sinkhorn)."""
     def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        X = np.asarray(X, dtype=np.float64)
+        Y = np.asarray(Y, dtype=np.float64)
         geom = PointCloud(X, Y)
         return self.solve_ot_problem(geom, **kwargs)
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        P = np.asarray(P, dtype=np.float64)
         geom = Geometry(cost_matrix=P[idx, :][:, ~idx])
         return self.solve_ot_problem(geom, **kwargs)
@@ -709,7 +710,13 @@ class WassersteinDistance(AbstractDistance):
         ot_prob = LinearProblem(geom)
         solver = Sinkhorn()
         ot = solver(ot_prob, **kwargs)
-        return ot.reg_ot_cost.item()
+        cost = float(ot.reg_ot_cost)
+        # Check for NaN or invalid cost
+        if not np.isfinite(cost):
+            return 1.0
+        else:
+            return cost
 class EuclideanDistance(AbstractDistance):
@@ -981,7 +988,7 @@ class NBLL(AbstractDistance):
             try:
                 nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
                 return _compute_nll(y, nb_params, epsilon)
-            except np.linalg.linalg.LinAlgError:
+            except np.linalg.LinAlgError:
                 if x.mean() < 10 and y.mean() < 10:
                     return 0.0
                 else:
@@ -1110,67 +1117,75 @@ class MeanVarDistributionDistance(AbstractDistance):
         super().__init__()
         self.accepts_precomputed = False
+    @staticmethod
+    def _mean_var(x, log: bool = False):
+        mean = np.mean(x, axis=0)
+        var = np.var(x, axis=0)
+        positive = mean > 0
+        mean = mean[positive]
+        var = var[positive]
+        if log:
+            mean = np.log(mean)
+            var = np.log(var)
+        return mean, var
+    @staticmethod
+    def _prep_kde_data(x, y):
+        return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
+    @staticmethod
+    def _grid_points(d, n_points=100):
+        # Make grid, add 1 bin on lower/upper end to get final n_points
+        d_min = d.min()
+        d_max = d.max()
+        # Compute bin size
+        d_bin = (d_max - d_min) / (n_points - 2)
+        d_min = d_min - d_bin
+        d_max = d_max + d_bin
+        return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
+    @staticmethod
+    def _kde_eval_both(x_kde, y_kde, grid):
+        n_points = len(grid)
+        chunk_size = 10000
+        result_x = np.zeros(n_points)
+        result_y = np.zeros(n_points)
+        # Process same chunks for both KDEs
+        for start in range(0, n_points, chunk_size):
+            end = min(start + chunk_size, n_points)
+            chunk = grid[start:end]
+            result_x[start:end] = x_kde.score_samples(chunk)
+            result_y[start:end] = y_kde.score_samples(chunk)
+        return result_x, result_y
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         """Difference of mean-var distributions in 2 matrices.
         Args:
             X: Normalized and log transformed cells x genes count matrix.
             Y: Normalized and log transformed cells x genes count matrix.
         """
+        mean_x, var_x = self._mean_var(X, log=True)
+        mean_y, var_y = self._mean_var(Y, log=True)
-        def _mean_var(x, log: bool = False):
-            mean = np.mean(x, axis=0)
-            var = np.var(x, axis=0)
-            positive = mean > 0
-            mean = mean[positive]
-            var = var[positive]
-            if log:
-                mean = np.log(mean)
-                var = np.log(var)
-            return mean, var
-        def _prep_kde_data(x, y):
-            return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
-        def _grid_points(d, n_points=100):
-            # Make grid, add 1 bin on lower/upper end to get final n_points
-            d_min = d.min()
-            d_max = d.max()
-            # Compute bin size
-            d_bin = (d_max - d_min) / (n_points - 2)
-            d_min = d_min - d_bin
-            d_max = d_max + d_bin
-            return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
-        def _parallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
-            # the thread_count is determined using the factor 0.875 as recommended here:
-            # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook
-            with multiprocessing.Pool(thread_count) as p:
-                return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
-        def _kde_eval(d, grid):
-            # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
-            # can not be compared well on regions further away from the data as they are -inf
-            kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d)
-            return _parallel_score_samples(kde, grid)
-        mean_x, var_x = _mean_var(X, log=True)
-        mean_y, var_y = _mean_var(Y, log=True)
-        x = _prep_kde_data(mean_x, var_x)
-        y = _prep_kde_data(mean_y, var_y)
+        x = self._prep_kde_data(mean_x, var_x)
+        y = self._prep_kde_data(mean_y, var_y)
         # Gridpoints to eval KDE on
-        mean_grid = _grid_points(np.concatenate([mean_x, mean_y]))
-        var_grid = _grid_points(np.concatenate([var_x, var_y]))
+        mean_grid = self._grid_points(np.concatenate([mean_x, mean_y]))
+        var_grid = self._grid_points(np.concatenate([var_x, var_y]))
         grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
-        kde_x = _kde_eval(x, grid)
-        kde_y = _kde_eval(y, grid)
+        # Fit both KDEs first
+        x_kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(x)
+        y_kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(y)
-        kde_diff = ((kde_x - kde_y) ** 2).mean()
+        # Evaluate both KDEs on same grid chunks
+        kde_x, kde_y = self._kde_eval_both(x_kde, y_kde, grid)
-        return kde_diff
+        return ((np.exp(kde_x) - np.exp(kde_y)) ** 2).mean()
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("MeanVarDistributionDistance cannot be called on a pairwise distance matrix.")

pertpy/tools/_enrichment.py CHANGED Viewed

@@ -3,6 +3,7 @@ from collections.abc import Sequence
 from typing import Any, Literal
 import blitzgsea
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scanpy as sc
@@ -14,6 +15,7 @@ from scipy.sparse import issparse
 from scipy.stats import hypergeom
 from statsmodels.stats.multitest import multipletests
+from pertpy._doc import _doc_params, doc_common_plot_args
 from pertpy.metadata import Drug
@@ -290,9 +292,11 @@ class Enrichment:
         return enrichment
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_dotplot(
         self,
         adata: AnnData,
+        *,
         targets: dict[str, dict[str, list[str]]] = None,
         source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
         category_name: str = "interaction_type",
@@ -300,10 +304,9 @@ class Enrichment:
         groupby: str = None,
         key: str = "pertpy_enrichment",
         ax: Axes | None = None,
-        save: bool | str | None = None,
-        show: bool | None = None,
+        return_fig: bool = False,
         **kwargs,
-    ) -> DotPlot | dict | None:
+    ) -> DotPlot | None:
         """Plots a dotplot by groupby and categories.
         Wraps scanpy's dotplot but formats it nicely by categories.
@@ -319,11 +322,11 @@ class Enrichment:
             category_name: The name of category used to generate a nested drug target set when `targets=None` and `source=dgidb|pharmgkb`.
             groupby: dotplot groupby such as clusters or cell types.
             key: Prefix key of enrichment results in `uns`.
+            {common_plot_args}
             kwargs: Passed to scanpy dotplot.
         Returns:
-            If `return_fig` is `True`, returns a :class:`~scanpy.pl.DotPlot` object,
-            else if `show` is false, return axes dict.
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
         Examples:
             >>> import pertpy as pt
@@ -403,21 +406,26 @@ class Enrichment:
             "var_group_labels": var_group_labels,
         }
-        return sc.pl.dotplot(
+        fig = sc.pl.dotplot(
             enrichment_score_adata,
             groupby=groupby,
             swap_axes=True,
             ax=ax,
-            save=save,
-            show=show,
+            show=False,
             **plot_args,
             **kwargs,
         )
+        if return_fig:
+            return fig
+        plt.show()
+        return None
     def plot_gsea(
         self,
         adata: AnnData,
         enrichment: dict[str, pd.DataFrame],
+        *,
         n: int = 10,
         key: str = "pertpy_enrichment_gsea",
         interactive_plot: bool = False,

pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

pertpy 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl