PyPI - pertpy - Versions diffs - 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pertpy 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl

Files changed (20) hide show

pertpy/__init__.py +1 -1
pertpy/_doc.py +1 -2
pertpy/metadata/_cell_line.py +3 -5
pertpy/preprocessing/_guide_rna.py +98 -10
pertpy/preprocessing/_guide_rna_mixture.py +179 -0
pertpy/tools/_augur.py +32 -44
pertpy/tools/_cinemaot.py +1 -3
pertpy/tools/_coda/_base_coda.py +21 -29
pertpy/tools/_dialogue.py +17 -21
pertpy/tools/_differential_gene_expression/_base.py +4 -12
pertpy/tools/_distances/_distances.py +56 -48
pertpy/tools/_enrichment.py +1 -3
pertpy/tools/_milo.py +4 -12
pertpy/tools/_mixscape.py +215 -127
pertpy/tools/_perturbation_space/_simple.py +1 -3
pertpy/tools/_scgen/_scgen.py +1 -3
{pertpy-0.9.5.dist-info → pertpy-0.10.0.dist-info}/METADATA +2 -2
{pertpy-0.9.5.dist-info → pertpy-0.10.0.dist-info}/RECORD +20 -19
{pertpy-0.9.5.dist-info → pertpy-0.10.0.dist-info}/WHEEL +0 -0
{pertpy-0.9.5.dist-info → pertpy-0.10.0.dist-info}/licenses/LICENSE +0 -0

pertpy/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 __author__ = "Lukas Heumos"
 __email__ = "lukas.heumos@posteo.net"
-__version__ = "0.9.5"
+__version__ = "0.10.0"
 import warnings

pertpy/_doc.py CHANGED Viewed

@@ -15,6 +15,5 @@ def _doc_params(**kwds):  # pragma: no cover
 doc_common_plot_args = """\
-show: if `True`, shows the plot.
-            return_fig: if `True`, returns figure of the plot.\
+return_fig: if `True`, returns figure of the plot, that can be used for saving.\
 """

pertpy/metadata/_cell_line.py CHANGED Viewed

@@ -703,7 +703,6 @@ class CellLine(MetaData):
         metadata_key: str = "bulk_rna_broad",
         category: str = "cell line",
         subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Visualise the correlation of cell lines with annotated metadata.
@@ -747,7 +746,7 @@ class CellLine(MetaData):
                 if all(isinstance(id, str) for id in subset_identifier_list):
                     if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
                         subset_identifier_list = np.where(
-                            np.in1d(adata.obs[identifier].values, subset_identifier_list)
+                            np.isin(adata.obs[identifier].values, subset_identifier_list)
                         )[0]
                     else:
                         raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
@@ -798,10 +797,9 @@ class CellLine(MetaData):
                 },
             )
-            if show:
-                plt.show()
             if return_fig:
                 return plt.gcf()
+            plt.show()
             return None
         else:
-            raise NotImplementedError
+            raise NotImplementedError("Only 'cell line' category is supported for correlation comparison.")

pertpy/preprocessing/_guide_rna.py CHANGED Viewed

@@ -1,15 +1,19 @@
 from __future__ import annotations
 import uuid
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
+from warnings import warn
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scanpy as sc
 import scipy
+from rich.progress import track
+from scipy.sparse import issparse
 from pertpy._doc import _doc_params, doc_common_plot_args
+from pertpy.preprocessing._guide_rna_mixture import PoissonGaussMixture
 if TYPE_CHECKING:
     from anndata import AnnData
@@ -17,7 +21,7 @@ if TYPE_CHECKING:
 class GuideAssignment:
-    """Offers simple guide assigment based on count thresholds."""
+    """Assign cells to guide RNAs."""
     def assign_by_threshold(
         self,
@@ -33,12 +37,12 @@ class GuideAssignment:
         This function expects unnormalized data as input.
         Args:
-            adata: Annotated data matrix containing gRNA values
+            adata: AnnData object containing gRNA values.
             assignment_threshold: The count threshold that is required for an assignment to be viable.
             layer: Key to the layer containing raw count values of the gRNAs.
                    adata.X is used if layer is None. Expects count data.
             output_layer: Assigned guide will be saved on adata.layers[output_key].
-            only_return_results: If True, input AnnData is not modified and the result is returned as an np.ndarray.
+            only_return_results: Whether to input AnnData is not modified and the result is returned as an :class:`np.ndarray`.
         Examples:
             Each cell is assigned to gRNA that occurs at least 5 times in the respective cell.
@@ -67,7 +71,7 @@ class GuideAssignment:
         assignment_threshold: float,
         layer: str | None = None,
         output_key: str = "assigned_guide",
-        no_grna_assigned_key: str = "NT",
+        no_grna_assigned_key: str = "Negative",
         only_return_results: bool = False,
     ) -> np.ndarray | None:
         """Simple threshold based max gRNA assignment function.
@@ -76,13 +80,13 @@ class GuideAssignment:
         This function expects unnormalized data as input.
         Args:
-            adata: Annotated data matrix containing gRNA values
+            adata: AnnData object containing gRNA values.
             assignment_threshold: The count threshold that is required for an assignment to be viable.
             layer: Key to the layer containing raw count values of the gRNAs.
                    adata.X is used if layer is None. Expects count data.
             output_key: Assigned guide will be saved on adata.obs[output_key]. default value is `assigned_guide`.
             no_grna_assigned_key: The key to return if no gRNA is expressed enough.
-            only_return_results: If True, input AnnData is not modified and the result is returned as an np.ndarray.
+            only_return_results: Whether to input AnnData is not modified and the result is returned as an np.ndarray.
         Examples:
             Each cell is assigned to the most expressed gRNA if it has at least 5 counts.
@@ -109,6 +113,92 @@ class GuideAssignment:
         return None
+    def assign_mixture_model(
+        self,
+        adata: AnnData,
+        model: Literal["poisson_gauss_mixture"] = "poisson_gauss_mixture",
+        assigned_guides_key: str = "assigned_guide",
+        no_grna_assigned_key: str = "negative",
+        max_assignments_per_cell: int = 5,
+        multiple_grna_assigned_key: str = "multiple",
+        multiple_grna_assignment_string: str = "+",
+        only_return_results: bool = False,
+        uns_key: str = "guide_assignment_params",
+        show_progress: bool = False,
+        **mixture_model_kwargs,
+    ) -> np.ndarray | None:
+        """Assigns gRNAs to cells using a mixture model.
+        Args:
+            adata: AnnData object containing gRNA values.
+            model: The model to use for the mixture model. Currently only `Poisson_Gauss_Mixture` is supported.
+            output_key: Assigned guide will be saved on adata.obs[output_key].
+            no_grna_assigned_key: The key to return if a cell is negative for all gRNAs.
+            max_assignments_per_cell: The maximum number of gRNAs that can be assigned to a cell.
+            multiple_grna_assigned_key: The key to return if multiple gRNAs are assigned to a cell.
+            multiple_grna_assignment_string: The string to use to join multiple gRNAs assigned to a cell.
+            only_return_results: Whether input AnnData is not modified and the result is returned as an np.ndarray.
+            show_progress: Whether to shows progress bar.
+            mixture_model_kwargs: Are passed to the mixture model.
+        Examples:
+            >>> import pertpy as pt
+            >>> mdata = pt.dt.papalexi_2021()
+            >>> gdo = mdata.mod["gdo"]
+            >>> ga = pt.pp.GuideAssignment()
+            >>> ga.assign_mixture_model(gdo)
+        """
+        if model == "poisson_gauss_mixture":
+            mixture_model = PoissonGaussMixture(**mixture_model_kwargs)
+        else:
+            raise ValueError("Model not implemented. Please use 'poisson_gauss_mixture'.")
+        if uns_key not in adata.uns:
+            adata.uns[uns_key] = {}
+        elif type(adata.uns[uns_key]) is not dict:
+            raise ValueError(f"adata.uns['{uns_key}'] should be a dictionary. Please remove it or change the key.")
+        res = pd.DataFrame(0, index=adata.obs_names, columns=adata.var_names)
+        fct = track if show_progress else lambda iterable: iterable
+        for gene in fct(adata.var_names):
+            is_nonzero = (
+                np.ravel((adata[:, gene].X != 0).todense()) if issparse(adata.X) else np.ravel(adata[:, gene].X != 0)
+            )
+            if sum(is_nonzero) < 2:
+                warn(f"Skipping {gene} as there are less than 2 cells expressing the guide at all.", stacklevel=2)
+                continue
+            # We are only fitting the model to the non-zero values, the rest is
+            # automatically assigned to the negative class
+            data = adata[is_nonzero, gene].X.todense().A1 if issparse(adata.X) else adata[is_nonzero, gene].X
+            data = np.ravel(data)
+            if np.any(data < 0):
+                raise ValueError(
+                    "Data contains negative values. Please use non-negative data for guide assignment with the Mixture Model."
+                )
+            # Log2 transform the data so positive population is approximately normal
+            data = np.log2(data)
+            assignments = mixture_model.run_model(data)
+            res.loc[adata.obs_names[is_nonzero][assignments == "Positive"], gene] = 1
+            adata.uns[uns_key][gene] = mixture_model.params
+        # Assign guides to cells
+        # Some cells might have multiple guides assigned
+        series = pd.Series(no_grna_assigned_key, index=adata.obs_names)
+        num_guides_assigned = res.sum(1)
+        series.loc[(num_guides_assigned <= max_assignments_per_cell) & (num_guides_assigned != 0)] = res.apply(
+            lambda row: row.index[row == 1].tolist(), axis=1
+        ).str.join(multiple_grna_assignment_string)
+        series.loc[num_guides_assigned > max_assignments_per_cell] = multiple_grna_assigned_key
+        if only_return_results:
+            return series.values
+        adata.obs[assigned_guides_key] = series.values
+        return None
     @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_heatmap(
         self,
@@ -117,7 +207,6 @@ class GuideAssignment:
         layer: str | None = None,
         order_by: np.ndarray | str | None = None,
         key_to_save_order: str = None,
-        show: bool = True,
         return_fig: bool = False,
         **kwargs,
     ) -> Figure | None:
@@ -194,8 +283,7 @@ class GuideAssignment:
         finally:
             del adata.obs[temp_col_name]
-        if show:
-            plt.show()
         if return_fig:
             return fig
+        plt.show()
         return None

pertpy/preprocessing/_guide_rna_mixture.py ADDED Viewed

@@ -0,0 +1,179 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+import jax
+import jax.numpy as jnp
+import numpy as np
+import numpyro
+import numpyro.distributions as dist
+from jax import random
+from numpyro.infer import MCMC, NUTS
+ParamsDict = Mapping[str, jnp.ndarray]
+class MixtureModel(ABC):
+    """Abstract base class for 2-component mixture models.
+    Args:
+        num_warmup: Number of warmup steps for MCMC sampling.
+        num_samples: Number of samples to draw after warmup.
+        fraction_positive_expected: Prior belief about fraction of positive components.
+        poisson_rate_prior: Rate parameter for exponential prior on Poisson component.
+        gaussian_mean_prior: Mean and standard deviation for Gaussian prior on positive component mean.
+        gaussian_std_prior: Scale parameter for half-normal prior on positive component std.
+    """
+    def __init__(
+        self,
+        num_warmup: int = 50,
+        num_samples: int = 100,
+        fraction_positive_expected: float = 0.15,
+        poisson_rate_prior: float = 0.2,
+        gaussian_mean_prior: tuple[float, float] = (3, 2),
+        gaussian_std_prior: float = 1,
+    ) -> None:
+        self.num_warmup = num_warmup
+        self.num_samples = num_samples
+        self.fraction_positive_expected = fraction_positive_expected
+        self.poisson_rate_prior = poisson_rate_prior
+        self.gaussian_mean_prior = gaussian_mean_prior
+        self.gaussian_std_prior = gaussian_std_prior
+    @abstractmethod
+    def initialize_params(self) -> ParamsDict:
+        """Initialize model parameters via sampling from priors.
+        Returns:
+            Dictionary of sampled parameter values.
+        """
+        pass
+    @abstractmethod
+    def log_likelihood(self, data: jnp.ndarray, params: ParamsDict) -> jnp.ndarray:
+        """Calculate log likelihood of data under current parameters.
+        Args:
+            data: Input data array.
+            params: Current parameter values.
+        Returns:
+            Log likelihood values for each datapoint.
+        """
+        pass
+    def fit_model(self, data: jnp.ndarray, seed: int = 0) -> MCMC:
+        """Fit the mixture model using MCMC.
+        Args:
+            data: Input data to fit.
+            seed: Random seed for reproducibility.
+        Returns:
+            Fitted MCMC object containing samples.
+        """
+        nuts_kernel = NUTS(self.mixture_model)
+        mcmc = MCMC(nuts_kernel, num_warmup=self.num_warmup, num_samples=self.num_samples, progress_bar=False)
+        mcmc.run(random.PRNGKey(seed), data=data)
+        return mcmc
+    def run_model(self, data: jnp.ndarray, seed: int = 0) -> np.ndarray:
+        """Run model fitting and assign components.
+        Args:
+            data: Input data array.
+            seed: Random seed.
+        Returns:
+            Array of "Positive"/"Negative" assignments for each datapoint.
+        """
+        self.mcmc = self.fit_model(data, seed)
+        self.samples = self.mcmc.get_samples()
+        self.assignments = self.assignment(self.samples, data)
+        return self.assignments
+    def mixture_model(self, data: jnp.ndarray) -> None:
+        """Define mixture model structure for NumPyro.
+        Args:
+            data: Input data array.
+        """
+        params = self.initialize_params()
+        with numpyro.plate("data", data.shape[0]):
+            log_likelihoods = self.log_likelihood(data, params)
+            log_mixture_likelihood = jax.scipy.special.logsumexp(log_likelihoods, axis=-1)
+            numpyro.sample("obs", dist.Normal(log_mixture_likelihood, 1.0), obs=data)
+    def assignment(self, samples: ParamsDict, data: jnp.ndarray) -> np.ndarray:
+        """Assign data points to mixture components.
+        Args:
+            samples: MCMC samples of parameters.
+            data: Input data array.
+        Returns:
+            Array of component assignments.
+        """
+        params = {key: samples[key].mean(axis=0) for key in samples.keys()}
+        self.params = params
+        log_likelihoods = self.log_likelihood(data, params)
+        guide_assignments = jnp.argmax(log_likelihoods, axis=-1)
+        assignments = ["Negative" if assign == 0 else "Positive" for assign in guide_assignments]
+        return np.array(assignments)
+class PoissonGaussMixture(MixtureModel):
+    """Mixture model combining Poisson and Gaussian distributions."""
+    def log_likelihood(self, data: np.ndarray, params: ParamsDict) -> jnp.ndarray:
+        """Calculate component-wise log likelihoods.
+        Args:
+            data: Input data array.
+            params: Current parameter values.
+        Returns:
+            Log likelihood values for each component.
+        """
+        poisson_rate = params["poisson_rate"]
+        gaussian_mean = params["gaussian_mean"]
+        gaussian_std = params["gaussian_std"]
+        mix_probs = params["mix_probs"]
+        # We penalize the model for positioning the Poisson component to the right of the Gaussian component
+        # by imposing a soft constraint to penalize the Poisson rate being larger than the Gaussian mean
+        # Heuristic regularization term to prevent flipping of the components
+        numpyro.factor("separation_penalty", +10 * jnp.heaviside(-poisson_rate + gaussian_mean, 0))
+        log_likelihoods = jnp.stack(
+            [
+                # Poisson component
+                jnp.log(mix_probs[0]) + dist.Poisson(poisson_rate).log_prob(data),
+                # Gaussian component
+                jnp.log(mix_probs[1]) + dist.Normal(gaussian_mean, gaussian_std).log_prob(data),
+            ],
+            axis=-1,
+        )
+        return log_likelihoods
+    def initialize_params(self) -> ParamsDict:
+        """Initialize model parameters via prior sampling.
+        Returns:
+            Dictionary of sampled parameter values.
+        """
+        params = {}
+        params["poisson_rate"] = numpyro.sample("poisson_rate", dist.Exponential(self.poisson_rate_prior))
+        params["gaussian_mean"] = numpyro.sample("gaussian_mean", dist.Normal(*self.gaussian_mean_prior))
+        params["gaussian_std"] = numpyro.sample("gaussian_std", dist.HalfNormal(self.gaussian_std_prior))
+        params["mix_probs"] = numpyro.sample(
+            "mix_probs",
+            dist.Dirichlet(jnp.array([1 - self.fraction_positive_expected, self.fraction_positive_expected])),
+        )
+        return params

pertpy/tools/_augur.py CHANGED Viewed

@@ -685,7 +685,7 @@ class Augur:
         span: float = 0.75,
         filter_negative_residuals: bool = False,
         n_threads: int = 4,
-        augur_mode: Literal["permute"] | Literal["default"] | Literal["velocity"] = "default",
+        augur_mode: Literal["default", "permute", "velocity"] = "default",
         select_variance_features: bool = True,
         key_added: str = "augurpy_results",
         random_state: int | None = None,
@@ -908,41 +908,39 @@ class Augur:
             .mean()
         )
-        sampled_permuted_cv_augur1 = []
-        sampled_permuted_cv_augur2 = []
+        rng = np.random.default_rng()
+        sampled_data = []
         # draw mean aucs for permute1 and permute2
         for celltype in permuted_cv_augur1["cell_type"].unique():
             df1 = permuted_cv_augur1[permuted_cv_augur1["cell_type"] == celltype]
             df2 = permuted_cv_augur2[permuted_cv_augur2["cell_type"] == celltype]
-            for permutation_idx in range(n_permutations):
-                # subsample
-                sample1 = df1.sample(n=n_subsamples, random_state=permutation_idx, axis="index")
-                sampled_permuted_cv_augur1.append(
-                    pd.DataFrame(
-                        {
-                            "cell_type": [celltype],
-                            "permutation_idx": [permutation_idx],
-                            "mean": [sample1["augur_score"].mean(axis=0)],
-                            "std": [sample1["augur_score"].std(axis=0)],
-                        }
-                    )
-                )
-                sample2 = df2.sample(n=n_subsamples, random_state=permutation_idx, axis="index")
-                sampled_permuted_cv_augur2.append(
-                    pd.DataFrame(
-                        {
-                            "cell_type": [celltype],
-                            "permutation_idx": [permutation_idx],
-                            "mean": [sample2["augur_score"].mean(axis=0)],
-                            "std": [sample2["augur_score"].std(axis=0)],
-                        }
-                    )
+            indices1 = rng.choice(len(df1), size=(n_permutations, n_subsamples), replace=True)
+            indices2 = rng.choice(len(df2), size=(n_permutations, n_subsamples), replace=True)
+            scores1 = df1["augur_score"].values[indices1]
+            scores2 = df2["augur_score"].values[indices2]
+            means1 = scores1.mean(axis=1)
+            means2 = scores2.mean(axis=1)
+            stds1 = scores1.std(axis=1)
+            stds2 = scores2.std(axis=1)
+            sampled_data.append(
+                pd.DataFrame(
+                    {
+                        "cell_type": np.repeat(celltype, n_permutations),
+                        "permutation_idx": np.arange(n_permutations),
+                        "mean1": means1,
+                        "mean2": means2,
+                        "std1": stds1,
+                        "std2": stds2,
+                    }
                 )
+            )
-        permuted_samples1 = pd.concat(sampled_permuted_cv_augur1)
-        permuted_samples2 = pd.concat(sampled_permuted_cv_augur2)
+        sampled_df = pd.concat(sampled_data)
         # delta between augur scores
         delta = augur_score1.merge(augur_score2, on=["cell_type"], suffixes=("1", "2")).assign(
@@ -950,9 +948,7 @@ class Augur:
         )
         # delta between permutation scores
-        delta_rnd = permuted_samples1.merge(
-            permuted_samples2, on=["cell_type", "permutation_idx"], suffixes=("1", "2")
-        ).assign(delta_rnd=lambda x: x.mean2 - x.mean1)
+        delta_rnd = sampled_df.assign(delta_rnd=lambda x: x.mean2 - x.mean1)
         # number of values where permutations are larger than test statistic
         delta["b"] = (
@@ -967,7 +963,7 @@ class Augur:
         delta["z"] = (
             delta["delta_augur"] - delta_rnd.groupby("cell_type", as_index=False).mean()["delta_rnd"]
         ) / delta_rnd.groupby("cell_type", as_index=False).std()["delta_rnd"]
-        # calculate pvalues
         delta["pval"] = np.minimum(
             2 * (delta["b"] + 1) / (delta["m"] + 1), 2 * (delta["m"] - delta["b"] + 1) / (delta["m"] + 1)
         )
@@ -982,7 +978,6 @@ class Augur:
         *,
         top_n: int = None,
         ax: Axes = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Plot scatterplot of differential prioritization.
@@ -1041,10 +1036,9 @@ class Augur:
         legend1 = ax.legend(*scatter.legend_elements(), loc="center left", title="z-scores", bbox_to_anchor=(1, 0.5))
         ax.add_artist(legend1)
-        if show:
-            plt.show()
         if return_fig:
             return plt.gcf()
+        plt.show()
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
@@ -1055,7 +1049,6 @@ class Augur:
         key: str = "augurpy_results",
         top_n: int = 10,
         ax: Axes = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Plot a lollipop plot of the n features with largest feature importances.
@@ -1109,10 +1102,9 @@ class Augur:
         plt.ylabel("Gene")
         plt.yticks(y_axes_range, n_features["genes"])
-        if show:
-            plt.show()
         if return_fig:
             return plt.gcf()
+        plt.show()
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
@@ -1122,7 +1114,6 @@ class Augur:
         *,
         key: str = "augurpy_results",
         ax: Axes = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Plot a lollipop plot of the mean augur values.
@@ -1172,10 +1163,9 @@ class Augur:
         plt.ylabel("Cell Type")
         plt.yticks(y_axes_range, results["summary_metrics"].sort_values("mean_augur_score", axis=1).columns)
-        if show:
-            plt.show()
         if return_fig:
             return plt.gcf()
+        plt.show()
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
@@ -1185,7 +1175,6 @@ class Augur:
         results2: dict[str, Any],
         *,
         top_n: int = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Create scatterplot with two augur results.
@@ -1243,8 +1232,7 @@ class Augur:
         plt.xlabel("Augur scores 1")
         plt.ylabel("Augur scores 2")
-        if show:
-            plt.show()
         if return_fig:
             return plt.gcf()
+        plt.show()
         return None

pertpy/tools/_cinemaot.py CHANGED Viewed

@@ -658,7 +658,6 @@ class Cinemaot:
         title: str = "CINEMA-OT matching matrix",
         min_val: float = 0.01,
         ax: Axes | None = None,
-        show: bool = True,
         return_fig: bool = False,
         **kwargs,
     ) -> Figure | None:
@@ -717,10 +716,9 @@ class Cinemaot:
         g = sns.heatmap(df, annot=True, ax=ax, **kwargs)
         plt.title(title)
-        if show:
-            plt.show()
         if return_fig:
             return g
+        plt.show()
         return None

pertpy 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

pertpy 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl