PyPI - pertpy - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

pertpy 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

pertpy/__init__.py +3 -2
pertpy/data/__init__.py +5 -1
pertpy/data/_dataloader.py +2 -4
pertpy/data/_datasets.py +203 -92
pertpy/metadata/__init__.py +4 -0
pertpy/metadata/_cell_line.py +826 -0
pertpy/metadata/_compound.py +129 -0
pertpy/metadata/_drug.py +242 -0
pertpy/metadata/_look_up.py +582 -0
pertpy/metadata/_metadata.py +73 -0
pertpy/metadata/_moa.py +129 -0
pertpy/plot/__init__.py +1 -9
pertpy/plot/_augur.py +53 -116
pertpy/plot/_coda.py +277 -677
pertpy/plot/_guide_rna.py +17 -35
pertpy/plot/_milopy.py +59 -134
pertpy/plot/_mixscape.py +152 -391
pertpy/preprocessing/_guide_rna.py +88 -4
pertpy/tools/__init__.py +8 -13
pertpy/tools/_augur.py +315 -17
pertpy/tools/_cinemaot.py +143 -4
pertpy/tools/_coda/_base_coda.py +1210 -65
pertpy/tools/_coda/_sccoda.py +50 -21
pertpy/tools/_coda/_tasccoda.py +27 -19
pertpy/tools/_dialogue.py +164 -56
pertpy/tools/_differential_gene_expression.py +240 -14
pertpy/tools/_distances/_distance_tests.py +8 -8
pertpy/tools/_distances/_distances.py +184 -34
pertpy/tools/_enrichment.py +465 -0
pertpy/tools/_milo.py +345 -11
pertpy/tools/_mixscape.py +668 -50
pertpy/tools/_perturbation_space/_clustering.py +5 -1
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
pertpy/tools/_perturbation_space/_simple.py +51 -10
pertpy/tools/_scgen/__init__.py +1 -1
pertpy/tools/_scgen/_scgen.py +701 -0
pertpy/tools/_scgen/_utils.py +1 -3
pertpy/tools/decoupler_LICENSE +674 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
pertpy-0.7.0.dist-info/RECORD +53 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
pertpy/plot/_cinemaot.py +0 -81
pertpy/plot/_dialogue.py +0 -91
pertpy/plot/_scgen.py +0 -337
pertpy/tools/_metadata/__init__.py +0 -0
pertpy/tools/_metadata/_cell_line.py +0 -613
pertpy/tools/_metadata/_look_up.py +0 -342
pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
pertpy/tools/_scgen/_jax_scgen.py +0 -370
pertpy-0.6.0.dist-info/RECORD +0 -50
/pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_distances/_distances.py CHANGED Viewed

@@ -3,24 +3,25 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
+import numba
 import numpy as np
 import pandas as pd
 from ott.geometry.geometry import Geometry
 from ott.geometry.pointcloud import PointCloud
 from ott.problems.linear.linear_problem import LinearProblem
 from ott.solvers.linear.sinkhorn import Sinkhorn
+from pandas import Series
 from rich.progress import track
 from scipy.sparse import issparse
 from scipy.spatial.distance import cosine
 from scipy.special import gammaln
-from scipy.stats import kendalltau, pearsonr, spearmanr
+from scipy.stats import kendalltau, kstest, pearsonr, spearmanr
+from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import pairwise_distances, r2_score
 from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 if TYPE_CHECKING:
-    from collections.abc import Iterable
     from anndata import AnnData
@@ -30,6 +31,7 @@ class Distance:
     The distance metric can be specified by the user. This class also provides a
     method to compute the pairwise distances between all groups of cells.
     Currently available metrics:
     - "edistance": Energy distance (Default metric).
         In essence, it is twice the mean pairwise distance between cells of two
         groups minus the mean pairwise distance between cells within each group
@@ -55,8 +57,6 @@ class Distance:
         Coefficient of determination distance between the means of cells from two groups.
     - "mean_pairwise": Mean pairwise distance.
         Mean of the pairwise euclidean distances between cells of two groups.
-    - "mean_pairwise": Mean pairwise distance.
-        Mean of the pairwise euclidean distances between cells of two groups.
     - "mmd": Maximum mean discrepancy
         Maximum mean discrepancy between the cells of two groups.
         Here, uses linear, rbf, and quadratic polynomial MMD. For theory on MMD in single-cell applications, see
@@ -66,14 +66,20 @@ class Distance:
         OTT-JAX implementation of the Sinkhorn algorithm to compute the distance.
         For more information on the optimal transport solver, see
         `Cuturi et al. (2013) <https://proceedings.neurips.cc/paper/2013/file/af21d0c97db2e27e13572cbf59eb343d-Paper.pdf>`__.
-    - "kl_divergence": Kullback–Leibler divergence distance.
+    - "sym_kldiv": symmetrized Kullback–Leibler divergence distance.
         Kullback–Leibler divergence of the gaussian distributions between cells of two groups.
-        Here we fit a gaussian distribution over each group of cells and then calculate the KL divergence
+        Here we fit a gaussian distribution over one group of cells and then calculate the KL divergence on the other, and vice versa.
     - "t_test": t-test statistic.
         T-test statistic measure between cells of two groups.
+    - "ks_test": Kolmogorov-Smirnov test statistic.
+        Kolmogorov-Smirnov test statistic measure between cells of two groups.
     - "nb_ll": log-likelihood over negative binomial
         Average of log-likelihoods of samples of the secondary group after fitting a negative binomial distribution
         over the samples of the first group.
+    - "classifier_proba": probability of a binary classifier
+        Average of the classification probability of the perturbation for a binary classifier.
+    - "classifier_cp": classifier class projection
+        Average of the class
     Attributes:
         metric: Name of distance metric.
@@ -137,12 +143,18 @@ class Distance:
             metric_fct = MMD()
         elif metric == "wasserstein":
             metric_fct = WassersteinDistance()
-        elif metric == "kl_divergence":
-            metric_fct = KLDivergence()
+        elif metric == "sym_kldiv":
+            metric_fct = SymmetricKLDivergence()
         elif metric == "t_test":
             metric_fct = TTestDistance()
+        elif metric == "ks_test":
+            metric_fct = KSTestDistance()
         elif metric == "nb_ll":
             metric_fct = NBLL()
+        elif metric == "classifier_proba":
+            metric_fct = ClassifierProbaDistance()
+        elif metric == "classifier_cp":
+            metric_fct = ClassifierClassProjection()
         else:
             raise ValueError(f"Metric {metric} not recognized.")
         self.metric_fct = metric_fct
@@ -280,7 +292,7 @@ class Distance:
         n_jobs: int = -1,
         **kwargs,
     ) -> pd.DataFrame:
-        """Get pairwise distances between groups of cells.
+        """Get distances between one selected cell group and the remaining other cell groups.
         Args:
             adata: Annotated data matrix.
@@ -301,6 +313,11 @@ class Distance:
             >>> Distance = pt.tools.Distance(metric="edistance")
             >>> pairwise_df = Distance.onesided_distances(adata, groupby="perturbation", selected_group="control")
         """
+        if self.metric == "classifier_cp":
+            return self.metric_fct.onesided_distances(  # type: ignore
+                adata, groupby, selected_group, groups, show_progressbar, n_jobs, **kwargs
+            )
         groups = adata.obs[groupby].unique() if groups is None else groups
         grouping = adata.obs[groupby].copy()
         df = pd.Series(index=groups, dtype=float)
@@ -329,7 +346,10 @@ class Distance:
                     dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
                 df.loc[group_x] = dist
         else:
-            embedding = adata.obsm[self.obsm_key].copy()
+            if self.layer_key:
+                embedding = adata.layers[self.layer_key]
+            else:
+                embedding = adata.obsm[self.obsm_key].copy()
             for group_x in fct(groups):
                 cells_x = embedding[grouping == group_x].copy()
                 group_y = selected_group
@@ -337,7 +357,7 @@ class Distance:
                     dist = 0.0
                 else:
                     cells_y = embedding[grouping == group_y].copy()
-                    dist = self.metric_fct(cells_x, cells_y, **kwargs)
+                    dist = self(cells_x, cells_y, **kwargs)
                 df.loc[group_x] = dist
         df.index.name = groupby
         df.name = f"{self.metric} to {selected_group}"
@@ -471,11 +491,8 @@ class WassersteinDistance(AbstractDistance):
         return self.solve_ot_problem(geom, **kwargs)
     def solve_ot_problem(self, geom: Geometry, **kwargs):
-        # Define a linear problem with that cost structure.
         ot_prob = LinearProblem(geom)
-        # Create a Sinkhorn solver
         solver = Sinkhorn()
-        # Solve OT problem
         ot = solver(ot_prob, **kwargs)
         return ot.reg_ot_cost.item()
@@ -502,7 +519,7 @@ class MeanSquaredDistance(AbstractDistance):
         self.accepts_precomputed = False
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 0.5
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
@@ -516,7 +533,7 @@ class MeanAbsoluteDistance(AbstractDistance):
         self.accepts_precomputed = False
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
@@ -614,11 +631,12 @@ class R2ScoreDistance(AbstractDistance):
         raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
-class KLDivergence(AbstractDistance):
-    """Average of KL divergence between gene distributions of two groups
+class SymmetricKLDivergence(AbstractDistance):
+    """Average of symmetric KL divergence between gene distributions of two groups
     Assuming a Gaussian distribution for each gene in each group, calculates
-    the KL divergence between them and averages over all genes
+    the KL divergence between them and averages over all genes. Repeats this ABBA to get a symmetrized distance.
+    See https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence#Symmetrised_divergence.
     """
@@ -632,11 +650,12 @@ class KLDivergence(AbstractDistance):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
             kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
-            kl_all.append(kl)
+            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
+            kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KLDivergence cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
 class TTestDistance(AbstractDistance):
@@ -663,6 +682,23 @@ class TTestDistance(AbstractDistance):
         raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
+class KSTestDistance(AbstractDistance):
+    """Average of two-sided KS test statistic between two groups"""
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        stats = []
+        for i in range(X.shape[1]):
+            stats.append(abs(kstest(X[:, i], Y[:, i])[0]))
+        return sum(stats) / len(stats)
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
 class NBLL(AbstractDistance):
     """
     Average of Log likelihood (scalar) of group B cells
@@ -683,16 +719,12 @@ class NBLL(AbstractDistance):
         if not _is_count_matrix(matrix=X) or not _is_count_matrix(matrix=Y):
             raise ValueError("NBLL distance only works for raw counts.")
-        nlls = []
-        for i in range(X.shape[1]):
-            x, y = X[:, i], Y[:, i]
-            nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
-            mu = np.repeat(np.exp(nb_params[0]), y.shape[0])
-            theta = np.repeat(1 / nb_params[1], y.shape[0])
-            if mu[0] == np.nan or theta[0] == np.nan:
-                raise ValueError("Could not fit a negative binomial distribution to the input data")
-            # calculate the nll of y
-            eps = np.repeat(epsilon, y.shape[0])
+        @numba.jit(forceobj=True)
+        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
+            mu = np.exp(nb_params[0])
+            theta = 1 / nb_params[1]
+            eps = epsilon
             log_theta_mu_eps = np.log(theta + mu + eps)
             nll = (
                 theta * (np.log(theta + eps) - log_theta_mu_eps)
@@ -701,9 +733,127 @@ class NBLL(AbstractDistance):
                 - gammaln(theta)
                 - gammaln(y + 1)
             )
-            nlls.append(nll.mean())
+            return nll.mean()
+        def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
+            try:
+                nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
+                return _compute_nll(y, nb_params, epsilon)
+            except np.linalg.linalg.LinAlgError:
+                if x.mean() < 10 and y.mean() < 10:
+                    return 0.0
+                else:
+                    return np.nan  # Use NaN to indicate skipped genes
+        nlls = []
+        genes_skipped = 0
+        for i in range(X.shape[1]):
+            nll = _process_gene(X[:, i], Y[:, i], epsilon)
+            if np.isnan(nll):
+                genes_skipped += 1
+            else:
+                nlls.append(nll)
-        return -sum(nlls) / len(nlls)
+        if genes_skipped > X.shape[1] / 2:
+            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
+        return -np.sum(nlls) / len(nlls)
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
+def _sample(X, frac=None, n=None):
+    """Returns subsample of cells in format (train, test)."""
+    if frac and n:
+        raise ValueError("Cannot pass both frac and n.")
+    if frac:
+        n_cells = max(1, int(X.shape[0] * frac))
+    elif n:
+        n_cells = n
+    else:
+        raise ValueError("Must pass either `frac` or `n`.")
+    rng = np.random.default_rng()
+    sampled_indices = rng.choice(X.shape[0], n_cells, replace=False)
+    remaining_indices = np.setdiff1d(np.arange(X.shape[0]), sampled_indices)
+    return X[remaining_indices, :], X[sampled_indices, :]
+class ClassifierProbaDistance(AbstractDistance):
+    """Average of classification probabilites of a binary classifier.
+    Assumes the first condition is control and the second is perturbed.
+    Always holds out 20% of the perturbed condition.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        Y_train, Y_test = _sample(Y, frac=0.2)
+        label = ["c"] * X.shape[0] + ["p"] * Y_train.shape[0]
+        train = np.concatenate([X, Y_train])
+        reg = LogisticRegression()
+        reg.fit(train, label)
+        test_labels = reg.predict_proba(Y_test)
+        return np.mean(test_labels[:, 1])
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
+class ClassifierClassProjection(AbstractDistance):
+    """Average of 1-(classification probability of control).
+    Warning: unlike all other distances, this must also take a list of categorical labels the same length as X.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
+    def onesided_distances(
+        self,
+        adata: AnnData,
+        groupby: str,
+        selected_group: str | None = None,
+        groups: list[str] | None = None,
+        show_progressbar: bool = True,
+        n_jobs: int = -1,
+        **kwargs,
+    ) -> Series:
+        """Unlike the parent function, all groups except the selected group are factored into the classifier.
+        Similar to the parent function, the returned dataframe contains only the specified groups.
+        """
+        groups = adata.obs[groupby].unique() if groups is None else groups
+        X = adata[adata.obs[groupby] != selected_group].X
+        labels = adata[adata.obs[groupby] != selected_group].obs[groupby].values
+        Y = adata[adata.obs[groupby] == selected_group].X
+        reg = LogisticRegression()
+        reg.fit(X, labels)
+        test_probas = reg.predict_proba(Y)
+        df = pd.Series(index=groups, dtype=float)
+        for group in groups:
+            if group == selected_group:
+                df.loc[group] = 0
+            else:
+                class_idx = list(reg.classes_).index(group)
+                df.loc[group] = 1 - np.mean(test_probas[:, class_idx])
+        df.index.name = groupby
+        df.name = f"classifier_cp to {selected_group}"
+        return df
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")

pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

pertpy 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl