PyPI - pertpy - Versions diffs - 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

pertpy 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

pertpy/__init__.py +4 -2
pertpy/data/__init__.py +66 -1
pertpy/data/_dataloader.py +28 -26
pertpy/data/_datasets.py +261 -92
pertpy/metadata/__init__.py +6 -0
pertpy/metadata/_cell_line.py +795 -0
pertpy/metadata/_compound.py +128 -0
pertpy/metadata/_drug.py +238 -0
pertpy/metadata/_look_up.py +569 -0
pertpy/metadata/_metadata.py +70 -0
pertpy/metadata/_moa.py +125 -0
pertpy/plot/__init__.py +0 -13
pertpy/preprocessing/__init__.py +2 -0
pertpy/preprocessing/_guide_rna.py +89 -6
pertpy/tools/__init__.py +48 -15
pertpy/tools/_augur.py +329 -32
pertpy/tools/_cinemaot.py +145 -6
pertpy/tools/_coda/_base_coda.py +1237 -116
pertpy/tools/_coda/_sccoda.py +66 -36
pertpy/tools/_coda/_tasccoda.py +46 -39
pertpy/tools/_dialogue.py +180 -77
pertpy/tools/_differential_gene_expression/__init__.py +20 -0
pertpy/tools/_differential_gene_expression/_base.py +657 -0
pertpy/tools/_differential_gene_expression/_checks.py +41 -0
pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
pertpy/tools/_differential_gene_expression/_edger.py +125 -0
pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
pertpy/tools/_distances/_distance_tests.py +29 -24
pertpy/tools/_distances/_distances.py +584 -98
pertpy/tools/_enrichment.py +460 -0
pertpy/tools/_kernel_pca.py +1 -1
pertpy/tools/_milo.py +406 -49
pertpy/tools/_mixscape.py +677 -55
pertpy/tools/_perturbation_space/_clustering.py +10 -3
pertpy/tools/_perturbation_space/_comparison.py +112 -0
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
pertpy/tools/_perturbation_space/_simple.py +52 -11
pertpy/tools/_scgen/__init__.py +1 -1
pertpy/tools/_scgen/_base_components.py +2 -3
pertpy/tools/_scgen/_scgen.py +706 -0
pertpy/tools/_scgen/_utils.py +3 -5
pertpy/tools/decoupler_LICENSE +674 -0
{pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
pertpy-0.8.0.dist-info/RECORD +57 -0
{pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
pertpy/plot/_augur.py +0 -234
pertpy/plot/_cinemaot.py +0 -81
pertpy/plot/_coda.py +0 -1001
pertpy/plot/_dialogue.py +0 -91
pertpy/plot/_guide_rna.py +0 -82
pertpy/plot/_milopy.py +0 -284
pertpy/plot/_mixscape.py +0 -594
pertpy/plot/_scgen.py +0 -337
pertpy/tools/_differential_gene_expression.py +0 -99
pertpy/tools/_metadata/__init__.py +0 -0
pertpy/tools/_metadata/_cell_line.py +0 -613
pertpy/tools/_metadata/_look_up.py +0 -342
pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
pertpy/tools/_scgen/_jax_scgen.py +0 -370
pertpy-0.6.0.dist-info/RECORD +0 -50
/pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
{pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_distances/_distances.py CHANGED Viewed

@@ -1,35 +1,46 @@
 from __future__ import annotations
+import multiprocessing
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal, NamedTuple
+import numba
 import numpy as np
 import pandas as pd
 from ott.geometry.geometry import Geometry
 from ott.geometry.pointcloud import PointCloud
 from ott.problems.linear.linear_problem import LinearProblem
 from ott.solvers.linear.sinkhorn import Sinkhorn
+from pandas import Series
 from rich.progress import track
 from scipy.sparse import issparse
-from scipy.spatial.distance import cosine
+from scipy.spatial.distance import cosine, mahalanobis
 from scipy.special import gammaln
-from scipy.stats import kendalltau, pearsonr, spearmanr
+from scipy.stats import kendalltau, kstest, pearsonr, spearmanr
+from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import pairwise_distances, r2_score
 from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel
+from sklearn.neighbors import KernelDensity
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Callable
     from anndata import AnnData
+class MeanVar(NamedTuple):
+    mean: float
+    variance: float
 class Distance:
     """Distance class, used to compute distances between groups of cells.
     The distance metric can be specified by the user. This class also provides a
     method to compute the pairwise distances between all groups of cells.
     Currently available metrics:
     - "edistance": Energy distance (Default metric).
         In essence, it is twice the mean pairwise distance between cells of two
         groups minus the mean pairwise distance between cells within each group
@@ -55,8 +66,6 @@ class Distance:
         Coefficient of determination distance between the means of cells from two groups.
     - "mean_pairwise": Mean pairwise distance.
         Mean of the pairwise euclidean distances between cells of two groups.
-    - "mean_pairwise": Mean pairwise distance.
-        Mean of the pairwise euclidean distances between cells of two groups.
     - "mmd": Maximum mean discrepancy
         Maximum mean discrepancy between the cells of two groups.
         Here, uses linear, rbf, and quadratic polynomial MMD. For theory on MMD in single-cell applications, see
@@ -66,14 +75,25 @@ class Distance:
         OTT-JAX implementation of the Sinkhorn algorithm to compute the distance.
         For more information on the optimal transport solver, see
         `Cuturi et al. (2013) <https://proceedings.neurips.cc/paper/2013/file/af21d0c97db2e27e13572cbf59eb343d-Paper.pdf>`__.
-    - "kl_divergence": Kullback–Leibler divergence distance.
+    - "sym_kldiv": symmetrized Kullback–Leibler divergence distance.
         Kullback–Leibler divergence of the gaussian distributions between cells of two groups.
-        Here we fit a gaussian distribution over each group of cells and then calculate the KL divergence
+        Here we fit a gaussian distribution over one group of cells and then calculate the KL divergence on the other, and vice versa.
     - "t_test": t-test statistic.
         T-test statistic measure between cells of two groups.
+    - "ks_test": Kolmogorov-Smirnov test statistic.
+        Kolmogorov-Smirnov test statistic measure between cells of two groups.
     - "nb_ll": log-likelihood over negative binomial
         Average of log-likelihoods of samples of the secondary group after fitting a negative binomial distribution
         over the samples of the first group.
+    - "classifier_proba": probability of a binary classifier
+        Average of the classification probability of the perturbation for a binary classifier.
+    - "classifier_cp": classifier class projection
+        Average of the class
+    - "mean_var_distribution": Distance between mean-variance distributions between cells of 2 groups.
+       Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE).
+    - "mahalanobis": Mahalanobis distance between the means of cells from two groups.
+        It is originally used to measure distance between a point and a distribution.
+        in this context, it quantifies the difference between the mean profiles of a target group and a reference group.
     Attributes:
         metric: Name of distance metric.
@@ -93,6 +113,7 @@ class Distance:
     def __init__(
         self,
         metric: str = "edistance",
+        agg_fct: Callable = np.mean,
         layer_key: str = None,
         obsm_key: str = None,
         cell_wise_metric: str = "euclidean",
@@ -100,57 +121,67 @@ class Distance:
         """Initialize Distance class.
         Args:
-            metric: Distance metric to use. Defaults to "edistance".
+            metric: Distance metric to use.
+            agg_fct: Aggregation function to generate pseudobulk vectors.
             layer_key: Name of the counts layer containing raw counts to calculate distances for.
                               Mutually exclusive with 'obsm_key'.
-                              Defaults to None and is then not used.
+                              Is not used if `None`.
             obsm_key: Name of embedding in adata.obsm to use.
-                      Mutually exclusive with 'counts_layer_key'.
-                      Defaults to None, but is set to "X_pca" if not set explicitly internally.
+                      Mutually exclusive with 'layer_key'.
+                      Defaults to None, but is set to "X_pca" if not explicitly set internally.
             cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells.
-                                Defaults to "euclidean".
         """
         metric_fct: AbstractDistance = None
+        self.aggregation_func = agg_fct
         if metric == "edistance":
             metric_fct = Edistance()
         elif metric == "euclidean":
-            metric_fct = EuclideanDistance()
+            metric_fct = EuclideanDistance(self.aggregation_func)
         elif metric == "root_mean_squared_error":
-            metric_fct = EuclideanDistance()
+            metric_fct = EuclideanDistance(self.aggregation_func)
         elif metric == "mse":
-            metric_fct = MeanSquaredDistance()
+            metric_fct = MeanSquaredDistance(self.aggregation_func)
         elif metric == "mean_absolute_error":
-            metric_fct = MeanAbsoluteDistance()
+            metric_fct = MeanAbsoluteDistance(self.aggregation_func)
         elif metric == "pearson_distance":
-            metric_fct = PearsonDistance()
+            metric_fct = PearsonDistance(self.aggregation_func)
         elif metric == "spearman_distance":
-            metric_fct = SpearmanDistance()
+            metric_fct = SpearmanDistance(self.aggregation_func)
         elif metric == "kendalltau_distance":
-            metric_fct = KendallTauDistance()
+            metric_fct = KendallTauDistance(self.aggregation_func)
         elif metric == "cosine_distance":
-            metric_fct = CosineDistance()
+            metric_fct = CosineDistance(self.aggregation_func)
         elif metric == "r2_distance":
-            metric_fct = R2ScoreDistance()
+            metric_fct = R2ScoreDistance(self.aggregation_func)
         elif metric == "mean_pairwise":
             metric_fct = MeanPairwiseDistance()
         elif metric == "mmd":
             metric_fct = MMD()
         elif metric == "wasserstein":
             metric_fct = WassersteinDistance()
-        elif metric == "kl_divergence":
-            metric_fct = KLDivergence()
+        elif metric == "sym_kldiv":
+            metric_fct = SymmetricKLDivergence()
         elif metric == "t_test":
             metric_fct = TTestDistance()
+        elif metric == "ks_test":
+            metric_fct = KSTestDistance()
         elif metric == "nb_ll":
             metric_fct = NBLL()
+        elif metric == "classifier_proba":
+            metric_fct = ClassifierProbaDistance()
+        elif metric == "classifier_cp":
+            metric_fct = ClassifierClassProjection()
+        elif metric == "mean_var_distribution":
+            metric_fct = MeanVarDistributionDistance()
+        elif metric == "mahalanobis":
+            metric_fct = MahalanobisDistance(self.aggregation_func)
         else:
             raise ValueError(f"Metric {metric} not recognized.")
         self.metric_fct = metric_fct
         if layer_key and obsm_key:
             raise ValueError(
-                "Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n"
-                "Please provide only one of the two keys."
+                "Cannot use 'layer_key' and 'obsm_key' at the same time.\n" "Please provide only one of the two keys."
             )
         if not layer_key and not obsm_key:
             obsm_key = "X_pca"
@@ -183,37 +214,80 @@ class Distance:
             >>> D = Distance(X, Y)
         """
         if issparse(X):
-            X = X.A
+            X = X.toarray()
         if issparse(Y):
-            Y = Y.A
+            Y = Y.toarray()
         if len(X) == 0 or len(Y) == 0:
             raise ValueError("Neither X nor Y can be empty.")
         return self.metric_fct(X, Y, **kwargs)
+    def bootstrap(
+        self,
+        X: np.ndarray,
+        Y: np.ndarray,
+        *,
+        n_bootstrap: int = 100,
+        random_state: int = 0,
+        **kwargs,
+    ) -> MeanVar:
+        """Bootstrap computation of mean and variance of the distance between vectors X and Y.
+        Args:
+            X: First vector of shape (n_samples, n_features).
+            Y: Second vector of shape (n_samples, n_features).
+            n_bootstrap: Number of bootstrap samples.
+            random_state: Random state for bootstrapping.
+        Returns:
+            MeanVar: Mean and variance of distance between X and Y.
+        Examples:
+            >>> import pertpy as pt
+            >>> adata = pt.dt.distance_example()
+            >>> Distance = pt.tools.Distance(metric="edistance")
+            >>> X = adata.obsm["X_pca"][adata.obs["perturbation"] == "p-sgCREB1-2"]
+            >>> Y = adata.obsm["X_pca"][adata.obs["perturbation"] == "control"]
+            >>> D = Distance.bootstrap(X, Y)
+        """
+        return self._bootstrap_mode(
+            X,
+            Y,
+            n_bootstraps=n_bootstrap,
+            random_state=random_state,
+            **kwargs,
+        )
     def pairwise(
         self,
         adata: AnnData,
         groupby: str,
         groups: list[str] | None = None,
+        bootstrap: bool = False,
+        n_bootstrap: int = 100,
+        random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
-    ) -> pd.DataFrame:
+    ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
         """Get pairwise distances between groups of cells.
         Args:
             adata: Annotated data matrix.
             groupby: Column name in adata.obs.
             groups: List of groups to compute pairwise distances for.
-                    If None, uses all groups. Defaults to None.
-            show_progressbar: Whether to show progress bar. Defaults to True.
+                    If None, uses all groups.
+            bootstrap: Whether to bootstrap the distance.
+            n_bootstrap: Number of bootstrap samples.
+            random_state: Random state for bootstrapping.
+            show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
         Returns:
             pd.DataFrame: Dataframe with pairwise distances.
+            tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of pairwise distances.
         Examples:
             >>> import pertpy as pt
@@ -224,6 +298,8 @@ class Distance:
         groups = adata.obs[groupby].unique() if groups is None else groups
         grouping = adata.obs[groupby].copy()
         df = pd.DataFrame(index=groups, columns=groups, dtype=float)
+        if bootstrap:
+            df_var = pd.DataFrame(index=groups, columns=groups, dtype=float)
         fct = track if show_progressbar else lambda iterable: iterable
         # Some metrics are able to handle precomputed distances. This means that
@@ -239,16 +315,29 @@ class Distance:
             for index_x, group_x in enumerate(fct(groups)):
                 idx_x = grouping == group_x
                 for group_y in groups[index_x:]:  # type: ignore
-                    if group_x == group_y:
-                        dist = 0.0  # by distance axiom
+                    # subset the pairwise distance matrix to the two groups
+                    idx_y = grouping == group_y
+                    sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
+                    sub_idx = grouping[idx_x | idx_y] == group_x
+                    if not bootstrap:
+                        if group_x == group_y:
+                            dist = 0.0
+                        else:
+                            dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        df.loc[group_x, group_y] = dist
+                        df.loc[group_y, group_x] = dist
                     else:
-                        idx_y = grouping == group_y
-                        # subset the pairwise distance matrix to the two groups
-                        sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
-                        sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
-                    df.loc[group_x, group_y] = dist
-                    df.loc[group_y, group_x] = dist
+                        bootstrap_output = self._bootstrap_mode_precomputed(
+                            sub_pwd,
+                            sub_idx,
+                            n_bootstraps=n_bootstrap,
+                            random_state=random_state,
+                            **kwargs,
+                        )
+                        # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                        df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean
+                        df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance
         else:
             if self.layer_key:
                 embedding = adata.layers[self.layer_key]
@@ -257,18 +346,39 @@ class Distance:
             for index_x, group_x in enumerate(fct(groups)):
                 cells_x = embedding[grouping == group_x].copy()
                 for group_y in groups[index_x:]:  # type: ignore
-                    if group_x == group_y:
-                        dist = 0.0
+                    cells_y = embedding[grouping == group_y].copy()
+                    if not bootstrap:
+                        # By distance axiom, the distance between a group and itself is 0
+                        dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
+                        df.loc[group_x, group_y] = dist
+                        df.loc[group_y, group_x] = dist
                     else:
-                        cells_y = embedding[grouping == group_y].copy()
-                        dist = self(cells_x, cells_y, **kwargs)
-                    df.loc[group_x, group_y] = dist
-                    df.loc[group_y, group_x] = dist
+                        bootstrap_output = self.bootstrap(
+                            cells_x,
+                            cells_y,
+                            n_bootstrap=n_bootstrap,
+                            random_state=random_state,
+                            **kwargs,
+                        )
+                        # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                        df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean
+                        df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance
         df.index.name = groupby
         df.columns.name = groupby
         df.name = f"pairwise {self.metric}"
-        return df
+        if not bootstrap:
+            return df
+        else:
+            df = df.fillna(0)
+            df_var.index.name = groupby
+            df_var.columns.name = groupby
+            df_var = df_var.fillna(0)
+            df_var.name = f"pairwise {self.metric} variance"
+            return df, df_var
     def onesided_distances(
         self,
@@ -276,24 +386,32 @@ class Distance:
         groupby: str,
         selected_group: str | None = None,
         groups: list[str] | None = None,
+        bootstrap: bool = False,
+        n_bootstrap: int = 100,
+        random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
-    ) -> pd.DataFrame:
-        """Get pairwise distances between groups of cells.
+    ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
+        """Get distances between one selected cell group and the remaining other cell groups.
         Args:
             adata: Annotated data matrix.
             groupby: Column name in adata.obs.
             selected_group: Group to compute pairwise distances to all other.
             groups: List of groups to compute distances to selected_group for.
-                    If None, uses all groups. Defaults to None.
-            show_progressbar: Whether to show progress bar. Defaults to True.
+                    If None, uses all groups.
+            bootstrap: Whether to bootstrap the distance.
+            n_bootstrap: Number of bootstrap samples.
+            random_state: Random state for bootstrapping.
+            show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
         Returns:
             pd.DataFrame: Dataframe with distances of groups to selected_group.
+            tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of distances of groups to selected_group.
         Examples:
             >>> import pertpy as pt
@@ -301,16 +419,31 @@ class Distance:
             >>> Distance = pt.tools.Distance(metric="edistance")
             >>> pairwise_df = Distance.onesided_distances(adata, groupby="perturbation", selected_group="control")
         """
+        if self.metric == "classifier_cp":
+            if bootstrap:
+                raise NotImplementedError("Currently, ClassifierClassProjection does not support bootstrapping.")
+            return self.metric_fct.onesided_distances(  # type: ignore
+                adata,
+                groupby,
+                selected_group,
+                groups,
+                show_progressbar,
+                n_jobs,
+                **kwargs,
+            )
         groups = adata.obs[groupby].unique() if groups is None else groups
         grouping = adata.obs[groupby].copy()
         df = pd.Series(index=groups, dtype=float)
+        if bootstrap:
+            df_var = pd.Series(index=groups, dtype=float)
         fct = track if show_progressbar else lambda iterable: iterable
         # Some metrics are able to handle precomputed distances. This means that
         # the pairwise distances between all cells are computed once and then
         # passed to the metric function. This is much faster than computing the
         # pairwise distances for each group separately. Other metrics are not
-        # able to handle precomputed distances such as the PsuedobulkDistance.
+        # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
             if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
@@ -320,28 +453,59 @@ class Distance:
                 idx_x = grouping == group_x
                 group_y = selected_group
                 if group_x == group_y:
-                    dist = 0.0  # by distance axiom
+                    df.loc[group_x] = 0.0  # by distance axiom
                 else:
                     idx_y = grouping == group_y
                     # subset the pairwise distance matrix to the two groups
                     sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                     sub_idx = grouping[idx_x | idx_y] == group_x
-                    dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
-                df.loc[group_x] = dist
+                    if not bootstrap:
+                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        df.loc[group_x] = dist
+                    else:
+                        bootstrap_output = self._bootstrap_mode_precomputed(
+                            sub_pwd,
+                            sub_idx,
+                            n_bootstraps=n_bootstrap,
+                            random_state=random_state,
+                            **kwargs,
+                        )
+                        df.loc[group_x] = bootstrap_output.mean
+                        df_var.loc[group_x] = bootstrap_output.variance
         else:
-            embedding = adata.obsm[self.obsm_key].copy()
+            if self.layer_key:
+                embedding = adata.layers[self.layer_key]
+            else:
+                embedding = adata.obsm[self.obsm_key].copy()
             for group_x in fct(groups):
                 cells_x = embedding[grouping == group_x].copy()
                 group_y = selected_group
-                if group_x == group_y:
-                    dist = 0.0
+                cells_y = embedding[grouping == group_y].copy()
+                if not bootstrap:
+                    # By distance axiom, the distance between a group and itself is 0
+                    dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
+                    df.loc[group_x] = dist
                 else:
-                    cells_y = embedding[grouping == group_y].copy()
-                    dist = self.metric_fct(cells_x, cells_y, **kwargs)
-                df.loc[group_x] = dist
+                    bootstrap_output = self.bootstrap(
+                        cells_x,
+                        cells_y,
+                        n_bootstrap=n_bootstrap,
+                        random_state=random_state,
+                        **kwargs,
+                    )
+                    # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                    df.loc[group_x] = bootstrap_output.mean
+                    df_var.loc[group_x] = bootstrap_output.variance
         df.index.name = groupby
         df.name = f"{self.metric} to {selected_group}"
-        return df
+        if not bootstrap:
+            return df
+        else:
+            df_var.index.name = groupby
+            df_var = df_var.fillna(0)
+            df_var.name = f"pairwise {self.metric} variance to {selected_group}"
+            return df, df_var
     def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
         """Precompute pairwise distances between all cells, writes to adata.obsp.
@@ -367,6 +531,77 @@ class Distance:
         pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
+    def compare_distance(
+        self,
+        pert: np.ndarray,
+        pred: np.ndarray,
+        ctrl: np.ndarray,
+        mode: Literal["simple", "scaled"] = "simple",
+        fit_to_pert_and_ctrl: bool = False,
+        **kwargs,
+    ) -> float:
+        """Compute the score of simulating a perturbation.
+        Args:
+            pert: Real perturbed data.
+            pred: Simulated perturbed data.
+            ctrl: Control data
+            mode: Mode to use.
+            fit_to_pert_and_ctrl: Scales data based on both `pert` and `ctrl` if True, otherwise only on `ctrl`.
+            kwargs: Additional keyword arguments passed to the metric function.
+        """
+        if mode == "simple":
+            pass  # nothing to be done
+        elif mode == "scaled":
+            from sklearn.preprocessing import MinMaxScaler
+            scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if fit_to_pert_and_ctrl else ctrl)
+            pred = scaler.transform(pred)
+            pert = scaler.transform(pert)
+        else:
+            raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.")
+        d1 = self.metric_fct(pert, pred, **kwargs)
+        d2 = self.metric_fct(ctrl, pred, **kwargs)
+        return d1 / d2
+    def _bootstrap_mode(self, X, Y, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar:
+        rng = np.random.default_rng(random_state)
+        distances = []
+        for _ in range(n_bootstraps):
+            X_bootstrapped = X[rng.choice(a=X.shape[0], size=X.shape[0], replace=True)]
+            Y_bootstrapped = Y[rng.choice(a=Y.shape[0], size=X.shape[0], replace=True)]
+            distance = self(X_bootstrapped, Y_bootstrapped, **kwargs)
+            distances.append(distance)
+        mean = np.mean(distances)
+        variance = np.var(distances)
+        return MeanVar(mean=mean, variance=variance)
+    def _bootstrap_mode_precomputed(self, sub_pwd, sub_idx, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar:
+        rng = np.random.default_rng(random_state)
+        distances = []
+        for _ in range(n_bootstraps):
+            # To maintain the number of cells for both groups (whatever balancing they may have),
+            # we sample the positive and negative indices separately
+            bootstrap_pos_idx = rng.choice(a=sub_idx[sub_idx].index, size=sub_idx[sub_idx].size, replace=True)
+            bootstrap_neg_idx = rng.choice(a=sub_idx[~sub_idx].index, size=sub_idx[~sub_idx].size, replace=True)
+            bootstrap_idx = np.concatenate([bootstrap_pos_idx, bootstrap_neg_idx])
+            bootstrap_idx_nrs = sub_idx.index.get_indexer(bootstrap_idx)
+            bootstrap_sub_idx = sub_idx[bootstrap_idx]
+            bootstrap_sub_pwd = sub_pwd[bootstrap_idx_nrs, :][:, bootstrap_idx_nrs]
+            distance = self.metric_fct.from_precomputed(bootstrap_sub_pwd, bootstrap_sub_idx, **kwargs)
+            distances.append(distance)
+        mean = np.mean(distances)
+        variance = np.var(distances)
+        return MeanVar(mean=mean, variance=variance)
 class AbstractDistance(ABC):
     """Abstract class of distance metrics between two sets of vectors."""
@@ -471,11 +706,8 @@ class WassersteinDistance(AbstractDistance):
         return self.solve_ot_problem(geom, **kwargs)
     def solve_ot_problem(self, geom: Geometry, **kwargs):
-        # Define a linear problem with that cost structure.
         ot_prob = LinearProblem(geom)
-        # Create a Sinkhorn solver
         solver = Sinkhorn()
-        # Solve OT problem
         ot = solver(ot_prob, **kwargs)
         return ot.reg_ot_cost.item()
@@ -483,12 +715,17 @@ class WassersteinDistance(AbstractDistance):
 class EuclideanDistance(AbstractDistance):
     """Euclidean distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs)
+        return np.linalg.norm(
+            self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+            ord=2,
+            **kwargs,
+        )
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
@@ -497,12 +734,21 @@ class EuclideanDistance(AbstractDistance):
 class MeanSquaredDistance(AbstractDistance):
     """Mean squared distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 0.5
+        return (
+            np.linalg.norm(
+                self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+                ord=2,
+                **kwargs,
+            )
+            ** 2
+            / X.shape[1]
+        )
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
@@ -511,12 +757,20 @@ class MeanSquaredDistance(AbstractDistance):
 class MeanAbsoluteDistance(AbstractDistance):
     """Absolute (Norm-1) distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
+        return (
+            np.linalg.norm(
+                self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+                ord=1,
+                **kwargs,
+            )
+            / X.shape[1]
+        )
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
@@ -541,12 +795,13 @@ class MeanPairwiseDistance(AbstractDistance):
 class PearsonDistance(AbstractDistance):
     """Pearson distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0]
+        return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
@@ -555,12 +810,13 @@ class PearsonDistance(AbstractDistance):
 class SpearmanDistance(AbstractDistance):
     """Spearman distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0]
+        return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
@@ -569,12 +825,13 @@ class SpearmanDistance(AbstractDistance):
 class KendallTauDistance(AbstractDistance):
     """Kendall-tau distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        x, y = X.mean(axis=0), Y.mean(axis=0)
+        x, y = self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
         n = len(x)
         tau_corr = kendalltau(x, y).statistic
         tau_dist = (1 - tau_corr) * n * (n - 1) / 4
@@ -587,12 +844,13 @@ class KendallTauDistance(AbstractDistance):
 class CosineDistance(AbstractDistance):
     """Cosine distance between pseudobulk vectors."""
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return cosine(X.mean(axis=0), Y.mean(axis=0))
+        return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
@@ -603,22 +861,24 @@ class R2ScoreDistance(AbstractDistance):
     # NOTE: This is not a distance metric but a similarity metric.
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0))
+        return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
-class KLDivergence(AbstractDistance):
-    """Average of KL divergence between gene distributions of two groups
+class SymmetricKLDivergence(AbstractDistance):
+    """Average of symmetric KL divergence between gene distributions of two groups
     Assuming a Gaussian distribution for each gene in each group, calculates
-    the KL divergence between them and averages over all genes
+    the KL divergence between them and averages over all genes. Repeats this ABBA to get a symmetrized distance.
+    See https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence#Symmetrised_divergence.
     """
@@ -632,11 +892,12 @@ class KLDivergence(AbstractDistance):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
             kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
-            kl_all.append(kl)
+            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
+            kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KLDivergence cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
 class TTestDistance(AbstractDistance):
@@ -663,6 +924,23 @@ class TTestDistance(AbstractDistance):
         raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
+class KSTestDistance(AbstractDistance):
+    """Average of two-sided KS test statistic between two groups"""
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        stats = []
+        for i in range(X.shape[1]):
+            stats.append(abs(kstest(X[:, i], Y[:, i])[0]))
+        return sum(stats) / len(stats)
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
 class NBLL(AbstractDistance):
     """
     Average of Log likelihood (scalar) of group B cells
@@ -683,16 +961,12 @@ class NBLL(AbstractDistance):
         if not _is_count_matrix(matrix=X) or not _is_count_matrix(matrix=Y):
             raise ValueError("NBLL distance only works for raw counts.")
-        nlls = []
-        for i in range(X.shape[1]):
-            x, y = X[:, i], Y[:, i]
-            nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
-            mu = np.repeat(np.exp(nb_params[0]), y.shape[0])
-            theta = np.repeat(1 / nb_params[1], y.shape[0])
-            if mu[0] == np.nan or theta[0] == np.nan:
-                raise ValueError("Could not fit a negative binomial distribution to the input data")
-            # calculate the nll of y
-            eps = np.repeat(epsilon, y.shape[0])
+        @numba.jit(forceobj=True)
+        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
+            mu = np.exp(nb_params[0])
+            theta = 1 / nb_params[1]
+            eps = epsilon
             log_theta_mu_eps = np.log(theta + mu + eps)
             nll = (
                 theta * (np.log(theta + eps) - log_theta_mu_eps)
@@ -701,9 +975,221 @@ class NBLL(AbstractDistance):
                 - gammaln(theta)
                 - gammaln(y + 1)
             )
-            nlls.append(nll.mean())
+            return nll.mean()
+        def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
+            try:
+                nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
+                return _compute_nll(y, nb_params, epsilon)
+            except np.linalg.linalg.LinAlgError:
+                if x.mean() < 10 and y.mean() < 10:
+                    return 0.0
+                else:
+                    return np.nan  # Use NaN to indicate skipped genes
+        nlls = []
+        genes_skipped = 0
+        for i in range(X.shape[1]):
+            nll = _process_gene(X[:, i], Y[:, i], epsilon)
+            if np.isnan(nll):
+                genes_skipped += 1
+            else:
+                nlls.append(nll)
+        if genes_skipped > X.shape[1] / 2:
+            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
-        return -sum(nlls) / len(nlls)
+        return -np.sum(nlls) / len(nlls)
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
+def _sample(X, frac=None, n=None):
+    """Returns subsample of cells in format (train, test)."""
+    if frac and n:
+        raise ValueError("Cannot pass both frac and n.")
+    if frac:
+        n_cells = max(1, int(X.shape[0] * frac))
+    elif n:
+        n_cells = n
+    else:
+        raise ValueError("Must pass either `frac` or `n`.")
+    rng = np.random.default_rng()
+    sampled_indices = rng.choice(X.shape[0], n_cells, replace=False)
+    remaining_indices = np.setdiff1d(np.arange(X.shape[0]), sampled_indices)
+    return X[remaining_indices, :], X[sampled_indices, :]
+class ClassifierProbaDistance(AbstractDistance):
+    """Average of classification probabilites of a binary classifier.
+    Assumes the first condition is control and the second is perturbed.
+    Always holds out 20% of the perturbed condition.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        Y_train, Y_test = _sample(Y, frac=0.2)
+        label = ["c"] * X.shape[0] + ["p"] * Y_train.shape[0]
+        train = np.concatenate([X, Y_train])
+        reg = LogisticRegression()
+        reg.fit(train, label)
+        test_labels = reg.predict_proba(Y_test)
+        return np.mean(test_labels[:, 1])
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
+class ClassifierClassProjection(AbstractDistance):
+    """Average of 1-(classification probability of control).
+    Warning: unlike all other distances, this must also take a list of categorical labels the same length as X.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
+    def onesided_distances(
+        self,
+        adata: AnnData,
+        groupby: str,
+        selected_group: str | None = None,
+        groups: list[str] | None = None,
+        show_progressbar: bool = True,
+        n_jobs: int = -1,
+        **kwargs,
+    ) -> Series:
+        """Unlike the parent function, all groups except the selected group are factored into the classifier.
+        Similar to the parent function, the returned dataframe contains only the specified groups.
+        """
+        groups = adata.obs[groupby].unique() if groups is None else groups
+        fct = track if show_progressbar else lambda iterable: iterable
+        X = adata[adata.obs[groupby] != selected_group].X
+        labels = adata[adata.obs[groupby] != selected_group].obs[groupby].values
+        Y = adata[adata.obs[groupby] == selected_group].X
+        reg = LogisticRegression()
+        reg.fit(X, labels)
+        test_probas = reg.predict_proba(Y)
+        df = pd.Series(index=groups, dtype=float)
+        for group in fct(groups):
+            if group == selected_group:
+                df.loc[group] = 0
+            else:
+                class_idx = list(reg.classes_).index(group)
+                df.loc[group] = 1 - np.mean(test_probas[:, class_idx])
+        df.index.name = groupby
+        df.name = f"classifier_cp to {selected_group}"
+        return df
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
+class MeanVarDistributionDistance(AbstractDistance):
+    """Distance between mean-var distributions of gene expression."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        """Difference of mean-var distributions in 2 matrices.
+        Args:
+            X: Normalized and log transformed cells x genes count matrix.
+            Y: Normalized and log transformed cells x genes count matrix.
+        """
+        def _mean_var(x, log: bool = False):
+            mean = np.mean(x, axis=0)
+            var = np.var(x, axis=0)
+            positive = mean > 0
+            mean = mean[positive]
+            var = var[positive]
+            if log:
+                mean = np.log(mean)
+                var = np.log(var)
+            return mean, var
+        def _prep_kde_data(x, y):
+            return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
+        def _grid_points(d, n_points=100):
+            # Make grid, add 1 bin on lower/upper end to get final n_points
+            d_min = d.min()
+            d_max = d.max()
+            # Compute bin size
+            d_bin = (d_max - d_min) / (n_points - 2)
+            d_min = d_min - d_bin
+            d_max = d_max + d_bin
+            return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
+        def _parallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
+            # the thread_count is determined using the factor 0.875 as recommended here:
+            # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook
+            with multiprocessing.Pool(thread_count) as p:
+                return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
+        def _kde_eval(d, grid):
+            # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
+            # can not be compared well on regions further away from the data as they are -inf
+            kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d)
+            return _parallel_score_samples(kde, grid)
+        mean_x, var_x = _mean_var(X, log=True)
+        mean_y, var_y = _mean_var(Y, log=True)
+        x = _prep_kde_data(mean_x, var_x)
+        y = _prep_kde_data(mean_y, var_y)
+        # Gridpoints to eval KDE on
+        mean_grid = _grid_points(np.concatenate([mean_x, mean_y]))
+        var_grid = _grid_points(np.concatenate([var_x, var_y]))
+        grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
+        kde_x = _kde_eval(x, grid)
+        kde_y = _kde_eval(y, grid)
+        kde_diff = ((kde_x - kde_y) ** 2).mean()
+        return kde_diff
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("MeanVarDistributionDistance cannot be called on a pairwise distance matrix.")
+class MahalanobisDistance(AbstractDistance):
+    """Mahalanobis distance between pseudobulk vectors."""
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        return mahalanobis(
+            self.aggregation_func(X, axis=0),
+            self.aggregation_func(Y, axis=0),
+            np.linalg.inv(np.cov(X.T)),
+        )
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")

pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

pertpy 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl