PyPI - pertpy - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

pertpy 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

pertpy/__init__.py +5 -1
pertpy/_doc.py +1 -3
pertpy/_types.py +6 -0
pertpy/data/_dataloader.py +68 -24
pertpy/data/_datasets.py +9 -9
pertpy/metadata/__init__.py +2 -1
pertpy/metadata/_cell_line.py +133 -25
pertpy/metadata/_look_up.py +13 -19
pertpy/metadata/_moa.py +1 -1
pertpy/preprocessing/_guide_rna.py +138 -44
pertpy/preprocessing/_guide_rna_mixture.py +17 -19
pertpy/tools/__init__.py +1 -1
pertpy/tools/_augur.py +106 -98
pertpy/tools/_cinemaot.py +74 -114
pertpy/tools/_coda/_base_coda.py +129 -145
pertpy/tools/_coda/_sccoda.py +66 -69
pertpy/tools/_coda/_tasccoda.py +71 -79
pertpy/tools/_dialogue.py +48 -40
pertpy/tools/_differential_gene_expression/_base.py +21 -31
pertpy/tools/_differential_gene_expression/_checks.py +4 -6
pertpy/tools/_differential_gene_expression/_dge_comparison.py +5 -6
pertpy/tools/_differential_gene_expression/_edger.py +6 -10
pertpy/tools/_differential_gene_expression/_pydeseq2.py +1 -1
pertpy/tools/_differential_gene_expression/_simple_tests.py +3 -3
pertpy/tools/_differential_gene_expression/_statsmodels.py +8 -5
pertpy/tools/_distances/_distance_tests.py +1 -2
pertpy/tools/_distances/_distances.py +31 -45
pertpy/tools/_enrichment.py +7 -22
pertpy/tools/_milo.py +19 -15
pertpy/tools/_mixscape.py +73 -75
pertpy/tools/_perturbation_space/_clustering.py +4 -4
pertpy/tools/_perturbation_space/_comparison.py +4 -4
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +83 -32
pertpy/tools/_perturbation_space/_perturbation_space.py +10 -10
pertpy/tools/_perturbation_space/_simple.py +12 -14
pertpy/tools/_scgen/_scgen.py +16 -17
pertpy/tools/_scgen/_scgenvae.py +2 -2
pertpy/tools/_scgen/_utils.py +3 -1
{pertpy-0.10.0.dist-info → pertpy-0.11.0.dist-info}/METADATA +36 -20
pertpy-0.11.0.dist-info/RECORD +58 -0
{pertpy-0.10.0.dist-info → pertpy-0.11.0.dist-info}/licenses/LICENSE +1 -0
pertpy/tools/_kernel_pca.py +0 -50
pertpy-0.10.0.dist-info/RECORD +0 -58
{pertpy-0.10.0.dist-info → pertpy-0.11.0.dist-info}/WHEEL +0 -0

pertpy/tools/_mixscape.py CHANGED Viewed

@@ -9,15 +9,14 @@ import numpy as np
 import pandas as pd
 import scanpy as sc
 import seaborn as sns
+from fast_array_utils.stats import mean, mean_var
 from scanpy import get
-from scanpy._settings import settings
 from scanpy._utils import _check_use_raw, sanitize_anndata
 from scanpy.plotting import _utils
 from scanpy.tools._utils import _choose_representation
-from scipy.sparse import csr_matrix, issparse, spmatrix
+from scipy.sparse import csr_matrix, spmatrix
 from sklearn.mixture import GaussianMixture
-import pertpy as pt
 from pertpy._doc import _doc_params, doc_common_plot_args
 if TYPE_CHECKING:
@@ -111,7 +110,7 @@ class Mixscape:
             for split in adata.obs[split_by].unique():
                 split_mask = adata.obs[split_by] == split
                 control_mask_group = control_mask & split_mask
-                control_mean_expr = adata.X[control_mask_group].mean(0)
+                control_mean_expr = mean(adata.X[control_mask_group], axis=0)
                 adata.layers["X_pert"][split_mask] = (
                     np.repeat(control_mean_expr.reshape(1, -1), split_mask.sum(), axis=0)
                     - adata.layers["X_pert"][split_mask]
@@ -127,14 +126,14 @@ class Mixscape:
             if n_dims is not None and n_dims < representation.shape[1]:
                 representation = representation[:, :n_dims]
+            from pynndescent import NNDescent
             for split_mask in split_masks:
                 control_mask_split = control_mask & split_mask
                 R_split = representation[split_mask]
                 R_control = representation[np.asarray(control_mask_split)]
-                from pynndescent import NNDescent
                 eps = kwargs.pop("epsilon", 0.1)
                 nn_index = NNDescent(R_control, **kwargs)
                 indices, _ = nn_index.query(R_split, k=n_neighbors, epsilon=eps)
@@ -153,11 +152,10 @@ class Mixscape:
                         shape=(n_split, n_control),
                     )
                     neigh_matrix /= n_neighbors
-                    adata.layers["X_pert"][split_mask] = (
-                        np.log1p(neigh_matrix @ X_control) - adata.layers["X_pert"][split_mask]
+                    adata.layers["X_pert"][np.asarray(split_mask)] = (
+                        sc.pp.log1p(neigh_matrix @ X_control) - adata.layers["X_pert"][np.asarray(split_mask)]
                     )
                 else:
-                    is_sparse = issparse(X_control)
                     split_indices = np.where(split_mask)[0]
                     for i in range(0, n_split, batch_size):
                         size = min(i + batch_size, n_split)
@@ -168,10 +166,9 @@ class Mixscape:
                         size = size - i
-                        # sparse is very slow
                         means_batch = X_control[batch]
-                        means_batch = means_batch.toarray() if is_sparse else means_batch
-                        means_batch = means_batch.reshape(size, n_neighbors, -1).mean(1)
+                        batch_reshaped = means_batch.reshape(size, n_neighbors, -1)
+                        means_batch, _ = mean_var(batch_reshaped, axis=1)
                         adata.layers["X_pert"][split_batch] = (
                             np.log1p(means_batch) - adata.layers["X_pert"][split_batch]
@@ -199,6 +196,7 @@ class Mixscape:
         perturbation_type: str | None = "KO",
         random_state: int | None = 0,
         copy: bool | None = False,
+        **gmmkwargs,
     ):
         """Identify perturbed and non-perturbed gRNA expressing cells that accounts for multiple treatments/conditions/chemical perturbations.
@@ -221,6 +219,7 @@ class Mixscape:
             perturbation_type: specify type of CRISPR perturbation expected for labeling mixscape classifications.
             random_state: Random seed for the GaussianMixture model.
             copy: Determines whether a copy of the `adata` is returned.
+            **gmmkwargs: Passed to custom implementation of scikit-learn Gaussian Mixture Model.
         Returns:
             If `copy=True`, returns the copy of `adata` with the classification result in `.obs`.
@@ -307,10 +306,9 @@ class Mixscape:
                 else:
                     de_genes = perturbation_markers[(category, gene)]
-                    de_genes_indices = self._get_column_indices(adata, list(de_genes))
+                    de_genes_indices = np.where(np.isin(adata.var_names, list(de_genes)))[0]
                     dat = X[np.asarray(all_cells)][:, de_genes_indices]
-                    dat_cells = all_cells[all_cells].index
                     if scale:
                         dat = sc.pp.scale(dat)
@@ -318,6 +316,9 @@ class Mixscape:
                     n_iter = 0
                     old_classes = adata.obs[new_class_name][all_cells]
+                    nt_cells_dat_idx = all_cells[all_cells].index.get_indexer(nt_cells[nt_cells].index)
+                    nt_cells_mean = np.mean(dat[nt_cells_dat_idx], axis=0)
                     while not converged and n_iter < iter_num:
                         # Get all cells in current split&Gene
                         guide_cells = (adata.obs[new_class_name] == gene) & split_mask
@@ -326,12 +327,12 @@ class Mixscape:
                         # all cells in current split&Gene minus all NT cells in current split
                         # Each row is for each cell, each column is for each gene, get mean for each column
                         guide_cells_dat_idx = all_cells[all_cells].index.get_indexer(guide_cells[guide_cells].index)
-                        nt_cells_dat_idx = all_cells[all_cells].index.get_indexer(nt_cells[nt_cells].index)
-                        vec = np.mean(dat[guide_cells_dat_idx], axis=0) - np.mean(dat[nt_cells_dat_idx], axis=0)
+                        guide_cells_mean = np.mean(dat[guide_cells_dat_idx], axis=0)
+                        vec = guide_cells_mean - nt_cells_mean
                         # project cells onto the perturbation vector
                         if isinstance(dat, spmatrix):
-                            pvec = np.dot(dat.toarray(), vec) / np.dot(vec, vec)
+                            pvec = dat.dot(vec) / np.dot(vec, vec)
                         else:
                             pvec = np.dot(dat, vec) / np.dot(vec, vec)
                         pvec = pd.Series(np.asarray(pvec).flatten(), index=list(all_cells.index[all_cells]))
@@ -341,7 +342,7 @@ class Mixscape:
                             gv["pvec"] = pvec
                             gv[labels] = control
                             gv.loc[guide_cells, labels] = gene
-                            if gene not in gv_list.keys():
+                            if gene not in gv_list:
                                 gv_list[gene] = {}
                             gv_list[gene][category] = gv
@@ -351,31 +352,30 @@ class Mixscape:
                             n_components=2,
                             covariance_type="spherical",
                             means_init=means_init,
-                            precisions_init=1 / (std_init ** 2),
+                            precisions_init=1 / (std_init**2),
                             random_state=random_state,
-                            max_iter=5000,
+                            max_iter=100,
                             fixed_means=[pvec[nt_cells].mean(), None],
                             fixed_covariances=[pvec[nt_cells].std() ** 2, None],
+                            **gmmkwargs,
                         ).fit(np.asarray(pvec).reshape(-1, 1))
                         probabilities = mm.predict_proba(np.array(pvec[orig_guide_cells_index]).reshape(-1, 1))
                         lik_ratio = probabilities[:, 0] / probabilities[:, 1]
                         post_prob = 1 / (1 + lik_ratio)
                         # based on the posterior probability, assign cells to the two classes
-                        adata.obs.loc[
-                            [orig_guide_cells_index[cell] for cell in np.where(post_prob > 0.5)[0]], new_class_name
-                        ] = gene
-                        adata.obs.loc[
-                            [orig_guide_cells_index[cell] for cell in np.where(post_prob <= 0.5)[0]], new_class_name
-                        ] = f"{gene} NP"
+                        ko_mask = post_prob > 0.5
+                        adata.obs.loc[np.array(orig_guide_cells_index)[ko_mask], new_class_name] = gene
+                        adata.obs.loc[np.array(orig_guide_cells_index)[~ko_mask], new_class_name] = f"{gene} NP"
                         if sum(adata.obs[new_class_name][split_mask] == gene) < min_de_genes:
                             adata.obs.loc[guide_cells, new_class_name] = "NP"
                             converged = True
-                        if adata.obs[new_class_name][all_cells].equals(old_classes):
+                        current_classes = adata.obs[new_class_name][all_cells]
+                        if (current_classes == old_classes).all():
                             converged = True
+                        old_classes = current_classes
-                        old_classes = adata.obs[new_class_name][all_cells]
                         n_iter += 1
                     adata.obs.loc[(adata.obs[new_class_name] == gene) & split_mask, new_class_name] = (
@@ -414,7 +414,6 @@ class Mixscape:
             control: Control category from the `pert_key` column.
             mixscape_class_global: The column of `.obs` with mixscape global classification result (perturbed, NP or NT).
             layer: Layer to use for identifying differentially expressed genes. If `None`, adata.X is used.
-            control: Control category from the `pert_key` column.
             n_comps: Number of principal components to use.
             min_de_genes: Required number of genes that are differentially expressed for method to separate perturbed and non-perturbed cells.
             logfc_threshold: Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
@@ -470,7 +469,8 @@ class Mixscape:
         )
         adata_subset = adata[
             (adata.obs[mixscape_class_global] == perturbation_type) | (adata.obs[mixscape_class_global] == control)
-        ].copy()
+        ]
+        X = adata_subset.X - adata_subset.X.mean(0)
         projected_pcs: dict[str, np.ndarray] = {}
         # performs PCA on each mixscape class separately and projects each subspace onto all cells in the data.
         for _, (key, value) in enumerate(perturbation_markers.items()):
@@ -482,16 +482,10 @@ class Mixscape:
                 ].copy()
                 sc.pp.scale(gene_subset)
                 sc.tl.pca(gene_subset, n_comps=n_comps)
-                sc.pp.neighbors(gene_subset)
-                # projects each subspace onto all cells in the data.
-                sc.tl.ingest(adata=adata_subset, adata_ref=gene_subset, embedding_method="pca")
-                projected_pcs[key[1]] = adata_subset.obsm["X_pca"]
+                # project cells into PCA space of gene_subset
+                projected_pcs[key[1]] = np.asarray(np.dot(X, gene_subset.varm["PCs"]))
         # concatenate all pcs into a single matrix.
-        for index, (_, value) in enumerate(projected_pcs.items()):
-            if index == 0:
-                projected_pcs_array = value
-            else:
-                projected_pcs_array = np.concatenate((projected_pcs_array, value), axis=1)
+        projected_pcs_array = np.concatenate(list(projected_pcs.values()), axis=1)
         clf = LinearDiscriminantAnalysis(n_components=len(np.unique(adata_subset.obs[labels])) - 1)
         clf.fit(projected_pcs_array, adata_subset.obs[labels])
@@ -514,7 +508,7 @@ class Mixscape:
         logfc_threshold: float,
         test_method: str,
     ) -> dict[tuple, np.ndarray]:
-        """Determine gene sets across all splits/groups through differential gene expression
+        """Determine gene sets across all splits/groups through differential gene expression.
         Args:
             adata: :class:`~anndata.AnnData` object
@@ -549,7 +543,9 @@ class Mixscape:
             )
             # get DE genes for each target gene
             for gene in gene_targets:
-                logfc_threshold_mask = np.abs(adata_split.uns["rank_genes_groups"]["logfoldchanges"][gene]) >= logfc_threshold
+                logfc_threshold_mask = (
+                    np.abs(adata_split.uns["rank_genes_groups"]["logfoldchanges"][gene]) >= logfc_threshold
+                )
                 de_genes = adata_split.uns["rank_genes_groups"]["names"][gene][logfc_threshold_mask]
                 pvals_adj = adata_split.uns["rank_genes_groups"]["pvals_adj"][gene][logfc_threshold_mask]
                 de_genes = de_genes[pvals_adj < pval_cutoff]
@@ -559,19 +555,8 @@ class Mixscape:
         return perturbation_markers
-    def _get_column_indices(self, adata, col_names):
-        if isinstance(col_names, str):  # pragma: no cover
-            col_names = [col_names]
-        indices = []
-        for idx, col in enumerate(adata.var_names):
-            if col in col_names:
-                indices.append(idx)
-        return indices
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_barplot(  # pragma: no cover
+    def plot_barplot(  # pragma: no cover # noqa: D417
         self,
         adata: AnnData,
         guide_rna_column: str,
@@ -678,7 +663,7 @@ class Mixscape:
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_heatmap(  # pragma: no cover
+    def plot_heatmap(  # pragma: no cover # noqa: D417
         self,
         adata: AnnData,
         labels: str,
@@ -748,7 +733,7 @@ class Mixscape:
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_perturbscore(  # pragma: no cover
+    def plot_perturbscore(  # pragma: no cover # noqa: D417
         self,
         adata: AnnData,
         labels: str,
@@ -801,7 +786,7 @@ class Mixscape:
         if "mixscape" not in adata.uns:
             raise ValueError("Please run the `mixscape` function first.")
         perturbation_score = None
-        for key in adata.uns["mixscape"][target_gene].keys():
+        for key in adata.uns["mixscape"][target_gene]:
             perturbation_score_temp = adata.uns["mixscape"][target_gene][key]
             perturbation_score_temp["name"] = key
             if perturbation_score is None:
@@ -914,7 +899,7 @@ class Mixscape:
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_violin(  # pragma: no cover
+    def plot_violin(  # pragma: no cover # noqa: D417
         self,
         adata: AnnData,
         target_gene_idents: str | list[str],
@@ -994,7 +979,7 @@ class Mixscape:
             if len(ylabel) != 1:
                 raise ValueError(f"Expected number of y-labels to be `1`, found `{len(ylabel)}`.")
         elif len(ylabel) != len(keys):
-            raise ValueError(f"Expected number of y-labels to be `{len(keys)}`, " f"found `{len(ylabel)}`.")
+            raise ValueError(f"Expected number of y-labels to be `{len(keys)}`, found `{len(ylabel)}`.")
         if groupby is not None:
             if hue is not None:
@@ -1047,7 +1032,7 @@ class Mixscape:
                 g.set(yscale="log")
             g.set_titles(col_template="{col_name}").set_xlabels("")
             if rotation is not None:
-                for ax in g.axes[0]:
+                for ax in g.axes[0]:  # noqa: PLR1704
                     ax.tick_params(axis="x", labelrotation=rotation)
         else:
             # set by default the violin plot cut=0 to limit the extend
@@ -1065,7 +1050,7 @@ class Mixscape:
             else:
                 axs = [ax]
             for ax, y, ylab in zip(axs, ys, ylabel, strict=False):
-                ax = sns.violinplot(
+                ax = sns.violinplot(  # noqa: PLW2901
                     x=x,
                     y=y,
                     data=obs_tidy,
@@ -1079,7 +1064,7 @@ class Mixscape:
                 # Get the handles and labels.
                 handles, labels = ax.get_legend_handles_labels()
                 if stripplot:
-                    ax = sns.stripplot(
+                    ax = sns.stripplot(  # noqa: PLW2901
                         x=x,
                         y=y,
                         data=obs_tidy,
@@ -1116,7 +1101,7 @@ class Mixscape:
         return None
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_lda(  # pragma: no cover
+    def plot_lda(  # pragma: no cover # noqa: D417
         self,
         adata: AnnData,
         control: str,
@@ -1135,13 +1120,16 @@ class Mixscape:
         """Visualizing perturbation responses with Linear Discriminant Analysis. Requires `pt.tl.mixscape()` to be run first.
         Args:
-            adata: The annotated data object.
+            adata: The annotated data objectplot_heatmap.
             control: Control category from the `pert_key` column.
             mixscape_class: The column of `.obs` with the mixscape classification result.
             mixscape_class_global: The column of `.obs` with mixscape global classification result (perturbed, NP or NT).
             perturbation_type: Specify type of CRISPR perturbation expected for labeling mixscape classifications.
-            lda_key: If not specified, lda looks .uns["mixscape_lda"] for the LDA results.
             n_components: The number of dimensions of the embedding.
+            lda_key: If not specified, lda looks .uns["mixscape_lda"] for the LDA results.
+            color_map: Matplotlib color map.
+            palette: Matplotlib palette.
+            ax: Matplotlib axes.
             {common_plot_args}
             **kwds: Additional arguments to `scanpy.pl.umap`.
@@ -1186,13 +1174,14 @@ class Mixscape:
         plt.show()
         return None
 class MixscapeGaussianMixture(GaussianMixture):
     def __init__(
         self,
         n_components: int,
-        fixed_means:  Sequence[float] | None = None,
+        fixed_means: Sequence[float] | None = None,
         fixed_covariances: Sequence[float] | None = None,
-        **kwargs
+        **kwargs,
     ):
         """Custom Gaussian Mixture Model where means and covariances can be fixed for specific components.
@@ -1206,19 +1195,28 @@ class MixscapeGaussianMixture(GaussianMixture):
         self.fixed_means = fixed_means
         self.fixed_covariances = fixed_covariances
+        self.fixed_mean_indices = []
+        self.fixed_mean_values = []
+        if fixed_means is not None:
+            self.fixed_mean_indices = [i for i, m in enumerate(fixed_means) if m is not None]
+            if self.fixed_mean_indices:
+                self.fixed_mean_values = np.array([fixed_means[i] for i in self.fixed_mean_indices])
+        self.fixed_cov_indices = []
+        self.fixed_cov_values = []
+        if fixed_covariances is not None:
+            self.fixed_cov_indices = [i for i, c in enumerate(fixed_covariances) if c is not None]
+            if self.fixed_cov_indices:
+                self.fixed_cov_values = np.array([fixed_covariances[i] for i in self.fixed_cov_indices])
     def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
         """Modified M-step to respect fixed means and covariances."""
         super()._m_step(X, log_resp)
-        if self.fixed_means is not None:
-            for i in range(self.n_components):
-                if self.fixed_means[i] is not None:
-                    self.means_[i] = self.fixed_means[i]
+        if self.fixed_mean_indices:
+            self.means_[self.fixed_mean_indices] = self.fixed_mean_values
-        if self.fixed_covariances is not None:
-            for i in range(self.n_components):
-                if self.fixed_covariances[i] is not None:
-                    self.covariances_[i] = self.fixed_covariances[i]
+        if self.fixed_cov_indices:
+            self.covariances_[self.fixed_cov_indices] = self.fixed_cov_values
         return self

pertpy/tools/_perturbation_space/_clustering.py CHANGED Viewed

@@ -76,13 +76,13 @@ class ClusteringSpace(PerturbationSpace):
             if metric == "asw":
                 from pertpy.tools._perturbation_space._metrics import asw
-                if "metric" not in kwargs.keys():
+                if "metric" not in kwargs:
                     kwargs["metric"] = "euclidean"
-                if "distances" not in kwargs.keys():
+                if "distances" not in kwargs:
                     distances = pairwise_distances(self.X, metric=kwargs["metric"])
-                if "sample_size" not in kwargs.keys():
+                if "sample_size" not in kwargs:
                     kwargs["sample_size"] = None
-                if "random_state" not in kwargs.keys():
+                if "random_state" not in kwargs:
                     kwargs["random_state"] = None
                 asw_score = asw(

pertpy/tools/_perturbation_space/_comparison.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import TYPE_CHECKING
 import numpy as np
-import pynndescent
 from scipy.sparse import issparse
 from scipy.sparse import vstack as sp_vstack
 from sklearn.base import ClassifierMixin
@@ -95,7 +94,9 @@ class PerturbationComparison:
             labels[-control.shape[0] :] = "ctrl"
             label_groups.append("ctrl")
-        index = pynndescent.NNDescent(
+        from pynndescent import NNDescent
+        index = NNDescent(
             index_data,
             n_neighbors=max(50, n_neighbors),
             random_state=random_state,
@@ -106,7 +107,6 @@ class PerturbationComparison:
         uq, uq_counts = np.unique(labels[indices], return_counts=True)
         uq_counts_norm = uq_counts / uq_counts.sum()
         counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False))
-        for group, count_norm in zip(uq, uq_counts_norm, strict=False):
-            counts[group] = count_norm
+        counts = dict(zip(uq, uq_counts_norm, strict=False))
         return counts

pertpy 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

pertpy 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl