PyPI - pertpy - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pertpy 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pertpy/__init__.py +1 -1
pertpy/_doc.py +19 -0
pertpy/data/_datasets.py +1 -1
pertpy/metadata/_cell_line.py +18 -8
pertpy/metadata/_compound.py +3 -4
pertpy/metadata/_metadata.py +1 -1
pertpy/preprocessing/_guide_rna.py +114 -13
pertpy/preprocessing/_guide_rna_mixture.py +179 -0
pertpy/tools/__init__.py +1 -1
pertpy/tools/_augur.py +64 -86
pertpy/tools/_cinemaot.py +21 -17
pertpy/tools/_coda/_base_coda.py +90 -117
pertpy/tools/_dialogue.py +32 -40
pertpy/tools/_differential_gene_expression/__init__.py +1 -2
pertpy/tools/_differential_gene_expression/_base.py +486 -112
pertpy/tools/_differential_gene_expression/_edger.py +30 -21
pertpy/tools/_differential_gene_expression/_pydeseq2.py +15 -29
pertpy/tools/_differential_gene_expression/_statsmodels.py +0 -11
pertpy/tools/_distances/_distances.py +71 -56
pertpy/tools/_enrichment.py +16 -8
pertpy/tools/_milo.py +54 -50
pertpy/tools/_mixscape.py +307 -208
pertpy/tools/_perturbation_space/_perturbation_space.py +40 -31
pertpy/tools/_perturbation_space/_simple.py +48 -0
pertpy/tools/_scgen/_scgen.py +35 -27
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/METADATA +6 -6
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/RECORD +29 -28
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/WHEEL +1 -1
pertpy/tools/_differential_gene_expression/_formulaic.py +0 -189
{pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_mixscape.py CHANGED Viewed

@@ -18,6 +18,7 @@ from scipy.sparse import csr_matrix, issparse, spmatrix
 from sklearn.mixture import GaussianMixture
 import pertpy as pt
+from pertpy._doc import _doc_params, doc_common_plot_args
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -25,6 +26,7 @@ if TYPE_CHECKING:
     from anndata import AnnData
     from matplotlib.axes import Axes
     from matplotlib.colors import Colormap
+    from matplotlib.pyplot import Figure
     from scipy import sparse
@@ -39,9 +41,12 @@ class Mixscape:
         adata: AnnData,
         pert_key: str,
         control: str,
+        *,
+        ref_selection_mode: Literal["nn", "split_by"] = "nn",
         split_by: str | None = None,
         n_neighbors: int = 20,
         use_rep: str | None = None,
+        n_dims: int | None = 15,
         n_pcs: int | None = None,
         batch_size: int | None = None,
         copy: bool = False,
@@ -49,14 +54,18 @@ class Mixscape:
     ):
         """Calculate perturbation signature.
-        For each cell, we identify `n_neighbors` cells from the control pool with the most similar mRNA expression profiles.
-        The perturbation signature is calculated by subtracting the averaged mRNA expression profile of the control
-        neighbors from the mRNA expression profile of each cell.
+        The perturbation signature is calculated by subtracting the mRNA expression profile of each cell from the averaged
+        mRNA expression profile of the control cells (selected according to `ref_selection_mode`).
+        The implementation resembles https://satijalab.org/seurat/reference/runmixscape. Note that in the original implementation, the
+        perturbation signature is calculated on unscaled data by default, and we therefore recommend to do the same.
         Args:
             adata: The annotated data object.
             pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
-            control: Control category from the `pert_key` column.
+            control: Name of the control category from the `pert_key` column.
+            ref_selection_mode: Method to select reference cells for the perturbation signature calculation. If `nn`,
+                the `n_neighbors` cells from the control pool with the most similar mRNA expression profiles are selected. If `split_by`,
+                the control cells from the same split in `split_by` (e.g. indicating biological replicates) are used to calculate the perturbation signature.
             split_by: Provide the column `.obs` if multiple biological replicates exist to calculate
                 the perturbation signature for every replicate separately.
             n_neighbors: Number of neighbors from the control to use for the perturbation signature.
@@ -64,7 +73,10 @@ class Mixscape:
                 If `None`, the representation is chosen automatically:
                 For `.n_vars` < 50, `.X` is used, otherwise 'X_pca' is used.
                 If 'X_pca' is not present, it’s computed with default parameters.
-            n_pcs: Use this many PCs. If `n_pcs==0` use `.X` if `use_rep is None`.
+            n_dims: Number of dimensions to use from the representation to calculate the perturbation signature.
+                If `None`, use all dimensions.
+            n_pcs: If PCA representation is used, the number of principal components to compute.
+                If `n_pcs==0` use `.X` if `use_rep is None`.
             batch_size: Size of batch to calculate the perturbation signature.
                 If 'None', the perturbation signature is calcuated in the full mode, requiring more memory.
                 The batched mode is very inefficient for sparse data.
@@ -81,8 +93,13 @@ class Mixscape:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
         """
+        if ref_selection_mode not in ["nn", "split_by"]:
+            raise ValueError("ref_selection_mode must be either 'nn' or 'split_by'.")
+        if ref_selection_mode == "split_by" and split_by is None:
+            raise ValueError("split_by must be provided if ref_selection_mode is 'split_by'.")
         if copy:
             adata = adata.copy()
@@ -90,59 +107,75 @@ class Mixscape:
         control_mask = adata.obs[pert_key] == control
-        if split_by is None:
-            split_masks = [np.full(adata.n_obs, True, dtype=bool)]
+        if ref_selection_mode == "split_by":
+            for split in adata.obs[split_by].unique():
+                split_mask = adata.obs[split_by] == split
+                control_mask_group = control_mask & split_mask
+                control_mean_expr = adata.X[control_mask_group].mean(0)
+                adata.layers["X_pert"][split_mask] = (
+                    np.repeat(control_mean_expr.reshape(1, -1), split_mask.sum(), axis=0)
+                    - adata.layers["X_pert"][split_mask]
+                )
         else:
-            split_obs = adata.obs[split_by]
-            split_masks = [split_obs == cat for cat in split_obs.unique()]
+            if split_by is None:
+                split_masks = [np.full(adata.n_obs, True, dtype=bool)]
+            else:
+                split_obs = adata.obs[split_by]
+                split_masks = [split_obs == cat for cat in split_obs.unique()]
-        representation = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
+            representation = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
+            if n_dims is not None and n_dims < representation.shape[1]:
+                representation = representation[:, :n_dims]
-        for split_mask in split_masks:
-            control_mask_split = control_mask & split_mask
+            for split_mask in split_masks:
+                control_mask_split = control_mask & split_mask
-            R_split = representation[split_mask]
-            R_control = representation[control_mask_split]
+                R_split = representation[split_mask]
+                R_control = representation[np.asarray(control_mask_split)]
-            from pynndescent import NNDescent
+                from pynndescent import NNDescent
-            eps = kwargs.pop("epsilon", 0.1)
-            nn_index = NNDescent(R_control, **kwargs)
-            indices, _ = nn_index.query(R_split, k=n_neighbors, epsilon=eps)
+                eps = kwargs.pop("epsilon", 0.1)
+                nn_index = NNDescent(R_control, **kwargs)
+                indices, _ = nn_index.query(R_split, k=n_neighbors, epsilon=eps)
-            X_control = np.expm1(adata.X[control_mask_split])
+                X_control = np.expm1(adata.X[np.asarray(control_mask_split)])
-            n_split = split_mask.sum()
-            n_control = X_control.shape[0]
+                n_split = split_mask.sum()
+                n_control = X_control.shape[0]
-            if batch_size is None:
-                col_indices = np.ravel(indices)
-                row_indices = np.repeat(np.arange(n_split), n_neighbors)
+                if batch_size is None:
+                    col_indices = np.ravel(indices)
+                    row_indices = np.repeat(np.arange(n_split), n_neighbors)
-                neigh_matrix = csr_matrix(
-                    (np.ones_like(col_indices, dtype=np.float64), (row_indices, col_indices)),
-                    shape=(n_split, n_control),
-                )
-                neigh_matrix /= n_neighbors
-                adata.layers["X_pert"][split_mask] -= np.log1p(neigh_matrix @ X_control)
-            else:
-                is_sparse = issparse(X_control)
-                split_indices = np.where(split_mask)[0]
-                for i in range(0, n_split, batch_size):
-                    size = min(i + batch_size, n_split)
-                    select = slice(i, size)
+                    neigh_matrix = csr_matrix(
+                        (np.ones_like(col_indices, dtype=np.float64), (row_indices, col_indices)),
+                        shape=(n_split, n_control),
+                    )
+                    neigh_matrix /= n_neighbors
+                    adata.layers["X_pert"][split_mask] = (
+                        np.log1p(neigh_matrix @ X_control) - adata.layers["X_pert"][split_mask]
+                    )
+                else:
+                    is_sparse = issparse(X_control)
+                    split_indices = np.where(split_mask)[0]
+                    for i in range(0, n_split, batch_size):
+                        size = min(i + batch_size, n_split)
+                        select = slice(i, size)
-                    batch = np.ravel(indices[select])
-                    split_batch = split_indices[select]
+                        batch = np.ravel(indices[select])
+                        split_batch = split_indices[select]
-                    size = size - i
+                        size = size - i
-                    # sparse is very slow
-                    means_batch = X_control[batch]
-                    means_batch = means_batch.toarray() if is_sparse else means_batch
-                    means_batch = means_batch.reshape(size, n_neighbors, -1).mean(1)
+                        # sparse is very slow
+                        means_batch = X_control[batch]
+                        means_batch = means_batch.toarray() if is_sparse else means_batch
+                        means_batch = means_batch.reshape(size, n_neighbors, -1).mean(1)
-                    adata.layers["X_pert"][split_batch] -= np.log1p(means_batch)
+                        adata.layers["X_pert"][split_batch] = (
+                            np.log1p(means_batch) - adata.layers["X_pert"][split_batch]
+                        )
         if copy:
             return adata
@@ -152,33 +185,41 @@ class Mixscape:
         adata: AnnData,
         labels: str,
         control: str,
+        *,
         new_class_name: str | None = "mixscape_class",
-        min_de_genes: int | None = 5,
         layer: str | None = None,
+        min_de_genes: int | None = 5,
         logfc_threshold: float | None = 0.25,
+        de_layer: str | None = None,
+        test_method: str | None = "wilcoxon",
         iter_num: int | None = 10,
+        scale: bool | None = True,
         split_by: str | None = None,
         pval_cutoff: float | None = 5e-2,
         perturbation_type: str | None = "KO",
+        random_state: int | None = 0,
         copy: bool | None = False,
     ):
         """Identify perturbed and non-perturbed gRNA expressing cells that accounts for multiple treatments/conditions/chemical perturbations.
-        The implementation resembles https://satijalab.org/seurat/reference/runmixscape
+        The implementation resembles https://satijalab.org/seurat/reference/runmixscape.
         Args:
             adata: The annotated data object.
             labels: The column of `.obs` with target gene labels.
-            control: Control category from the `pert_key` column.
+            control: Control category from the `labels` column.
             new_class_name: Name of mixscape classification to be stored in `.obs`.
-            min_de_genes: Required number of genes that are differentially expressed for method to separate perturbed and non-perturbed cells.
             layer: Key from adata.layers whose value will be used to perform tests on. Default is using `.layers["X_pert"]`.
+            min_de_genes: Required number of genes that are differentially expressed for method to separate perturbed and non-perturbed cells.
             logfc_threshold: Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells (default: 0.25).
+            de_layer: Layer to use for identifying differentially expressed genes. If `None`, adata.X is used.
+            test_method: Method to use for differential expression testing.
             iter_num: Number of normalmixEM iterations to run if convergence does not occur.
-            split_by: Provide the column `.obs` if multiple biological replicates exist to calculate
-                    the perturbation signature for every replicate separately.
+            scale: Scale the data specified in `layer` before running the GaussianMixture model on it.
+            split_by: Provide `.obs` column with experimental condition/cell type annotation, if perturbations are condition/cell type-specific.
             pval_cutoff: P-value cut-off for selection of significantly DE genes.
             perturbation_type: specify type of CRISPR perturbation expected for labeling mixscape classifications.
+            random_state: Random seed for the GaussianMixture model.
             copy: Determines whether a copy of the `adata` is returned.
         Returns:
@@ -201,8 +242,8 @@ class Mixscape:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
         """
         if copy:
             adata = adata.copy()
@@ -216,7 +257,16 @@ class Mixscape:
             split_masks = [split_obs == category for category in categories]
         perturbation_markers = self._get_perturbation_markers(
-            adata, split_masks, categories, labels, control, layer, pval_cutoff, min_de_genes, logfc_threshold
+            adata=adata,
+            split_masks=split_masks,
+            categories=categories,
+            labels=labels,
+            control=control,
+            layer=de_layer,
+            pval_cutoff=pval_cutoff,
+            min_de_genes=min_de_genes,
+            logfc_threshold=logfc_threshold,
+            test_method=test_method,
         )
         adata_comp = adata
@@ -229,6 +279,7 @@ class Mixscape:
                 raise KeyError(
                     "No 'X_pert' found in .layers! Please run perturbation_signature first to calculate perturbation signature!"
                 ) from None
         # initialize return variables
         adata.obs[f"{new_class_name}_p_{perturbation_type.lower()}"] = 0
         adata.obs[new_class_name] = adata.obs[labels].astype(str)
@@ -239,10 +290,12 @@ class Mixscape:
             dtype=np.object_,
         )
         gv_list: dict[str, dict] = {}
+        adata.obs[f"{new_class_name}_p_{perturbation_type.lower()}"] = 0.0
         for split, split_mask in enumerate(split_masks):
             category = categories[split]
-            genes = list(set(adata[split_mask].obs[labels]).difference([control]))
-            for gene in genes:
+            gene_targets = list(set(adata[split_mask].obs[labels]).difference([control]))
+            for gene in gene_targets:
                 post_prob = 0
                 orig_guide_cells = (adata.obs[labels] == gene) & split_mask
                 orig_guide_cells_index = list(orig_guide_cells.index[orig_guide_cells])
@@ -251,28 +304,38 @@ class Mixscape:
                 if len(perturbation_markers[(category, gene)]) == 0:
                     adata.obs.loc[orig_guide_cells, new_class_name] = f"{gene} NP"
                 else:
                     de_genes = perturbation_markers[(category, gene)]
                     de_genes_indices = self._get_column_indices(adata, list(de_genes))
-                    dat = X[all_cells][:, de_genes_indices]
+                    dat = X[np.asarray(all_cells)][:, de_genes_indices]
+                    dat_cells = all_cells[all_cells].index
+                    if scale:
+                        dat = sc.pp.scale(dat)
                     converged = False
                     n_iter = 0
-                    old_classes = adata.obs[labels][all_cells]
+                    old_classes = adata.obs[new_class_name][all_cells]
                     while not converged and n_iter < iter_num:
                         # Get all cells in current split&Gene
-                        guide_cells = (adata.obs[labels] == gene) & split_mask
+                        guide_cells = (adata.obs[new_class_name] == gene) & split_mask
                         # get average value for each gene over all selected cells
                         # all cells in current split&Gene minus all NT cells in current split
                         # Each row is for each cell, each column is for each gene, get mean for each column
-                        vec = np.mean(X[guide_cells][:, de_genes_indices], axis=0) - np.mean(
-                            X[nt_cells][:, de_genes_indices], axis=0
-                        )
+                        guide_cells_dat_idx = all_cells[all_cells].index.get_indexer(guide_cells[guide_cells].index)
+                        nt_cells_dat_idx = all_cells[all_cells].index.get_indexer(nt_cells[nt_cells].index)
+                        vec = np.mean(dat[guide_cells_dat_idx], axis=0) - np.mean(dat[nt_cells_dat_idx], axis=0)
                         # project cells onto the perturbation vector
                         if isinstance(dat, spmatrix):
-                            pvec = np.sum(np.multiply(dat.toarray(), vec), axis=1) / np.sum(np.multiply(vec, vec))
+                            pvec = np.dot(dat.toarray(), vec) / np.dot(vec, vec)
                         else:
-                            pvec = np.sum(np.multiply(dat, vec), axis=1) / np.sum(np.multiply(vec, vec))
+                            pvec = np.dot(dat, vec) / np.dot(vec, vec)
                         pvec = pd.Series(np.asarray(pvec).flatten(), index=list(all_cells.index[all_cells]))
                         if n_iter == 0:
                             gv = pd.DataFrame(columns=["pvec", labels])
                             gv["pvec"] = pvec
@@ -282,19 +345,22 @@ class Mixscape:
                                 gv_list[gene] = {}
                             gv_list[gene][category] = gv
-                        guide_norm = self._define_normal_mixscape(pvec[guide_cells])
-                        nt_norm = self._define_normal_mixscape(pvec[nt_cells])
-                        means_init = np.array([[nt_norm[0]], [guide_norm[0]]])
-                        precisions_init = np.array([nt_norm[1], guide_norm[1]])
-                        mm = GaussianMixture(
+                        means_init = np.array([[pvec[nt_cells].mean()], [pvec[guide_cells].mean()]])
+                        std_init = np.array([pvec[nt_cells].std(), pvec[guide_cells].std()])
+                        mm = MixscapeGaussianMixture(
                             n_components=2,
                             covariance_type="spherical",
                             means_init=means_init,
-                            precisions_init=precisions_init,
+                            precisions_init=1 / (std_init ** 2),
+                            random_state=random_state,
+                            max_iter=5000,
+                            fixed_means=[pvec[nt_cells].mean(), None],
+                            fixed_covariances=[pvec[nt_cells].std() ** 2, None],
                         ).fit(np.asarray(pvec).reshape(-1, 1))
                         probabilities = mm.predict_proba(np.array(pvec[orig_guide_cells_index]).reshape(-1, 1))
                         lik_ratio = probabilities[:, 0] / probabilities[:, 1]
                         post_prob = 1 / (1 + lik_ratio)
                         # based on the posterior probability, assign cells to the two classes
                         adata.obs.loc[
                             [orig_guide_cells_index[cell] for cell in np.where(post_prob > 0.5)[0]], new_class_name
@@ -302,11 +368,13 @@ class Mixscape:
                         adata.obs.loc[
                             [orig_guide_cells_index[cell] for cell in np.where(post_prob <= 0.5)[0]], new_class_name
                         ] = f"{gene} NP"
                         if sum(adata.obs[new_class_name][split_mask] == gene) < min_de_genes:
                             adata.obs.loc[guide_cells, new_class_name] = "NP"
                             converged = True
                         if adata.obs[new_class_name][all_cells].equals(old_classes):
                             converged = True
                         old_classes = adata.obs[new_class_name][all_cells]
                         n_iter += 1
@@ -315,9 +383,7 @@ class Mixscape:
                     )
                 adata.obs[f"{new_class_name}_global"] = [a.split(" ")[-1] for a in adata.obs[new_class_name]]
-                adata.obs.loc[orig_guide_cells_index, f"{new_class_name}_p_{perturbation_type.lower()}"] = np.round(
-                    post_prob
-                ).astype("int64")
+                adata.obs.loc[orig_guide_cells_index, f"{new_class_name}_p_{perturbation_type.lower()}"] = post_prob
         adata.uns["mixscape"] = gv_list
         if copy:
@@ -328,11 +394,13 @@ class Mixscape:
         adata: AnnData,
         labels: str,
         control: str,
+        *,
         mixscape_class_global: str | None = "mixscape_class_global",
         layer: str | None = None,
         n_comps: int | None = 10,
         min_de_genes: int | None = 5,
         logfc_threshold: float | None = 0.25,
+        test_method: str | None = "wilcoxon",
         split_by: str | None = None,
         pval_cutoff: float | None = 5e-2,
         perturbation_type: str | None = "KO",
@@ -345,12 +413,13 @@ class Mixscape:
             labels: The column of `.obs` with target gene labels.
             control: Control category from the `pert_key` column.
             mixscape_class_global: The column of `.obs` with mixscape global classification result (perturbed, NP or NT).
-            layer: Key from `adata.layers` whose value will be used to perform tests on.
+            layer: Layer to use for identifying differentially expressed genes. If `None`, adata.X is used.
             control: Control category from the `pert_key` column.
             n_comps: Number of principal components to use.
             min_de_genes: Required number of genes that are differentially expressed for method to separate perturbed and non-perturbed cells.
             logfc_threshold: Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
-            split_by: Provide the column `.obs` if multiple biological replicates exist to calculate
+            test_method: Method to use for differential expression testing.
+            split_by: Provide `.obs` column with experimental condition/cell type annotation, if perturbations are condition/cell type-specific.
             pval_cutoff: P-value cut-off for selection of significantly DE genes.
             perturbation_type: Specify type of CRISPR perturbation expected for labeling mixscape classifications.
             copy: Determines whether a copy of the `adata` is returned.
@@ -368,9 +437,9 @@ class Mixscape:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
-            >>> ms_pt.lda(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
+            >>> ms_pt.lda(mdata["rna"], "gene_target", "NT")
         """
         if copy:
             adata = adata.copy()
@@ -386,9 +455,8 @@ class Mixscape:
             categories = split_obs.unique()
             split_masks = [split_obs == category for category in categories]
-        mixscape_identifier = pt.tl.Mixscape()
         # determine gene sets across all splits/groups through differential gene expression
-        perturbation_markers = mixscape_identifier._get_perturbation_markers(
+        perturbation_markers = self._get_perturbation_markers(
             adata=adata,
             split_masks=split_masks,
             categories=categories,
@@ -398,6 +466,7 @@ class Mixscape:
             pval_cutoff=pval_cutoff,
             min_de_genes=min_de_genes,
             logfc_threshold=logfc_threshold,
+            test_method=test_method,
         )
         adata_subset = adata[
             (adata.obs[mixscape_class_global] == perturbation_type) | (adata.obs[mixscape_class_global] == control)
@@ -443,12 +512,21 @@ class Mixscape:
         pval_cutoff: float,
         min_de_genes: float,
         logfc_threshold: float,
+        test_method: str,
     ) -> dict[tuple, np.ndarray]:
         """Determine gene sets across all splits/groups through differential gene expression
         Args:
             adata: :class:`~anndata.AnnData` object
-            col_names: Column names to extract the indices for
+            split_masks: List of boolean masks for each split/group.
+            categories: List of split/group names.
+            labels: The column of `.obs` with target gene labels.
+            control: Control category from the `labels` column.
+            layer: Key from adata.layers whose value will be used to compare gene expression.
+            pval_cutoff: P-value cut-off for selection of significantly DE genes.
+            min_de_genes: Required number of genes that are differentially expressed for method to separate perturbed and non-perturbed cells.
+            logfc_threshold: Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
+            test_method: Method to use for differential expression testing.
         Returns:
             Set of column indices.
@@ -457,21 +535,21 @@ class Mixscape:
         for split, split_mask in enumerate(split_masks):
             category = categories[split]
             # get gene sets for each split
-            genes = list(set(adata[split_mask].obs[labels]).difference([control]))
+            gene_targets = list(set(adata[split_mask].obs[labels]).difference([control]))
             adata_split = adata[split_mask].copy()
             # find top DE genes between cells with targeting and non-targeting gRNAs
             sc.tl.rank_genes_groups(
                 adata_split,
                 layer=layer,
                 groupby=labels,
-                groups=genes,
+                groups=gene_targets,
                 reference=control,
-                method="t-test",
+                method=test_method,
                 use_raw=False,
             )
-            # get DE genes for each gene
-            for gene in genes:
-                logfc_threshold_mask = adata_split.uns["rank_genes_groups"]["logfoldchanges"][gene] >= logfc_threshold
+            # get DE genes for each target gene
+            for gene in gene_targets:
+                logfc_threshold_mask = np.abs(adata_split.uns["rank_genes_groups"]["logfoldchanges"][gene]) >= logfc_threshold
                 de_genes = adata_split.uns["rank_genes_groups"]["names"][gene][logfc_threshold_mask]
                 pvals_adj = adata_split.uns["rank_genes_groups"]["pvals_adj"][gene][logfc_threshold_mask]
                 de_genes = de_genes[pvals_adj < pval_cutoff]
@@ -492,35 +570,22 @@ class Mixscape:
         return indices
-    def _define_normal_mixscape(self, X: np.ndarray | sparse.spmatrix | pd.DataFrame | None) -> list[float]:
-        """Calculates the mean and standard deviation of a matrix.
-        Args:
-            X: The matrix to calculate the properties for.
-        Returns:
-            Mean and standard deviation of the matrix.
-        """
-        mu = X.mean()
-        sd = X.std()
-        return [mu, sd]
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_barplot(  # pragma: no cover
         self,
         adata: AnnData,
         guide_rna_column: str,
+        *,
         mixscape_class_global: str = "mixscape_class_global",
         axis_text_x_size: int = 8,
         axis_text_y_size: int = 6,
         axis_title_size: int = 8,
         legend_title_size: int = 8,
         legend_text_size: int = 8,
-        return_fig: bool | None = None,
-        ax: Axes | None = None,
-        show: bool | None = None,
-        save: bool | str | None = None,
-    ):
+        legend_bbox_to_anchor: tuple[float, float] = None,
+        figsize: tuple[float, float] = (25, 25),
+        return_fig: bool = False,
+    ) -> Figure | None:
         """Barplot to visualize perturbation scores calculated by the `mixscape` function.
         Args:
@@ -528,19 +593,24 @@ class Mixscape:
             guide_rna_column: The column of `.obs` with guide RNA labels. The target gene labels.
                               The format must be <gene_target>g<#>. Examples are 'STAT2g1' and 'ATF2g1'.
             mixscape_class_global: The column of `.obs` with mixscape global classification result (perturbed, NP or NT).
-            show: Show the plot, do not return axis.
-            save: If True or a str, save the figure. A string is appended to the default filename.
-                  Infer the filetype if ending on {'.pdf', '.png', '.svg'}.
+            axis_text_x_size: Size of the x-axis text.
+            axis_text_y_size: Size of the y-axis text.
+            axis_title_size: Size of the axis title.
+            legend_title_size: Size of the legend title.
+            legend_text_size: Size of the legend text.
+            legend_bbox_to_anchor: The bbox that the legend will be anchored.
+            figsize: The size of the figure.
+            {common_plot_args}
         Returns:
-            If `show==False`, return a :class:`~matplotlib.axes.Axes.
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
         Examples:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
             >>> ms_pt.plot_barplot(mdata["rna"], guide_rna_column="NT")
         Preview:
@@ -565,63 +635,64 @@ class Mixscape:
         all_cells_percentage["guide_number"] = "g" + all_cells_percentage["guide_number"]
         NP_KO_cells = all_cells_percentage[all_cells_percentage["gene"] != "NT"]
-        if show:
-            color_mapping = {"KO": "salmon", "NP": "lightgray", "NT": "grey"}
-            unique_genes = NP_KO_cells["gene"].unique()
-            fig, axs = plt.subplots(int(len(unique_genes) / 5), 5, figsize=(25, 25), sharey=True)
-            for i, gene in enumerate(unique_genes):
-                ax = axs[int(i / 5), i % 5]
-                grouped_df = (
-                    NP_KO_cells[NP_KO_cells["gene"] == gene]
-                    .groupby(["guide_number", "mixscape_class_global"], observed=False)["value"]
-                    .sum()
-                    .unstack()
-                )
-                grouped_df.plot(
-                    kind="bar",
-                    stacked=True,
-                    color=[color_mapping[col] for col in grouped_df.columns],
-                    ax=ax,
-                    width=0.8,
-                    legend=False,
-                )
-                ax.set_title(
-                    gene, bbox={"facecolor": "white", "edgecolor": "black", "pad": 1}, fontsize=axis_title_size
-                )
-                ax.set(xlabel="sgRNA", ylabel="% of cells")
-                sns.despine(ax=ax, top=True, right=True, left=False, bottom=False)
-                ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size)
-                ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=axis_text_y_size)
-            fig.subplots_adjust(right=0.8)
-            fig.subplots_adjust(hspace=0.5, wspace=0.5)
+        color_mapping = {"KO": "salmon", "NP": "lightgray", "NT": "grey"}
+        unique_genes = NP_KO_cells["gene"].unique()
+        fig, axs = plt.subplots(int(len(unique_genes) / 5), 5, figsize=figsize, sharey=True)
+        for i, gene in enumerate(unique_genes):
+            ax = axs[int(i / 5), i % 5]
+            grouped_df = (
+                NP_KO_cells[NP_KO_cells["gene"] == gene]
+                .groupby(["guide_number", "mixscape_class_global"], observed=False)["value"]
+                .sum()
+                .unstack()
+            )
+            grouped_df.plot(
+                kind="bar",
+                stacked=True,
+                color=[color_mapping[col] for col in grouped_df.columns],
+                ax=ax,
+                width=0.8,
+                legend=False,
+            )
+            ax.set_title(gene, bbox={"facecolor": "white", "edgecolor": "black", "pad": 1}, fontsize=axis_title_size)
+            ax.set(xlabel="sgRNA", ylabel="% of cells")
+            sns.despine(ax=ax, top=True, right=True, left=False, bottom=False)
+            ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size)
+            ax.set_yticks(ax.get_yticks(), ax.get_yticklabels(), rotation=0, fontsize=axis_text_y_size)
             ax.legend(
-                title="mixscape_class_global",
+                title="Mixscape Class",
                 loc="center right",
-                bbox_to_anchor=(2.2, 3.5),
+                bbox_to_anchor=legend_bbox_to_anchor,
                 frameon=True,
                 fontsize=legend_text_size,
                 title_fontsize=legend_title_size,
             )
+        fig.subplots_adjust(right=0.8)
+        fig.subplots_adjust(hspace=0.5, wspace=0.5)
         plt.tight_layout()
-        _utils.savefig_or_show("mixscape_barplot", show=show, save=save)
+        if return_fig:
+            return fig
+        plt.show()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_heatmap(  # pragma: no cover
         self,
         adata: AnnData,
         labels: str,
         target_gene: str,
         control: str,
+        *,
         layer: str | None = None,
         method: str | None = "wilcoxon",
         subsample_number: int | None = 900,
         vmin: float | None = -2,
         vmax: float | None = 2,
-        return_fig: bool | None = None,
-        show: bool | None = None,
-        save: bool | str | None = None,
+        return_fig: bool = False,
         **kwds,
-    ) -> Axes | None:
+    ) -> Figure | None:
         """Heatmap plot using mixscape results. Requires `pt.tl.mixscape()` to be run first.
         Args:
@@ -634,21 +705,18 @@ class Mixscape:
             subsample_number: Subsample to this number of observations.
             vmin: The value representing the lower limit of the color scale. Values smaller than vmin are plotted with the same color as vmin.
             vmax: The value representing the upper limit of the color scale. Values larger than vmax are plotted with the same color as vmax.
-            show: Show the plot, do not return axis.
-            save: If `True` or a `str`, save the figure. A string is appended to the default filename.
-                  Infer the filetype if ending on {`'.pdf'`, `'.png'`, `'.svg'`}.
-            ax: A matplotlib axes object. Only works if plotting a single component.
+            {common_plot_args}
             **kwds: Additional arguments to `scanpy.pl.rank_genes_groups_heatmap`.
         Returns:
-            If `show==False`, return a :class:`~matplotlib.axes.Axes`.
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
         Examples:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
             >>> ms_pt.plot_heatmap(
             ...     adata=mdata["rna"], labels="gene_target", target_gene="IFNGR2", layer="X_pert", control="NT"
             ... )
@@ -663,35 +731,37 @@ class Mixscape:
         sc.pp.scale(adata_subset, max_value=vmax)
         sc.pp.subsample(adata_subset, n_obs=subsample_number)
-        return sc.pl.rank_genes_groups_heatmap(
+        fig = sc.pl.rank_genes_groups_heatmap(
             adata_subset,
             groupby="mixscape_class",
             vmin=vmin,
             vmax=vmax,
             n_genes=20,
             groups=["NT"],
-            return_fig=return_fig,
-            show=show,
-            save=save,
+            show=False,
             **kwds,
         )
+        if return_fig:
+            return fig
+        plt.show()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_perturbscore(  # pragma: no cover
         self,
         adata: AnnData,
         labels: str,
         target_gene: str,
+        *,
         mixscape_class: str = "mixscape_class",
         color: str = "orange",
         palette: dict[str, str] = None,
         split_by: str = None,
         before_mixscape: bool = False,
         perturbation_type: str = "KO",
-        return_fig: bool | None = None,
-        ax: Axes | None = None,
-        show: bool | None = None,
-        save: bool | str | None = None,
-    ) -> None:
+        return_fig: bool = False,
+    ) -> Figure | None:
         """Density plots to visualize perturbation scores calculated by the `pt.tl.mixscape` function.
         Requires `pt.tl.mixscape` to be run first.
@@ -710,6 +780,10 @@ class Mixscape:
             before_mixscape: Option to split densities based on mixscape classification (default) or original target gene classification.
                              Default is set to NULL and plots cells by original class ID.
             perturbation_type: Specify type of CRISPR perturbation expected for labeling mixscape classifications.
+            {common_plot_args}
+        Returns:
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
         Examples:
             Visualizing the perturbation scores for the cells in a dataset:
@@ -717,8 +791,8 @@ class Mixscape:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
             >>> ms_pt.plot_perturbscore(adata=mdata["rna"], labels="gene_target", target_gene="IFNGR2", color="orange")
         Preview:
@@ -778,15 +852,6 @@ class Mixscape:
                 plt.legend(title="gene_target", title_fontsize=14, fontsize=12)
                 sns.despine()
-            if save:
-                plt.savefig(save, bbox_inches="tight")
-            if show:
-                plt.show()
-            if return_fig:
-                return plt.gcf()
-            if not (show or save):
-                return plt.gca()
         # If before_mixscape is False, split densities based on mixscape classifications
         else:
             if palette is None:
@@ -843,19 +908,17 @@ class Mixscape:
                 plt.legend(title="mixscape class", title_fontsize=14, fontsize=12)
                 sns.despine()
-            if save:
-                plt.savefig(save, bbox_inches="tight")
-            if show:
-                plt.show()
-            if return_fig:
-                return plt.gcf()
-            if not (show or save):
-                return plt.gca()
+        if return_fig:
+            return plt.gcf()
+        plt.show()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_violin(  # pragma: no cover
         self,
         adata: AnnData,
         target_gene_idents: str | list[str],
+        *,
         keys: str | Sequence[str] = "mixscape_class_p_ko",
         groupby: str | None = "mixscape_class",
         log: bool = False,
@@ -872,10 +935,9 @@ class Mixscape:
         ylabel: str | Sequence[str] | None = None,
         rotation: float | None = None,
         ax: Axes | None = None,
-        show: bool | None = None,
-        save: bool | str | None = None,
+        return_fig: bool = False,
         **kwargs,
-    ):
+    ) -> Axes | Figure | None:
         """Violin plot using mixscape results.
         Requires `pt.tl.mixscape` to be run first.
@@ -892,21 +954,19 @@ class Mixscape:
             xlabel: Label of the x-axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown.
             ylabel: Label of the y-axis. If `None` and `groupby` is `None`, defaults to `'value'`.
                     If `None` and `groubpy` is not `None`, defaults to `keys`.
-            show: Show the plot, do not return axis.
-            save: If `True` or a `str`, save the figure. A string is appended to the default filename.
-                  Infer the filetype if ending on {`'.pdf'`, `'.png'`, `'.svg'`}.
             ax: A matplotlib axes object. Only works if plotting a single component.
+            {common_plot_args}
             **kwargs: Additional arguments to `seaborn.violinplot`.
         Returns:
-            A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`.
+            If `return_fig` is `True`, returns the figure (as Axes list if it's a multi-panel plot), otherwise `None`.
         Examples:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
             >>> ms_pt.plot_violin(
             ...     adata=mdata["rna"], target_gene_idents=["NT", "IFNGR2 NP", "IFNGR2 KO"], groupby="mixscape_class"
             ... )
@@ -1042,23 +1102,25 @@ class Mixscape:
                 if rotation is not None:
                     ax.tick_params(axis="x", labelrotation=rotation)
-        show = settings.autoshow if show is None else show
         if hue is not None and stripplot is True:
             plt.legend(handles, labels)
-        _utils.savefig_or_show("mixscape_violin", show=show, save=save)
-        if not show:
+        if return_fig:
             if multi_panel and groupby is None and len(ys) == 1:
                 return g
             elif len(axs) == 1:
                 return axs[0]
             else:
                 return axs
+        plt.show()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_lda(  # pragma: no cover
         self,
         adata: AnnData,
         control: str,
+        *,
         mixscape_class: str = "mixscape_class",
         mixscape_class_global: str = "mixscape_class_global",
         perturbation_type: str | None = "KO",
@@ -1066,12 +1128,10 @@ class Mixscape:
         n_components: int | None = None,
         color_map: Colormap | str | None = None,
         palette: str | Sequence[str] | None = None,
-        return_fig: bool | None = None,
         ax: Axes | None = None,
-        show: bool | None = None,
-        save: bool | str | None = None,
+        return_fig: bool = False,
         **kwds,
-    ) -> None:
+    ) -> Figure | None:
         """Visualizing perturbation responses with Linear Discriminant Analysis. Requires `pt.tl.mixscape()` to be run first.
         Args:
@@ -1082,18 +1142,16 @@ class Mixscape:
             perturbation_type: Specify type of CRISPR perturbation expected for labeling mixscape classifications.
             lda_key: If not specified, lda looks .uns["mixscape_lda"] for the LDA results.
             n_components: The number of dimensions of the embedding.
-            show: Show the plot, do not return axis.
-            save: If `True` or a `str`, save the figure. A string is appended to the default filename.
-                  Infer the filetype if ending on {`'.pdf'`, `'.png'`, `'.svg'`}.
+            {common_plot_args}
             **kwds: Additional arguments to `scanpy.pl.umap`.
         Examples:
             >>> import pertpy as pt
             >>> mdata = pt.dt.papalexi_2021()
             >>> ms_pt = pt.tl.Mixscape()
-            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", "replicate")
-            >>> ms_pt.mixscape(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
-            >>> ms_pt.lda(adata=mdata["rna"], control="NT", labels="gene_target", layer="X_pert")
+            >>> ms_pt.perturbation_signature(mdata["rna"], "perturbation", "NT", split_by="replicate")
+            >>> ms_pt.mixscape(mdata["rna"], "gene_target", "NT", layer="X_pert")
+            >>> ms_pt.lda(mdata["rna"], "gene_target", "NT", split_by="replicate")
             >>> ms_pt.plot_lda(adata=mdata["rna"], control="NT")
         Preview:
@@ -1112,14 +1170,55 @@ class Mixscape:
             n_components = adata_subset.uns[lda_key].shape[1]
         sc.pp.neighbors(adata_subset, use_rep=lda_key)
         sc.tl.umap(adata_subset, n_components=n_components)
-        sc.pl.umap(
+        fig = sc.pl.umap(
             adata_subset,
             color=mixscape_class,
             palette=palette,
             color_map=color_map,
             return_fig=return_fig,
-            show=show,
-            save=save,
+            show=False,
             ax=ax,
             **kwds,
         )
+        if return_fig:
+            return fig
+        plt.show()
+        return None
+class MixscapeGaussianMixture(GaussianMixture):
+    def __init__(
+        self,
+        n_components: int,
+        fixed_means:  Sequence[float] | None = None,
+        fixed_covariances: Sequence[float] | None = None,
+        **kwargs
+    ):
+        """Custom Gaussian Mixture Model where means and covariances can be fixed for specific components.
+        Args:
+            n_components: Number of Gaussian components
+            fixed_means: Means to fix (use None for those that should be estimated)
+            fixed_covariances: Covariances to fix (use None for those that should be estimated)
+            kwargs: Additional arguments passed to scikit-learn's GaussianMixture
+        """
+        super().__init__(n_components=n_components, **kwargs)
+        self.fixed_means = fixed_means
+        self.fixed_covariances = fixed_covariances
+    def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
+        """Modified M-step to respect fixed means and covariances."""
+        super()._m_step(X, log_resp)
+        if self.fixed_means is not None:
+            for i in range(self.n_components):
+                if self.fixed_means[i] is not None:
+                    self.means_[i] = self.fixed_means[i]
+        if self.fixed_covariances is not None:
+            for i in range(self.n_components):
+                if self.fixed_covariances[i] is not None:
+                    self.covariances_[i] = self.fixed_covariances[i]
+        return self

pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

pertpy 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl