PyPI - pertpy - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

pertpy 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

pertpy/__init__.py +3 -2
pertpy/data/__init__.py +5 -1
pertpy/data/_dataloader.py +2 -4
pertpy/data/_datasets.py +203 -92
pertpy/metadata/__init__.py +4 -0
pertpy/metadata/_cell_line.py +826 -0
pertpy/metadata/_compound.py +129 -0
pertpy/metadata/_drug.py +242 -0
pertpy/metadata/_look_up.py +582 -0
pertpy/metadata/_metadata.py +73 -0
pertpy/metadata/_moa.py +129 -0
pertpy/plot/__init__.py +1 -9
pertpy/plot/_augur.py +53 -116
pertpy/plot/_coda.py +277 -677
pertpy/plot/_guide_rna.py +17 -35
pertpy/plot/_milopy.py +59 -134
pertpy/plot/_mixscape.py +152 -391
pertpy/preprocessing/_guide_rna.py +88 -4
pertpy/tools/__init__.py +8 -13
pertpy/tools/_augur.py +315 -17
pertpy/tools/_cinemaot.py +143 -4
pertpy/tools/_coda/_base_coda.py +1210 -65
pertpy/tools/_coda/_sccoda.py +50 -21
pertpy/tools/_coda/_tasccoda.py +27 -19
pertpy/tools/_dialogue.py +164 -56
pertpy/tools/_differential_gene_expression.py +240 -14
pertpy/tools/_distances/_distance_tests.py +8 -8
pertpy/tools/_distances/_distances.py +184 -34
pertpy/tools/_enrichment.py +465 -0
pertpy/tools/_milo.py +345 -11
pertpy/tools/_mixscape.py +668 -50
pertpy/tools/_perturbation_space/_clustering.py +5 -1
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
pertpy/tools/_perturbation_space/_simple.py +51 -10
pertpy/tools/_scgen/__init__.py +1 -1
pertpy/tools/_scgen/_scgen.py +701 -0
pertpy/tools/_scgen/_utils.py +1 -3
pertpy/tools/decoupler_LICENSE +674 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
pertpy-0.7.0.dist-info/RECORD +53 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
pertpy/plot/_cinemaot.py +0 -81
pertpy/plot/_dialogue.py +0 -91
pertpy/plot/_scgen.py +0 -337
pertpy/tools/_metadata/__init__.py +0 -0
pertpy/tools/_metadata/_cell_line.py +0 -613
pertpy/tools/_metadata/_look_up.py +0 -342
pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
pertpy/tools/_scgen/_jax_scgen.py +0 -370
pertpy-0.6.0.dist-info/RECORD +0 -50
/pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
{pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_dialogue.py CHANGED Viewed

@@ -2,13 +2,14 @@ from __future__ import annotations
 import itertools
 from collections import defaultdict
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal
 import anndata as ad
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scanpy as sc
-import scipy.sparse as sp
+import seaborn as sns
 import statsmodels.formula.api as smf
 import statsmodels.stats.multitest as ssm
 from anndata import AnnData
@@ -19,10 +20,15 @@ from rich.live import Live
 from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
 from scipy import stats
 from scipy.optimize import nnls
+from seaborn import PairGrid
 from sklearn.linear_model import LinearRegression
 from sparsecca import lp_pmd, multicca_permute, multicca_pmd
 from statsmodels.sandbox.stats.multicomp import multipletests
+if TYPE_CHECKING:
+    from matplotlib.axes import Axes
+    from matplotlib.figure import Figure
 class Dialogue:
     """Python implementation of DIALOGUE"""
@@ -53,8 +59,6 @@ class Dialogue:
         Copied from `https://github.com/schillerlab/sc-toolbox/blob/397e80dc5e8fb8017b75f6c3fa634a1e1213d484/sc_toolbox/tools/__init__.py#L458`
-        # TODO: Replace with decoupler's implementation
         Args:
             groupby: The key to groupby for pseudobulks
             strategy: The pseudobulking strategy. One of "median" or "mean"
@@ -62,14 +66,15 @@ class Dialogue:
         Returns:
             A Pandas DataFrame of pseudobulk counts
         """
+        # TODO: Replace with decoupler's implementation
         pseudobulk = {"Genes": adata.var_names.values}
         for category in adata.obs.loc[:, groupby].cat.categories:
             temp = adata.obs.loc[:, groupby] == category
             if strategy == "median":
-                pseudobulk[category] = adata[temp].X.median(axis=0).A1
+                pseudobulk[category] = adata[temp].X.median(axis=0)
             elif strategy == "mean":
-                pseudobulk[category] = adata[temp].X.mean(axis=0).A1
+                pseudobulk[category] = adata[temp].X.mean(axis=0)
         pseudobulk = pd.DataFrame(pseudobulk).set_index("Genes")
@@ -101,8 +106,6 @@ class Dialogue:
     def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray:
         """Row-wise mean center and scale by the standard deviation.
-        TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality.
         Args:
             pseudobulks: The pseudobulk PCA components.
             normalize: Whether to mimic DIALOGUE behavior or not.
@@ -110,9 +113,9 @@ class Dialogue:
         Returns:
             The scaled count matrix.
         """
+        # TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality.
         # DIALOGUE doesn't scale the data before passing to multicca, unlike what is recommended by sparsecca.
         # However, performing this scaling _does_ increase overall correlation of the end result
-        # WHEN SAMPLE ORDER AND DIALOGUE2+3 PROCESSING IS IGNORED.
         if normalize:
             return pseudobulks.to_numpy()
         else:
@@ -313,13 +316,13 @@ class Dialogue:
     def _apply_HLM_per_MCP_for_one_pair(
         self,
         mcp_name: str,
-        scores_df: dict,
+        scores_df: pd.DataFrame,
         ct_data: AnnData,
         tme: pd.DataFrame,
         sig: dict,
         n_counts: str,
         formula: str,
-        confounder: str,
+        confounder: str | None,
     ) -> tuple[pd.DataFrame, dict[str, Any]]:
         """Applies hierarchical modeling for a single MCP.
@@ -340,7 +343,7 @@ class Dialogue:
         """
         HLM_result = self._mixed_effects(
             scores=scores_df[[mcp_name]],
-            x_labels=ct_data.obs[[n_counts, confounder]],
+            x_labels=ct_data.obs[[n_counts, confounder]] if confounder else ct_data.obs[[n_counts]],
             tme=tme,
             genes_in_mcp=list(sig[mcp_name]["up"]) + list(sig[mcp_name]["down"]),
             formula=formula,
@@ -367,7 +370,7 @@ class Dialogue:
         return np.array(resid)
     def _iterative_nnls(self, A_orig: np.ndarray, y_orig: np.ndarray, feature_ranks: list[int], n_iter: int = 1000):
-        """Solves non-negative least squares separately for different feature categories.
+        """Solves non-negative least-squares separately for different feature categories.
         Mimics DLG.iterative.nnls.
         Variables are notated according to:
@@ -398,7 +401,7 @@ class Dialogue:
         x_final = np.zeros(A_orig.shape[0])
         Ax = np.zeros(A_orig.shape[1])
-        for _, mask in zip(sig_ranks, masks):
+        for _, mask in zip(sig_ranks, masks, strict=False):
             A = A_orig[mask].T
             coef_nnls, _ = nnls(A, y, maxiter=n_iter)
             y = y - A @ coef_nnls  # residuals
@@ -516,8 +519,8 @@ class Dialogue:
             # TODO: probably format the up and down within get_top_elements
             cca_sig: dict[str, Any] = defaultdict(dict)
             for i in range(0, int(len(cca_sig_unformatted) / 2)):
-                cca_sig[f"MCP{i + 1}"]["up"] = cca_sig_unformatted[i * 2]
-                cca_sig[f"MCP{i + 1}"]["down"] = cca_sig_unformatted[i * 2 + 1]
+                cca_sig[f"MCP{i}"]["up"] = cca_sig_unformatted[i * 2]
+                cca_sig[f"MCP{i}"]["down"] = cca_sig_unformatted[i * 2 + 1]
             cca_sig = dict(cca_sig)
             cca_sig_results[ct] = cca_sig
@@ -555,7 +558,7 @@ class Dialogue:
         return cca_sig_results, new_mcp_scores
-    def load(
+    def _load(
         self,
         adata: AnnData,
         ct_order: list[str],
@@ -574,16 +577,6 @@ class Dialogue:
         Returns:
             A celltype_label:array dictionary.
-        Examples:
-            >>> import pertpy as pt
-            >>> import scanpy as sc
-            >>> adata = pt.dt.dialogue_example()
-            >>> sc.pp.pca(adata)
-            >>> dl = pt.tl.Dialogue(sample_id = "clinical.status", celltype_key = "cell.subtypes", \
-                n_counts_key = "nCount_RNA", n_mpcs = 3)
-            >>> cell_types = adata.obs[dl.celltype_key].astype("category").cat.categories
-            >>> mcca_in, ct_subs = dl.load(adata, ct_order=cell_types)
         """
         ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order}
         fn = self._pseudobulk_pca if agg_pca else self._get_pseudobulks
@@ -631,19 +624,19 @@ class Dialogue:
             >>> import scanpy as sc
             >>> adata = pt.dt.dialogue_example()
             >>> sc.pp.pca(adata)
-            >>> dl = pt.tl.Dialogue(sample_id = "clinical.status", celltype_key = "cell.subtypes", \
-                n_counts_key = "nCount_RNA", n_mpcs = 3)
+            >>> dl = pt.tl.Dialogue(
+            ...     sample_id="clinical.status", celltype_key="cell.subtypes", n_counts_key="nCount_RNA", n_mpcs=3
+            ... )
             >>> adata, mcps, ws, ct_subs = dl.calculate_multifactor_PMD(adata, normalize=True)
         """
-        # IMPORTANT NOTE: the order in which matrices are passed to multicca matters. As such,
-        # it is important here that to obtain the same result as in R, we pass the matrices in
-        # in the same order.
+        # IMPORTANT NOTE: the order in which matrices are passed to multicca matters.
+        # As such, it is important here that to obtain the same result as in R, we pass the matrices in the same order.
         if ct_order is not None:
             cell_types = ct_order
         else:
             ct_order = cell_types = adata.obs[self.celltype_key].astype("category").cat.categories
-        mcca_in, ct_subs = self.load(adata, ct_order=cell_types, agg_pca=agg_pca, normalize=normalize)
+        mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_pca=agg_pca, normalize=normalize)
         n_samples = mcca_in[0].shape[1]
         if penalties is None:
@@ -685,7 +678,7 @@ class Dialogue:
         ct_subs: dict,
         mcp_scores: dict,
         ws_dict: dict,
-        confounder: str,
+        confounder: str | None,
         formula: str = None,
     ):
         """Runs the multilevel modeling step to match genes to MCPs and generate p-values for MCPs.
@@ -700,7 +693,6 @@ class Dialogue:
             A Pandas DataFrame containing:
             - for each mcp: HLM_result_1, HLM_result_2, sig_genes_1, sig_genes_2
             - merged HLM_result_1, HLM_result_2, sig_genes_1, sig_genes_2 of all mcps
-            TODO: Describe both returns
         Examples:
             >>> import pertpy as pt
@@ -713,7 +705,9 @@ class Dialogue:
             >>> all_results, new_mcps = dl.multilevel_modeling(ct_subs=ct_subs, mcp_scores=mcps, ws_dict=ws, \
                 confounder="gender")
         """
-        # all possible pairs of cell types with out pairing same cell type
+        # TODO the returns of the function better
+        # all possible pairs of cell types without pairing same cell type
         cell_types = list(ct_subs.keys())
         pairs = list(itertools.combinations(cell_types, 2))
@@ -721,9 +715,9 @@ class Dialogue:
             formula = f"y ~ x + {self.n_counts_key}"
         # Hierarchical modeling expects DataFrames
-        mcp_cell_types = {f"MCP{i + 1}": cell_types for i in range(self.n_mcps)}
+        mcp_cell_types = {f"MCP{i}": cell_types for i in range(self.n_mcps)}
         mcp_scores_df = {
-            ct: pd.DataFrame(v, index=ct_subs[ct].obs.index, columns=mcp_cell_types.keys())
+            ct: pd.DataFrame(v, index=ct_subs[ct].obs.index, columns=list(mcp_cell_types.keys()))
             for ct, v in mcp_scores.items()
         }
@@ -805,7 +799,7 @@ class Dialogue:
                 for mcp in mcps:
                     mixed_model_progress.update(mm_task, description=f"[bold blue]Determining mixed effects for {mcp}")
-                    # TODO Check that the genes in result{sig_genes_1] are different and if so note that somewhere and explain why
+                    # TODO Check whether the genes in result{sig_genes_1] are different and if so note that somewhere and explain why
                     result = {}
                     result["HLM_result_1"], result["sig_genes_1"] = self._apply_HLM_per_MCP_for_one_pair(
                         mcp_name=mcp,
@@ -875,22 +869,19 @@ class Dialogue:
         sample_label = self.sample_id
         n_mcps = self.n_mcps
-        # create conditions_compare if not supplied
         if conditions_compare is None:
-            conditions_compare = list(adata.obs["path_str"].cat.categories)  # type: ignore
+            conditions_compare = list(adata.obs[condition_label].cat.categories)  # type: ignore
             if len(conditions_compare) != 2:
                 raise ValueError("Please specify conditions to compare or supply an object with only 2 conditions")
-        # create data frames to store results
         pvals = pd.DataFrame(1, adata.obs[celltype_label].unique(), ["mcp_" + str(n) for n in range(0, n_mcps)])
         tstats = pd.DataFrame(1, adata.obs[celltype_label].unique(), ["mcp_" + str(n) for n in range(0, n_mcps)])
         pvals_adj = pd.DataFrame(1, adata.obs[celltype_label].unique(), ["mcp_" + str(n) for n in range(0, n_mcps)])
         response = adata.obs.groupby(sample_label)[condition_label].agg(pd.Series.mode)
         for celltype in adata.obs[celltype_label].unique():
-            # subset data to cell type
             df = adata.obs[adata.obs[celltype_label] == celltype]
-            # run t-test for each MCP
             for mcpnum in ["mcp_" + str(n) for n in range(0, n_mcps)]:
                 mns = df.groupby(sample_label)[mcpnum].mean()
                 mns = pd.concat([mns, response], axis=1)
@@ -900,11 +891,10 @@ class Dialogue:
                 )
                 pvals.loc[celltype, mcpnum] = res[1]
                 tstats.loc[celltype, mcpnum] = res[0]
-                # return(res)
-        # benjamini-hochberg correction for number of cell types (use BH because correlated MCPs)
         for mcpnum in ["mcp_" + str(n) for n in range(0, n_mcps)]:
             pvals_adj[mcpnum] = multipletests(pvals[mcpnum], method="fdr_bh")[1]
         return {"pvals": pvals, "tstats": tstats, "pvals_adj": pvals_adj}
     def get_mlm_mcp_genes(
@@ -921,7 +911,7 @@ class Dialogue:
             celltype: Cell type of interest.
             results: dl.MultilevelModeling result object.
             MCP: MCP key of the result object.
-            threshhold: Number between [0,1]. The fraction of cell types compared against which must have the associated MCP gene.
+            threshold: Number between [0,1]. The fraction of cell types compared against which must have the associated MCP gene.
                         Defaults to 0.70.
             focal_celltypes: None (compare against all cell types) or a list of other cell types which you want to compare against.
                              Defaults to None.
@@ -945,7 +935,6 @@ class Dialogue:
         # REMOVE THIS BLOCK ONCE MLM OUTPUT MATCHES STANDARD
         if MCP.startswith("mcp_"):
             MCP = MCP.replace("mcp_", "MCP")
-            # convert from MCPx to MCPx+1
             MCP = "MCP" + str(int(MCP[3:]) - 1)
         # Extract all comparison keys from the results object
@@ -1014,17 +1003,16 @@ class Dialogue:
             objects containing the results of gene ranking analysis.
         Examples:
-            ct_subs = {
-            "subpop1": anndata_obj1,
-            "subpop2": anndata_obj2,
-            # ... more subpopulations ...
-            }
-            genes_results = _get_extrema_MCP_genes_single(ct_subs, mcp="mcp_4", fraction=0.2)
+            >>> ct_subs = {
+            ...     "subpop1": anndata_obj1,
+            ...     "subpop2": anndata_obj2,
+            ...     # ... more subpopulations ...
+            ... }
+            >>> genes_results = _get_extrema_MCP_genes_single(ct_subs, mcp="mcp_4", fraction=0.2)
         """
         genes = {}
         for ct in ct_subs.keys():
             mini = ct_subs[ct]
-            mini.obs[mcp]
             mini.obs["extrema"] = pd.qcut(
                 mini.obs[mcp],
                 [0, 0 + fraction, 1 - fraction, 1.0],
@@ -1034,6 +1022,7 @@ class Dialogue:
                 mini, "extrema", groups=["high" + mcp + " " + ct], reference="low " + mcp + " " + ct
             )
             genes[ct] = mini  # .uns['rank_genes_groups']
         return genes
     def get_extrema_MCP_genes(self, ct_subs: dict, fraction: float = 0.1):
@@ -1064,7 +1053,7 @@ class Dialogue:
             >>> extrema_mcp_genes = dl.get_extrema_MCP_genes(ct_subs)
         """
         rank_dfs: dict[str, dict[Any, Any]] = {}
-        _, ct_sub = next(iter(ct_subs.items()))
+        ct_sub = next(iter(ct_subs.values()))
         mcps = [col for col in ct_sub.obs.columns if col.startswith("mcp_")]
         for mcp in mcps:
@@ -1072,4 +1061,123 @@ class Dialogue:
             ct_ranked = self._get_extrema_MCP_genes_single(ct_subs, mcp=mcp, fraction=fraction)
             for celltype in ct_ranked.keys():
                 rank_dfs[mcp][celltype] = sc.get.rank_genes_groups_df(ct_ranked[celltype], group=None)
         return rank_dfs
+    def plot_split_violins(
+        self,
+        adata: AnnData,
+        split_key: str,
+        celltype_key: str,
+        split_which: tuple[str, str] = None,
+        mcp: str = "mcp_0",
+        return_fig: bool | None = None,
+        ax: Axes | None = None,
+        save: bool | str | None = None,
+        show: bool | None = None,
+    ) -> Axes | Figure | None:
+        """Plots split violin plots for a given MCP and split variable.
+        Any cells with a value for split_key not in split_which are removed from the plot.
+        Args:
+            adata: Annotated data object.
+            split_key: Variable in adata.obs used to split the data.
+            celltype_key: Key for cell type annotations.
+            split_which: Which values of split_key to plot. Required if more than 2 values in split_key.
+            mcp: Key for MCP data. Defaults to "mcp_0".
+        Returns:
+            A :class:`~matplotlib.axes.Axes` object
+        Examples:
+            >>> import pertpy as pt
+            >>> import scanpy as sc
+            >>> adata = pt.dt.dialogue_example()
+            >>> sc.pp.pca(adata)
+            >>> dl = pt.tl.Dialogue(sample_id = "clinical.status", celltype_key = "cell.subtypes", \
+                n_counts_key = "nCount_RNA", n_mpcs = 3)
+            >>> adata, mcps, ws, ct_subs = dl.calculate_multifactor_PMD(adata, normalize=True)
+            >>> dl.plot_split_violins(adata, split_key='gender', celltype_key='cell.subtypes')
+        Preview:
+            .. image:: /_static/docstring_previews/dialogue_violin.png
+        """
+        df = sc.get.obs_df(adata, [celltype_key, mcp, split_key])
+        if split_which is None:
+            split_which = df[split_key].unique()
+        df = df[df[split_key].isin(split_which)]
+        df[split_key] = df[split_key].cat.remove_unused_categories()
+        ax = sns.violinplot(data=df, x=celltype_key, y=mcp, hue=split_key, split=True)
+        ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
+        if save:
+            plt.savefig(save, bbox_inches="tight")
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        if not (show or save):
+            return ax
+        return None
+    def plot_pairplot(
+        self,
+        adata: AnnData,
+        celltype_key: str,
+        color: str,
+        sample_id: str,
+        mcp: str = "mcp_0",
+        return_fig: bool | None = None,
+        show: bool | None = None,
+        save: bool | str | None = None,
+    ) -> PairGrid | Figure | None:
+        """Generate a pairplot visualization for multi-cell perturbation (MCP) data.
+        Computes the mean of a specified MCP feature (mcp) for each combination of sample and cell type,
+        then creates a pairplot to visualize the relationships between these mean MCP values.
+        Args:
+            adata: Annotated data object.
+            celltype_key: Key in `adata.obs` containing cell type annotations.
+            color: Key in `adata.obs` for color annotations. This parameter is used as the hue
+            sample_id: Key in `adata.obs` for the sample annotations.
+            mcp: Key in `adata.obs` for MCP feature values. Defaults to `"mcp_0"`.
+        Returns:
+            Seaborn Pairgrid object.
+        Examples:
+            >>> import pertpy as pt
+            >>> import scanpy as sc
+            >>> adata = pt.dt.dialogue_example()
+            >>> sc.pp.pca(adata)
+            >>> dl = pt.tl.Dialogue(sample_id = "clinical.status", celltype_key = "cell.subtypes", \
+                n_counts_key = "nCount_RNA", n_mpcs = 3)
+            >>> adata, mcps, ws, ct_subs = dl.calculate_multifactor_PMD(adata, normalize=True)
+            >>> dl.plot_pairplot(adata, celltype_key="cell.subtypes", color="gender", sample_id="clinical.status")
+        Preview:
+            .. image:: /_static/docstring_previews/dialogue_pairplot.png
+        """
+        mean_mcps = adata.obs.groupby([sample_id, celltype_key])[mcp].mean()
+        mean_mcps = mean_mcps.reset_index()
+        mcp_pivot = pd.pivot(mean_mcps[[sample_id, celltype_key, mcp]], index=sample_id, columns=celltype_key)[mcp]
+        aggstats = adata.obs.groupby([sample_id])[color].describe()
+        aggstats = aggstats.loc[list(mcp_pivot.index), :]
+        aggstats[color] = aggstats["top"]
+        mcp_pivot = pd.concat([mcp_pivot, aggstats[color]], axis=1)
+        ax = sns.pairplot(mcp_pivot, hue=color, corner=True)
+        if save:
+            plt.savefig(save, bbox_inches="tight")
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        if not (show or save):
+            return ax
+        return None

pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

pertpy 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl