PyPI - pertpy - Versions diffs - 0.9.3__py3-none-any.whl → 0.9.5__py3-none-any.whl - Mend

pertpy 0.9.3py3-none-any.whl → 0.9.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pertpy/__init__.py +1 -1
pertpy/_doc.py +20 -0
pertpy/data/_dataloader.py +4 -4
pertpy/data/_datasets.py +3 -3
pertpy/metadata/_cell_line.py +19 -7
pertpy/metadata/_compound.py +3 -4
pertpy/metadata/_metadata.py +1 -1
pertpy/preprocessing/_guide_rna.py +19 -6
pertpy/tools/__init__.py +12 -15
pertpy/tools/_augur.py +36 -46
pertpy/tools/_cinemaot.py +24 -18
pertpy/tools/_coda/_base_coda.py +87 -106
pertpy/tools/_dialogue.py +17 -21
pertpy/tools/_differential_gene_expression/__init__.py +1 -2
pertpy/tools/_differential_gene_expression/_base.py +495 -113
pertpy/tools/_differential_gene_expression/_edger.py +30 -21
pertpy/tools/_differential_gene_expression/_pydeseq2.py +15 -29
pertpy/tools/_differential_gene_expression/_statsmodels.py +0 -11
pertpy/tools/_distances/_distances.py +15 -8
pertpy/tools/_enrichment.py +18 -8
pertpy/tools/_milo.py +58 -46
pertpy/tools/_mixscape.py +111 -100
pertpy/tools/_perturbation_space/_perturbation_space.py +40 -31
pertpy/tools/_perturbation_space/_simple.py +50 -0
pertpy/tools/_scgen/_scgen.py +35 -25
{pertpy-0.9.3.dist-info → pertpy-0.9.5.dist-info}/METADATA +5 -4
{pertpy-0.9.3.dist-info → pertpy-0.9.5.dist-info}/RECORD +29 -29
{pertpy-0.9.3.dist-info → pertpy-0.9.5.dist-info}/WHEEL +1 -1
pertpy/tools/_differential_gene_expression/_formulaic.py +0 -189
{pertpy-0.9.3.dist-info → pertpy-0.9.5.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_differential_gene_expression/_base.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import os
+import math
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from itertools import chain
+from collections.abc import Iterable, Mapping, Sequence
+from itertools import zip_longest
 from types import MappingProxyType
 import adjustText
@@ -11,27 +11,14 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from formulaic_contrasts import FormulaicContrasts
+from lamin_utils import logger
+from matplotlib.pyplot import Figure
 from matplotlib.ticker import MaxNLocator
+from pertpy._doc import _doc_params, doc_common_plot_args
+from pertpy.tools import PseudobulkSpace
 from pertpy.tools._differential_gene_expression._checks import check_is_numeric_matrix
-from pertpy.tools._differential_gene_expression._formulaic import (
-    AmbiguousAttributeError,
-    Factor,
-    get_factor_storage_and_materializer,
-    resolve_ambiguous,
-)
-@dataclass
-class Contrast:
-    """Simple contrast for comparison between groups"""
-    column: str
-    baseline: str
-    group_to_compare: str
-ContrastType = Contrast | tuple[str, str, str]
 class MethodBase(ABC):
@@ -58,7 +45,7 @@ class MethodBase(ABC):
         if self.layer is None:
             return self.adata.X
         else:
-            return self.adata.layer[self.layer]
+            return self.adata.layers[self.layer]
     @classmethod
     @abstractmethod
@@ -91,9 +78,28 @@ class MethodBase(ABC):
         Returns:
             Pandas dataframe with results ordered by significance. If multiple comparisons were performed this is indicated in an additional column.
+        Examples:
+            >>> # Example with EdgeR
+            >>> import pertpy as pt
+            >>> adata = pt.dt.zhang_2021()
+            >>> adata.layers["counts"] = adata.X.copy()
+            >>> ps = pt.tl.PseudobulkSpace()
+            >>> pdata = ps.compute(
+            ...     adata,
+            ...     target_col="Patient",
+            ...     groups_col="Cluster",
+            ...     layer_key="counts",
+            ...     mode="sum",
+            ...     min_cells=10,
+            ...     min_counts=1000,
+            ... )
+            >>> edgr = pt.tl.EdgeR(pdata, design="~Efficacy+Treatment")
+            >>> res_df = edgr.compare_groups(pdata, column="Efficacy", baseline="SD", groups_to_compare=["PR", "PD"])
         """
         ...
+    @_doc_params(common_plot_args=doc_common_plot_args)
     def plot_volcano(
         self,
         data: pd.DataFrame | ad.AnnData,
@@ -115,13 +121,14 @@ class MethodBase(ABC):
         figsize: tuple[int, int] = (5, 5),
         legend_pos: tuple[float, float] = (1.6, 1),
         point_sizes: tuple[int, int] = (15, 150),
-        save: bool | str | None = None,
         shapes: list[str] | None = None,
         shape_order: list[str] | None = None,
         x_label: str | None = None,
         y_label: str | None = None,
+        show: bool = True,
+        return_fig: bool = False,
         **kwargs: int,
-    ) -> None:
+    ) -> Figure | None:
         """Creates a volcano plot from a pandas DataFrame or Anndata.
         Args:
@@ -143,12 +150,40 @@ class MethodBase(ABC):
             top_right_frame: Whether to show the top and right frame of the plot.
             figsize: Size of the figure.
             legend_pos: Position of the legend as determined by matplotlib.
-            save: Saves the plot if True or to the path provided.
             shapes: List of matplotlib marker ids.
             shape_order: Order of categories for shapes.
             x_label: Label for the x-axis.
             y_label: Label for the y-axis.
+            {common_plot_args}
             **kwargs: Additional arguments for seaborn.scatterplot.
+        Returns:
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
+        Examples:
+            >>> # Example with EdgeR
+            >>> import pertpy as pt
+            >>> adata = pt.dt.zhang_2021()
+            >>> adata.layers["counts"] = adata.X.copy()
+            >>> ps = pt.tl.PseudobulkSpace()
+            >>> pdata = ps.compute(
+            ...     adata,
+            ...     target_col="Patient",
+            ...     groups_col="Cluster",
+            ...     layer_key="counts",
+            ...     mode="sum",
+            ...     min_cells=10,
+            ...     min_counts=1000,
+            ... )
+            >>> edgr = pt.tl.EdgeR(pdata, design="~Efficacy+Treatment")
+            >>> edgr.fit()
+            >>> res_df = edgr.test_contrasts(
+            ...     edgr.contrast(column="Treatment", baseline="Chemo", group_to_compare="Anti-PD-L1+Chemo")
+            ... )
+            >>> edgr.plot_volcano(res_df, log2fc_thresh=0)
+        Preview:
+            .. image:: /_static/docstring_previews/de_volcano.png
         """
         if colors is None:
             colors = ["gray", "#D62728", "#1F77B4"]
@@ -243,7 +278,7 @@ class MethodBase(ABC):
             if varm_key is None:
                 raise ValueError("Please pass a .varm key to use for plotting")
-            raise NotImplementedError("Anndata not implemented yet")
+            raise NotImplementedError("Anndata not implemented yet")  # TODO: Implement this
             df = data.varm[varm_key].copy()
         df = data.copy(deep=True)
@@ -449,26 +484,412 @@ class MethodBase(ABC):
         plt.legend(loc=1, bbox_to_anchor=legend_pos, frameon=False)
-        # TODO replace with scanpy save style
-        if save:
-            files = os.listdir()
-            for x in range(100):
-                file_pref = "volcano_" + "%02d" % (x,)
-                if len([x for x in files if x.startswith(file_pref)]) == 0:
-                    plt.savefig(file_pref + ".png", dpi=300, bbox_inches="tight")
-                    plt.savefig(file_pref + ".svg", bbox_inches="tight")
-                    break
-        elif isinstance(save, str):
-            plt.savefig(save + ".png", dpi=300, bbox_inches="tight")
-            plt.savefig(save + ".svg", bbox_inches="tight")
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
+    def plot_paired(
+        self,
+        adata: ad.AnnData,
+        results_df: pd.DataFrame,
+        groupby: str,
+        pairedby: str,
+        *,
+        var_names: Sequence[str] = None,
+        n_top_vars: int = 15,
+        layer: str = None,
+        pvalue_col: str = "adj_p_value",
+        symbol_col: str = "variable",
+        n_cols: int = 4,
+        panel_size: tuple[int, int] = (5, 5),
+        show_legend: bool = True,
+        size: int = 10,
+        y_label: str = "expression",
+        pvalue_template=lambda x: f"p={x:.2e}",
+        boxplot_properties=None,
+        palette=None,
+        show: bool = True,
+        return_fig: bool = False,
+    ) -> Figure | None:
+        """Creates a pairwise expression plot from a Pandas DataFrame or Anndata.
+        Visualizes a panel of paired scatterplots per variable.
+        Args:
+            adata: AnnData object, can be pseudobulked.
+            results_df: DataFrame with results from a differential expression test.
+            groupby: .obs column containing the grouping. Must contain exactly two different values.
+            pairedby: .obs column containing the pairing (e.g. "patient_id"). If None, an independent t-test is performed.
+            var_names: Variables to plot.
+            n_top_vars: Number of top variables to plot.
+            layer: Layer to use for plotting.
+            pvalue_col: Column name of the p values.
+            symbol_col: Column name of gene IDs.
+            n_cols: Number of columns in the plot.
+            panel_size: Size of each panel.
+            show_legend: Whether to show the legend.
+            size: Size of the points.
+            y_label: Label for the y-axis.
+            pvalue_template: Template for the p-value string displayed in the title of each panel.
+            boxplot_properties: Additional properties for the boxplot, passed to seaborn.boxplot.
+            palette: Color palette for the line- and stripplot.
+            {common_plot_args}
+        Returns:
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
+        Examples:
+            >>> # Example with EdgeR
+            >>> import pertpy as pt
+            >>> adata = pt.dt.zhang_2021()
+            >>> adata.layers["counts"] = adata.X.copy()
+            >>> ps = pt.tl.PseudobulkSpace()
+            >>> pdata = ps.compute(
+            ...     adata,
+            ...     target_col="Patient",
+            ...     groups_col="Cluster",
+            ...     layer_key="counts",
+            ...     mode="sum",
+            ...     min_cells=10,
+            ...     min_counts=1000,
+            ... )
+            >>> edgr = pt.tl.EdgeR(pdata, design="~Efficacy+Treatment")
+            >>> edgr.fit()
+            >>> res_df = edgr.test_contrasts(
+            ...     edgr.contrast(column="Treatment", baseline="Chemo", group_to_compare="Anti-PD-L1+Chemo")
+            ... )
+            >>> edgr.plot_paired(pdata, results_df=res_df, n_top_vars=8, groupby="Treatment", pairedby="Efficacy")
+        Preview:
+            .. image:: /_static/docstring_previews/de_paired_expression.png
+        """
+        if boxplot_properties is None:
+            boxplot_properties = {}
+        groups = adata.obs[groupby].unique()
+        if len(groups) != 2:
+            raise ValueError("The number of groups in the group_by column must be exactly 2 to enable paired testing")
+        if var_names is None:
+            var_names = results_df.head(n_top_vars)[symbol_col].tolist()
+        adata = adata[:, var_names]
+        if any(adata.obs[[groupby, pairedby]].value_counts() > 1):
+            logger.info("Performing pseudobulk for paired samples")
+            ps = PseudobulkSpace()
+            adata = ps.compute(
+                adata, target_col=groupby, groups_col=pairedby, layer_key=layer, mode="sum", min_cells=1, min_counts=1
+            )
+        if layer is not None:
+            X = adata.layers[layer]
+        else:
+            X = adata.X
+        try:
+            X = X.toarray()
+        except AttributeError:
+            pass
+        groupby_cols = [pairedby, groupby]
+        df = adata.obs.loc[:, groupby_cols].join(pd.DataFrame(X, index=adata.obs_names, columns=var_names))
+        # remove unpaired samples
+        paired_samples = set(df[df[groupby] == groups[0]][pairedby]) & set(df[df[groupby] == groups[1]][pairedby])
+        df = df[df[pairedby].isin(paired_samples)]
+        removed_samples = adata.obs[pairedby].nunique() - len(df[pairedby].unique())
+        if removed_samples > 0:
+            logger.warning(f"{removed_samples} unpaired samples removed")
+        pvalues = results_df.set_index(symbol_col).loc[var_names, pvalue_col].values
+        df.reset_index(drop=False, inplace=True)
+        # transform data for seaborn
+        df_melt = df.melt(
+            id_vars=groupby_cols,
+            var_name="var",
+            value_name="val",
+        )
+        n_panels = len(var_names)
+        nrows = math.ceil(n_panels / n_cols)
+        ncols = min(n_cols, n_panels)
+        fig, axes = plt.subplots(
+            nrows,
+            ncols,
+            figsize=(ncols * panel_size[0], nrows * panel_size[1]),
+            tight_layout=True,
+            squeeze=False,
+        )
+        axes = axes.flatten()
+        for i, (var, ax) in enumerate(zip_longest(var_names, axes)):
+            if var is not None:
+                sns.boxplot(
+                    x=groupby,
+                    data=df_melt.loc[df_melt["var"] == var],
+                    y="val",
+                    ax=ax,
+                    color="white",
+                    fliersize=0,
+                    **boxplot_properties,
+                )
+                if pairedby is not None:
+                    sns.lineplot(
+                        x=groupby,
+                        data=df_melt.loc[df_melt["var"] == var],
+                        y="val",
+                        ax=ax,
+                        hue=pairedby,
+                        legend=False,
+                        errorbar=None,
+                        palette=palette,
+                    )
+                jitter = 0 if pairedby else True
+                sns.stripplot(
+                    x=groupby,
+                    data=df_melt.loc[df_melt["var"] == var],
+                    y="val",
+                    ax=ax,
+                    hue=pairedby,
+                    jitter=jitter,
+                    size=size,
+                    linewidth=1,
+                    palette=palette,
+                )
+                ax.set_xlabel("")
+                ax.tick_params(
+                    axis="x",
+                    labelsize=15,
+                )
+                ax.legend().set_visible(False)
+                ax.set_ylabel(y_label)
+                ax.set_title(f"{var}\n{pvalue_template(pvalues[i])}")
+            else:
+                ax.set_visible(False)
+        fig.tight_layout()
+        if show_legend is True:
+            axes[n_panels - 1].legend().set_visible(True)
+            axes[n_panels - 1].legend(
+                bbox_to_anchor=(0.5, -0.1), loc="upper center", ncol=adata.obs[pairedby].nunique()
+            )
+        plt.tight_layout()
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
+    def plot_fold_change(
+        self,
+        results_df: pd.DataFrame,
+        *,
+        var_names: Sequence[str] = None,
+        n_top_vars: int = 15,
+        log2fc_col: str = "log_fc",
+        symbol_col: str = "variable",
+        y_label: str = "Log2 fold change",
+        figsize: tuple[int, int] = (10, 5),
+        show: bool = True,
+        return_fig: bool = False,
+        **barplot_kwargs,
+    ) -> Figure | None:
+        """Plot a metric from the results as a bar chart, optionally with additional information about paired samples in a scatter plot.
+        Args:
+            results_df: DataFrame with results from DE analysis.
+            var_names: Variables to plot. If None, the top n_top_vars variables based on the log2 fold change are plotted.
+            n_top_vars: Number of top variables to plot. The top and bottom n_top_vars variables are plotted, respectively.
+            log2fc_col: Column name of log2 Fold-Change values.
+            symbol_col: Column name of gene IDs.
+            y_label: Label for the y-axis.
+            figsize: Size of the figure.
+            {common_plot_args}
+            **barplot_kwargs: Additional arguments for seaborn.barplot.
+        Returns:
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
+        Examples:
+            >>> # Example with EdgeR
+            >>> import pertpy as pt
+            >>> adata = pt.dt.zhang_2021()
+            >>> adata.layers["counts"] = adata.X.copy()
+            >>> ps = pt.tl.PseudobulkSpace()
+            >>> pdata = ps.compute(
+            ...     adata,
+            ...     target_col="Patient",
+            ...     groups_col="Cluster",
+            ...     layer_key="counts",
+            ...     mode="sum",
+            ...     min_cells=10,
+            ...     min_counts=1000,
+            ... )
+            >>> edgr = pt.tl.EdgeR(pdata, design="~Efficacy+Treatment")
+            >>> edgr.fit()
+            >>> res_df = edgr.test_contrasts(
+            ...     edgr.contrast(column="Treatment", baseline="Chemo", group_to_compare="Anti-PD-L1+Chemo")
+            ... )
+            >>> edgr.plot_fold_change(res_df)
+        Preview:
+            .. image:: /_static/docstring_previews/de_fold_change.png
+        """
+        if var_names is None:
+            var_names = results_df.sort_values(log2fc_col, ascending=False).head(n_top_vars)[symbol_col].tolist()
+            var_names += results_df.sort_values(log2fc_col, ascending=True).head(n_top_vars)[symbol_col].tolist()
+            assert len(var_names) == 2 * n_top_vars
+        df = results_df[results_df[symbol_col].isin(var_names)]
+        df.sort_values(log2fc_col, ascending=False, inplace=True)
+        plt.figure(figsize=figsize)
+        sns.barplot(
+            x=symbol_col,
+            y=log2fc_col,
+            data=df,
+            palette="RdBu",
+            legend=False,
+            **barplot_kwargs,
+        )
+        plt.xticks(rotation=90)
+        plt.xlabel("")
+        plt.ylabel(y_label)
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        return None
+    @_doc_params(common_plot_args=doc_common_plot_args)
+    def plot_multicomparison_fc(
+        self,
+        results_df: pd.DataFrame,
+        *,
+        n_top_vars=15,
+        contrast_col: str = "contrast",
+        log2fc_col: str = "log_fc",
+        pvalue_col: str = "adj_p_value",
+        symbol_col: str = "variable",
+        marker_size: int = 100,
+        figsize: tuple[int, int] = (10, 2),
+        x_label: str = "Contrast",
+        y_label: str = "Gene",
+        show: bool = True,
+        return_fig: bool = False,
+        **heatmap_kwargs,
+    ) -> Figure | None:
+        """Plot a matrix of log2 fold changes from the results.
+        Args:
+            results_df: DataFrame with results from DE analysis.
+            n_top_vars: Number of top variables to plot per group.
+            contrast_col: Column in results_df containing information about the contrast.
+            log2fc_col: Column in results_df containing the log2 fold change.
+            pvalue_col: Column in results_df containing the p-value. Can be used to switch between adjusted and unadjusted p-values.
+            symbol_col: Column in results_df containing the gene symbol.
+            marker_size: Size of the biggest marker for significant variables.
+            figsize: Size of the figure.
+            x_label: Label for the x-axis.
+            y_label: Label for the y-axis.
+            {common_plot_args}
+            **heatmap_kwargs: Additional arguments for seaborn.heatmap.
+        Returns:
+            If `return_fig` is `True`, returns the figure, otherwise `None`.
+        Examples:
+            >>> # Example with EdgeR
+            >>> import pertpy as pt
+            >>> adata = pt.dt.zhang_2021()
+            >>> adata.layers["counts"] = adata.X.copy()
+            >>> ps = pt.tl.PseudobulkSpace()
+            >>> pdata = ps.compute(
+            ...     adata,
+            ...     target_col="Patient",
+            ...     groups_col="Cluster",
+            ...     layer_key="counts",
+            ...     mode="sum",
+            ...     min_cells=10,
+            ...     min_counts=1000,
+            ... )
+            >>> edgr = pt.tl.EdgeR(pdata, design="~Efficacy+Treatment")
+            >>> res_df = edgr.compare_groups(pdata, column="Efficacy", baseline="SD", groups_to_compare=["PR", "PD"])
+            >>> edgr.plot_multicomparison_fc(res_df)
+        Preview:
+            .. image:: /_static/docstring_previews/de_multicomparison_fc.png
+        """
+        groups = results_df[contrast_col].unique().tolist()
+        results_df["abs_log_fc"] = results_df[log2fc_col].abs()
+        def _get_significance(p_val):
+            if p_val < 0.001:
+                return "< 0.001"
+            elif p_val < 0.01:
+                return "< 0.01"
+            elif p_val < 0.1:
+                return "< 0.1"
+            else:
+                return "n.s."
+        results_df["significance"] = results_df[pvalue_col].apply(_get_significance)
+        var_names = []
+        for group in groups:
+            var_names += (
+                results_df[results_df[contrast_col] == group]
+                .sort_values("abs_log_fc", ascending=False)
+                .head(n_top_vars)[symbol_col]
+                .tolist()
+            )
+        results_df = results_df[results_df[symbol_col].isin(var_names)]
+        df = results_df.pivot(index=contrast_col, columns=symbol_col, values=log2fc_col)[var_names]
-        plt.show()
+        plt.figure(figsize=figsize)
+        sns.heatmap(df, **heatmap_kwargs, cmap="coolwarm", center=0, cbar_kws={"label": "Log2 fold change"})
+        _size = {"< 0.001": marker_size, "< 0.01": math.floor(marker_size / 2), "< 0.1": math.floor(marker_size / 4)}
+        x_locs, x_labels = plt.xticks()[0], [label.get_text() for label in plt.xticks()[1]]
+        y_locs, y_labels = plt.yticks()[0], [label.get_text() for label in plt.yticks()[1]]
+        for _i, row in results_df.iterrows():
+            if row["significance"] != "n.s.":
+                plt.scatter(
+                    x=x_locs[x_labels.index(row[symbol_col])],
+                    y=y_locs[y_labels.index(row[contrast_col])],
+                    s=_size[row["significance"]],
+                    marker="*",
+                    c="white",
+                )
+        plt.scatter([], [], s=marker_size, marker="*", c="black", label="< 0.001")
+        plt.scatter([], [], s=math.floor(marker_size / 2), marker="*", c="black", label="< 0.01")
+        plt.scatter([], [], s=math.floor(marker_size / 4), marker="*", c="black", label="< 0.1")
+        plt.legend(title="Significance", bbox_to_anchor=(1.2, -0.05))
+        plt.xlabel(x_label)
+        plt.ylabel(y_label)
+        if show:
+            plt.show()
+        if return_fig:
+            return plt.gcf()
+        return None
 class LinearModelBase(MethodBase):
     def __init__(self, adata, design, *, mask=None, layer=None, **kwargs):
-        """
-        Initialize the method.
+        """Initialize the method.
         Args:
             adata: AnnData object, usually pseudobulked.
@@ -480,26 +901,24 @@ class LinearModelBase(MethodBase):
         super().__init__(adata, mask=mask, layer=layer)
         self._check_counts()
-        self.factor_storage = None
-        self.variable_to_factors = None
+        self.formulaic_contrasts = None
         if isinstance(design, str):
-            self.factor_storage, self.variable_to_factors, materializer_class = get_factor_storage_and_materializer()
-            self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design)
+            self.formulaic_contrasts = FormulaicContrasts(adata.obs, design)
+            self.design = self.formulaic_contrasts.design_matrix
         else:
             self.design = design
     @classmethod
     def compare_groups(
         cls,
-        adata,
-        column,
-        baseline,
-        groups_to_compare,
+        adata: ad.AnnData,
+        column: str,
+        baseline: str,
+        groups_to_compare: str | Iterable[str],
         *,
-        paired_by=None,
-        mask=None,
-        layer=None,
+        paired_by: str | None = None,
+        mask: pd.Series | None = None,
+        layer: str | None = None,
         fit_kwargs=MappingProxyType({}),
         test_kwargs=MappingProxyType({}),
     ):
@@ -525,17 +944,16 @@ class LinearModelBase(MethodBase):
     @property
     def variables(self):
         """Get the names of the variables used in the model definition."""
-        try:
-            return self.design.model_spec.variables_by_source["data"]
-        except AttributeError:
+        if self.formulaic_contrasts is None:
             raise ValueError(
                 "Retrieving variables is only possible if the model was initialized using a formula."
             ) from None
+        else:
+            return self.formulaic_contrasts.variables
     @abstractmethod
     def _check_counts(self):
-        """
-        Check that counts are valid for the specific method.
+        """Check that counts are valid for the specific method.
         Raises:
             ValueError: if the data matrix does not comply with the expectations.
@@ -544,8 +962,7 @@ class LinearModelBase(MethodBase):
     @abstractmethod
     def fit(self, **kwargs):
-        """
-        Fit the model.
+        """Fit the model.
         Args:
             **kwargs: Additional arguments for fitting the specific method.
@@ -555,9 +972,8 @@ class LinearModelBase(MethodBase):
     @abstractmethod
     def _test_single_contrast(self, contrast, **kwargs): ...
-    def test_contrasts(self, contrasts, **kwargs):
-        """
-        Perform a comparison as specified in a contrast vector.
+    def test_contrasts(self, contrasts: np.ndarray | Mapping[str | None, np.ndarray], **kwargs):
+        """Perform a comparison as specified in a contrast vector.
         Args:
             contrasts: Either a numeric contrast vector, or a dictionary of numeric contrast vectors.
@@ -573,25 +989,25 @@ class LinearModelBase(MethodBase):
             results.append(self._test_single_contrast(contrast, **kwargs).assign(contrast=name))
         results_df = pd.concat(results)
         return results_df
     def test_reduced(self, modelB):
-        """
-        Test against a reduced model.
+        """Test against a reduced model.
         Args:
             modelB: the reduced model against which to test.
         Example:
-            modelA = Model().fit()
-            modelB = Model().fit()
-            modelA.test_reduced(modelB)
+            >>> import pertpy as pt
+            >>> modelA = Model().fit()
+            >>> modelB = Model().fit()
+            >>> modelA.test_reduced(modelB)
         """
         raise NotImplementedError
     def cond(self, **kwargs):
-        """
-        Get a contrast vector representing a specific condition.
+        """Get a contrast vector representing a specific condition.
         Args:
             **kwargs: column/value pairs.
@@ -599,52 +1015,14 @@ class LinearModelBase(MethodBase):
         Returns:
             A contrast vector that aligns to the columns of the design matrix.
         """
-        if self.factor_storage is None:
+        if self.formulaic_contrasts is None:
             raise RuntimeError(
                 "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector."
             )
-        cond_dict = kwargs
-        if not set(cond_dict.keys()).issubset(self.variables):
-            raise ValueError(
-                "You specified a variable that is not part of the model. Available variables: "
-                + ",".join(self.variables)
-            )
-        for var in self.variables:
-            if var in cond_dict:
-                self._check_category(var, cond_dict[var])
-            else:
-                cond_dict[var] = self._get_default_value(var)
-        df = pd.DataFrame([kwargs])
-        return self.design.model_spec.get_model_matrix(df).iloc[0]
-    def _get_factor_metadata_for_variable(self, var):
-        factors = self.variable_to_factors[var]
-        return list(chain.from_iterable(self.factor_storage[f] for f in factors))
-    def _get_default_value(self, var):
-        factor_metadata = self._get_factor_metadata_for_variable(var)
-        if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL:
-            try:
-                tmp_base = resolve_ambiguous(factor_metadata, "base")
-            except AmbiguousAttributeError as e:
-                raise ValueError(
-                    f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`."
-                ) from e
-            return tmp_base if tmp_base is not None else "\0"
-        else:
-            return 0
-    def _check_category(self, var, value):
-        factor_metadata = self._get_factor_metadata_for_variable(var)
-        tmp_categories = resolve_ambiguous(factor_metadata, "categories")
-        if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL and value not in tmp_categories:
-            raise ValueError(
-                f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}"
-            )
+        return self.formulaic_contrasts.cond(**kwargs)
-    def contrast(self, column, baseline, group_to_compare):
-        """
-        Build a simple contrast for pairwise comparisons.
+    def contrast(self, *args, **kwargs):
+        """Build a simple contrast for pairwise comparisons.
         Args:
             column: column in adata.obs to test on.
@@ -654,4 +1032,8 @@ class LinearModelBase(MethodBase):
         Returns:
             Numeric contrast vector.
         """
-        return self.cond(**{column: group_to_compare}) - self.cond(**{column: baseline})
+        if self.formulaic_contrasts is None:
+            raise RuntimeError(
+                "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector."
+            )
+        return self.formulaic_contrasts.contrast(*args, **kwargs)

pertpy 0.9.3__py3-none-any.whl → 0.9.5__py3-none-any.whl

pertpy 0.9.3py3-none-any.whl → 0.9.5py3-none-any.whl