PyPI - scdesigner - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

scdesigner 0.0.5py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

scdesigner/base/__init__.py +8 -0
scdesigner/base/copula.py +416 -0
scdesigner/base/marginal.py +391 -0
scdesigner/base/simulator.py +59 -0
scdesigner/copulas/__init__.py +8 -0
scdesigner/copulas/standard_copula.py +645 -0
scdesigner/datasets/__init__.py +5 -0
scdesigner/datasets/pancreas.py +39 -0
scdesigner/distributions/__init__.py +19 -0
scdesigner/{minimal → distributions}/bernoulli.py +42 -14
scdesigner/distributions/gaussian.py +114 -0
scdesigner/distributions/negbin.py +121 -0
scdesigner/distributions/negbin_irls.py +72 -0
scdesigner/distributions/negbin_irls_funs.py +456 -0
scdesigner/distributions/poisson.py +88 -0
scdesigner/{minimal → distributions}/zero_inflated_negbin.py +39 -10
scdesigner/distributions/zero_inflated_poisson.py +103 -0
scdesigner/simulators/__init__.py +24 -28
scdesigner/simulators/composite.py +239 -0
scdesigner/simulators/positive_nonnegative_matrix_factorization.py +477 -0
scdesigner/simulators/scd3.py +486 -0
scdesigner/transform/__init__.py +8 -6
scdesigner/{minimal → transform}/transform.py +1 -1
scdesigner/{minimal → utils}/kwargs.py +4 -1
{scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/METADATA +1 -1
scdesigner-0.0.10.dist-info/RECORD +28 -0
{scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/WHEEL +1 -1
scdesigner/data/__init__.py +0 -16
scdesigner/data/formula.py +0 -137
scdesigner/data/group.py +0 -123
scdesigner/data/sparse.py +0 -39
scdesigner/diagnose/__init__.py +0 -65
scdesigner/diagnose/aic_bic.py +0 -119
scdesigner/diagnose/plot.py +0 -242
scdesigner/estimators/__init__.py +0 -32
scdesigner/estimators/bernoulli.py +0 -85
scdesigner/estimators/gaussian.py +0 -121
scdesigner/estimators/gaussian_copula_factory.py +0 -367
scdesigner/estimators/glm_factory.py +0 -75
scdesigner/estimators/negbin.py +0 -153
scdesigner/estimators/pnmf.py +0 -160
scdesigner/estimators/poisson.py +0 -124
scdesigner/estimators/zero_inflated_negbin.py +0 -195
scdesigner/estimators/zero_inflated_poisson.py +0 -85
scdesigner/format/__init__.py +0 -4
scdesigner/format/format.py +0 -20
scdesigner/format/print.py +0 -30
scdesigner/minimal/__init__.py +0 -17
scdesigner/minimal/composite.py +0 -119
scdesigner/minimal/copula.py +0 -205
scdesigner/minimal/formula.py +0 -23
scdesigner/minimal/gaussian.py +0 -65
scdesigner/minimal/loader.py +0 -211
scdesigner/minimal/marginal.py +0 -154
scdesigner/minimal/negbin.py +0 -73
scdesigner/minimal/positive_nonnegative_matrix_factorization.py +0 -231
scdesigner/minimal/scd3.py +0 -96
scdesigner/minimal/scd3_instances.py +0 -50
scdesigner/minimal/simulator.py +0 -25
scdesigner/minimal/standard_copula.py +0 -383
scdesigner/predictors/__init__.py +0 -15
scdesigner/predictors/bernoulli.py +0 -9
scdesigner/predictors/gaussian.py +0 -16
scdesigner/predictors/negbin.py +0 -17
scdesigner/predictors/poisson.py +0 -12
scdesigner/predictors/zero_inflated_negbin.py +0 -18
scdesigner/predictors/zero_inflated_poisson.py +0 -18
scdesigner/samplers/__init__.py +0 -23
scdesigner/samplers/bernoulli.py +0 -27
scdesigner/samplers/gaussian.py +0 -25
scdesigner/samplers/glm_factory.py +0 -103
scdesigner/samplers/negbin.py +0 -25
scdesigner/samplers/poisson.py +0 -25
scdesigner/samplers/zero_inflated_negbin.py +0 -40
scdesigner/samplers/zero_inflated_poisson.py +0 -16
scdesigner/simulators/composite_regressor.py +0 -72
scdesigner/simulators/glm_simulator.py +0 -167
scdesigner/simulators/pnmf_regression.py +0 -61
scdesigner/transform/amplify.py +0 -14
scdesigner/transform/mask.py +0 -33
scdesigner/transform/nullify.py +0 -25
scdesigner/transform/split.py +0 -23
scdesigner/transform/substitute.py +0 -14
scdesigner-0.0.5.dist-info/RECORD +0 -66

scdesigner/base/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Base classes for scDesigner simulation framework."""
+from .copula import CovarianceStructure
+__all__ = [
+    "CovarianceStructure",
+]

scdesigner/base/copula.py ADDED Viewed

@@ -0,0 +1,416 @@
+from typing import Dict, Callable, Tuple
+import torch
+from anndata import AnnData
+from ..data.loader import adata_loader
+from abc import ABC, abstractmethod
+import numpy as np
+import pandas as pd
+from typing import Optional, Union
+class Copula(ABC):
+    """Abstract Copula Class
+    The scDesign3 model is built from two components: a collection of marginal
+    models, and a copula to tie them together. This class implements an abstract
+    version of the copula. Within this class, we may define different subclasses
+    that implement various types of regularization or dependencies on
+    experimental and biological conditions. Despite these differences, the
+    overall class must always provide utilities for fitting and sampling
+    dependent uniform variables.
+    Parameters
+    ----------
+    formula : str
+        A string describing the dependence of the copula on experimental or
+        biological conditions. We support predictors for categorical variables
+        like cell type; this corresponds to estimating a different covariance
+        for each category.
+    Attributes
+    ----------
+    loader : torch.utils.data.DataLoader
+        A data loader object is used to estimate the covariance one batch at a
+        time. This allows estimation of the covariance structure in a streaming
+        way, without having to load all data into memory.
+    n_outcomes : int
+        The number of features modeled by this marginal model. For example,
+        this corresponds to the number of genes being simulated.
+    parameters : Dict[str, CovarianceStructure]
+        A dictionary of CovarianceStructure objects. Each key corresponds to a
+        different category specified in the original formula. The covariance
+        structure stores the relationships among genes. It can be a standard
+        covariance matrix, but may also use more memory-efficient approximations
+        like when using CovarianceStructure with a constraint on
+        num_modeled_genes.
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc3k()[:, :300]
+    >>>
+    >>> class DummyCopula(Copula):
+    ...     def fit(self):
+    ...         pass
+    ...     def likelihood(self):
+    ...         pass
+    ...     def num_params(self):
+    ...         return 0
+    ...     def pseudo_obs(self, x_dict):
+    ...         return np.random.uniform(size=(x_dict["group"].shape[0], self.n_outcomes))
+    ...
+    >>> model = DummyCopula({"group": "~ 1"})
+    >>> model.setup_data(adata, {"group": "~ 1"})
+    >>> model.fit()
+    """
+    def __init__(self, formula: Union[str, dict], **kwargs):
+        self.formula = formula
+        self.loader = None
+        self.n_outcomes = None
+        self.parameters = None # Should be a dictionary of CovarianceStructure objects
+    def setup_data(self, adata: AnnData, marginal_formula: Dict[str, str], batch_size: int = 1024, **kwargs):
+        """
+        Populate the .loader attribute
+        Parameters
+        ----------
+        adata : AnnData
+            This is the object on which we want to estimate the simulator. This
+            serves as the template for all downstream fitting.
+        marginal_formula : Dict[str, str]
+            A dictionary or string specifying the relationship between the columns
+            of an input data frame (adata.obs, adata.var, or similar attributes) and
+            the parameters of the marginal model. If only a string is provided,
+            then the means are allowed to depend on the design parameters, while all
+            other parameters are treated as fixed. If a dictionary is provided,
+            each key should correspond to a parameter. The string values should be
+            in a format that can be parsed by the formulaic package.  For example,
+            '~ x' will ensure that the parameter varies linearly with X.
+        Returns
+        -------
+        None
+            This method does not return anything but populates the self.adata,
+            formula, loader, and n_outcomes attributes based on the provided
+            adata input object.
+        """
+        self.adata = adata
+        self.formula = self.formula | marginal_formula #
+        self.loader = adata_loader(adata, self.formula, batch_size=batch_size, **kwargs)
+        X_batch, _ = next(iter(self.loader))
+        self.n_outcomes = X_batch.shape[1]
+    def decorrelate(self, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
+        """
+        Decorrelate the covariance matrix for the given row and column patterns.
+        This method can be used to generate synthetic null data where particular
+        pairs of features are forced to be uncorrelated with one another. Any
+        indices of the covariance that lie in the intersection of the specified
+        row and column patterns will be set to zero.
+        Parameters
+        ----------
+        row_pattern : str
+            The regex pattern for the row names to match.
+        col_pattern : str
+            The regex pattern for the column names to match.
+        group : Union[str, list, None], optional
+            The group or groups to apply the transformation to. If None, the
+            transformation is applied to all groups.
+        Returns
+        -------
+        None
+            This method does not return anything but modifies self parameters as
+            a side effect.
+        """
+        if group is None:
+            for g in self.groups:
+                self.parameters[g].decorrelate(row_pattern, col_pattern)
+        elif isinstance(group, str):
+            self.parameters[group].decorrelate(row_pattern, col_pattern)
+        else:
+            for g in group:
+                self.parameters[g].decorrelate(row_pattern, col_pattern)
+    def correlate(self, factor: float, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
+        """
+        Multiply selected off-diagonal entries by factor.
+        To adjust the signal strength in a power analysis, we may want to
+        rescale the correlation for specific entries in the covariance matrix.
+        This function is used to apply a multiplicative factor to selected
+        entries, allowing targeted modification of correlation strength.
+        Parameters
+        ----------
+        factor : float
+            The factor to multiply the off-diagonal entries by.
+        row_pattern : str
+            The regex pattern for the row names to match.
+        col_pattern : str
+            The regex pattern for the column names to match.
+        group : Union[str, list, None], optional
+            The group or groups to apply the transformation to. If None, the
+            transformation is applied to all groups.
+        Returns
+        -------
+        None
+            This method does not return anything but modifies self parameters as
+            a side effect.
+        """
+        if group is None:
+            for g in self.groups:
+                self.parameters[g].correlate(row_pattern, col_pattern, factor)
+        elif isinstance(group, str):
+            self.parameters[group].correlate(row_pattern, col_pattern, factor)
+        else:
+            for g in group:
+                self.parameters[g].correlate(row_pattern, col_pattern, factor)
+    @abstractmethod
+    def fit(self, uniformizer: Callable, **kwargs):
+        """
+        Fit a Copula
+        Copula models are estimated by transforming the observed data onto the
+        [0, 1] space of percentiles. See the .invert() method within class
+        Marginal.
+        Parameters
+        ----------
+        uniformizer : Callable
+            Function to transform data to uniform marginals. See .invert()
+            within class Marginal for an example.
+        **kwargs
+            Additional keyword arguments.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def pseudo_obs(self, x_dict: Dict):
+        """
+        Sample from a Copula
+        Dependent uniform variables can be sampled from the copula conditional
+        on a specific design matrix X (encoding biological and experimental
+        covariates). For example, this will sample uniform variables with
+        dependence reflecting the cell type specified by X.
+        Parameters
+        ----------
+        x_dict : Dict
+            A dictionary of tensors, with one key/value pair per parameter.
+            These tensors are the conditioning information to pass to the
+            .predict() function of this distribution class. They are the
+            numerical design matrices implied by the initializing formulas.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
+        """
+        Parameters
+        ----------
+        uniformizer : Callable
+            Function to transform data to uniform marginals.
+        batch : Tuple[torch.Tensor, Dict[str, torch.Tensor]]
+            Batch of data.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def num_params(self, **kwargs):
+        """
+        Covariance Parameters
+        This returns the number of free parameters in the overall copula model.
+        This is useful for assessing model complexity.
+        Parameters
+        ----------
+        **kwargs
+            Additional keyword arguments.
+        """
+        raise NotImplementedError
+class CovarianceStructure:
+    """
+    Efficient storage for covariance matrices in copula-based gene expression modeling.
+    This class provides memory-efficient storage for covariance information by storing
+    either a full covariance matrix or a block matrix with diagonal variances for
+    remaining genes. This enables fast copula estimation and sampling for large
+    gene expression datasets.
+    Attributes
+    ----------
+    cov : pd.DataFrame
+        Covariance matrix for modeled genes with gene names as index/columns
+    modeled_indices : np.ndarray
+        Indices of modeled genes in original ordering
+    remaining_var : pd.Series or None
+        Diagonal variances for remaining genes, None if full matrix stored
+    remaining_indices : np.ndarray or None
+        Indices of remaining genes in original ordering
+    num_modeled_genes : int
+        Number of modeled genes
+    num_remaining_genes : int
+        Number of remaining genes (0 if full matrix stored)
+    total_genes : int
+        Total number of genes
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>>
+    >>> sigma = np.random.uniform(size=(5, 5))
+    >>> modeled_names = ["A", "B", "C", "D", "E"]
+    >>> sigma = pd.DataFrame(sigma, columns=modeled_names, index=modeled_names)
+    >>> covariance = CovarianceStructure(sigma, modeled_names)
+    """
+    def __init__(self, cov: np.ndarray,
+                 modeled_names: pd.Index,
+                 modeled_indices: Optional[np.ndarray] = None,
+                 remaining_var: Optional[np.ndarray] = None,
+                 remaining_indices: Optional[np.ndarray] = None,
+                 remaining_names: Optional[pd.Index] = None):
+        """
+        Initialize a CovarianceStructure object.
+        Parameters
+        ----------
+        cov : np.ndarray
+            Covariance matrix for modeled genes, shape (n_modeled_genes, n_modeled_genes)
+        modeled_names : pd.Index
+            Gene names for the modeled genes
+        modeled_indices : Optional[np.ndarray], optional
+            Indices of modeled genes in original ordering. Defaults to sequential indices.
+        remaining_var : Optional[np.ndarray], optional
+            Diagonal variances for remaining genes, shape (n_remaining_genes,)
+        remaining_indices : Optional[np.ndarray], optional
+            Indices of remaining genes in original ordering
+        remaining_names : Optional[pd.Index], optional
+            Gene names for remaining genes
+        """
+        self.cov = pd.DataFrame(cov, index=modeled_names, columns=modeled_names)
+        if modeled_indices is not None:
+            self.modeled_indices = modeled_indices
+        else:
+            self.modeled_indices = np.arange(len(modeled_names))
+        if remaining_var is not None:
+            self.remaining_var = pd.Series(remaining_var, index=remaining_names)
+        else:
+            self.remaining_var = None
+        self.remaining_indices = remaining_indices
+        self.num_modeled_genes = len(modeled_names)
+        self.num_remaining_genes = len(remaining_indices) if remaining_indices is not None else 0
+        self.total_genes = self.num_modeled_genes + self.num_remaining_genes
+    def __repr__(self):
+        if self.remaining_var is None:
+            return self.cov.__repr__()
+        else:
+            return f"CovarianceStructure(modeled_genes={self.num_modeled_genes}, \
+                total_genes={self.total_genes})"
+    def _repr_html_(self):
+        """
+        Jupyter Notebook display
+        Returns
+        -------
+        str
+            HTML representation of the object.
+        """
+        if self.remaining_var is None:
+            return self.cov._repr_html_()
+        else:
+            html = f"<b>CovarianceStructure:</b> {self.num_modeled_genes} modeled genes, {self.total_genes} total<br>"
+            html += "<h4>Modeled Covariance Matrix</h4>" + self.cov._repr_html_()
+            html += "<h4>Remaining Gene Variances</h4>" + self.remaining_var.to_frame("variance").T._repr_html_()
+            return html
+    def decorrelate(self, row_pattern: str, col_pattern: str):
+        """
+        Decorrelate the covariance matrix for the given row and column patterns.
+        This method can be used to generate synthetic null data where particular
+        pairs of features are forced to be uncorrelated with one another. Any
+        indices of the covariance that lie in the intersection of the specified
+        row and column patterns will be set to zero.
+        Parameters
+        ----------
+        row_pattern : str
+            The regex pattern for the row names to match.
+        col_pattern : str
+            The regex pattern for the column names to match.
+        """
+        from ..transform.transform import data_frame_mask
+        m1 = data_frame_mask(self.cov, ".", col_pattern)
+        m2 = data_frame_mask(self.cov, row_pattern, ".")
+        mask = (m1 | m2)
+        np.fill_diagonal(mask, False)
+        self.cov.values[mask] = 0
+    def correlate(self, row_pattern: str, col_pattern: str, factor: float):
+        """
+        Multiply selected off-diagonal entries by factor.
+        To adjust the signal strength in a power analysis, we may want to
+        rescale the correlation for specific entries in the covariance matrix.
+        This function is used to apply a multiplicative factor to selected
+        entries, allowing targeted modification of correlation strength.
+        Parameters
+        ----------
+        row_pattern : str
+            The regex pattern for the row names to match.
+        col_pattern : str
+            The regex pattern for the column names to match.
+        factor : float
+            The factor to multiply the off-diagonal entries by.
+        """
+        from ..transform.transform import data_frame_mask
+        m1 = data_frame_mask(self.cov, ".", col_pattern)
+        m2 = data_frame_mask(self.cov, row_pattern, ".")
+        mask = (m1 | m2)
+        np.fill_diagonal(mask, False)
+        self.cov.values[mask] = self.cov.values[mask] * factor
+    @property
+    def shape(self):
+        return (self.total_genes, self.total_genes)
+    def to_full_matrix(self):
+        """
+        Convert to full covariance matrix for compatibility and debugging.
+        Returns
+        -------
+        np.ndarray
+            Full covariance matrix with shape (total_genes, total_genes)
+        """
+        if self.remaining_var is None:
+            return self.cov.values
+        else:
+            full_cov = np.zeros((self.total_genes, self.total_genes))
+            # Fill in top-k block
+            ix_modeled = np.ix_(self.modeled_indices, self.modeled_indices)
+            full_cov[ix_modeled] = self.cov.values
+            # Fill in diagonal for remaining genes
+            full_cov[self.remaining_indices, self.remaining_indices] = self.remaining_var.values
+        return full_cov

scdesigner 0.0.5__py3-none-any.whl → 0.0.10__py3-none-any.whl

scdesigner 0.0.5py3-none-any.whl → 0.0.10py3-none-any.whl