PyPI - scdesigner - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

scdesigner 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

scdesigner/estimators/__init__.py +8 -3
scdesigner/estimators/gaussian_copula_factory.py +222 -7
scdesigner/estimators/negbin.py +24 -0
scdesigner/estimators/poisson.py +24 -0
scdesigner/minimal/composite.py +2 -2
scdesigner/minimal/copula.py +178 -6
scdesigner/minimal/loader.py +85 -40
scdesigner/minimal/marginal.py +53 -39
scdesigner/minimal/negbin.py +1 -1
scdesigner/minimal/scd3.py +1 -0
scdesigner/minimal/scd3_instances.py +5 -5
scdesigner/minimal/standard_copula.py +383 -0
scdesigner/minimal/transform.py +27 -30
scdesigner/samplers/glm_factory.py +66 -4
scdesigner/transform/nullify.py +1 -1
{scdesigner-0.0.3.dist-info → scdesigner-0.0.5.dist-info}/METADATA +1 -2
{scdesigner-0.0.3.dist-info → scdesigner-0.0.5.dist-info}/RECORD +18 -18
scdesigner/minimal/standard_covariance.py +0 -124
{scdesigner-0.0.3.dist-info → scdesigner-0.0.5.dist-info}/WHEEL +0 -0

scdesigner/estimators/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from .negbin import negbin_regression, negbin_copula
-from .gaussian_copula_factory import group_indices
-from .poisson import poisson_regression, poisson_copula
+from .negbin import negbin_regression, negbin_copula, fast_negbin_copula_factory
+from .gaussian_copula_factory import group_indices, fast_copula_covariance, FastCovarianceStructure, fast_gaussian_copula_array_factory
+from .poisson import poisson_regression, poisson_copula, fast_poisson_copula_factory
 from .bernoulli import bernoulli_regression, bernoulli_copula
 from .gaussian import gaussian_regression, gaussian_copula
 from .zero_inflated_negbin import (
@@ -24,4 +24,9 @@ __all__ = [
     "zero_inflated_negbin_regression",
     "zero_inflated_poisson_regression",
     "multiple_formula_regression_factory",
+    "fast_copula_covariance",
+    "FastCovarianceStructure",
+    "fast_gaussian_copula_array_factory",
+    "fast_negbin_copula_factory",
+    "fast_poisson_copula_factory",
 ]

scdesigner/estimators/gaussian_copula_factory.py CHANGED Viewed

@@ -31,6 +31,28 @@ def gaussian_copula_array_factory(marginal_model: Callable, uniformizer: Callabl
     return copula_fun
+def fast_gaussian_copula_array_factory(marginal_model: Callable, uniformizer: Callable, top_k: int):
+    """
+    Factory function for fast Gaussian copula array computation using top-k gene modeling.
+    """
+    def copula_fun(loaders: dict[str, DataLoader], lr: float = 0.1, epochs: int = 40, **kwargs):
+        # for the marginal model, ignore the groupings
+        # Strip all dataloaders and create a dictionary to pass to marginal_model
+        formula_loaders = {}
+        for key in loaders.keys():
+            formula_loaders[key] = strip_dataloader(loaders[key], pop="Stack" in type(loaders[key].dataset).__name__)
+        # Call marginal_model with the dictionary of stripped dataloaders
+        parameters = marginal_model(formula_loaders, lr=lr, epochs=epochs, **kwargs)
+        # estimate covariance using fast method, allowing for different groups
+        parameters["covariance"] = fast_copula_covariance(parameters, loaders, uniformizer, top_k)
+        return parameters
+    return copula_fun
 def gaussian_copula_factory(copula_array_fun: Callable,
                             parameter_formatter: Callable,
                             param_name: list = None):
@@ -65,7 +87,10 @@ def gaussian_copula_factory(copula_array_fun: Callable,
     return copula_fun
 def copula_covariance(parameters: dict, loaders: dict[str, DataLoader], uniformizer: Callable):
     first_loader = next(iter(loaders.values()))
     D = next(iter(first_loader))[1].shape[1] #dimension of y
     groups = first_loader.dataset.groups # a list of strings of group names
@@ -98,7 +123,183 @@ def copula_covariance(parameters: dict, loaders: dict[str, DataLoader], uniformi
     if len(groups) == 1:
         return list(result.values())[0]
-    return result
+    return result
+def fast_copula_covariance(parameters: dict, loaders: dict[str, DataLoader], uniformizer: Callable, top_k: int):
+    """
+    Compute an efficient approximation of copula covariance by modeling only the top-k most prevalent genes
+    with full covariance and approximating the rest with diagonal covariance.
+    Parameters:
+    -----------
+    parameters : dict
+        Model parameters dictionary
+    loaders : dict[str, DataLoader]
+        Dictionary of data loaders
+    uniformizer : Callable
+        Function to convert to uniform distribution
+    top_k : int
+        Number of top genes to model with full covariance
+    Returns:
+    --------
+    dict or FastCovarianceStructure:
+        - If single group: FastCovarianceStructure containing:
+          * top_k_cov: (top_k, top_k) full covariance matrix for top genes
+          * remaining_var: (remaining_genes,) diagonal variances for remaining genes
+          * top_k_indices: indices of top-k genes
+          * remaining_indices: indices of remaining genes
+          * gene_total_expression: total expression levels for gene selection
+        - If multiple groups: dict mapping group names to FastCovarianceStructure objects
+    """
+    first_loader = next(iter(loaders.values()))
+    D = next(iter(first_loader))[1].shape[1] #dimension of y
+    groups = first_loader.dataset.groups # a list of strings of group names
+    # Validate top_k parameter
+    if top_k <= 0:
+        raise ValueError("top_k must be a positive integer")
+    if top_k >= D:
+        # If top_k is larger than total genes, fall back to regular covariance
+        return copula_covariance(parameters, loaders, uniformizer)
+    # Step 1: Calculate total expression for each gene to determine prevalence
+    gene_total_expression = np.zeros(D)
+    keys = list(loaders.keys())
+    loaders_list = list(loaders.values())
+    num_keys = len(keys)
+    # Calculate total expression across all batches
+    for batches in zip(*loaders_list):
+        y_batch = batches[0][1].cpu().numpy()
+        gene_total_expression += y_batch.sum(axis=0)
+    # Step 2: Select top-k most prevalent genes
+    top_k_indices = np.argsort(gene_total_expression)[-top_k:]
+    remaining_indices = np.argsort(gene_total_expression)[:-top_k]
+    # Step 3: Compute statistics for both top-k and remaining genes
+    sums_top_k = {g: np.zeros(top_k) for g in groups}
+    second_moments_top_k = {g: np.zeros((top_k, top_k)) for g in groups}
+    sums_remaining = {g: np.zeros(len(remaining_indices)) for g in groups}
+    second_moments_remaining = {g: np.zeros(len(remaining_indices)) for g in groups}
+    Ng = {g: 0 for g in groups}
+    # Reset loaders for second pass
+    loaders_list = list(loaders.values())
+    for batches in zip(*loaders_list):
+        x_batch_dict = {
+            keys[i]: batches[i][0].cpu().numpy() for i in range(num_keys)
+        }
+        y_batch = batches[0][1].cpu().numpy()
+        memberships = batches[0][2] # should be identical for all keys
+        u = uniformizer(parameters, x_batch_dict, y_batch)
+        for g in groups:
+            ix = np.where(np.array(memberships) == g)
+            if len(ix[0]) == 0:
+                continue
+            z = norm().ppf(u[ix])
+            # Process top-k genes with full covariance
+            z_top_k = z[:, top_k_indices]
+            second_moments_top_k[g] += z_top_k.T @ z_top_k
+            sums_top_k[g] += z_top_k.sum(axis=0)
+            # Process remaining genes with diagonal covariance only
+            z_remaining = z[:, remaining_indices]
+            second_moments_remaining[g] += (z_remaining ** 2).sum(axis=0)
+            sums_remaining[g] += z_remaining.sum(axis=0)
+            Ng[g] += len(ix[0])
+    # Step 4: Compute final covariance structures
+    result = {}
+    for g in groups:
+        if Ng[g] == 0:
+            continue
+        # Full covariance for top-k genes
+        mean_top_k = sums_top_k[g] / Ng[g]
+        cov_top_k = second_moments_top_k[g] / Ng[g] - np.outer(mean_top_k, mean_top_k)
+        # Diagonal variance for remaining genes
+        mean_remaining = sums_remaining[g] / Ng[g]
+        var_remaining = second_moments_remaining[g] / Ng[g] - mean_remaining ** 2
+        # Create FastCovarianceStructure
+        result[g] = FastCovarianceStructure(
+            top_k_cov=cov_top_k,
+            remaining_var=var_remaining,
+            top_k_indices=top_k_indices,
+            remaining_indices=remaining_indices,
+            gene_total_expression=gene_total_expression
+        )
+    if len(groups) == 1:
+        return list(result.values())[0]
+    return result
+class FastCovarianceStructure:
+    """
+    Data structure to efficiently store and access covariance information for fast copula sampling.
+    Attributes:
+    -----------
+    top_k_cov : np.ndarray
+        Full covariance matrix for top-k most prevalent genes, shape (top_k, top_k)
+    remaining_var : np.ndarray
+        Diagonal variances for remaining genes, shape (remaining_genes,)
+    top_k_indices : np.ndarray
+        Indices of the top-k genes in the original gene ordering
+    remaining_indices : np.ndarray
+        Indices of the remaining genes in the original gene ordering
+    gene_total_expression : np.ndarray
+        Total expression levels used for gene selection, shape (total_genes,)
+    """
+    def __init__(self, top_k_cov, remaining_var, top_k_indices, remaining_indices, gene_total_expression):
+        self.top_k_cov = top_k_cov
+        self.remaining_var = remaining_var
+        self.top_k_indices = top_k_indices
+        self.remaining_indices = remaining_indices
+        self.gene_total_expression = gene_total_expression
+        self.top_k = len(top_k_indices)
+        self.total_genes = len(top_k_indices) + len(remaining_indices)
+    def __repr__(self):
+        return (f"FastCovarianceStructure(top_k={self.top_k}, "
+                f"remaining_genes={len(self.remaining_indices)}, "
+                f"total_genes={self.total_genes})")
+    def to_full_matrix(self):
+        """
+        Convert to full covariance matrix for compatibility/debugging.
+        Returns:
+        --------
+        np.ndarray : Full covariance matrix with shape (total_genes, total_genes)
+        """
+        full_cov = np.zeros((self.total_genes, self.total_genes))
+        # Fill in top-k block
+        ix_top = np.ix_(self.top_k_indices, self.top_k_indices)
+        full_cov[ix_top] = self.top_k_cov
+        # Fill in diagonal for remaining genes
+        full_cov[self.remaining_indices, self.remaining_indices] = self.remaining_var
+        return full_cov
 ###############################################################################
@@ -128,18 +329,32 @@ def clip(u: np.array, min: float = 1e-5, max: float = 1 - 1e-5) -> np.array:
 def format_copula_parameters(parameters: dict, var_names: list):
+    '''
+    Format the copula parameters into a dictionary of covariance matrices in pandas dataframe format.
+    If the covariance is a FastCovarianceStructure, return it as is.
+    If the covariance is a dictionary of FastCovarianceStructure objects, return it as is.
+    Otherwise, return a dictionary of covariance matrices in pandas dataframe format.
+    '''
     covariance = parameters["covariance"]
-    if type(covariance) is not dict:
+    # Handle FastCovarianceStructure - keep it as is since it has efficient methods
+    if isinstance(covariance, FastCovarianceStructure):
+        return covariance
+    elif isinstance(covariance, dict) and any(isinstance(v, FastCovarianceStructure) for v in covariance.values()):
+        # If it's a dict containing FastCovarianceStructure objects, keep as is
+        return covariance
+    elif type(covariance) is not dict:
         covariance = pd.DataFrame(
             parameters["covariance"], columns=list(var_names), index=list(var_names)
         )
     else:
         for group in covariance.keys():
-            covariance[group] = pd.DataFrame(
-                parameters["covariance"][group],
-                columns=list(var_names),
-                index=list(var_names),
-            )
+            if not isinstance(covariance[group], FastCovarianceStructure):
+                covariance[group] = pd.DataFrame(
+                    parameters["covariance"][group],
+                    columns=list(var_names),
+                    index=list(var_names),
+                )
     return covariance

scdesigner/estimators/negbin.py CHANGED Viewed

@@ -127,3 +127,27 @@ negbin_copula = gcf.gaussian_copula_factory(
     negbin_copula_array, format_negbin_parameters_with_loaders,
     param_name=['mean', 'dispersion']
 )
+###############################################################################
+## Fast copula versions for negative binomial regression
+###############################################################################
+def fast_negbin_copula_array_factory(top_k: int):
+    """
+    top_k: int
+        Number of top genes to model with full covariance
+    """
+    return gcf.fast_gaussian_copula_array_factory(
+        negbin_regression_array, negbin_uniformizer, top_k
+    )
+def fast_negbin_copula_factory(top_k: int):
+    """
+    top_k: int
+        Number of top genes to model with full covariance
+    """
+    fast_copula_array = fast_negbin_copula_array_factory(top_k)
+    return gcf.gaussian_copula_factory(
+        fast_copula_array, format_negbin_parameters_with_loaders,
+        param_name=['mean', 'dispersion']
+    )

scdesigner/estimators/poisson.py CHANGED Viewed

@@ -98,3 +98,27 @@ poisson_copula_array = gcf.gaussian_copula_array_factory(
 poisson_copula = gcf.gaussian_copula_factory(
     poisson_copula_array, format_poisson_parameters_with_loaders, ['mean']
 )
+###############################################################################
+## Fast copula versions for poisson regression
+###############################################################################
+def fast_poisson_copula_array_factory(top_k: int):
+    """
+    top_k: int
+        Number of top genes to model with full covariance
+    """
+    return gcf.fast_gaussian_copula_array_factory(
+        poisson_regression_array, poisson_uniformizer, top_k
+    )
+def fast_poisson_copula_factory(top_k: int):
+    """
+    top_k: int
+        Number of top genes to model with full covariance
+    """
+    fast_copula_array = fast_poisson_copula_array_factory(top_k)
+    return gcf.gaussian_copula_factory(
+        fast_copula_array, format_poisson_parameters_with_loaders,
+        param_name=['mean']
+    )

scdesigner/minimal/composite.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .loader import obs_loader
 from .scd3 import SCD3Simulator
-from .standard_covariance import StandardCovariance
+from .standard_copula import StandardCopula
 from anndata import AnnData
 from typing import Dict, Optional, List
 import numpy as np
@@ -10,7 +10,7 @@ class CompositeCopula(SCD3Simulator):
     def __init__(self, marginals: List,
                  copula_formula: Optional[str] = None) -> None:
         self.marginals = marginals
-        self.copula = StandardCovariance(copula_formula)
+        self.copula = StandardCopula(copula_formula)
         self.template = None
         self.parameters = None
         self.merged_formula = None

scdesigner/minimal/copula.py CHANGED Viewed

@@ -2,13 +2,16 @@ from typing import Dict, Callable, Tuple
 import torch
 from anndata import AnnData
 from .loader import adata_loader
-class Copula:
+from abc import ABC, abstractmethod
+import numpy as np
+import pandas as pd
+from typing import Optional, Union
+class Copula(ABC):
     def __init__(self, formula: str, **kwargs):
         self.formula = formula
         self.loader = None
         self.n_outcomes = None
-        self.parameters = None
+        self.parameters = None # Should be a dictionary of CovarianceStructure objects
     def setup_data(self, adata: AnnData, marginal_formula: Dict[str, str], batch_size: int = 1024, **kwargs):
         self.adata = adata
@@ -16,18 +19,187 @@ class Copula:
         self.loader = adata_loader(adata, self.formula, batch_size=batch_size, **kwargs)
         X_batch, _ = next(iter(self.loader))
         self.n_outcomes = X_batch.shape[1]
+    def decorrelate(self, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
+        """Decorrelate the covariance matrix for the given row and column patterns.
+        Args:
+            row_pattern (str): The regex pattern for the row names to match.
+            col_pattern (str): The regex pattern for the column names to match.
+            group (Union[str, list, None]): The group or groups to apply the transformation to. If None, the transformation is applied to all groups.
+        """
+        if group is None:
+            for g in self.groups:
+                self.parameters[g].decorrelate(row_pattern, col_pattern)
+        elif isinstance(group, str):
+            self.parameters[group].decorrelate(row_pattern, col_pattern)
+        else:
+            for g in group:
+                self.parameters[g].decorrelate(row_pattern, col_pattern)
+    def correlate(self, factor: float, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
+        """Multiply selected off-diagonal entries by factor.
+        Args:
+            row_pattern (str): The regex pattern for the row names to match.
+            col_pattern (str): The regex pattern for the column names to match.
+            factor (float): The factor to multiply the off-diagonal entries by.
+            group (Union[str, list, None]): The group or groups to apply the transformation to. If None, the transformation is applied to all groups.
+        """
+        if group is None:
+            for g in self.groups:
+                self.parameters[g].correlate(row_pattern, col_pattern, factor)
+        elif isinstance(group, str):
+            self.parameters[group].correlate(row_pattern, col_pattern, factor)
+        else:
+            for g in group:
+                self.parameters[g].correlate(row_pattern, col_pattern, factor)
+    @abstractmethod
     def fit(self, uniformizer: Callable, **kwargs):
         raise NotImplementedError
+    @abstractmethod
     def pseudo_obs(self, x_dict: Dict):
         raise NotImplementedError
+    @abstractmethod
     def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
         raise NotImplementedError
+    @abstractmethod
     def num_params(self, **kwargs):
         raise NotImplementedError
-    def format_parameters(self):
-        raise NotImplementedError
+    # @abstractmethod
+    # def format_parameters(self):
+    #     raise NotImplementedError
+class CovarianceStructure:
+    """
+    Efficient storage for covariance matrices in copula-based gene expression modeling.
+    This class provides memory-efficient storage for covariance information by storing
+    either a full covariance matrix or a block matrix with diagonal variances for
+    remaining genes. This enables fast copula estimation and sampling for large
+    gene expression datasets.
+    Attributes
+    ----------
+    cov : pd.DataFrame
+        Covariance matrix for modeled genes with gene names as index/columns
+    modeled_indices : np.ndarray
+        Indices of modeled genes in original ordering
+    remaining_var : pd.Series or None
+        Diagonal variances for remaining genes, None if full matrix stored
+    remaining_indices : np.ndarray or None
+        Indices of remaining genes in original ordering
+    num_modeled_genes : int
+        Number of modeled genes
+    num_remaining_genes : int
+        Number of remaining genes (0 if full matrix stored)
+    total_genes : int
+        Total number of genes
+    """
+    def __init__(self, cov: np.ndarray,
+                 modeled_names: pd.Index,
+                 modeled_indices: Optional[np.ndarray] = None,
+                 remaining_var: Optional[np.ndarray] = None,
+                 remaining_indices: Optional[np.ndarray] = None,
+                 remaining_names: Optional[pd.Index] = None):
+        """initialize a CovarianceStructure object.
+        Args:
+            cov (np.ndarray): Covariance matrix for modeled genes, shape (n_modeled_genes, n_modeled_genes)
+            modeled_names (pd.Index): Gene names for the modeled genes
+            modeled_indices (Optional[np.ndarray], optional): Indices of modeled genes in original ordering. Defaults to sequential indices.
+            remaining_var (Optional[np.ndarray], optional): Diagonal variances for remaining genes, shape (n_remaining_genes,)
+            remaining_indices (Optional[np.ndarray], optional): Indices of remaining genes in original ordering
+            remaining_names (Optional[pd.Index], optional): Gene names for remaining genes
+        """
+        self.cov = pd.DataFrame(cov, index=modeled_names, columns=modeled_names)
+        if modeled_indices is not None:
+            self.modeled_indices = modeled_indices
+        else:
+            self.modeled_indices = np.arange(len(modeled_names))
+        if remaining_var is not None:
+            self.remaining_var = pd.Series(remaining_var, index=remaining_names)
+        else:
+            self.remaining_var = None
+        self.remaining_indices = remaining_indices
+        self.num_modeled_genes = len(modeled_names)
+        self.num_remaining_genes = len(remaining_indices) if remaining_indices is not None else 0
+        self.total_genes = self.num_modeled_genes + self.num_remaining_genes
+    def __repr__(self):
+        if self.remaining_var is None:
+            return self.cov.__repr__()
+        else:
+            return f"CovarianceStructure(modeled_genes={self.num_modeled_genes}, \
+                total_genes={self.total_genes})"
+    def _repr_html_(self):
+        """Jupyter Notebook display"""
+        if self.remaining_var is None:
+            return self.cov._repr_html_()
+        else:
+            html = f"<b>CovarianceStructure:</b> {self.num_modeled_genes} modeled genes, {self.total_genes} total<br>"
+            html += "<h4>Modeled Covariance Matrix</h4>" + self.cov._repr_html_()
+            html += "<h4>Remaining Gene Variances</h4>" + self.remaining_var.to_frame("variance").T._repr_html_()
+            return html
+    def decorrelate(self, row_pattern: str, col_pattern: str):
+        """Decorrelate the covariance matrix for the given row and column patterns.
+        """
+        from .transform import data_frame_mask
+        m1 = data_frame_mask(self.cov, ".", col_pattern)
+        m2 = data_frame_mask(self.cov, row_pattern, ".")
+        mask = (m1 | m2)
+        np.fill_diagonal(mask, False)
+        self.cov.values[mask] = 0
+    def correlate(self, row_pattern: str, col_pattern: str, factor: float):
+        """Multiply selected off-diagonal entries by factor.
+        Args:
+            row_pattern (str): The regex pattern for the row names to match.
+            col_pattern (str): The regex pattern for the column names to match.
+            factor (float): The factor to multiply the off-diagonal entries by.
+        """
+        from .transform import data_frame_mask
+        m1 = data_frame_mask(self.cov, ".", col_pattern)
+        m2 = data_frame_mask(self.cov, row_pattern, ".")
+        mask = (m1 | m2)
+        np.fill_diagonal(mask, False)
+        self.cov.values[mask] = self.cov.values[mask] * factor
+    @property
+    def shape(self):
+        return (self.total_genes, self.total_genes)
+    def to_full_matrix(self):
+        """
+        Convert to full covariance matrix for compatibility/debugging.
+        Returns:
+        --------
+        np.ndarray : Full covariance matrix with shape (total_genes, total_genes)
+        """
+        if self.remaining_var is None:
+            return self.cov.values
+        else:
+            full_cov = np.zeros((self.total_genes, self.total_genes))
+            # Fill in top-k block
+            ix_modeled = np.ix_(self.modeled_indices, self.modeled_indices)
+            full_cov[ix_modeled] = self.cov.values
+            # Fill in diagonal for remaining genes
+            full_cov[self.remaining_indices, self.remaining_indices] = self.remaining_var.values
+        return full_cov

scdesigner 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

scdesigner 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl