PyPI - scdesigner - Versions diffs - 0.0.4__tar.gz → 0.0.5__tar.gz - Mend

scdesigner 0.0.4tar.gz → 0.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of scdesigner might be problematic. Click here for more details.

Files changed (70) hide show

{scdesigner-0.0.4 → scdesigner-0.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scdesigner
-Version: 0.0.4
+Version: 0.0.5
 Summary: Interactive simulation for rigorous and transparent multi-omics analysis.
 Project-URL: Homepage, https://github.com/krisrs1128/scDesigner/
 Project-URL: Issues, https://github.com/krisrs1128/scDesigner/Issues/
@@ -11,7 +11,6 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.8
 Requires-Dist: anndata
 Requires-Dist: formulaic
-Requires-Dist: lightning
 Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: rich

{scdesigner-0.0.4 → scdesigner-0.0.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "scdesigner"
-version = "0.0.4"
+version = "0.0.5"
 authors = [
   { name="Kris Sankaran", email="ksankaran@wisc.edu" },
 ]
@@ -15,7 +15,6 @@ classifiers = [
 dependencies = [
     "anndata",
     "formulaic",
-    "lightning",
     "numpy",
     "pandas",
     "rich",

{scdesigner-0.0.4 → scdesigner-0.0.5}/src/scdesigner/minimal/loader.py RENAMED Viewed

@@ -5,21 +5,45 @@ from torch.utils.data import Dataset, DataLoader
 from typing import Dict
 import numpy as np
 import pandas as pd
+import scipy.sparse
 import torch
+def get_device():
+    """Detect and return the best available device (MPS, CUDA, or CPU)."""
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+class PreloadedDataset(Dataset):
+    """Dataset that assumes x and y are both fully in memory."""
+    def __init__(self, y_tensor, x_tensors, predictor_names):
+        self.y = y_tensor
+        self.x = x_tensors
+        self.predictor_names = predictor_names
+    def __len__(self):
+        return len(self.y)
+    def __getitem__(self, idx):
+        return self.y[idx], {k: v[idx] for k, v in self.x.items()}
 class AnnDataDataset(Dataset):
     """Simple PyTorch Dataset for AnnData objects.
     Supports optional chunked loading for backed AnnData objects. When
     `chunk_size` is provided, the dataset will load contiguous slices
     of rows (of size `chunk_size`) into memory once and serve individual
-    rows from that cached chunk. This avoids calling `to_memory()` on
-    a per-row basis which is expensive for large backed files.
+    rows from that cached chunk. Chunks are moved to device for faster access.
     """
     def __init__(self, adata: AnnData, formula: Dict[str, str], chunk_size: int):
         self.adata = adata
         self.formula = formula
         self.chunk_size = chunk_size
+        self.device = get_device()
         # keeping track of covariate-related information
         self.obs_levels = categories(self.adata.obs)
@@ -28,6 +52,7 @@ class AnnDataDataset(Dataset):
         # Internal cache for the currently loaded chunk
         self._chunk: AnnData | None = None
+        self._chunk_X = None
         self._chunk_start = 0
     def __len__(self):
@@ -42,19 +67,12 @@ class AnnDataDataset(Dataset):
         """
         self._ensure_chunk_loaded(idx)
         local_idx = idx - self._chunk_start
-        adata_slice = self._chunk[local_idx]
-        # Get X data, accounting for potential sparse matrices
-        X = adata_slice.X
-        if hasattr(X, 'toarray'):
-            X = X.toarray()
-        # Get obs data
+        # Get obs data from GPU-cached matrices
         obs_dict = {}
         for key in self.formula.keys():
-            mat = self.obs_matrices.get(key)
-            obs_dict[key] = to_tensor(mat.values[local_idx: local_idx + 1])
-        return to_tensor(X), obs_dict
+            obs_dict[key] = self.obs_matrices[key][local_idx: local_idx + 1]
+        return self._chunk_X[local_idx], obs_dict
     def _ensure_chunk_loaded(self, idx: int) -> None:
         """Load the chunk that contains `idx` into the internal cache."""
@@ -69,36 +87,45 @@ class AnnDataDataset(Dataset):
             self._chunk = chunk
             self._chunk_start = start
-            # Compute model matrices for this chunk's `obs` so we don't need
-            # to keep the full obs data model matrices in memory.
+            # Move chunk to GPU
+            X = chunk.X
+            if hasattr(X, 'toarray'):
+                X = X.toarray()
+            self._chunk_X = torch.tensor(X, dtype=torch.float32).to(self.device)
+            # Compute model matrices for this chunk's `obs` and move to GPU
             obs_coded_chunk = code_levels(self._chunk.obs.copy(), self.obs_levels)
             self.obs_matrices = {}
+            predictor_names = {}
             for key, f in self.formula.items():
-                self.obs_matrices[key] = model_matrix(f, obs_coded_chunk)
+                mat = model_matrix(f, obs_coded_chunk)
+                predictor_names [key] = list(mat.columns)
+                self.obs_matrices[key] = torch.tensor(mat.values, dtype=torch.float32).to(self.device)
             # Capture predictor (column) names from the model matrices once.
             if self.predictor_names is None:
-                self.predictor_names = {k: list(v.columns) for k, v in self.obs_matrices.items()}
-def adata_loader(adata: AnnData,
-                 formula: Dict[str, str],
-                 chunk_size: int = None,
-                 batch_size: int = 1024,
-                 shuffle: bool = False,
-                 num_workers: int = 0,
-                 **kwargs) -> DataLoader:
-    """
-    Create a DataLoader from AnnData that returns batches of (X, obs).
-    """
+                self.predictor_names = predictor_names
+def adata_loader(
+    adata: AnnData,
+    formula: Dict[str, str],
+    chunk_size: int = None,
+    batch_size: int = 1024,
+    shuffle: bool = False,
+    num_workers: int = 0,
+    **kwargs
+) -> DataLoader:
+    """Create a DataLoader from AnnData that returns batches of (X, obs)."""
     data_kwargs = _filter_kwargs(kwargs, DEFAULT_ALLOWED_KWARGS['data'])
-    if chunk_size is None:
-        if getattr(adata, 'isbacked', False):
-            chunk_size = 5000
-        else:
-            chunk_size = len(adata)
+    device = get_device()
+    # separate chunked from non-chunked cases
+    if not getattr(adata, 'isbacked', False):
+        dataset = _preloaded_adata(adata, formula, device)
+    else:
+        dataset = AnnDataDataset(adata, formula, chunk_size or 5000)
-    dataset = AnnDataDataset(adata, formula, chunk_size)
     return DataLoader(
         dataset,
         batch_size=batch_size,
@@ -109,12 +136,30 @@ def adata_loader(adata: AnnData,
     )
 def obs_loader(obs: pd.DataFrame, marginal_formula, **kwargs):
-        adata = AnnData(X=np.zeros((len(obs), 1)), obs=obs)
-        return adata_loader(
-            adata,
-            marginal_formula,
-            **kwargs
-        )
+    adata = AnnData(X=np.zeros((len(obs), 1)), obs=obs)
+    return adata_loader(
+        adata,
+        marginal_formula,
+        **kwargs
+    )
+################################################################################
+## Extraction of in-memory AnnData to PreloadedDataset
+################################################################################
+def _preloaded_adata(adata: AnnData, formula: Dict[str, str], device: torch.device) -> PreloadedDataset:
+    X = adata.X
+    if scipy.sparse.issparse(X):
+        X = X.toarray()
+    y = torch.tensor(X, dtype=torch.float32).to(device)
+    obs = code_levels(adata.obs.copy(), categories(adata.obs))
+    x = {
+        k: torch.tensor(model_matrix(f, obs).values, dtype=torch.float32).to(device)
+        for k, f in formula.items()
+    }
+    predictor_names = {k: list(model_matrix(f, obs).columns) for k, f in formula.items()}
+    return PreloadedDataset(y, x, predictor_names)
 ################################################################################
 ## Helper functions

{scdesigner-0.0.4 → scdesigner-0.0.5}/src/scdesigner/minimal/marginal.py RENAMED Viewed

@@ -1,9 +1,8 @@
 from .kwargs import DEFAULT_ALLOWED_KWARGS, _filter_kwargs
-from .loader import adata_loader
+from .loader import adata_loader, get_device
 from anndata import AnnData
 from typing import Union, Dict, Optional, Tuple
 import pandas as pd
-import pytorch_lightning as pl
 import torch
 import torch.nn as nn
 from abc import ABC, abstractmethod
@@ -18,6 +17,7 @@ class Marginal(ABC):
         self.predict = None
         self.predictor_names = None
         self.parameters = None
+        self.device = get_device()
     def setup_data(self, adata: AnnData, batch_size: int = 1024, **kwargs):
         """Set up the dataloader for the AnnData object."""
@@ -29,13 +29,30 @@ class Marginal(ABC):
         self.feature_dims = {k: v.shape[1] for k, v in obs_batch.items()}
         self.predictor_names = self.loader.dataset.predictor_names
-    def fit(self, **kwargs):
-        """Fit the marginal predictor"""
+    def fit(self, max_epochs: int = 100, **kwargs):
+        """Fit the marginal predictor using vanilla PyTorch training loop."""
         if self.predict is None:
             self.setup_optimizer(**kwargs)
-        trainer_kwargs = _filter_kwargs(kwargs, DEFAULT_ALLOWED_KWARGS['trainer'])
-        trainer = pl.Trainer(**trainer_kwargs)
-        trainer.fit(self.predict, train_dataloaders=self.loader)
+        for epoch in range(max_epochs):
+            epoch_loss, n_batches = 0.0, 0
+            for batch in self.loader:
+                y, x = batch
+                if y.device != self.device:
+                    y = y.to(self.device)
+                    x = {k: v.to(self.device) for k, v in x.items()}
+                self.predict.optimizer.zero_grad()
+                loss = self.predict.loss_fn((y, x))
+                loss.backward()
+                self.predict.optimizer.step()
+                epoch_loss += loss.item()
+                n_batches += 1
+            avg_loss = epoch_loss / n_batches
+            print(f"Epoch {epoch}/{max_epochs}, Loss: {avg_loss:.4f}", end='\r')
         self.parameters = self.format_parameters()
     def format_parameters(self):
@@ -85,7 +102,7 @@ class Marginal(ABC):
         raise NotImplementedError
-class GLMPredictor(pl.LightningModule):
+class GLMPredictor(nn.Module):
     """GLM-style predictor with arbitrary named parameters.
     Args:
@@ -111,21 +128,22 @@ class GLMPredictor(pl.LightningModule):
         self.feature_dims = dict(feature_dims)
         self.param_names = list(self.feature_dims.keys())
-        # create default link functions and parameter matrices
         self.link_fns = link_fns or {k: torch.exp for k in self.param_names}
         self.coefs = nn.ParameterDict()
         for key, dim in self.feature_dims.items():
             self.coefs[key] = nn.Parameter(torch.zeros(dim, self.n_outcomes))
-        # optimization parameters
         self.reset_parameters()
         self.loss_fn = loss_fn
-        self.optimizer_class = optimizer_class
-        self.optimizer_kwargs = optimizer_kwargs
+        self.to(get_device())
+        optimizer_kwargs = optimizer_kwargs or {}
+        filtered_kwargs = _filter_kwargs(optimizer_kwargs, DEFAULT_ALLOWED_KWARGS['optimizer'])
+        self.optimizer = optimizer_class(self.parameters(), **filtered_kwargs)
     def reset_parameters(self):
         for p in self.coefs.values():
-            nn.init.normal_(p, mean=0.0, std=1e-2)
+            nn.init.normal_(p, mean=0.0, std=1e-4)
     def forward(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         out = {}
@@ -133,13 +151,4 @@ class GLMPredictor(pl.LightningModule):
             x_beta = obs_dict[name] @ self.coefs[name]
             link = self.link_fns.get(name, torch.exp)
             out[name] = link(x_beta)
-        return out
-    def training_step(self, batch):
-        loss = self.loss_fn(batch)
-        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
-        return loss
-    def configure_optimizers(self, **kwargs):
-        optimizer_kwargs = _filter_kwargs(self.optimizer_kwargs, DEFAULT_ALLOWED_KWARGS['optimizer'])
-        return self.optimizer_class(self.parameters(), **optimizer_kwargs)
+        return out

{scdesigner-0.0.4 → scdesigner-0.0.5}/src/scdesigner/minimal/standard_copula.py RENAMED Viewed

@@ -51,16 +51,16 @@ class StandardCopula(Copula):
     def fit(self, uniformizer: Callable, **kwargs):
         """
         Fit the copula covariance model.
         Args:
             uniformizer (Callable): Function to convert data to uniform distribution
             **kwargs: Additional arguments
                 top_k (int, optional): Use only top-k most expressed genes for covariance estimation.
                                     If None, estimates full covariance for all genes.
         Returns:
             None: Stores fitted parameters in self.parameters as dict of CovarianceStructure objects.
         Raises:
             ValueError: If top_k is not a positive integer or exceeds n_outcomes
         """
@@ -76,11 +76,11 @@ class StandardCopula(Copula):
             sorted_indices = np.argsort(gene_total_expression)
             top_k_indices = sorted_indices[-top_k:]
             remaining_indices = sorted_indices[:-top_k]
-            covariances = self._compute_block_covariance(uniformizer, top_k_indices,
+            covariances = self._compute_block_covariance(uniformizer, top_k_indices,
                                                              remaining_indices, top_k)
         else:
             covariances = self._compute_full_covariance(uniformizer)
         self.parameters = covariances
     def pseudo_obs(self, x_dict: Dict):
@@ -88,7 +88,7 @@ class StandardCopula(Copula):
         #      {"group1": [indices of group 1], "group2": [indices of group 2]}
         # The initialization method ensures that x_dict will always have a "group" key.
         group_data = x_dict.get("group")
-        memberships = group_data.numpy()
+        memberships = group_data.cpu().numpy()
         group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
         # initialize the result
@@ -106,14 +106,14 @@ class StandardCopula(Copula):
     def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
         """
         Compute likelihood of data given the copula model.
         Args:
             uniformizer (Callable): Function to convert expression data to uniform distribution
             batch (Tuple[torch.Tensor, Dict[str, torch.Tensor]]): Data batch containing:
                 - Y (torch.Tensor): Expression data of shape (n_cells, n_genes)
                 - X_dict (Dict[str, torch.Tensor]): Covariates dict with keys as parameter names
                                                 and values as tensors of shape (n_cells, n_covariates)
         Returns:
             np.ndarray: Log-likelihood for each cell, shape (n_cells,)
         """
@@ -132,19 +132,19 @@ class StandardCopula(Copula):
         group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
         ll = np.zeros(len(z))
         for group, cov_struct in parameters.items():
             ix = group_ix[group]
             if len(ix) > 0:
                 z_modeled = z[ix][:, cov_struct.modeled_indices]
                 ll_modeled = multivariate_normal.logpdf(z_modeled,
-                                                       np.zeros(cov_struct.num_modeled_genes),
+                                                       np.zeros(cov_struct.num_modeled_genes),
                                                        cov_struct.cov.values)
                 if cov_struct.num_remaining_genes > 0:
                     z_remaining = z[ix][:, cov_struct.remaining_indices]
                     ll_remaining = norm.logpdf(z_remaining,
-                                            loc=0,
+                                            loc=0,
                                             scale = np.sqrt(cov_struct.remaining_var.values))
                 else:
                     ll_remaining = 0
@@ -155,7 +155,7 @@ class StandardCopula(Copula):
         S = self.parameters
         per_group = [((S[g].num_modeled_genes * (S[g].num_modeled_genes - 1)) / 2) for g in self.groups]
         return sum(per_group)
     def _validate_parameters(self, **kwargs):
         top_k = kwargs.get("top_k", None)
         if top_k is not None:
@@ -166,14 +166,14 @@ class StandardCopula(Copula):
             if top_k > self.n_outcomes:
                 raise ValueError(f"top_k ({top_k}) cannot exceed number of outcomes ({self.n_outcomes})")
         return top_k
     def _accumulate_top_k_stats(self, uniformizer:Callable, top_k_idx, rem_idx, top_k) \
-        -> Tuple[Dict[Union[str, int], np.ndarray],
-                 Dict[Union[str, int], np.ndarray],
-                 Dict[Union[str, int], np.ndarray],
-                 Dict[Union[str, int], np.ndarray],
+        -> Tuple[Dict[Union[str, int], np.ndarray],
+                 Dict[Union[str, int], np.ndarray],
+                 Dict[Union[str, int], np.ndarray],
+                 Dict[Union[str, int], np.ndarray],
                  Dict[Union[str, int], int]]:
         """Accumulate sufficient statistics for top-k covariance estimation.
@@ -198,7 +198,7 @@ class StandardCopula(Copula):
         for y, x_dict in tqdm(self.loader, desc="Estimating top-k copula covariance"):
             group_data = x_dict.get("group")
-            memberships = group_data.numpy()
+            memberships = group_data.cpu().numpy()
             u = uniformizer(y, x_dict)
             z = norm.ppf(u)
@@ -211,20 +211,20 @@ class StandardCopula(Copula):
                 n_g = mask.sum()
                 top_k_z, rem_z = z_g[:, top_k_idx], z_g[:, rem_idx]
                 top_k_sums[g] += top_k_z.sum(axis=0)
                 top_k_second_moments[g] += top_k_z.T @ top_k_z
                 rem_sums[g] += rem_z.sum(axis=0)
                 rem_second_moments[g] += (rem_z ** 2).sum(axis=0)
                 Ng[g] += n_g
         return top_k_sums, top_k_second_moments, rem_sums, rem_second_moments, Ng
     def _accumulate_full_stats(self, uniformizer:Callable) \
-        -> Tuple[Dict[Union[str, int], np.ndarray],
-                 Dict[Union[str, int], np.ndarray],
+        -> Tuple[Dict[Union[str, int], np.ndarray],
+                 Dict[Union[str, int], np.ndarray],
                  Dict[Union[str, int], int]]:
         """Accumulate sufficient statistics for full covariance estimation.
@@ -242,14 +242,14 @@ class StandardCopula(Copula):
         for y, x_dict in tqdm(self.loader, desc="Estimating copula covariance"):
             group_data = x_dict.get("group")
-            memberships = group_data.numpy()
+            memberships = group_data.cpu().numpy()
             u = uniformizer(y, x_dict)
             z = norm.ppf(u)
             for g in self.groups:
                 mask = memberships[:, self.group_col[g]] == 1
                 if not np.any(mask):
                     continue
@@ -258,12 +258,12 @@ class StandardCopula(Copula):
                 second_moments[g] += z_g.T @ z_g
                 sums[g] += z_g.sum(axis=0)
                 Ng[g] += n_g
         return sums, second_moments, Ng
-    def _compute_block_covariance(self, uniformizer:Callable,
+    def _compute_block_covariance(self, uniformizer:Callable,
                                   top_k_idx: np.ndarray, rem_idx: np.ndarray, top_k: int) \
         -> Dict[Union[str, int], CovarianceStructure]:
         """Compute the covariance matrix for the top-k and remaining genes.
@@ -300,7 +300,7 @@ class StandardCopula(Copula):
                 remaining_names=remaining_names
             )
         return covariance
     def _compute_full_covariance(self, uniformizer:Callable) -> Dict[Union[str, int], CovarianceStructure]:
         """Compute the covariance matrix for the full genes.
@@ -327,7 +327,7 @@ class StandardCopula(Copula):
                 remaining_names=None
             )
         return covariance
     def _fast_normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
         """Sample pseudo-observations from the covariance structure.
@@ -339,28 +339,28 @@ class StandardCopula(Copula):
             np.ndarray: Pseudo-observations with shape (n_samples, total_genes)
         """
         u = np.zeros((n_samples, cov_struct.total_genes))
         z_modeled = np.random.multivariate_normal(
-            mean=np.zeros(cov_struct.num_modeled_genes),
-            cov=cov_struct.cov.values,
+            mean=np.zeros(cov_struct.num_modeled_genes),
+            cov=cov_struct.cov.values,
             size=n_samples
         )
         z_remaining = np.random.normal(
-            loc=0,
-            scale=cov_struct.remaining_var.values ** 0.5,
+            loc=0,
+            scale=cov_struct.remaining_var.values ** 0.5,
             size=(n_samples, cov_struct.num_remaining_genes)
         )
         normal_distn_modeled = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
         u[:, cov_struct.modeled_indices] = normal_distn_modeled.cdf(z_modeled)
         normal_distn_remaining = norm(0, cov_struct.remaining_var.values ** 0.5)
         u[:, cov_struct.remaining_indices] = normal_distn_remaining.cdf(z_remaining)
         return u
-    def _normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
+    def _normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
         """Sample pseudo-observations from the covariance structure.
         Args:
@@ -372,12 +372,12 @@ class StandardCopula(Copula):
         """
         u = np.zeros((n_samples, cov_struct.total_genes))
         z = np.random.multivariate_normal(
-            mean=np.zeros(cov_struct.total_genes),
-            cov=cov_struct.cov.values,
+            mean=np.zeros(cov_struct.total_genes),
+            cov=cov_struct.cov.values,
             size=n_samples
         )
         normal_distn = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
         u = normal_distn.cdf(z)
         return u