PyPI - pertpy - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

pertpy 1.0.2py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

pertpy/__init__.py +1 -1
pertpy/data/_dataloader.py +2 -2
pertpy/data/_datasets.py +62 -62
pertpy/metadata/_drug.py +4 -2
pertpy/preprocessing/_guide_rna.py +17 -10
pertpy/preprocessing/_guide_rna_mixture.py +9 -3
pertpy/tools/__init__.py +12 -2
pertpy/tools/_augur.py +37 -14
pertpy/tools/_coda/_sccoda.py +0 -19
pertpy/tools/_coda/_tasccoda.py +12 -24
pertpy/tools/_mixscape.py +48 -39
pertpy/tools/_perturbation_space/_comparison.py +3 -3
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +261 -353
pertpy/tools/_perturbation_space/_perturbation_space.py +22 -14
pertpy/tools/_perturbation_space/_simple.py +12 -6
pertpy/tools/_scgen/_scgenvae.py +2 -1
pertpy/tools/core.py +18 -0
{pertpy-1.0.2.dist-info → pertpy-1.0.3.dist-info}/METADATA +84 -51
{pertpy-1.0.2.dist-info → pertpy-1.0.3.dist-info}/RECORD +21 -20
{pertpy-1.0.2.dist-info → pertpy-1.0.3.dist-info}/WHEEL +1 -1
{pertpy-1.0.2.dist-info → pertpy-1.0.3.dist-info}/licenses/LICENSE +0 -0

pertpy/tools/_perturbation_space/_discriminator_classifiers.py CHANGED Viewed

@@ -1,18 +1,21 @@
 from __future__ import annotations
-import anndata
+from typing import Any
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
 import numpy as np
+import optax
+import pandas as pd
 import scipy
-import torch
 from anndata import AnnData
 from fast_array_utils.conv import to_dense
-from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.callbacks import EarlyStopping
+from flax.training import train_state
+from jax import random
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import OneHotEncoder
-from torch import optim
-from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
 from pertpy.tools._perturbation_space._perturbation_space import PerturbationSpace
@@ -74,13 +77,11 @@ class LRClassifierSpace(PerturbationSpace):
         regression_labels = adata.obs[target_col]
-        # Save adata observations for embedding annotations in get_embeddings
         adata_obs = adata.obs.reset_index(drop=True)
         adata_obs = adata_obs.groupby(target_col).agg(
             lambda pert_group: np.nan if len(set(pert_group)) != 1 else list(set(pert_group))[0]
         )
-        # Fit a logistic regression model for each perturbation
         regression_model = LogisticRegression(max_iter=max_iter, class_weight="balanced")
         regression_embeddings = {}
         regression_scores = {}
@@ -95,12 +96,10 @@ class LRClassifierSpace(PerturbationSpace):
             regression_embeddings[perturbation] = regression_model.coef_
             regression_scores[perturbation] = regression_model.score(X_test, y_test)
-        # Save the regression embeddings and scores in an AnnData object
         pert_adata = AnnData(X=np.array(list(regression_embeddings.values())).squeeze())
         pert_adata.obs["perturbations"] = list(regression_embeddings.keys())
         pert_adata.obs["classifier_score"] = list(regression_scores.values())
-        # Save adata observations for embedding annotations
         for obs_name in adata_obs.columns:
             if not adata_obs[obs_name].isnull().values.any():
                 pert_adata.obs[obs_name] = pert_adata.obs["perturbations"].map(
@@ -110,6 +109,174 @@ class LRClassifierSpace(PerturbationSpace):
         return pert_adata
+class MLP(nn.Module):
+    """A multilayer perceptron with ReLU activations, optional Dropout and optional BatchNorm."""
+    sizes: list[int]
+    dropout: float = 0.0
+    batch_norm: bool = True
+    layer_norm: bool = False
+    last_layer_act: str = "linear"
+    @nn.compact
+    def __call__(self, x: jnp.ndarray, training: bool = True) -> jnp.ndarray:
+        for i in range(len(self.sizes) - 1):
+            x = nn.Dense(self.sizes[i + 1])(x)
+            if i < len(self.sizes) - 2:
+                if self.batch_norm:
+                    x = nn.BatchNorm(use_running_average=not training)(x)
+                elif self.layer_norm:
+                    x = nn.LayerNorm()(x)
+                x = nn.relu(x)
+                if self.dropout > 0 and training:
+                    x = nn.Dropout(rate=self.dropout, deterministic=not training)(x)
+        if self.last_layer_act == "ReLU":
+            x = nn.relu(x)
+        return x
+    @nn.compact
+    def embedding(self, x: jnp.ndarray, training: bool = False) -> jnp.ndarray:
+        for i in range(len(self.sizes) - 2):
+            x = nn.Dense(self.sizes[i + 1])(x)
+            if self.batch_norm:
+                x = nn.BatchNorm(use_running_average=True)(x)
+            elif self.layer_norm:
+                x = nn.LayerNorm()(x)
+            x = nn.relu(x)
+            if self.dropout > 0 and training:
+                x = nn.Dropout(rate=self.dropout, deterministic=True)(x)
+        return x
+class TrainState(train_state.TrainState):
+    batch_stats: Any
+def create_train_state(rng: jnp.ndarray, model: nn.Module, input_shape: tuple[int, ...], lr: float) -> TrainState:
+    dummy_input = jnp.ones((1,) + input_shape)
+    rng, init_rng, dropout_rng = random.split(rng, 3)
+    variables = model.init({"params": init_rng, "dropout": dropout_rng}, dummy_input, training=True)
+    params = variables["params"]
+    batch_stats = variables.get("batch_stats", {})
+    tx = optax.adamw(learning_rate=lr, weight_decay=0.1)
+    return TrainState.create(apply_fn=model.apply, params=params, tx=tx, batch_stats=batch_stats)
+@jax.jit
+def train_step(state: TrainState, batch: tuple[jnp.ndarray, jnp.ndarray], rng: jnp.ndarray) -> tuple[TrainState, float]:
+    def loss_fn(params):
+        x, y = batch
+        variables = {"params": params, "batch_stats": state.batch_stats}
+        logits, new_batch_stats = state.apply_fn(
+            variables, x, training=True, mutable=["batch_stats"], rngs={"dropout": rng}
+        )
+        y_indices = jnp.argmax(y, axis=1)
+        loss = optax.softmax_cross_entropy_with_integer_labels(logits, y_indices).mean()
+        return loss, new_batch_stats
+    (loss, new_batch_stats), grads = jax.value_and_grad(loss_fn, has_aux=True)(state.params)
+    state = state.apply_gradients(grads=grads)
+    state = state.replace(batch_stats=new_batch_stats["batch_stats"])
+    return state, loss
+@jax.jit
+def val_step(state: TrainState, batch: tuple[jnp.ndarray, jnp.ndarray]) -> float:
+    x, y = batch
+    variables = {"params": state.params, "batch_stats": state.batch_stats}
+    logits = state.apply_fn(variables, x, training=False)
+    y_indices = jnp.argmax(y, axis=1)
+    loss = optax.softmax_cross_entropy_with_integer_labels(logits, y_indices).mean()
+    return loss
+@jax.jit
+def get_embeddings(state: TrainState, x: jnp.ndarray) -> jnp.ndarray:
+    variables = {"params": state.params, "batch_stats": state.batch_stats}
+    return state.apply_fn(variables, x, training=False, method="embedding")
+class JAXDataset:
+    """Dataset for perturbation classification.
+    Needed for training a model that classifies the perturbed cells and takes as perturbation embedding the second to last layer.
+    """
+    def __init__(
+        self,
+        adata: AnnData,
+        target_col: str = "perturbations",
+        label_col: str = "perturbations",
+        layer_key: str = None,
+    ):
+        """JAX Dataset for perturbation classification.
+        Args:
+            adata: AnnData object with observations and labels.
+            target_col: key with the perturbation labels numerically encoded.
+            label_col: key with the perturbation labels.
+            layer_key: key of the layer to be used as data, otherwise .X.
+        """
+        if layer_key:
+            self.data = adata.layers[layer_key]
+        else:
+            self.data = adata.X
+        if target_col in adata.obs.columns:
+            self.labels = adata.obs[target_col].values
+        elif target_col in adata.obsm:
+            self.labels = adata.obsm[target_col]
+        else:
+            raise ValueError(f"Target column {target_col} not found in obs or obsm")
+        self.pert_labels = adata.obs[label_col].values
+        if scipy.sparse.issparse(self.data):
+            self.data = to_dense(self.data)
+        self.data = jnp.array(self.data, dtype=jnp.float32)
+        self.labels = jnp.array(self.labels, dtype=jnp.float32)
+    def __len__(self):
+        return self.data.shape[0]
+    def get_batch(self, indices: jnp.ndarray) -> tuple[jnp.ndarray, jnp.ndarray, list]:
+        """Returns a batch of samples and corresponding perturbations applied (labels)."""
+        batch_data = self.data[indices]
+        batch_labels = self.labels[indices]
+        batch_pert_labels = [self.pert_labels[i] for i in indices]
+        return batch_data, batch_labels, batch_pert_labels
+def create_batched_indices(
+    dataset_size: int, rng: jnp.ndarray, batch_size: int, n_batches: int, weights: jnp.ndarray | None = None
+) -> list:
+    """Create batched indices for training, optionally with weighted sampling."""
+    batches = []
+    for _ in range(n_batches):
+        rng, batch_rng = random.split(rng)
+        if weights is not None:
+            batch_indices = random.choice(batch_rng, dataset_size, shape=(batch_size,), p=weights)
+        else:
+            batch_indices = random.choice(batch_rng, dataset_size, shape=(batch_size,), replace=False)
+        batches.append(batch_indices)
+    return batches
 class MLPClassifierSpace(PerturbationSpace):
     """Fits an ANN classifier to the data and takes the feature space (weights in the last layer) as embedding.
@@ -120,7 +287,7 @@ class MLPClassifierSpace(PerturbationSpace):
     See here https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7289078/ (Dose-response analysis) and Sup 17-19.
     """
-    def compute(  # type: ignore
+    def compute(
         self,
         adata: AnnData,
         target_col: str = "perturbations",
@@ -128,12 +295,14 @@ class MLPClassifierSpace(PerturbationSpace):
         hidden_dim: list[int] = None,
         dropout: float = 0.0,
         batch_norm: bool = True,
-        batch_size: int = 256,
+        batch_size: int = 128,
         test_split_size: float = 0.2,
         validation_split_size: float = 0.25,
         max_epochs: int = 20,
         val_epochs_check: int = 2,
         patience: int = 2,
+        lr: float = 1e-4,
+        seed: int = 42,
     ) -> AnnData:
         """Creates cell embeddings by training a MLP classifier model to distinguish between perturbations.
@@ -148,21 +317,21 @@ class MLPClassifierSpace(PerturbationSpace):
             adata: AnnData object of size cells x genes
             target_col: .obs column that stores the perturbations.
             layer_key: Layer in adata to use.
-            hidden_dim: List of number of neurons in each hidden layers of the neural network. For instance, [512, 256]
-                will create a neural network with two hidden layers, the first with 512 neurons and the second with 256 neurons.
+            hidden_dim: List of number of neurons in each hidden layers of the neural network.
+                For instance, [512, 256] will create a neural network with two hidden layers, the first with 512 neurons and the second with 256 neurons.
             dropout: Amount of dropout applied, constant for all layers.
             batch_norm: Whether to apply batch normalization.
             batch_size: The batch size, i.e. the number of datapoints to use in one forward/backward pass.
             test_split_size: Fraction of data to put in the test set. Default to 0.2.
             validation_split_size: Fraction of data to put in the validation set of the resultant train set.
-                E.g. a test_split_size of 0.2 and a validation_split_size of 0.25 means that 25% of 80% of the data
-                will be used for validation.
+                E.g. a test_split_size of 0.2 and a validation_split_size of 0.25 means that 25% of 80% of the data will be used for validation.
             max_epochs: Maximum number of epochs for training.
             val_epochs_check: Test performance on validation dataset after every val_epochs_check training epochs.
-                Note that this affects early stopping, as the model will be stopped if the validation performance does not
-                improve for patience epochs.
+                Note that this affects early stopping, as the model will be stopped if the validation performance does not improve for patience epochs.
             patience: Number of validation performance checks without improvement, after which the early stopping flag
                 is activated and training is therefore stopped.
+            lr: Learning rate for training.
+            seed: Random seed for reproducibility.
         Returns:
             AnnData whose `X` attribute is the perturbation embedding and whose .obs['perturbations'] are the names of the perturbations.
@@ -188,9 +357,9 @@ class MLPClassifierSpace(PerturbationSpace):
         labels = adata.obs[target_col].values.reshape(-1, 1)
         encoder = OneHotEncoder()
         encoded_labels = encoder.fit_transform(labels).toarray()
+        adata = adata.copy()
         adata.obsm["encoded_perturbations"] = encoded_labels.astype(np.float32)
-        # Split the data in train, test and validation
         X = list(range(adata.n_obs))
         y = adata.obs[target_col]
@@ -199,368 +368,107 @@ class MLPClassifierSpace(PerturbationSpace):
             X_train, y_train, test_size=validation_split_size, stratify=y_train
         )
-        train_dataset = PLDataset(
+        train_dataset = JAXDataset(
             adata=adata[X_train], target_col="encoded_perturbations", label_col=target_col, layer_key=layer_key
         )
-        val_dataset = PLDataset(
+        val_dataset = JAXDataset(
             adata=adata[X_val], target_col="encoded_perturbations", label_col=target_col, layer_key=layer_key
         )
-        test_dataset = PLDataset(
+        test_dataset = JAXDataset(
             adata=adata[X_test], target_col="encoded_perturbations", label_col=target_col, layer_key=layer_key
-        )  # we don't need to pass y_test since the label selection is done inside
-        # Fix class unbalance (likely to happen in perturbation datasets)
-        # Usually control cells are overrepresented such that predicting control all time would give good results
-        # Cells with rare perturbations are sampled more
-        train_weights = 1 / (1 + torch.sum(torch.tensor(train_dataset.labels), dim=1))
-        train_sampler = WeightedRandomSampler(train_weights, len(train_weights))
-        self.train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4)
-        self.test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
-        self.valid_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
-        # Define the network
-        sizes = [adata.n_vars] + hidden_dim + [n_classes]
-        self.net = MLP(sizes=sizes, dropout=dropout, batch_norm=batch_norm)
-        # Define a dataset that gathers all the data and dataloader for getting embeddings
-        total_dataset = PLDataset(
-            adata=adata, target_col="encoded_perturbations", label_col=target_col, layer_key=layer_key
-        )
-        self.entire_dataset = DataLoader(total_dataset, batch_size=batch_size * 2, shuffle=False, num_workers=0)
-        # Save adata observations for embedding annotations in get_embeddings
-        self.adata_obs = adata.obs.reset_index(drop=True)
-        self.trainer = Trainer(
-            min_epochs=1,
-            max_epochs=max_epochs,
-            check_val_every_n_epoch=val_epochs_check,
-            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=patience)],
-            devices="auto",
-            accelerator="auto",
-        )
-        self.mlp = PerturbationClassifier(model=self.net, batch_size=self.train_dataloader.batch_size)
-        self.trainer.fit(model=self.mlp, train_dataloaders=self.train_dataloader, val_dataloaders=self.valid_dataloader)
-        self.trainer.test(model=self.mlp, dataloaders=self.test_dataloader)
-        # Obtain cell embeddings
-        with torch.no_grad():
-            self.mlp.eval()
-            for dataset_count, batch in enumerate(self.entire_dataset):
-                emb, y = self.mlp.get_embeddings(batch)
-                emb = torch.squeeze(emb)
-                batch_adata = AnnData(X=emb.cpu().numpy())
-                batch_adata.obs["perturbations"] = y
-                if dataset_count == 0:
-                    pert_adata = batch_adata
-                else:
-                    pert_adata = batch_adata if dataset_count == 0 else anndata.concat([pert_adata, batch_adata])
-        # Add .obs annotations to the pert_adata. Because shuffle=False and num_workers=0, the order of the data is stable
-        # and we can just add the annotations from the original AnnData object
-        pert_adata.obs = pert_adata.obs.reset_index(drop=True)
-        if "perturbations" in self.adata_obs.columns:
-            self.adata_obs = self.adata_obs.drop("perturbations", axis=1)
-        obs_subset = self.adata_obs.iloc[: len(pert_adata.obs)].copy()
-        for col in obs_subset.columns:
-            if col not in ["perturbations", "encoded_perturbations"]:
-                pert_adata.obs[col] = obs_subset[col].values
-        return pert_adata
-    def load(self, adata, **kwargs):
-        """This method is deprecated and will be removed in the future. Please use the compute method instead."""
-        raise DeprecationWarning(
-            "The load method is deprecated and will be removed in the future. Please use the compute method instead."
-        )
-    def train(self, **kwargs):
-        """This method is deprecated and will be removed in the future. Please use the compute method instead."""
-        raise DeprecationWarning(
-            "The train method is deprecated and will be removed in the future. Please use the compute method instead."
         )
-    def get_embeddings(self, **kwargs):
-        """This method is deprecated and will be removed in the future. Please use the compute method instead."""
-        raise DeprecationWarning(
-            "The get_embeddings method is deprecated and will be removed in the future. Please use the compute method instead."
+        total_dataset = JAXDataset(
+            adata=adata, target_col="encoded_perturbations", label_col=target_col, layer_key=layer_key
         )
+        rng = random.PRNGKey(seed)
+        rng, init_rng, train_rng = random.split(rng, 3)
-class MLP(torch.nn.Module):
-    """A multilayer perceptron with ReLU activations, optional Dropout and optional BatchNorm."""
-    def __init__(
-        self,
-        sizes: list[int],
-        dropout: float = 0.0,
-        batch_norm: bool = True,
-        layer_norm: bool = False,
-        last_layer_act: str = "linear",
-    ) -> None:
-        """Multilayer perceptron with ReLU activations, optional Dropout and optional BatchNorm.
-        Args:
-            sizes: size of layers.
-            dropout: Dropout probability.
-            batch_norm: specifies if batch norm should be applied.
-            layer_norm:  specifies if layer norm should be applied, as commonly used in Transformers.
-            last_layer_act: activation function of last layer.
-        """
-        super().__init__()
-        layers = []
-        for s in range(len(sizes) - 1):
-            layers += [
-                torch.nn.Linear(sizes[s], sizes[s + 1]),
-                torch.nn.BatchNorm1d(sizes[s + 1]) if batch_norm and s < len(sizes) - 2 else None,
-                torch.nn.LayerNorm(sizes[s + 1]) if layer_norm and s < len(sizes) - 2 and not batch_norm else None,
-                torch.nn.ReLU(),
-                torch.nn.Dropout(dropout) if s < len(sizes) - 2 else None,
-            ]
-        layers = [layer for layer in layers if layer is not None][:-1]
-        self.activation = last_layer_act
-        if self.activation == "linear":
-            pass
-        elif self.activation == "ReLU":
-            self.relu = torch.nn.ReLU()
-        else:
-            raise ValueError("last_layer_act must be one of 'linear' or 'ReLU'")
-        self.network = torch.nn.Sequential(*layers)
-        self.network.apply(init_weights)
-        self.sizes = sizes
-        self.batch_norm = batch_norm
-        self.layer_norm = layer_norm
-        self.last_layer_act = last_layer_act
-    def forward(self, x) -> torch.Tensor:
-        if self.activation == "ReLU":
-            return self.relu(self.network(x))
-        return self.network(x)
-    def embedding(self, x) -> torch.Tensor:
-        for layer in self.network[:-1]:
-            x = layer(x)
-        return x
-def init_weights(m):
-    if isinstance(m, torch.nn.Linear):
-        torch.nn.init.kaiming_uniform_(m.weight)
-        m.bias.data.fill_(0.01)
-class PLDataset(Dataset):
-    """Dataset for perturbation classification.
-    Needed for training a model that classifies the perturbed cells and takes as perturbation embedding the second to last layer.
-    """
-    def __init__(
-        self,
-        adata: np.array,
-        target_col: str = "perturbations",
-        label_col: str = "perturbations",
-        layer_key: str = None,
-    ):
-        """PyTorch lightning Dataset for perturbation classification.
-        Args:
-            adata: AnnData object with observations and labels.
-            target_col: key with the perturbation labels numerically encoded.
-            label_col: key with the perturbation labels.
-            layer_key: key of the layer to be used as data, otherwise .X.
-        """
-        if layer_key:
-            self.data = adata.layers[layer_key]
-        else:
-            self.data = adata.X
-        if target_col in adata.obs.columns:
-            self.labels = adata.obs[target_col]
-        elif target_col in adata.obsm:
-            self.labels = adata.obsm[target_col]
-        else:
-            raise ValueError(f"Target column {target_col} not found in obs or obsm")
-        self.pert_labels = adata.obs[label_col]
-    def __len__(self):
-        return self.data.shape[0]
-    def __getitem__(self, idx):
-        """Returns a sample and corresponding perturbations applied (labels)."""
-        sample = to_dense(self.data[idx]).squeeze() if scipy.sparse.issparse(self.data) else self.data[idx]
-        num_label = self.labels.iloc[idx] if hasattr(self.labels, "iloc") else self.labels[idx]
-        str_label = self.pert_labels.iloc[idx]
-        return sample, num_label, str_label
+        sizes = [adata.n_vars] + hidden_dim + [n_classes]
+        model = MLP(sizes=sizes, dropout=dropout, batch_norm=batch_norm)
+        state = create_train_state(init_rng, model, (adata.n_vars,), lr)
-class PerturbationClassifier(LightningModule):
-    def __init__(
-        self,
-        model: torch.nn.Module,
-        batch_size: int,
-        layers: list = [512],  # noqa
-        dropout: float = 0.0,
-        batch_norm: bool = True,
-        layer_norm: bool = False,
-        last_layer_act: str = "linear",
-        lr=1e-4,
-        seed=42,
-    ):
-        """Perturbation Classifier.
+        # Create weighted sampling for class imbalance
+        weights = 1.0 / (1.0 + jnp.sum(train_dataset.labels, axis=1))
+        weights = weights / jnp.sum(weights)
-        Args:
-            model: model to be trained
-            batch_size: batch size
-            layers: list of layers of the MLP
-            dropout: dropout probability
-            batch_norm: whether to apply batch norm
-            layer_norm: whether to apply layer norm
-            last_layer_act: activation function of last layer
-            lr: learning rate
-            seed: random seed.
-        """
-        super().__init__()
-        self.batch_size = batch_size
-        self.save_hyperparameters()
-        if model:
-            self.net = model
-        else:
-            self._create_model()
-    def _create_model(self):
-        self.net = MLP(
-            sizes=self.hparams.layers,
-            dropout=self.hparams.dropout,
-            batch_norm=self.hparams.batch_norm,
-            layer_norm=self.hparams.layer_norm,
-            last_layer_act=self.hparams.last_layer_act,
+        n_batches_per_epoch = len(train_dataset) // batch_size
+        train_batches = create_batched_indices(
+            len(train_dataset), train_rng, batch_size, max_epochs * n_batches_per_epoch, weights
         )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through the network.
+        best_val_loss = float("inf")
+        patience_counter = 0
-        Args:
-            x: Input tensor
+        for epoch in range(max_epochs):
+            epoch_train_loss = 0
-        Returns:
-            Network output tensor
-        """
-        x = self.net(x)
-        return x
+            epoch_start = epoch * n_batches_per_epoch
+            epoch_end = (epoch + 1) * n_batches_per_epoch
+            epoch_batches = train_batches[epoch_start:epoch_end]
-    def configure_optimizers(self) -> optim.Adam:
-        """Configure optimizer for the model.
+            for _n_train_batches, batch_indices in enumerate(epoch_batches, 1):
+                rng, step_rng = random.split(rng)
+                batch_data, batch_labels, *_ = train_dataset.get_batch(batch_indices)
+                state, loss = train_step(state, (batch_data, batch_labels), step_rng)
+                epoch_train_loss += loss
-        Returns:
-            Adam optimizer with weight decay
-        """
-        optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=0.1)
-        return optimizer
-    def training_step(self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
-        """Perform a training step.
-        Args:
-            batch: Tuple of (input, target, metadata)
-            batch_idx: Index of the current batch
-        Returns:
-            Loss value
-        """
-        x, y, _ = batch
-        x = x.to(torch.float32)
-        y_hat = self.forward(x)
-        y = torch.argmax(y, dim=1)
-        y_hat = y_hat.squeeze()
+            if (epoch + 1) % val_epochs_check == 0:
+                val_losses = []
+                for i in range(0, len(val_dataset), batch_size):
+                    val_indices = jnp.arange(i, min(i + batch_size, len(val_dataset)))
+                    val_batch_data, val_batch_labels, _ = val_dataset.get_batch(val_indices)
+                    val_loss = val_step(state, (val_batch_data, val_batch_labels))
+                    val_losses.append(val_loss)
-        loss = torch.nn.functional.cross_entropy(y_hat, y)
-        self.log("train_loss", loss, prog_bar=True, batch_size=self.batch_size)
+                avg_val_loss = jnp.mean(jnp.array(val_losses))
-        return loss
-    def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
-        """Perform a validation step.
-        Args:
-            batch: Tuple of (input, target, metadata)
-            batch_idx: Index of the current batch
-        Returns:
-            Loss value
-        """
-        x, y, _ = batch
-        x = x.to(torch.float32)
-        y_hat = self.forward(x)
-        y = torch.argmax(y, dim=1)
-        y_hat = y_hat.squeeze()
-        loss = torch.nn.functional.cross_entropy(y_hat, y)
-        self.log("val_loss", loss, prog_bar=True, batch_size=self.batch_size)
-        return loss
-    def test_step(self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
-        """Perform a test step.
-        Args:
-            batch: Tuple of (input, target, metadata)
-            batch_idx: Index of the current batch
-        Returns:
-            Loss value
-        """
-        x, y, _ = batch
-        x = x.to(torch.float32)
+                if avg_val_loss < best_val_loss:
+                    best_val_loss = avg_val_loss
+                    patience_counter = 0
+                else:
+                    patience_counter += 1
-        y_hat = self.forward(x)
+                if patience_counter >= patience:
+                    break
-        y = torch.argmax(y, dim=1)
-        y_hat = y_hat.squeeze()
+        # Test evaluation
+        test_losses = []
+        for i in range(0, len(test_dataset), batch_size):
+            test_indices = jnp.arange(i, min(i + batch_size, len(test_dataset)))
+            test_batch_data, test_batch_labels, _ = test_dataset.get_batch(test_indices)
+            test_loss = val_step(state, (test_batch_data, test_batch_labels))
+            test_losses.append(test_loss)
-        loss = torch.nn.functional.cross_entropy(y_hat, y)
-        self.log("test_loss", loss, prog_bar=True, batch_size=self.batch_size)
+        # Extract embeddings
+        embeddings_list = []
+        labels_list = []
-        return loss
+        for i in range(0, len(total_dataset), batch_size * 2):
+            indices = jnp.arange(i, min(i + batch_size * 2, len(total_dataset)))
+            batch_data, _, batch_pert_labels = total_dataset.get_batch(indices)
+            batch_embeddings = get_embeddings(state, batch_data)
-    def embedding(self, x: torch.Tensor) -> torch.Tensor:
-        """Extract embeddings from input features.
+            embeddings_list.append(batch_embeddings)
+            labels_list.extend(batch_pert_labels)
-        Args:
-            x: Input tensor of shape [Batch, SeqLen, 1]
+        all_embeddings = jnp.concatenate(embeddings_list, axis=0)
-        Returns:
-            Embedded representation of the input
-        """
-        x = self.net.embedding(x)
-        return x
+        pert_adata = AnnData(X=np.array(all_embeddings))
+        pert_adata.obs["perturbations"] = labels_list
-    def get_embeddings(
-        self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor]
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Extract embeddings from a batch.
+        adata_obs = adata.obs.reset_index(drop=True)
+        if "perturbations" in adata_obs.columns:
+            adata_obs = adata_obs.drop("perturbations", axis=1)
-        Args:
-            batch: Tuple of (input, target, metadata)
+        obs_subset = adata_obs.iloc[: len(pert_adata.obs)].copy()
+        cols_to_add = [col for col in obs_subset.columns if col not in ["perturbations", "encoded_perturbations"]]
+        new_cols_data = {col: obs_subset[col].values for col in cols_to_add}
-        Returns:
-            Tuple of (embeddings, metadata)
-        """
-        x, _, y = batch
-        x = x.to(torch.float32)
+        if new_cols_data:
+            pert_adata.obs = pd.concat(
+                [pert_adata.obs, pd.DataFrame(new_cols_data, index=pert_adata.obs.index)], axis=1
+            )
-        embedding = self.embedding(x)
-        return embedding, y
+        return pert_adata

pertpy 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

pertpy 1.0.2py3-none-any.whl → 1.0.3py3-none-any.whl