PyPI - flaxdiff - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

flaxdiff 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

flaxdiff/models/attention.py +12 -11
flaxdiff/models/autoencoder/__init__.py +2 -0
flaxdiff/models/autoencoder/autoencoder.py +7 -2
flaxdiff/models/autoencoder/diffusers.py +3 -0
flaxdiff/models/autoencoder/simple_autoenc.py +26 -0
flaxdiff/models/common.py +89 -10
flaxdiff/models/simple_unet.py +4 -75
flaxdiff/trainer/__init__.py +1 -183
flaxdiff/trainer/autoencoder_trainer.py +182 -0
flaxdiff/trainer/diffusion_trainer.py +202 -0
{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/METADATA +3 -1
{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/RECORD +14 -11
{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/WHEEL +0 -0
{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/top_level.txt +0 -0

flaxdiff/models/attention.py CHANGED Viewed

@@ -5,7 +5,8 @@ Some Code ported from https://github.com/huggingface/diffusers/blob/main/src/dif
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
-from typing import Dict, Callable, Sequence, Any, Union
+from typing import Dict, Callable, Sequence, Any, Union, Tuple, Optional
+from flax.typing import Dtype, PrecisionLike
 import einops
 import functools
 import math
@@ -18,8 +19,8 @@ class EfficientAttention(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
@@ -109,8 +110,8 @@ class NormalAttention(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
@@ -166,8 +167,8 @@ class BasicTransformerBlock(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
     use_flash_attention:bool = False
@@ -286,8 +287,8 @@ class BasicTransformerBlock(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
     use_flash_attention:bool = False
@@ -346,8 +347,8 @@ class TransformerBlock(nn.Module):
     heads: int = 4
     dim_head: int = 32
     use_linear_attention: bool = True
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_projection: bool = False
     use_flash_attention:bool = True
     use_self_and_cross:bool = False

flaxdiff/models/autoencoder/__init__.py CHANGED Viewed

	@@ -0,0 +1,2 @@
1	+ from .autoencoder import AutoEncoder
2	+ from .diffusers import StableDiffusionVAE

flaxdiff/models/autoencoder/autoencoder.py CHANGED Viewed

@@ -6,9 +6,14 @@ import einops
 from ..common import kernel_init, ConvLayer, Upsample, Downsample, PixelShuffle
-class AutoEncoder:
+class AutoEncoder():
     def encode(self, x: jnp.ndarray, **kwargs) -> jnp.ndarray:
         raise NotImplementedError
     def decode(self, z: jnp.ndarray, **kwargs) -> jnp.ndarray:
-        raise NotImplementedError
+        raise NotImplementedError
+    def __call__(self, x: jnp.ndarray):
+        latents = self.encode(x)
+        reconstructions = self.decode(latents)
+        return reconstructions

flaxdiff/models/autoencoder/diffusers.py CHANGED Viewed

@@ -5,6 +5,9 @@ from .autoencoder import AutoEncoder
 """
 This module contains an Autoencoder implementation which uses the Stable Diffusion VAE model from the HuggingFace Diffusers library.
+The actual model was not trained by me, but was taken from the HuggingFace model hub.
+I have only implemented the wrapper around the diffusers pipeline to make it compatible with our library
+All credits for the model go to the developers of Stable Diffusion VAE and all credits for the pipeline go to the developers of the Diffusers library.
 """
 class StableDiffusionVAE(AutoEncoder):

flaxdiff/models/autoencoder/simple_autoenc.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import Any, List, Optional, Callable
+import jax
+import flax.linen as nn
+from jax import numpy as jnp
+from flax.typing import Dtype, PrecisionLike
+from .autoencoder import AutoEncoder
+class SimpleAutoEncoder(AutoEncoder):
+    latent_channels: int
+    feature_depths: List[int]=[64, 128, 256, 512]
+    attention_configs:list=[{"heads":8}, {"heads":8}, {"heads":8}, {"heads":8}],
+    num_res_blocks: int=2
+    num_middle_res_blocks:int=1,
+    activation:Callable = jax.nn.swish
+    norm_groups:int=8
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    # def encode(self, x: jnp.ndarray):
+    @nn.compact
+    def __call__(self, x: jnp.ndarray):
+        latents = self.encode(x)
+        reconstructions = self.decode(latents)
+        return reconstructions

flaxdiff/models/common.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import jax.numpy as jnp
 import jax
 from flax import linen as nn
+from typing import Optional, Any, Callable, Sequence, Union
+from flax.typing import Dtype, PrecisionLike
 from typing import Dict, Callable, Sequence, Any, Union
 import einops
@@ -18,8 +20,9 @@ class WeightStandardizedConv(nn.Module):
     kernel_size: Sequence[int] = 3
     strides: Union[None, int, Sequence[int]] = 1
     padding: Any = 1
-    dtype: Any = jnp.float32
-    param_dtype: Any = jnp.float32
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    param_dtype: Optional[Dtype] = None
     @nn.compact
     def __call__(self, x):
@@ -120,8 +123,8 @@ class SeparableConv(nn.Module):
     use_bias:bool=False
     kernel_init:Callable=kernel_init(1.0)
     padding:str="SAME"
-    dtype: Any = jnp.bfloat16
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     @nn.compact
     def __call__(self, x):
@@ -149,8 +152,8 @@ class ConvLayer(nn.Module):
     kernel_size:tuple=(3, 3)
     strides:tuple=(1, 1)
     kernel_init:Callable=kernel_init(1.0)
-    dtype: Any = jnp.bfloat16
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     def setup(self):
         # conv_type can be "conv", "separable", "conv_transpose"
@@ -199,8 +202,8 @@ class Upsample(nn.Module):
     features:int
     scale:int
     activation:Callable=jax.nn.swish
-    dtype: Any = jnp.bfloat16
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     @nn.compact
     def __call__(self, x, residual=None):
@@ -224,8 +227,8 @@ class Downsample(nn.Module):
     features:int
     scale:int
     activation:Callable=jax.nn.swish
-    dtype: Any = jnp.bfloat16
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     @nn.compact
     def __call__(self, x, residual=None):
@@ -248,3 +251,79 @@ def l2norm(t, axis=1, eps=1e-12):
     denom = jnp.clip(jnp.linalg.norm(t, ord=2, axis=axis, keepdims=True), eps)
     out = t/denom
     return (out)
+class ResidualBlock(nn.Module):
+    conv_type:str
+    features:int
+    kernel_size:tuple=(3, 3)
+    strides:tuple=(1, 1)
+    padding:str="SAME"
+    activation:Callable=jax.nn.swish
+    direction:str=None
+    res:int=2
+    norm_groups:int=8
+    kernel_init:Callable=kernel_init(1.0)
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    @nn.compact
+    def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_features:jax.Array=None):
+        residual = x
+        # out = nn.GroupNorm(self.norm_groups)(x)
+        out = nn.RMSNorm()(x)
+        out = self.activation(out)
+        out = ConvLayer(
+            self.conv_type,
+            features=self.features,
+            kernel_size=self.kernel_size,
+            strides=self.strides,
+            kernel_init=self.kernel_init,
+            name="conv1",
+            dtype=self.dtype,
+            precision=self.precision
+        )(out)
+        temb = nn.DenseGeneral(
+            features=self.features,
+            name="temb_projection",
+            dtype=self.dtype,
+            precision=self.precision)(temb)
+        temb = jnp.expand_dims(jnp.expand_dims(temb, 1), 1)
+        # scale, shift = jnp.split(temb, 2, axis=-1)
+        # out = out * (1 + scale) + shift
+        out = out + temb
+        # out = nn.GroupNorm(self.norm_groups)(out)
+        out = nn.RMSNorm()(out)
+        out = self.activation(out)
+        out = ConvLayer(
+            self.conv_type,
+            features=self.features,
+            kernel_size=self.kernel_size,
+            strides=self.strides,
+            kernel_init=self.kernel_init,
+            name="conv2",
+            dtype=self.dtype,
+            precision=self.precision
+        )(out)
+        if residual.shape != out.shape:
+            residual = ConvLayer(
+                self.conv_type,
+                features=self.features,
+                kernel_size=(1, 1),
+                strides=1,
+                kernel_init=self.kernel_init,
+                name="residual_conv",
+                dtype=self.dtype,
+                precision=self.precision
+            )(residual)
+        out = out + residual
+        out = jnp.concatenate([out, extra_features], axis=-1) if extra_features is not None else out
+        return out

flaxdiff/models/simple_unet.py CHANGED Viewed

@@ -1,83 +1,12 @@
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
-from typing import Dict, Callable, Sequence, Any, Union
+from flax.typing import Dtype, PrecisionLike
+from typing import Dict, Callable, Sequence, Any, Union, Optional
 import einops
 from .common import kernel_init, ConvLayer, Downsample, Upsample, FourierEmbedding, TimeProjection
 from .attention import TransformerBlock
-class ResidualBlock(nn.Module):
-    conv_type:str
-    features:int
-    kernel_size:tuple=(3, 3)
-    strides:tuple=(1, 1)
-    padding:str="SAME"
-    activation:Callable=jax.nn.swish
-    direction:str=None
-    res:int=2
-    norm_groups:int=8
-    kernel_init:Callable=kernel_init(1.0)
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
-    @nn.compact
-    def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_features:jax.Array=None):
-        residual = x
-        out = nn.GroupNorm(self.norm_groups)(x)
-        out = self.activation(out)
-        out = ConvLayer(
-            self.conv_type,
-            features=self.features,
-            kernel_size=self.kernel_size,
-            strides=self.strides,
-            kernel_init=self.kernel_init,
-            name="conv1",
-            dtype=self.dtype,
-            precision=self.precision
-        )(out)
-        temb = nn.DenseGeneral(
-            features=self.features,
-            name="temb_projection",
-            dtype=self.dtype,
-            precision=self.precision)(temb)
-        temb = jnp.expand_dims(jnp.expand_dims(temb, 1), 1)
-        # scale, shift = jnp.split(temb, 2, axis=-1)
-        # out = out * (1 + scale) + shift
-        out = out + temb
-        out = nn.GroupNorm(self.norm_groups)(out)
-        out = self.activation(out)
-        out = ConvLayer(
-            self.conv_type,
-            features=self.features,
-            kernel_size=self.kernel_size,
-            strides=self.strides,
-            kernel_init=self.kernel_init,
-            name="conv2",
-            dtype=self.dtype,
-            precision=self.precision
-        )(out)
-        if residual.shape != out.shape:
-            residual = ConvLayer(
-                self.conv_type,
-                features=self.features,
-                kernel_size=(1, 1),
-                strides=1,
-                kernel_init=self.kernel_init,
-                name="residual_conv",
-                dtype=self.dtype,
-                precision=self.precision
-            )(residual)
-        out = out + residual
-        out = jnp.concatenate([out, extra_features], axis=-1) if extra_features is not None else out
-        return out
 class Unet(nn.Module):
     output_channels:int=3
     emb_features:int=64*4,
@@ -87,8 +16,8 @@ class Unet(nn.Module):
     num_middle_res_blocks:int=1,
     activation:Callable = jax.nn.swish
     norm_groups:int=8
-    dtype: Any = jnp.bfloat16
-    precision: Any = jax.lax.Precision.HIGH
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     @nn.compact
     def __call__(self, x, temb, textcontext):

flaxdiff/trainer/__init__.py CHANGED Viewed

@@ -1,184 +1,2 @@
-from flax import linen as nn
-import jax
-from typing import Callable
-from dataclasses import field
-import jax.numpy as jnp
-import optax
-from jax.sharding import Mesh, PartitionSpec as P
-from jax.experimental.shard_map import shard_map
-from typing import Dict, Callable, Sequence, Any, Union, Tuple
-from ..schedulers import NoiseScheduler
-from ..predictors import DiffusionPredictionTransform, EpsilonPredictionTransform
-from flaxdiff.utils import RandomMarkovState
 from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
-class TrainState(SimpleTrainState):
-    rngs: jax.random.PRNGKey
-    ema_params: dict
-    def apply_ema(self, decay: float = 0.999):
-        new_ema_params = jax.tree_util.tree_map(
-            lambda ema, param: decay * ema + (1 - decay) * param,
-            self.ema_params,
-            self.params,
-        )
-        return self.replace(ema_params=new_ema_params)
-class DiffusionTrainer(SimpleTrainer):
-    noise_schedule: NoiseScheduler
-    model_output_transform: DiffusionPredictionTransform
-    ema_decay: float = 0.999
-    def __init__(self,
-                 model: nn.Module,
-                 input_shapes: Dict[str, Tuple[int]],
-                 optimizer: optax.GradientTransformation,
-                 noise_schedule: NoiseScheduler,
-                 rngs: jax.random.PRNGKey,
-                 unconditional_prob: float = 0.2,
-                 name: str = "Diffusion",
-                 model_output_transform: DiffusionPredictionTransform = EpsilonPredictionTransform(),
-                 **kwargs
-                 ):
-        super().__init__(
-            model=model,
-            input_shapes=input_shapes,
-            optimizer=optimizer,
-            rngs=rngs,
-            name=name,
-            **kwargs
-        )
-        self.noise_schedule = noise_schedule
-        self.model_output_transform = model_output_transform
-        self.unconditional_prob = unconditional_prob
-    def generate_states(
-        self,
-        optimizer: optax.GradientTransformation,
-        rngs: jax.random.PRNGKey,
-        existing_state: dict = None,
-        existing_best_state: dict = None,
-        model: nn.Module = None,
-        param_transforms: Callable = None
-    ) -> Tuple[TrainState, TrainState]:
-        print("Generating states for DiffusionTrainer")
-        rngs, subkey = jax.random.split(rngs)
-        if existing_state == None:
-            input_vars = self.get_input_ones()
-            params = model.init(subkey, **input_vars)
-            new_state = {"params": params, "ema_params": params}
-        else:
-            new_state = existing_state
-        if param_transforms is not None:
-            params = param_transforms(params)
-        state = TrainState.create(
-            apply_fn=model.apply,
-            params=new_state['params'],
-            ema_params=new_state['ema_params'],
-            tx=optimizer,
-            rngs=rngs,
-            metrics=Metrics.empty()
-        )
-        if existing_best_state is not None:
-            best_state = state.replace(
-                params=existing_best_state['params'], ema_params=existing_best_state['ema_params'])
-        else:
-            best_state = state
-        return state, best_state
-    def _define_train_step(self, batch_size, null_labels_seq, text_embedder):
-        noise_schedule: NoiseScheduler = self.noise_schedule
-        model = self.model
-        model_output_transform = self.model_output_transform
-        loss_fn = self.loss_fn
-        unconditional_prob = self.unconditional_prob
-        # Determine the number of unconditional samples
-        num_unconditional = int(batch_size * unconditional_prob)
-        nS, nC = null_labels_seq.shape
-        null_labels_seq = jnp.broadcast_to(
-            null_labels_seq, (batch_size, nS, nC))
-        distributed_training = self.distributed_training
-        # @jax.jit
-        def train_step(train_state: TrainState, rng_state: RandomMarkovState, batch, local_device_index):
-            """Train for a single step."""
-            rng_state, subkey = rng_state.get_random_key()
-            subkey = jax.random.fold_in(subkey, local_device_index.reshape())
-            local_rng_state = RandomMarkovState(subkey)
-            images = batch['image']
-            # normalize image
-            images = (images - 127.5) / 127.5
-            output = text_embedder(
-                input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
-            label_seq = output.last_hidden_state
-            # Generate random probabilities to decide how much of this batch will be unconditional
-            label_seq = jnp.concat(
-                [null_labels_seq[:num_unconditional], label_seq[num_unconditional:]], axis=0)
-            noise_level, local_rng_state = noise_schedule.generate_timesteps(images.shape[0], local_rng_state)
-            local_rng_state, rngs = local_rng_state.get_random_key()
-            noise: jax.Array = jax.random.normal(rngs, shape=images.shape)
-            rates = noise_schedule.get_rates(noise_level)
-            noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(
-                images, noise, rates)
-            def model_loss(params):
-                preds = model.apply(
-                    params, *noise_schedule.transform_inputs(noisy_images*c_in, noise_level), label_seq)
-                preds = model_output_transform.pred_transform(
-                    noisy_images, preds, rates)
-                nloss = loss_fn(preds, expected_output)
-                # nloss = jnp.mean(nloss, axis=1)
-                nloss *= noise_schedule.get_weights(noise_level)
-                nloss = jnp.mean(nloss)
-                loss = nloss
-                return loss
-            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
-            if distributed_training:
-                grads = jax.lax.pmean(grads, "data")
-                loss = jax.lax.pmean(loss, "data")
-            train_state = train_state.apply_gradients(grads=grads)
-            train_state = train_state.apply_ema(self.ema_decay)
-            return train_state, loss, rng_state
-        if distributed_training:
-            train_step = shard_map(train_step, mesh=self.mesh, in_specs=(P(), P(), P('data'), P('data')),
-                                   out_specs=(P(), P(), P()))
-            train_step = jax.jit(train_step)
-        return train_step
-    def _define_compute_metrics(self):
-        @jax.jit
-        def compute_metrics(state: TrainState, expected, pred):
-            loss = jnp.mean(jnp.square(pred - expected))
-            metric_updates = state.metrics.single_from_model_output(loss=loss)
-            metrics = state.metrics.merge(metric_updates)
-            state = state.replace(metrics=metrics)
-            return state
-        return compute_metrics
-    def fit(self, data, steps_per_epoch, epochs):
-        null_labels_full = data['null_labels_full']
-        local_batch_size = data['local_batch_size']
-        text_embedder = data['model']
-        super().fit(data, steps_per_epoch, epochs, {
-            "batch_size": local_batch_size, "null_labels_seq": null_labels_full, "text_embedder": text_embedder})
+from .diffusion_trainer import DiffusionTrainer, TrainState

flaxdiff/trainer/autoencoder_trainer.py ADDED Viewed

@@ -0,0 +1,182 @@
+from flax import linen as nn
+import jax
+from typing import Callable
+from dataclasses import field
+import jax.numpy as jnp
+import optax
+from jax.sharding import Mesh, PartitionSpec as P
+from jax.experimental.shard_map import shard_map
+from typing import Dict, Callable, Sequence, Any, Union, Tuple
+from ..schedulers import NoiseScheduler
+from ..predictors import DiffusionPredictionTransform, EpsilonPredictionTransform
+from flaxdiff.utils import RandomMarkovState
+from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
+from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
+class AutoEncoderTrainer(SimpleTrainer):
+    def __init__(self,
+                 model: nn.Module,
+                 input_shape: Union[int, int, int],
+                 latent_dim: int,
+                 spatial_scale: int,
+                 optimizer: optax.GradientTransformation,
+                 rngs: jax.random.PRNGKey,
+                 name: str = "Autoencoder",
+                 **kwargs
+                 ):
+        super().__init__(
+            model=model,
+            input_shapes={"image": input_shape},
+            optimizer=optimizer,
+            rngs=rngs,
+            name=name,
+            **kwargs
+        )
+        self.latent_dim = latent_dim
+        self.spatial_scale = spatial_scale
+    def generate_states(
+        self,
+        optimizer: optax.GradientTransformation,
+        rngs: jax.random.PRNGKey,
+        existing_state: dict = None,
+        existing_best_state: dict = None,
+        model: nn.Module = None,
+        param_transforms: Callable = None
+    ) -> Tuple[TrainState, TrainState]:
+        print("Generating states for DiffusionTrainer")
+        rngs, subkey = jax.random.split(rngs)
+        if existing_state == None:
+            input_vars = self.get_input_ones()
+            params = model.init(subkey, **input_vars)
+            new_state = {"params": params, "ema_params": params}
+        else:
+            new_state = existing_state
+        if param_transforms is not None:
+            params = param_transforms(params)
+        state = TrainState.create(
+            apply_fn=model.apply,
+            params=new_state['params'],
+            ema_params=new_state['ema_params'],
+            tx=optimizer,
+            rngs=rngs,
+            metrics=Metrics.empty()
+        )
+        if existing_best_state is not None:
+            best_state = state.replace(
+                params=existing_best_state['params'], ema_params=existing_best_state['ema_params'])
+        else:
+            best_state = state
+        return state, best_state
+    def _define_train_step(self, batch_size, null_labels_seq, text_embedder):
+        noise_schedule: NoiseScheduler = self.noise_schedule
+        model = self.model
+        model_output_transform = self.model_output_transform
+        loss_fn = self.loss_fn
+        unconditional_prob = self.unconditional_prob
+        # Determine the number of unconditional samples
+        num_unconditional = int(batch_size * unconditional_prob)
+        nS, nC = null_labels_seq.shape
+        null_labels_seq = jnp.broadcast_to(
+            null_labels_seq, (batch_size, nS, nC))
+        distributed_training = self.distributed_training
+        autoencoder = self.autoencoder
+        # @jax.jit
+        def train_step(train_state: TrainState, rng_state: RandomMarkovState, batch, local_device_index):
+            """Train for a single step."""
+            rng_state, subkey = rng_state.get_random_key()
+            subkey = jax.random.fold_in(subkey, local_device_index.reshape())
+            local_rng_state = RandomMarkovState(subkey)
+            images = batch['image']
+            if autoencoder is not None:
+                # Convert the images to latent space
+                local_rng_state, rngs = local_rng_state.get_random_key()
+                images = autoencoder.encode(images, rngs)
+            else:
+                # normalize image
+                images = (images - 127.5) / 127.5
+            output = text_embedder(
+                input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
+            label_seq = output.last_hidden_state
+            # Generate random probabilities to decide how much of this batch will be unconditional
+            label_seq = jnp.concat(
+                [null_labels_seq[:num_unconditional], label_seq[num_unconditional:]], axis=0)
+            noise_level, local_rng_state = noise_schedule.generate_timesteps(images.shape[0], local_rng_state)
+            local_rng_state, rngs = local_rng_state.get_random_key()
+            noise: jax.Array = jax.random.normal(rngs, shape=images.shape)
+            rates = noise_schedule.get_rates(noise_level)
+            noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(
+                images, noise, rates)
+            def model_loss(params):
+                preds = model.apply(params, *noise_schedule.transform_inputs(noisy_images*c_in, noise_level), label_seq)
+                preds = model_output_transform.pred_transform(
+                    noisy_images, preds, rates)
+                nloss = loss_fn(preds, expected_output)
+                # nloss = jnp.mean(nloss, axis=1)
+                nloss *= noise_schedule.get_weights(noise_level)
+                nloss = jnp.mean(nloss)
+                loss = nloss
+                return loss
+            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
+            if distributed_training:
+                grads = jax.lax.pmean(grads, "data")
+                loss = jax.lax.pmean(loss, "data")
+            train_state = train_state.apply_gradients(grads=grads)
+            train_state = train_state.apply_ema(self.ema_decay)
+            return train_state, loss, rng_state
+        if distributed_training:
+            train_step = shard_map(train_step, mesh=self.mesh, in_specs=(P(), P(), P('data'), P('data')),
+                                   out_specs=(P(), P(), P()))
+            train_step = jax.jit(train_step)
+        return train_step
+    def _define_compute_metrics(self):
+        @jax.jit
+        def compute_metrics(state: TrainState, expected, pred):
+            loss = jnp.mean(jnp.square(pred - expected))
+            metric_updates = state.metrics.single_from_model_output(loss=loss)
+            metrics = state.metrics.merge(metric_updates)
+            state = state.replace(metrics=metrics)
+            return state
+        return compute_metrics
+    def fit(self, data, steps_per_epoch, epochs):
+        null_labels_full = data['null_labels_full']
+        local_batch_size = data['local_batch_size']
+        text_embedder = data['model']
+        super().fit(data, steps_per_epoch, epochs, {
+            "batch_size": local_batch_size, "null_labels_seq": null_labels_full, "text_embedder": text_embedder})
+def boolean_string(s):
+    if type(s) == bool:
+        return s
+    return s == 'True'

flaxdiff/trainer/diffusion_trainer.py ADDED Viewed

@@ -0,0 +1,202 @@
+from flax import linen as nn
+import jax
+from typing import Callable
+from dataclasses import field
+import jax.numpy as jnp
+import optax
+from jax.sharding import Mesh, PartitionSpec as P
+from jax.experimental.shard_map import shard_map
+from typing import Dict, Callable, Sequence, Any, Union, Tuple
+from ..schedulers import NoiseScheduler
+from ..predictors import DiffusionPredictionTransform, EpsilonPredictionTransform
+from flaxdiff.utils import RandomMarkovState
+from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
+from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
+class TrainState(SimpleTrainState):
+    rngs: jax.random.PRNGKey
+    ema_params: dict
+    def apply_ema(self, decay: float = 0.999):
+        new_ema_params = jax.tree_util.tree_map(
+            lambda ema, param: decay * ema + (1 - decay) * param,
+            self.ema_params,
+            self.params,
+        )
+        return self.replace(ema_params=new_ema_params)
+class DiffusionTrainer(SimpleTrainer):
+    noise_schedule: NoiseScheduler
+    model_output_transform: DiffusionPredictionTransform
+    ema_decay: float = 0.999
+    def __init__(self,
+                 model: nn.Module,
+                 input_shapes: Dict[str, Tuple[int]],
+                 optimizer: optax.GradientTransformation,
+                 noise_schedule: NoiseScheduler,
+                 rngs: jax.random.PRNGKey,
+                 unconditional_prob: float = 0.2,
+                 name: str = "Diffusion",
+                 model_output_transform: DiffusionPredictionTransform = EpsilonPredictionTransform(),
+                 autoencoder: AutoEncoder = None,
+                 **kwargs
+                 ):
+        super().__init__(
+            model=model,
+            input_shapes=input_shapes,
+            optimizer=optimizer,
+            rngs=rngs,
+            name=name,
+            **kwargs
+        )
+        self.noise_schedule = noise_schedule
+        self.model_output_transform = model_output_transform
+        self.unconditional_prob = unconditional_prob
+        self.autoencoder = autoencoder
+    def generate_states(
+        self,
+        optimizer: optax.GradientTransformation,
+        rngs: jax.random.PRNGKey,
+        existing_state: dict = None,
+        existing_best_state: dict = None,
+        model: nn.Module = None,
+        param_transforms: Callable = None
+    ) -> Tuple[TrainState, TrainState]:
+        print("Generating states for DiffusionTrainer")
+        rngs, subkey = jax.random.split(rngs)
+        if existing_state == None:
+            input_vars = self.get_input_ones()
+            params = model.init(subkey, **input_vars)
+            new_state = {"params": params, "ema_params": params}
+        else:
+            new_state = existing_state
+        if param_transforms is not None:
+            params = param_transforms(params)
+        state = TrainState.create(
+            apply_fn=model.apply,
+            params=new_state['params'],
+            ema_params=new_state['ema_params'],
+            tx=optimizer,
+            rngs=rngs,
+            metrics=Metrics.empty()
+        )
+        if existing_best_state is not None:
+            best_state = state.replace(
+                params=existing_best_state['params'], ema_params=existing_best_state['ema_params'])
+        else:
+            best_state = state
+        return state, best_state
+    def _define_train_step(self, batch_size, null_labels_seq, text_embedder):
+        noise_schedule: NoiseScheduler = self.noise_schedule
+        model = self.model
+        model_output_transform = self.model_output_transform
+        loss_fn = self.loss_fn
+        unconditional_prob = self.unconditional_prob
+        # Determine the number of unconditional samples
+        num_unconditional = int(batch_size * unconditional_prob)
+        nS, nC = null_labels_seq.shape
+        null_labels_seq = jnp.broadcast_to(
+            null_labels_seq, (batch_size, nS, nC))
+        distributed_training = self.distributed_training
+        autoencoder = self.autoencoder
+        # @jax.jit
+        def train_step(train_state: TrainState, rng_state: RandomMarkovState, batch, local_device_index):
+            """Train for a single step."""
+            rng_state, subkey = rng_state.get_random_key()
+            subkey = jax.random.fold_in(subkey, local_device_index.reshape())
+            local_rng_state = RandomMarkovState(subkey)
+            images = batch['image']
+            if autoencoder is not None:
+                # Convert the images to latent space
+                local_rng_state, rngs = local_rng_state.get_random_key()
+                images = autoencoder.encode(images, rngs)
+            else:
+                # normalize image
+                images = (images - 127.5) / 127.5
+            output = text_embedder(
+                input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
+            label_seq = output.last_hidden_state
+            # Generate random probabilities to decide how much of this batch will be unconditional
+            label_seq = jnp.concat(
+                [null_labels_seq[:num_unconditional], label_seq[num_unconditional:]], axis=0)
+            noise_level, local_rng_state = noise_schedule.generate_timesteps(images.shape[0], local_rng_state)
+            local_rng_state, rngs = local_rng_state.get_random_key()
+            noise: jax.Array = jax.random.normal(rngs, shape=images.shape)
+            rates = noise_schedule.get_rates(noise_level)
+            noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(
+                images, noise, rates)
+            def model_loss(params):
+                preds = model.apply(params, *noise_schedule.transform_inputs(noisy_images*c_in, noise_level), label_seq)
+                preds = model_output_transform.pred_transform(
+                    noisy_images, preds, rates)
+                nloss = loss_fn(preds, expected_output)
+                # nloss = jnp.mean(nloss, axis=1)
+                nloss *= noise_schedule.get_weights(noise_level)
+                nloss = jnp.mean(nloss)
+                loss = nloss
+                return loss
+            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
+            if distributed_training:
+                grads = jax.lax.pmean(grads, "data")
+                loss = jax.lax.pmean(loss, "data")
+            train_state = train_state.apply_gradients(grads=grads)
+            train_state = train_state.apply_ema(self.ema_decay)
+            return train_state, loss, rng_state
+        if distributed_training:
+            train_step = shard_map(train_step, mesh=self.mesh, in_specs=(P(), P(), P('data'), P('data')),
+                                   out_specs=(P(), P(), P()))
+            train_step = jax.jit(train_step)
+        return train_step
+    def _define_compute_metrics(self):
+        @jax.jit
+        def compute_metrics(state: TrainState, expected, pred):
+            loss = jnp.mean(jnp.square(pred - expected))
+            metric_updates = state.metrics.single_from_model_output(loss=loss)
+            metrics = state.metrics.merge(metric_updates)
+            state = state.replace(metrics=metrics)
+            return state
+        return compute_metrics
+    def fit(self, data, steps_per_epoch, epochs):
+        null_labels_full = data['null_labels_full']
+        local_batch_size = data['local_batch_size']
+        text_embedder = data['model']
+        super().fit(data, steps_per_epoch, epochs, {
+            "batch_size": local_batch_size, "null_labels_seq": null_labels_full, "text_embedder": text_embedder})
+def boolean_string(s):
+    if type(s) == bool:
+        return s
+    return s == 'True'

{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.5
+Version: 0.1.6
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com
@@ -13,6 +13,8 @@ Requires-Dist: clu
 # ![](images/logo.jpeg "FlaxDiff")
+**This project is partially supported by [Google TPU Research Cloud](https://sites.research.google/trc/about/). I would like to thank the Google Cloud TPU team for providing me with the resources to train the bigger text-conditional models in multi-host distributed settings.**
 ## A Versatile and simple Diffusion Library
 In recent years, diffusion and score-based multi-step models have revolutionized the generative AI domain. However, the latest research in this field has become highly math-intensive, making it challenging to understand how state-of-the-art diffusion models work and generate such impressive images. Replicating this research in code can be daunting.

{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,15 @@
 flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=B0GcHlzlVYDNEIdh2v5qmP4u0neIT-FqexNohuyuCvg,2452
 flaxdiff/models/__init__.py,sha256=FAivVYXxM2JrCFIXf-C3374RB2Hth25dBrzOeNFhH1U,26
-flaxdiff/models/attention.py,sha256=KiAUyfujGpUZR13aJR6RVnL6pBXk5UcyM62VIXhojMg,14468
-flaxdiff/models/common.py,sha256=jlyRB4uF7BmeuExor1YHaqEbBjSuyaDZ4mDsSW3rWKE,7948
+flaxdiff/models/attention.py,sha256=OhpKQXdxWbf8K2_yotLfS0DYdHb-zNpL2p8--ql_FAg,14503
+flaxdiff/models/common.py,sha256=RYNxX9K19hvwSWaB9Wtv7MIZLhcacdugDgD9uZDh8XM,10358
 flaxdiff/models/favor_fastattn.py,sha256=79Ew1nqarsNLPzZaBSd1ILORzJr74CupYeqGiCQK5E4,27689
-flaxdiff/models/simple_unet.py,sha256=o1DCa9yvqarEGTiUKsTqE70q-h6bRU6HcU0lZpb65jc,11418
+flaxdiff/models/simple_unet.py,sha256=hAcz074E9NVdUtECPMi1c1Kw-52Dc6l_ME-5FqIg-n8,9255
 flaxdiff/models/simple_vit.py,sha256=vTu2CQRoSOxetBHTrnCWddm-vxrZDkMe8EpdNxtpJMk,4015
-flaxdiff/models/autoencoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-flaxdiff/models/autoencoder/autoencoder.py,sha256=At-DhcmrZ0Gao4PUa4l9D25FTdTPwbE4gu6LKcFKzUQ,433
-flaxdiff/models/autoencoder/diffusers.py,sha256=gwyD98277vQGKVPFbyd6w6CupoxMsNgKlN67AtzLCtg,3267
+flaxdiff/models/autoencoder/__init__.py,sha256=qY-7MldZpsfkF-_T2LqlRK7VHbqfmosz0NmvzDlBkOk,78
+flaxdiff/models/autoencoder/autoencoder.py,sha256=27_hYl0yXAdH9Mx4Xu9J79mSNo-FEKr9SxhVaS3ffn4,591
+flaxdiff/models/autoencoder/diffusers.py,sha256=kwlKwHBSAegtTiEkGju_1Trltegj-e47hXFN9jCKmgY,3609
+flaxdiff/models/autoencoder/simple_autoenc.py,sha256=UXHPgDmwGTnv3Uts6Zj3p9R9nJXnEiEXbllgarwDfXM,805
 flaxdiff/predictors/__init__.py,sha256=SKkYYRF9Wfgk2zhtZw4vCXOdOeRlrm2Mk6cvuaEvAzc,4403
 flaxdiff/samplers/__init__.py,sha256=_S-9TwDeshrI0VmapV-J2hqjTByOa0-oOeUs_IdovjU,285
 flaxdiff/samplers/common.py,sha256=_an5h5Niz9Joz_-ppridLrGHpu8X0VVvhNGknPu6AUY,5272
@@ -27,9 +28,11 @@ flaxdiff/schedulers/exp.py,sha256=cPTnUJpYdzJRRZqMLYQz0rRUCpEmaP2tXhRumLx94jA,60
 flaxdiff/schedulers/karras.py,sha256=4GN120kGwdxxU-h2mVdhBVy9IORkUMm_vvz3XjthBcI,3355
 flaxdiff/schedulers/linear.py,sha256=6003F5ISq1Wc0h6UAzY95MJgsDIKGMhBzbiVALpea0k,581
 flaxdiff/schedulers/sqrt.py,sha256=1F84ZgQPuoNMhe6yxGTR2G0h7dPOZtm4UDQOakbSsEU,445
-flaxdiff/trainer/__init__.py,sha256=17qKQFITCfaXQFKYElMzkE-c-EPrv5iUL66gY1gKOsQ,7243
+flaxdiff/trainer/__init__.py,sha256=T-vUVq4zHcMK6kpCsG4Gu8vn71q6lZD-lg-Ul7yKfEk,128
+flaxdiff/trainer/autoencoder_trainer.py,sha256=al7AsZ7yeDMEiDD-gbcXf0ADq_xfk1VMxvg24GfA-XQ,7008
+flaxdiff/trainer/diffusion_trainer.py,sha256=h5YxIMjBI553xDNeapzLDGF0_4y0MfGRMuHume5sPtM,7785
 flaxdiff/trainer/simple_trainer.py,sha256=f4g2KGuGM__d9v_4Ip3ng8wQubmenWZUW60VEu2ANOg,16774
-flaxdiff-0.1.5.dist-info/METADATA,sha256=tGKayFhkYSJJnLY_sHiaCJ60kJZqnO-kcLM3uH3JSN4,19811
-flaxdiff-0.1.5.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-flaxdiff-0.1.5.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.1.5.dist-info/RECORD,,
+flaxdiff-0.1.6.dist-info/METADATA,sha256=sWY_oQgQhhuyW89KyRwIBrpVHBPJjRMmsk5twfgIBlo,20090
+flaxdiff-0.1.6.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+flaxdiff-0.1.6.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.1.6.dist-info/RECORD,,

{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.1.5.dist-info → flaxdiff-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

flaxdiff 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl