PyPI - flaxdiff - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

flaxdiff 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

flaxdiff/models/attention.py +8 -65
flaxdiff/models/autoencoder/diffusers.py +1 -1
flaxdiff/models/common.py +14 -4
flaxdiff/models/simple_unet.py +20 -10
flaxdiff/models/simple_vit.py +13 -16
flaxdiff/trainer/diffusion_trainer.py +41 -11
flaxdiff/trainer/simple_trainer.py +80 -60
{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/METADATA +18 -1
{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/RECORD +11 -11
{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/WHEEL +0 -0
{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/top_level.txt +0 -0

flaxdiff/models/attention.py CHANGED Viewed

@@ -156,71 +156,14 @@ class NormalAttention(nn.Module):
         value = self.value(context)
         hidden_states = nn.dot_product_attention(
-            query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
+            query, key, value, dtype=self.dtype, broadcast_dropout=False,
+            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=True,
+            deterministic=True
         )
         proj = self.proj_attn(hidden_states)
         proj = proj.reshape(orig_x_shape)
         return proj
-class BasicTransformerBlock(nn.Module):
-    # Has self and cross attention
-    query_dim: int
-    heads: int = 4
-    dim_head: int = 64
-    dtype: Optional[Dtype] = None
-    precision: PrecisionLike = None
-    use_bias: bool = True
-    kernel_init: Callable = lambda : kernel_init(1.0)
-    use_flash_attention:bool = False
-    use_cross_only:bool = False
-    def setup(self):
-        if self.use_flash_attention:
-            attenBlock = EfficientAttention
-        else:
-            attenBlock = NormalAttention
-        self.attention1 = attenBlock(
-         query_dim=self.query_dim,
-            heads=self.heads,
-            dim_head=self.dim_head,
-            name=f'Attention1',
-            precision=self.precision,
-            use_bias=self.use_bias,
-            dtype=self.dtype,
-            kernel_init=self.kernel_init
-        )
-        self.attention2 = attenBlock(
-            query_dim=self.query_dim,
-            heads=self.heads,
-            dim_head=self.dim_head,
-            name=f'Attention2',
-            precision=self.precision,
-            use_bias=self.use_bias,
-            dtype=self.dtype,
-            kernel_init=self.kernel_init
-        )
-        self.ff = FlaxFeedForward(dim=self.query_dim)
-        self.norm1 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm2 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm3 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-    @nn.compact
-    def __call__(self, hidden_states, context=None):
-        # self attention
-        if not self.use_cross_only:
-            print("Using self attention")
-            hidden_states = hidden_states + self.attention1(self.norm1(hidden_states))
-        # cross attention
-        hidden_states = hidden_states + self.attention2(self.norm2(hidden_states), context)
-        # feed forward
-        hidden_states = hidden_states + self.ff(self.norm3(hidden_states))
-        return hidden_states
 class FlaxGEGLU(nn.Module):
     r"""
     Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
@@ -246,7 +189,7 @@ class FlaxGEGLU(nn.Module):
     def __call__(self, hidden_states):
         hidden_states = self.proj(hidden_states)
-        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=3)
+        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=-1)
         return hidden_linear * nn.gelu(hidden_gelu)
 class FlaxFeedForward(nn.Module):
@@ -330,7 +273,7 @@ class BasicTransformerBlock(nn.Module):
     @nn.compact
     def __call__(self, hidden_states, context=None):
         if self.only_pure_attention:
-            return self.attention2(self.norm2(hidden_states), context)
+            return self.attention2(hidden_states, context)
         # self attention
         if not self.use_cross_only:
@@ -350,14 +293,14 @@ class TransformerBlock(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     use_projection: bool = False
-    use_flash_attention:bool = True
-    use_self_and_cross:bool = False
+    use_flash_attention:bool = False
+    use_self_and_cross:bool = True
     only_pure_attention:bool = False
     @nn.compact
     def __call__(self, x, context=None):
         inner_dim = self.heads * self.dim_head
-        B, H, W, C = x.shape
+        C = x.shape[-1]
         normed_x = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)(x)
         if self.use_projection == True:
             if self.use_linear_attention:

flaxdiff/models/autoencoder/diffusers.py CHANGED Viewed

@@ -78,7 +78,7 @@ class StableDiffusionVAE(AutoEncoder):
             log_std = jnp.clip(log_std, -30, 20)
             std = jnp.exp(0.5 * log_std)
             latents = mean + std * jax.random.normal(rngkey, mean.shape, dtype=mean.dtype)
-            print("Sampled")
+            # print("Sampled")
         else:
             # return the mean
             latents, _ = jnp.split(latents, 2, axis=-1)

flaxdiff/models/common.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Optional, Any, Callable, Sequence, Union
 from flax.typing import Dtype, PrecisionLike
 from typing import Dict, Callable, Sequence, Any, Union
 import einops
+from functools import partial
 # Kernel initializer to use
 def kernel_init(scale, dtype=jnp.float32):
@@ -266,12 +267,21 @@ class ResidualBlock(nn.Module):
     kernel_init:Callable=kernel_init(1.0)
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
+    def setup(self):
+        if self.norm_groups > 0:
+            norm = partial(nn.GroupNorm, self.norm_groups)
+        else:
+            norm = partial(nn.RMSNorm, 1e-5)
+        self.norm1 = norm()
+        self.norm2 = norm()
     @nn.compact
     def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_features:jax.Array=None):
         residual = x
-        # out = nn.GroupNorm(self.norm_groups)(x)
-        out = nn.RMSNorm()(x)
+        out = self.norm1(x)
+        # out = nn.RMSNorm()(x)
         out = self.activation(out)
         out = ConvLayer(
@@ -295,8 +305,8 @@ class ResidualBlock(nn.Module):
         # out = out * (1 + scale) + shift
         out = out + temb
-        # out = nn.GroupNorm(self.norm_groups)(out)
-        out = nn.RMSNorm()(out)
+        out = self.norm2(out)
+        # out = nn.RMSNorm()(out)
         out = self.activation(out)
         out = ConvLayer(

flaxdiff/models/simple_unet.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict, Callable, Sequence, Any, Union, Optional
 import einops
 from .common import kernel_init, ConvLayer, Downsample, Upsample, FourierEmbedding, TimeProjection, ResidualBlock
 from .attention import TransformerBlock
+from functools import partial
 class Unet(nn.Module):
     output_channels:int=3
@@ -19,6 +20,15 @@ class Unet(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
+    def setup(self):
+        if self.norm_groups > 0:
+            norm = partial(nn.GroupNorm, self.norm_groups)
+        else:
+            norm = partial(nn.RMSNorm, 1e-5)
+        # self.last_up_norm = norm()
+        self.conv_out_norm = norm()
     @nn.compact
     def __call__(self, x, temb, textcontext):
         # print("embedding features", self.emb_features)
@@ -65,11 +75,11 @@ class Unet(nn.Module):
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
                                         dim_head=dim_in // attention_config['heads'],
-                                        use_flash_attention=attention_config.get("flash_attention", True),
+                                        use_flash_attention=attention_config.get("flash_attention", False),
                                         use_projection=attention_config.get("use_projection", False),
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
-                                        only_pure_attention=True,
+                                        only_pure_attention=attention_config.get("only_pure_attention", True),
                                         name=f"down_{i}_attention_{j}")(x, textcontext)
                 # print("down residual for feature level", i, "is of shape", x.shape, "features", dim_in)
                 downs.append(x)
@@ -103,12 +113,12 @@ class Unet(nn.Module):
             if middle_attention is not None and j == self.num_middle_res_blocks - 1:   # Apply attention only on the last block
                 x = TransformerBlock(heads=middle_attention['heads'], dtype=middle_attention.get('dtype', jnp.float32),
                                     dim_head=middle_dim_out // middle_attention['heads'],
-                                    use_flash_attention=middle_attention.get("flash_attention", True),
+                                    use_flash_attention=middle_attention.get("flash_attention", False),
                                     use_linear_attention=False,
                                     use_projection=middle_attention.get("use_projection", False),
                                     use_self_and_cross=False,
-                                    precision=attention_config.get("precision", self.precision),
-                                    only_pure_attention=True,
+                                    precision=middle_attention.get("precision", self.precision),
+                                    only_pure_attention=middle_attention.get("only_pure_attention", True),
                                     name=f"middle_attention_{j}")(x, textcontext)
             x = ResidualBlock(
                 middle_conv_type,
@@ -146,11 +156,11 @@ class Unet(nn.Module):
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
                                         dim_head=dim_out // attention_config['heads'],
-                                        use_flash_attention=attention_config.get("flash_attention", True),
+                                        use_flash_attention=attention_config.get("flash_attention", False),
                                         use_projection=attention_config.get("use_projection", False),
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
-                                        only_pure_attention=True,
+                                        only_pure_attention=attention_config.get("only_pure_attention", True),
                                         name=f"up_{i}_attention_{j}")(x, textcontext)
             # print("Upscaling ", i, x.shape)
             if i != len(feature_depths) - 1:
@@ -163,13 +173,13 @@ class Unet(nn.Module):
                     precision=self.precision
                 )(x)
-        # x = nn.GroupNorm(8)(x)
+        # x = self.last_up_norm(x)
         x = ConvLayer(
             conv_type,
             features=self.feature_depths[0],
             kernel_size=(3, 3),
             strides=(1, 1),
-            kernel_init=kernel_init(0.0),
+            kernel_init=kernel_init(1.0),
             dtype=self.dtype,
             precision=self.precision
         )(x)
@@ -189,7 +199,7 @@ class Unet(nn.Module):
             precision=self.precision
         )(x, temb)
-        x = nn.GroupNorm(self.norm_groups)(x)
+        x = self.conv_out_norm(x)
         x = self.activation(x)
         noise_out = ConvLayer(

flaxdiff/models/simple_vit.py CHANGED Viewed

@@ -4,7 +4,7 @@ import jax
 import jax.numpy as jnp
 from flax import linen as nn
 from typing import Callable, Any
-from .simply_unet import FourierEmbedding, TimeProjection, ConvLayer, kernel_init
+from .simple_unet import FourierEmbedding, TimeProjection, ConvLayer, kernel_init
 from .attention import TransformerBlock
 class PatchEmbedding(nn.Module):
@@ -40,22 +40,23 @@ class PositionalEncoding(nn.Module):
 class TransformerEncoder(nn.Module):
     num_layers: int
     num_heads: int
-    mlp_dim: int
     dropout_rate: float = 0.1
     dtype: Any = jnp.float32
     precision: Any = jax.lax.Precision.HIGH
+    use_projection: bool = False
     @nn.compact
-    def __call__(self, x, training=True):
+    def __call__(self, x, context=None):
         for _ in range(self.num_layers):
             x = TransformerBlock(
                 heads=self.num_heads,
                 dim_head=x.shape[-1] // self.num_heads,
-                mlp_dim=self.mlp_dim,
                 dropout_rate=self.dropout_rate,
                 dtype=self.dtype,
-                precision=self.precision
-            )(x)
+                precision=self.precision,
+                use_self_and_cross=True,
+                use_projection=self.use_projection,
+            )(x, context)
         return x
 class VisionTransformer(nn.Module):
@@ -63,11 +64,11 @@ class VisionTransformer(nn.Module):
     embedding_dim: int = 768
     num_layers: int = 12
     num_heads: int = 12
-    mlp_dim: int = 3072
     emb_features: int = 256
     dropout_rate: float = 0.1
     dtype: Any = jnp.float32
     precision: Any = jax.lax.Precision.HIGH
+    use_projection: bool = False
     @nn.compact
     def __call__(self, x, temb, textcontext=None):
@@ -81,27 +82,23 @@ class VisionTransformer(nn.Module):
         # Add positional encoding
         x = PositionalEncoding(max_len=x.shape[1], embedding_dim=self.embedding_dim)(x)
+        num_patches = x.shape[1]
         # Add time embedding
         temb = jnp.expand_dims(temb, axis=1)
         x = jnp.concatenate([x, temb], axis=1)
-        # Add text context
-        if textcontext is not None:
-            x = jnp.concatenate([x, textcontext], axis=1)
         # Transformer encoder
         x = TransformerEncoder(
             num_layers=self.num_layers,
             num_heads=self.num_heads,
-            mlp_dim=self.mlp_dim,
             dropout_rate=self.dropout_rate,
             dtype=self.dtype,
-            precision=self.precision
-        )(x)
+            precision=self.precision,
+            use_projection=self.use_projection
+        )(x, textcontext)
-        # Extract the image tokens (exclude time and text embeddings)
-        num_patches = (x.shape[1] - 1 - (0 if textcontext is None else textcontext.shape[1]))
         x = x[:, :num_patches, :]
         # Reshape to image dimensions

flaxdiff/trainer/diffusion_trainer.py CHANGED Viewed

@@ -29,6 +29,8 @@ class TrainState(SimpleTrainState):
         )
         return self.replace(ema_params=new_ema_params)
+from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
 class DiffusionTrainer(SimpleTrainer):
     noise_schedule: NoiseScheduler
     model_output_transform: DiffusionPredictionTransform
@@ -40,7 +42,7 @@ class DiffusionTrainer(SimpleTrainer):
                  optimizer: optax.GradientTransformation,
                  noise_schedule: NoiseScheduler,
                  rngs: jax.random.PRNGKey,
-                 unconditional_prob: float = 0.2,
+                 unconditional_prob: float = 0.12,
                  name: str = "Diffusion",
                  model_output_transform: DiffusionPredictionTransform = EpsilonPredictionTransform(),
                  autoencoder: AutoEncoder = None,
@@ -67,7 +69,8 @@ class DiffusionTrainer(SimpleTrainer):
         existing_state: dict = None,
         existing_best_state: dict = None,
         model: nn.Module = None,
-        param_transforms: Callable = None
+        param_transforms: Callable = None,
+        use_dynamic_scale: bool = False
     ) -> Tuple[TrainState, TrainState]:
         print("Generating states for DiffusionTrainer")
         rngs, subkey = jax.random.split(rngs)
@@ -88,7 +91,8 @@ class DiffusionTrainer(SimpleTrainer):
             ema_params=new_state['ema_params'],
             tx=optimizer,
             rngs=rngs,
-            metrics=Metrics.empty()
+            metrics=Metrics.empty(),
+            dynamic_scale = flax.training.dynamic_scale.DynamicScale() if use_dynamic_scale else None
         )
         if existing_best_state is not None:
@@ -125,14 +129,14 @@ class DiffusionTrainer(SimpleTrainer):
             local_rng_state = RandomMarkovState(subkey)
             images = batch['image']
+            images = jnp.array(images, dtype=jnp.float32)
+            # normalize image
+            images = (images - 127.5) / 127.5
             if autoencoder is not None:
                 # Convert the images to latent space
                 local_rng_state, rngs = local_rng_state.get_random_key()
                 images = autoencoder.encode(images, rngs)
-            else:
-                # normalize image
-                images = (images - 127.5) / 127.5
             output = text_embedder(
                 input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
@@ -163,12 +167,39 @@ class DiffusionTrainer(SimpleTrainer):
                 loss = nloss
                 return loss
-            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
+            if train_state.dynamic_scale is not None:
+                # dynamic scale takes care of averaging gradients across replicas
+                grad_fn = train_state.dynamic_scale.value_and_grad(
+                    model_loss, axis_name="data"
+                )
+                dynamic_scale, is_fin, loss, grads = grad_fn(train_state.params)
+                train_state = train_state.replace(dynamic_scale=dynamic_scale)
+            else:
+                grad_fn = jax.value_and_grad(model_loss)
+                loss, grads = grad_fn(train_state.params)
+                if distributed_training:
+                    grads = jax.lax.pmean(grads, "data")
+            new_state = train_state.apply_gradients(grads=grads)
+            if train_state.dynamic_scale:
+                # if is_fin == False the gradients contain Inf/NaNs and optimizer state and
+                # params should be restored (= skip this step).
+                select_fn = functools.partial(jnp.where, is_fin)
+                new_state = train_state.replace(
+                    opt_state=jax.tree_util.tree_map(
+                        select_fn, new_state.opt_state, train_state.opt_state
+                    ),
+                    params=jax.tree_util.tree_map(
+                        select_fn, new_state.params, train_state.params
+                    ),
+                )
+            train_state = new_state.apply_ema(self.ema_decay)
             if distributed_training:
-                grads = jax.lax.pmean(grads, "data")
                 loss = jax.lax.pmean(loss, "data")
-            train_state = train_state.apply_gradients(grads=grads)
-            train_state = train_state.apply_ema(self.ema_decay)
             return train_state, loss, rng_state
         if distributed_training:
@@ -199,4 +230,3 @@ def boolean_string(s):
     if type(s) == bool:
         return s
     return s == 'True'

flaxdiff/trainer/simple_trainer.py CHANGED Viewed

@@ -39,23 +39,23 @@ PROCESS_COLOR_MAP = {
 def _build_global_shape_and_sharding(
     local_shape: tuple[int, ...], global_mesh: Mesh
 ) -> tuple[tuple[int, ...], jax.sharding.NamedSharding]:
-    sharding = jax.sharding.NamedSharding(global_mesh, P(global_mesh.axis_names))
-    global_shape = (jax.process_count() * local_shape[0],) + local_shape[1:]
-    return global_shape, sharding
+  sharding = jax.sharding.NamedSharding(global_mesh, P(global_mesh.axis_names))
+  global_shape = (jax.process_count() * local_shape[0],) + local_shape[1:]
+  return global_shape, sharding
 def form_global_array(path, array: np.ndarray, global_mesh: Mesh) -> jax.Array:
-    """Put local sharded array into local devices"""
-    global_shape, sharding = _build_global_shape_and_sharding(np.shape(array), global_mesh)
-    try:
-        local_device_arrays = np.split(array, len(global_mesh.local_devices), axis=0)
-    except ValueError as array_split_error:
-        raise ValueError(
-            f"Unable to put to devices shape {array.shape} with "
-            f"local device count {len(global_mesh.local_devices)} "
-        ) from array_split_error
-    local_device_buffers = jax.device_put(local_device_arrays, global_mesh.local_devices)
-    return jax.make_array_from_single_device_arrays(global_shape, sharding, local_device_buffers)
+  """Put local sharded array into local devices"""
+  global_shape, sharding = _build_global_shape_and_sharding(np.shape(array), global_mesh)
+  try:
+    local_device_arrays = np.split(array, len(global_mesh.local_devices), axis=0)
+  except ValueError as array_split_error:
+    raise ValueError(
+        f"Unable to put to devices shape {array.shape} with "
+        f"local device count {len(global_mesh.local_devices)} "
+    ) from array_split_error
+  local_device_buffers = jax.device_put(local_device_arrays, global_mesh.local_devices)
+  return jax.make_array_from_single_device_arrays(global_shape, sharding, local_device_buffers)
 def convert_to_global_tree(global_mesh, pytree):
     return jax.tree_util.tree_map_with_path(partial(form_global_array, global_mesh=global_mesh), pytree)
@@ -67,12 +67,8 @@ class Metrics(metrics.Collection):
 # Define the TrainState
 class SimpleTrainState(train_state.TrainState):
-    rngs: jax.random.PRNGKey
     metrics: Metrics
-    def get_random_key(self):
-        rngs, subkey = jax.random.split(self.rngs)
-        return self.replace(rngs=rngs), subkey
+    dynamic_scale: flax.training.dynamic_scale.DynamicScale
 class SimpleTrainer:
     state: SimpleTrainState
@@ -88,20 +84,22 @@ class SimpleTrainer:
                  rngs: jax.random.PRNGKey,
                  train_state: SimpleTrainState = None,
                  name: str = "Simple",
-                 load_from_checkpoint: bool = False,
+                 load_from_checkpoint: str = None,
                  checkpoint_suffix: str = "",
-                 checkpoint_id: str = None,
                  loss_fn=optax.l2_loss,
                  param_transforms: Callable = None,
                  wandb_config: Dict[str, Any] = None,
                  distributed_training: bool = None,
                  checkpoint_base_path: str = "./checkpoints",
+                 checkpoint_step: int = None,
+                 use_dynamic_scale: bool = False,
                  ):
         if distributed_training is None or distributed_training is True:
             # Auto-detect if we are running on multiple devices
             distributed_training = jax.device_count() > 1
             self.mesh = jax.sharding.Mesh(jax.devices(), 'data')
-            # self.sharding = jax.sharding.NamedSharding(self.mesh, jax.sharding.PartitionSpec('data'))
+        else:
+            self.mesh = None
         self.distributed_training = distributed_training
         self.model = model
@@ -112,7 +110,6 @@ class SimpleTrainer:
         if wandb_config is not None and jax.process_index() == 0:
-            import wandb
             run = wandb.init(**wandb_config)
             self.wandb = run
@@ -126,11 +123,6 @@ class SimpleTrainer:
             self.wandb.define_metric("train/avg_time_per_step", step_metric="train/epoch")
             self.wandb.define_metric("train/avg_loss", step_metric="train/epoch")
             self.wandb.define_metric("train/best_loss", step_metric="train/epoch")
-        if checkpoint_id is None:
-            self.checkpoint_id = name.replace(' ', '_').replace('-', '_').lower()
-        else:
-            self.checkpoint_id = checkpoint_id
         # checkpointer = orbax.checkpoint.PyTreeCheckpointer()
         async_checkpointer = orbax.checkpoint.AsyncCheckpointer(orbax.checkpoint.PyTreeCheckpointHandler(), timeout_secs=60)
@@ -140,12 +132,12 @@ class SimpleTrainer:
         self.checkpointer = orbax.checkpoint.CheckpointManager(
             self.checkpoint_path() + checkpoint_suffix, async_checkpointer, options)
-        if load_from_checkpoint:
-            latest_epoch, old_state, old_best_state, rngstate = self.load()
+        if load_from_checkpoint is not None:
+            latest_epoch, latest_step, old_state, old_best_state, rngstate = self.load(load_from_checkpoint, checkpoint_step)
         else:
-            latest_epoch, old_state, old_best_state, rngstate = 0, None, None, None
+            latest_epoch, latest_step, old_state, old_best_state, rngstate = 0, 0, None, None, None
-        self.latest_epoch = latest_epoch
+        self.latest_step = latest_step
         if rngstate:
             self.rngstate = RandomMarkovState(**rngstate)
@@ -156,7 +148,7 @@ class SimpleTrainer:
         if train_state == None:
             state, best_state = self.generate_states(
-                optimizer, subkey, old_state, old_best_state, model, param_transforms
+                optimizer, subkey, old_state, old_best_state, model, param_transforms, use_dynamic_scale
             )
             self.init_state(state, best_state)
         else:
@@ -174,7 +166,8 @@ class SimpleTrainer:
         existing_state: dict = None,
         existing_best_state: dict = None,
         model: nn.Module = None,
-        param_transforms: Callable = None
+        param_transforms: Callable = None,
+        use_dynamic_scale: bool = False
     ) -> Tuple[SimpleTrainState, SimpleTrainState]:
         print("Generating states for SimpleTrainer")
         rngs, subkey = jax.random.split(rngs)
@@ -189,7 +182,8 @@ class SimpleTrainer:
             apply_fn=model.apply,
             params=params,
             tx=optimizer,
-            metrics=Metrics.empty()
+            metrics=Metrics.empty(),
+            dynamic_scale = flax.training.dynamic_scale.DynamicScale() if use_dynamic_scale else None
         )
         if existing_best_state is not None:
             best_state = state.replace(
@@ -222,7 +216,7 @@ class SimpleTrainer:
         return jax.tree_util.tree_map(lambda x : np.array(x), self.rngstate)
     def checkpoint_path(self):
-        path = os.path.join(self.checkpoint_base_path, self.checkpoint_id)
+        path = os.path.join(self.checkpoint_base_path, self.name.replace(' ', '_').lower())
         if not os.path.exists(path):
             os.makedirs(path)
         return path
@@ -234,31 +228,46 @@ class SimpleTrainer:
             os.makedirs(path)
         return path
-    def load(self):
-        epoch = self.checkpointer.latest_step()
-        print("Loading model from checkpoint", epoch)
-        ckpt = self.checkpointer.restore(epoch)
+    def load(self, checkpoint_path=None, checkpoint_step=None):
+        if checkpoint_path is None:
+            checkpointer = self.checkpointer
+        else:
+            checkpointer = orbax.checkpoint.PyTreeCheckpointer()
+            options = orbax.checkpoint.CheckpointManagerOptions(
+                max_to_keep=4, create=False)
+            checkpointer = orbax.checkpoint.CheckpointManager(
+                checkpoint_path, checkpointer, options)
+        if checkpoint_step is None:
+            step = checkpointer.latest_step()
+        else:
+            step = checkpoint_step
+        print("Loading model from checkpoint at step ", step)
+        ckpt = checkpointer.restore(step)
         state = ckpt['state']
         best_state = ckpt['best_state']
         rngstate = ckpt['rngs']
         # Convert the state to a TrainState
         self.best_loss = ckpt['best_loss']
+        current_epoch = ckpt.get('epoch', step) # Must be a checkpoint from an older version which used epochs instead of steps
         print(
-            f"Loaded model from checkpoint at epoch {epoch}", ckpt['best_loss'])
-        return epoch, state, best_state, rngstate
+            f"Loaded model from checkpoint at epoch {current_epoch} step {step}", ckpt['best_loss'])
+        return current_epoch, step, state, best_state, rngstate
-    def save(self, epoch=0):
-        print(f"Saving model at epoch {epoch}")
+    def save(self, epoch=0, step=0):
+        print(f"Saving model at epoch {epoch} step {step}")
         ckpt = {
             # 'model': self.model,
             'rngs': self.get_rngstate(),
             'state': self.get_state(),
             'best_state': self.get_best_state(),
             'best_loss': np.array(self.best_loss),
+            'epoch': epoch,
         }
         try:
             save_args = orbax_utils.save_args_from_target(ckpt)
-            self.checkpointer.save(epoch, ckpt, save_kwargs={
+            self.checkpointer.save(step, ckpt, save_kwargs={
                                    'save_args': save_args}, force=True)
             self.checkpointer.wait_until_finished()
             pass
@@ -350,9 +359,10 @@ class SimpleTrainer:
         else:
             global_device_indexes = 0
-        def train_loop(current_epoch, pbar: tqdm.tqdm, train_state, rng_state):
+        def train_loop(current_step, pbar: tqdm.tqdm, train_state, rng_state):
             epoch_loss = 0
-            current_step = 0
+            current_epoch = current_step // steps_per_epoch
+            last_save_time = time.time()
             for i in range(steps_per_epoch):
                 batch = next(train_ds)
                 if self.distributed_training and global_device_count > 1:
@@ -363,36 +373,46 @@ class SimpleTrainer:
                 if self.distributed_training:
                     loss = jax.experimental.multihost_utils.process_allgather(loss)
                     loss = jnp.mean(loss) # Just to make sure its a scaler value
+                if loss <= 1e-6:
+                    # If the loss is too low, we can assume the model has diverged
+                    print(colored(f"Loss too low at step {current_step} => {loss}", 'red'))
+                    # Exit the training loop
+                    exit(1)
                 epoch_loss += loss
-                if pbar is not None:
-                    if i % 100 == 0:
+                current_step += 1
+                if i % 100 == 0:
+                    if pbar is not None:
                         pbar.set_postfix(loss=f'{loss:.4f}')
                         pbar.update(100)
-                        current_step = current_epoch*steps_per_epoch + i
                         if self.wandb is not None:
                             self.wandb.log({
                                 "train/step" : current_step,
                                 "train/loss": loss,
                             }, step=current_step)
+                    # Save the model every 40 minutes
+                    if time.time() - last_save_time > 40 * 60:
+                        print(f"Saving model after 40 minutes at step {current_step}")
+                        self.save(current_epoch, current_step)
+                        last_save_time = time.time()
             print(colored(f"Epoch done on index {process_index} => {current_epoch} Loss: {epoch_loss/steps_per_epoch}", 'green'))
             return epoch_loss, current_step, train_state, rng_state
-        while self.latest_epoch < epochs:
-            current_epoch = self.latest_epoch
-            self.latest_epoch += 1
+        while self.latest_step < epochs * steps_per_epoch:
+            current_epoch = self.latest_step // steps_per_epoch
             print(f"\nEpoch {current_epoch}/{epochs}")
             start_time = time.time()
             epoch_loss = 0
             if process_index == 0:
                 with tqdm.tqdm(total=steps_per_epoch, desc=f'\t\tEpoch {current_epoch}', ncols=100, unit='step') as pbar:
-                    epoch_loss, current_step, train_state, rng_state = train_loop(current_epoch, pbar, train_state, rng_state)
+                    epoch_loss, current_step, train_state, rng_state = train_loop(self.latest_step, pbar, train_state, rng_state)
             else:
-                epoch_loss, current_step, train_state, rng_state = train_loop(current_epoch, None, train_state, rng_state)
-                print(colored(f"Epoch done on process index {process_index}", PROCESS_COLOR_MAP.get(process_index, 'white')))
+                epoch_loss, current_step, train_state, rng_state = train_loop(self.latest_step, None, train_state, rng_state)
+                print(colored(f"Epoch done on process index {process_index}", PROCESS_COLOR_MAP[process_index]))
+            self.latest_step = current_step
             end_time = time.time()
             self.state = train_state
             self.rngstate = rng_state
@@ -402,7 +422,7 @@ class SimpleTrainer:
             if avg_loss < self.best_loss:
                 self.best_loss = avg_loss
                 self.best_state = train_state
-                self.save(current_epoch)
+                self.save(current_epoch, current_step)
             if process_index == 0:
                 if self.wandb is not None:
@@ -415,4 +435,4 @@ class SimpleTrainer:
                     }, step=current_step)
                 print(colored(f"\n\tEpoch {current_epoch} completed. Avg Loss: {avg_loss}, Time: {total_time:.2f}s, Best Loss: {self.best_loss}", 'green'))
         self.save(epochs)
-        return self.state
+        return self.state

{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.7
+Version: 0.1.9
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com
@@ -234,6 +234,23 @@ plotImages(samples, dpi=300)
 ## Gallery
+### Images generated by Euler Ancestral Sampler in 200 Steps [text2image with CFG]
+Model trained on Laion-Aesthetics 12M + CC12M + MS COCO + 1M aesthetic 6+ subset of COYO-700M on TPU-v4-32:
+`a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden`
+**Params**:
+`Dataset: Laion-Aesthetics 12M + CC12M + MS COCO + 1M aesthetic 6+ subset of COYO-700M`
+`Batch size: 256`
+`Image Size: 128`
+`Training Epochs: 5`
+`Steps per epoch: 74573`
+`Model Configurations: feature_depths=[128, 256, 512, 1024]`
+`Training Noise Schedule: EDMNoiseScheduler`
+`Inference Noise Schedule: KarrasEDMPredictor`
+![EulerA with CFG](images/medium_epoch5.png)
 ### Images generated by Euler Ancestral Sampler in 200 Steps [text2image with CFG]
 Images generated by the following prompts using classifier free guidance with guidance factor = 2:
 `'water tulip, a water lily, a water lily, a water lily, a photo of a marigold, a water lily, a water lily, a photo of a lotus, a photo of a lotus, a photo of a lotus, a photo of a rose, a photo of a rose, a photo of a rose, a photo of a rose, a photo of a rose'`

{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=B0GcHlzlVYDNEIdh2v5qmP4u0neIT-FqexNohuyuCvg,2452
 flaxdiff/models/__init__.py,sha256=FAivVYXxM2JrCFIXf-C3374RB2Hth25dBrzOeNFhH1U,26
-flaxdiff/models/attention.py,sha256=OhpKQXdxWbf8K2_yotLfS0DYdHb-zNpL2p8--ql_FAg,14503
-flaxdiff/models/common.py,sha256=RYNxX9K19hvwSWaB9Wtv7MIZLhcacdugDgD9uZDh8XM,10358
+flaxdiff/models/attention.py,sha256=YyVI3dTAMB8cS8VWHgtIigr2YY-MYfFTlaNDfjNJOCk,12596
+flaxdiff/models/common.py,sha256=nh32GIfgT_vVab4DEFiRAns5WGKbv6L5xNhzzfKKyBs,10590
 flaxdiff/models/favor_fastattn.py,sha256=79Ew1nqarsNLPzZaBSd1ILORzJr74CupYeqGiCQK5E4,27689
-flaxdiff/models/simple_unet.py,sha256=BGKEx5CmXfAXC9Pv3oG_lgdjbxZ1gNYTzFQUh9s5bik,9270
-flaxdiff/models/simple_vit.py,sha256=vTu2CQRoSOxetBHTrnCWddm-vxrZDkMe8EpdNxtpJMk,4015
+flaxdiff/models/simple_unet.py,sha256=_elSWNaB3EG-DwnrdIPVPF4OkU0xaa2IJk6OVITOwWM,9691
+flaxdiff/models/simple_vit.py,sha256=xD23i1b7WEvoH4tUMsLyCe9ebDcv-PpaV0Nso38Jlb8,3887
 flaxdiff/models/autoencoder/__init__.py,sha256=qY-7MldZpsfkF-_T2LqlRK7VHbqfmosz0NmvzDlBkOk,78
 flaxdiff/models/autoencoder/autoencoder.py,sha256=27_hYl0yXAdH9Mx4Xu9J79mSNo-FEKr9SxhVaS3ffn4,591
-flaxdiff/models/autoencoder/diffusers.py,sha256=kwlKwHBSAegtTiEkGju_1Trltegj-e47hXFN9jCKmgY,3609
+flaxdiff/models/autoencoder/diffusers.py,sha256=l4teVksXd9XCCQWcVn9eB820xJyLT8hpg1CXQ_aHZ6M,3611
 flaxdiff/models/autoencoder/simple_autoenc.py,sha256=UXHPgDmwGTnv3Uts6Zj3p9R9nJXnEiEXbllgarwDfXM,805
 flaxdiff/predictors/__init__.py,sha256=SKkYYRF9Wfgk2zhtZw4vCXOdOeRlrm2Mk6cvuaEvAzc,4403
 flaxdiff/samplers/__init__.py,sha256=_S-9TwDeshrI0VmapV-J2hqjTByOa0-oOeUs_IdovjU,285
@@ -30,9 +30,9 @@ flaxdiff/schedulers/linear.py,sha256=6003F5ISq1Wc0h6UAzY95MJgsDIKGMhBzbiVALpea0k
 flaxdiff/schedulers/sqrt.py,sha256=1F84ZgQPuoNMhe6yxGTR2G0h7dPOZtm4UDQOakbSsEU,445
 flaxdiff/trainer/__init__.py,sha256=T-vUVq4zHcMK6kpCsG4Gu8vn71q6lZD-lg-Ul7yKfEk,128
 flaxdiff/trainer/autoencoder_trainer.py,sha256=al7AsZ7yeDMEiDD-gbcXf0ADq_xfk1VMxvg24GfA-XQ,7008
-flaxdiff/trainer/diffusion_trainer.py,sha256=h5YxIMjBI553xDNeapzLDGF0_4y0MfGRMuHume5sPtM,7785
-flaxdiff/trainer/simple_trainer.py,sha256=f4g2KGuGM__d9v_4Ip3ng8wQubmenWZUW60VEu2ANOg,16774
-flaxdiff-0.1.7.dist-info/METADATA,sha256=X3Fnznj_xZeXulzdPxvaAjnZmG2vt1D8AZWN9iJadpY,20090
-flaxdiff-0.1.7.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-flaxdiff-0.1.7.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.1.7.dist-info/RECORD,,
+flaxdiff/trainer/diffusion_trainer.py,sha256=z-ERdPt8mB6drXXlLjbGpbPreDIQlGmJFPRJhaoEZ1M,9242
+flaxdiff/trainer/simple_trainer.py,sha256=Dv2F7e2PQS_2b972iRr66odCcPPdJ9cZAD5t9LguOiw,18110
+flaxdiff-0.1.9.dist-info/METADATA,sha256=HhZlM5rBZrOSpNhS8KpeBCoXSmbsHy8ZAKY7gj10P0c,22082
+flaxdiff-0.1.9.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+flaxdiff-0.1.9.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.1.9.dist-info/RECORD,,

{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.1.7.dist-info → flaxdiff-0.1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

flaxdiff 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl