PyPI - flaxdiff - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

flaxdiff 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

flaxdiff/models/attention.py +57 -115
flaxdiff/models/common.py +2 -2
flaxdiff/models/simple_unet.py +7 -15
flaxdiff/models/simple_vit.py +123 -0
flaxdiff/trainer/__init__.py +113 -128
flaxdiff/trainer/simple_trainer.py +323 -0
{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/METADATA +1 -1
{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/RECORD +10 -8
{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/WHEEL +0 -0
{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/top_level.txt +0 -0

flaxdiff/models/attention.py CHANGED Viewed

@@ -11,105 +11,6 @@ import functools
 import math
 from .common import kernel_init
-def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
-    """Multi-head dot product attention with a limited number of queries."""
-    num_kv, num_heads, k_features = key.shape[-3:]
-    v_features = value.shape[-1]
-    key_chunk_size = min(key_chunk_size, num_kv)
-    query = query / jnp.sqrt(k_features)
-    @functools.partial(jax.checkpoint, prevent_cse=False)
-    def summarize_chunk(query, key, value):
-        attn_weights = jnp.einsum("...qhd,...khd->...qhk", query, key, precision=precision)
-        max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
-        max_score = jax.lax.stop_gradient(max_score)
-        exp_weights = jnp.exp(attn_weights - max_score)
-        exp_values = jnp.einsum("...vhf,...qhv->...qhf", value, exp_weights, precision=precision)
-        max_score = jnp.einsum("...qhk->...qh", max_score)
-        return (exp_values, exp_weights.sum(axis=-1), max_score)
-    def chunk_scanner(chunk_idx):
-        # julienne key array
-        key_chunk = jax.lax.dynamic_slice(
-            operand=key,
-            start_indices=[0] * (key.ndim - 3) + [chunk_idx, 0, 0],  # [...,k,h,d]
-            slice_sizes=list(key.shape[:-3]) + [key_chunk_size, num_heads, k_features],  # [...,k,h,d]
-        )
-        # julienne value array
-        value_chunk = jax.lax.dynamic_slice(
-            operand=value,
-            start_indices=[0] * (value.ndim - 3) + [chunk_idx, 0, 0],  # [...,v,h,d]
-            slice_sizes=list(value.shape[:-3]) + [key_chunk_size, num_heads, v_features],  # [...,v,h,d]
-        )
-        return summarize_chunk(query, key_chunk, value_chunk)
-    chunk_values, chunk_weights, chunk_max = jax.lax.map(f=chunk_scanner, xs=jnp.arange(0, num_kv, key_chunk_size))
-    global_max = jnp.max(chunk_max, axis=0, keepdims=True)
-    max_diffs = jnp.exp(chunk_max - global_max)
-    chunk_values *= jnp.expand_dims(max_diffs, axis=-1)
-    chunk_weights *= max_diffs
-    all_values = chunk_values.sum(axis=0)
-    all_weights = jnp.expand_dims(chunk_weights, -1).sum(axis=0)
-    return all_values / all_weights
-def jax_memory_efficient_attention(
-    query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
-):
-    r"""
-    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
-    https://github.com/AminRezaei0x443/memory-efficient-attention
-    Args:
-        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
-        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
-        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
-        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
-            numerical precision for computation
-        query_chunk_size (`int`, *optional*, defaults to 1024):
-            chunk size to divide query array value must divide query_length equally without remainder
-        key_chunk_size (`int`, *optional*, defaults to 4096):
-            chunk size to divide key and value array value must divide key_value_length equally without remainder
-    Returns:
-        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
-    """
-    num_q, num_heads, q_features = query.shape[-3:]
-    def chunk_scanner(chunk_idx, _):
-        # julienne query array
-        query_chunk = jax.lax.dynamic_slice(
-            operand=query,
-            start_indices=([0] * (query.ndim - 3)) + [chunk_idx, 0, 0],  # [...,q,h,d]
-            slice_sizes=list(query.shape[:-3]) + [min(query_chunk_size, num_q), num_heads, q_features],  # [...,q,h,d]
-        )
-        return (
-            chunk_idx + query_chunk_size,  # unused ignore it
-            _query_chunk_attention(
-                query=query_chunk, key=key, value=value, precision=precision, key_chunk_size=key_chunk_size
-            ),
-        )
-    _, res = jax.lax.scan(
-        f=chunk_scanner,
-        init=0,
-        xs=None,
-        length=math.ceil(num_q / query_chunk_size),  # start counter  # stop counter
-    )
-    return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
 class EfficientAttention(nn.Module):
     """
     Based on the pallas attention implementation.
@@ -125,41 +26,77 @@ class EfficientAttention(nn.Module):
     def setup(self):
         inner_dim = self.dim_head * self.heads
         # Weights were exported with old names {to_q, to_k, to_v, to_out}
-        self.query = nn.DenseGeneral(inner_dim, use_bias=False, precision=self.precision,
-                                     kernel_init=self.kernel_init(), dtype=self.dtype, name="to_q")
-        self.key = nn.DenseGeneral(inner_dim, use_bias=False, precision=self.precision,
-                                     kernel_init=self.kernel_init(), dtype=self.dtype, name="to_k")
-        self.value = nn.DenseGeneral(inner_dim, use_bias=False, precision=self.precision,
-                                     kernel_init=self.kernel_init(), dtype=self.dtype, name="to_v")
+        dense = functools.partial(
+            nn.Dense,
+            self.heads * self.dim_head,
+            precision=self.precision,
+            use_bias=self.use_bias,
+            kernel_init=self.kernel_init(),
+            dtype=self.dtype
+        )
+        self.query = dense(name="to_q")
+        self.key = dense(name="to_k")
+        self.value = dense(name="to_v")
         self.proj_attn = nn.DenseGeneral(self.query_dim, use_bias=False, precision=self.precision,
                                      kernel_init=self.kernel_init(), dtype=self.dtype, name="to_out_0")
         # self.attnfn = make_fast_generalized_attention(qkv_dim=inner_dim, lax_scan_unroll=16)
+    def _reshape_tensor_to_head_dim(self, tensor):
+        batch_size, _, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        return tensor
+    def _reshape_tensor_from_head_dim(self, tensor):
+        batch_size, _, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size, 1, seq_len, dim * head_size)
+        return tensor
     @nn.compact
     def __call__(self, x:jax.Array, context=None):
+        # print(x.shape)
         # x has shape [B, H * W, C]
         context = x if context is None else context
+        B, H, W, C = x.shape
+        x = x.reshape((B, 1, H * W, C))
+        if len(context.shape) == 4:
+            B, _H, _W, _C = context.shape
+            context = context.reshape((B, 1, _H * _W, _C))
+        else:
+            B, SEQ, _C = context.shape
+            context = context.reshape((B, 1, SEQ, _C))
         query = self.query(x)
         key = self.key(context)
         value = self.value(context)
-        # print(query.shape, key.shape, value.shape)
+        query = self._reshape_tensor_to_head_dim(query)
+        key = self._reshape_tensor_to_head_dim(key)
+        value = self._reshape_tensor_to_head_dim(value)
-        # hidden_states = jax.experimental.pallas.ops.tpu.flash_attention.mha_reference(
-        #     query, key, value, None
-        # )
-        hidden_states = nn.dot_product_attention(
-            query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
+        hidden_states = jax.experimental.pallas.ops.tpu.flash_attention.flash_attention(
+            query, key, value, None
         )
-        # hidden_states = self.attnfn(
-        #     query, key, value, None
+        hidden_states = self._reshape_tensor_from_head_dim(hidden_states)
+        # hidden_states = nn.dot_product_attention(
+        #     query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
         # )
         proj = self.proj_attn(hidden_states)
+        proj = proj.reshape((B, H, W, C))
         return proj
 class NormalAttention(nn.Module):
     """
     Simple implementation of the normal attention.
@@ -201,7 +138,11 @@ class NormalAttention(nn.Module):
     @nn.compact
     def __call__(self, x, context=None):
         # x has shape [B, H, W, C]
+        B, H, W, C = x.shape
+        x = x.reshape((B, H*W, C))
         context = x if context is None else context
+        if len(context.shape) == 4:
+            context = context.reshape((B, H*W, C))
         query = self.query(x)
         key = self.key(context)
         value = self.value(context)
@@ -210,6 +151,7 @@ class NormalAttention(nn.Module):
             query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
         )
         proj = self.proj_attn(hidden_states)
+        proj = proj.reshape((B, H, W, C))
         return proj
 class AttentionBlock(nn.Module):

flaxdiff/models/common.py CHANGED Viewed

@@ -2,6 +2,6 @@ import jax.numpy as jnp
 from flax import linen as nn
 # Kernel initializer to use
-def kernel_init(scale):
+def kernel_init(scale, dtype=jnp.float32):
     scale = max(scale, 1e-10)
-    return nn.initializers.variance_scaling(scale=scale, mode="fan_in", distribution="truncated_normal")
+    return nn.initializers.variance_scaling(scale=scale, mode="fan_avg", distribution="truncated_normal", dtype=dtype)

flaxdiff/models/simple_unet.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, Callable, Sequence, Any, Union
 import einops
 from .common import kernel_init
 from .attention import TransformerBlock
 class WeightStandardizedConv(nn.Module):
     """
     apply weight standardization  https://arxiv.org/abs/1903.10520
@@ -243,6 +244,7 @@ def l2norm(t, axis=1, eps=1e-12):
     denom = jnp.clip(jnp.linalg.norm(t, ord=2, axis=axis, keepdims=True), eps)
     out = t/denom
     return (out)
 class ResidualBlock(nn.Module):
     conv_type:str
     features:int
@@ -327,7 +329,7 @@ class Unet(nn.Module):
     precision: Any = jax.lax.Precision.HIGH
     @nn.compact
-    def __call__(self, x, temb, textcontext=None):
+    def __call__(self, x, temb, textcontext):
         # print("embedding features", self.emb_features)
         temb = FourierEmbedding(features=self.emb_features)(temb)
         temb = TimeProjection(features=self.emb_features)(temb)
@@ -340,7 +342,7 @@ class Unet(nn.Module):
         conv_type = up_conv_type = down_conv_type = middle_conv_type = "conv"
         # middle_conv_type = "separable"
         x = ConvLayer(
             conv_type,
             features=self.feature_depths[0],
@@ -370,18 +372,13 @@ class Unet(nn.Module):
                     precision=self.precision
                 )(x, temb)
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
-                    B, H, W, _ = x.shape
-                    if H > TS:
-                        padded_context = jnp.pad(textcontext, ((0, 0), (0, H - TS), (0, 0)), mode='constant', constant_values=0).reshape((B, 1, H, TC))
-                    else:
-                        padded_context = None
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
                                        dim_head=dim_in // attention_config['heads'],
                                        use_flash_attention=attention_config.get("flash_attention", True),
                                        use_projection=attention_config.get("use_projection", False),
                                        use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                        precision=attention_config.get("precision", self.precision),
-                                       name=f"down_{i}_attention_{j}")(x, padded_context)
+                                       name=f"down_{i}_attention_{j}")(x, textcontext)
                 # print("down residual for feature level", i, "is of shape", x.shape, "features", dim_in)
                 downs.append(x)
             if i != len(feature_depths) - 1:
@@ -419,7 +416,7 @@ class Unet(nn.Module):
                                     use_projection=middle_attention.get("use_projection", False),
                                     use_self_and_cross=False,
                                     precision=attention_config.get("precision", self.precision),
-                                    name=f"middle_attention_{j}")(x)
+                                    name=f"middle_attention_{j}")(x, textcontext)
             x = ResidualBlock(
                 middle_conv_type,
                 name=f"middle_res2_{j}",
@@ -454,18 +451,13 @@ class Unet(nn.Module):
                     precision=self.precision
                 )(x, temb)
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
-                    B, H, W, _ = x.shape
-                    if H > TS:
-                        padded_context = jnp.pad(textcontext, ((0, 0), (0, H - TS), (0, 0)), mode='constant', constant_values=0).reshape((B, 1, H, TC))
-                    else:
-                        padded_context = None
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
                                        dim_head=dim_out // attention_config['heads'],
                                        use_flash_attention=attention_config.get("flash_attention", True),
                                        use_projection=attention_config.get("use_projection", False),
                                        use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
-                                       name=f"up_{i}_attention_{j}")(x, padded_context)
+                                       name=f"up_{i}_attention_{j}")(x, textcontext)
             # print("Upscaling ", i, x.shape)
             if i != len(feature_depths) - 1:
                 x = Upsample(

flaxdiff/models/simple_vit.py ADDED Viewed

@@ -0,0 +1,123 @@
+# simple_vit.py
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from typing import Callable, Any
+from .simply_unet import FourierEmbedding, TimeProjection, ConvLayer, kernel_init
+from .attention import TransformerBlock
+class PatchEmbedding(nn.Module):
+    patch_size: int
+    embedding_dim: int
+    dtype: Any = jnp.float32
+    precision: Any = jax.lax.Precision.HIGH
+    @nn.compact
+    def __call__(self, x):
+        batch, height, width, channels = x.shape
+        assert height % self.patch_size == 0 and width % self.patch_size == 0, "Image dimensions must be divisible by patch size"
+        x = nn.Conv(features=self.embedding_dim,
+                    kernel_size=(self.patch_size, self.patch_size),
+                    strides=(self.patch_size, self.patch_size),
+                    dtype=self.dtype,
+                    precision=self.precision)(x)
+        x = jnp.reshape(x, (batch, -1, self.embedding_dim))
+        return x
+class PositionalEncoding(nn.Module):
+    max_len: int
+    embedding_dim: int
+    @nn.compact
+    def __call__(self, x):
+        pe = self.param('pos_encoding',
+                        jax.nn.initializers.zeros,
+                        (1, self.max_len, self.embedding_dim))
+        return x + pe[:, :x.shape[1], :]
+class TransformerEncoder(nn.Module):
+    num_layers: int
+    num_heads: int
+    mlp_dim: int
+    dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    precision: Any = jax.lax.Precision.HIGH
+    @nn.compact
+    def __call__(self, x, training=True):
+        for _ in range(self.num_layers):
+            x = TransformerBlock(
+                heads=self.num_heads,
+                dim_head=x.shape[-1] // self.num_heads,
+                mlp_dim=self.mlp_dim,
+                dropout_rate=self.dropout_rate,
+                dtype=self.dtype,
+                precision=self.precision
+            )(x)
+        return x
+class VisionTransformer(nn.Module):
+    patch_size: int = 16
+    embedding_dim: int = 768
+    num_layers: int = 12
+    num_heads: int = 12
+    mlp_dim: int = 3072
+    emb_features: int = 256
+    dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    precision: Any = jax.lax.Precision.HIGH
+    @nn.compact
+    def __call__(self, x, temb, textcontext=None):
+        # Time embedding
+        temb = FourierEmbedding(features=self.emb_features)(temb)
+        temb = TimeProjection(features=self.emb_features)(temb)
+        # Patch embedding
+        x = PatchEmbedding(patch_size=self.patch_size, embedding_dim=self.embedding_dim,
+                           dtype=self.dtype, precision=self.precision)(x)
+        # Add positional encoding
+        x = PositionalEncoding(max_len=x.shape[1], embedding_dim=self.embedding_dim)(x)
+        # Add time embedding
+        temb = jnp.expand_dims(temb, axis=1)
+        x = jnp.concatenate([x, temb], axis=1)
+        # Add text context
+        if textcontext is not None:
+            x = jnp.concatenate([x, textcontext], axis=1)
+        # Transformer encoder
+        x = TransformerEncoder(
+            num_layers=self.num_layers,
+            num_heads=self.num_heads,
+            mlp_dim=self.mlp_dim,
+            dropout_rate=self.dropout_rate,
+            dtype=self.dtype,
+            precision=self.precision
+        )(x)
+        # Extract the image tokens (exclude time and text embeddings)
+        num_patches = (x.shape[1] - 1 - (0 if textcontext is None else textcontext.shape[1]))
+        x = x[:, :num_patches, :]
+        # Reshape to image dimensions
+        batch, _, _ = x.shape
+        height = width = int((num_patches) ** 0.5)
+        x = jnp.reshape(x, (batch, height, width, self.embedding_dim))
+        # Final convolution to get the desired output channels
+        x = ConvLayer(
+            conv_type="conv",
+            features=3,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            kernel_init=kernel_init(0.0),
+            dtype=self.dtype,
+            precision=self.precision
+        )(x)
+        return x

flaxdiff/trainer/__init__.py CHANGED Viewed

@@ -17,18 +17,9 @@ from flax.training import orbax_utils
 from ..schedulers import NoiseScheduler
 from ..predictors import DiffusionPredictionTransform, EpsilonPredictionTransform
-@struct.dataclass
-class Metrics(metrics.Collection):
-  loss: metrics.Average.from_output('loss') # type: ignore
+from .simple_trainer import SimpleTrainer, SimpleTrainState
-class ModelState():
-    model: nn.Module
-    params: dict
-    noise_schedule: NoiseScheduler
-    model_output_transform: DiffusionPredictionTransform
-# Define the TrainState with EMA parameters
-class TrainState(train_state.TrainState):
+class TrainState(SimpleTrainState):
     rngs: jax.random.PRNGKey
     ema_params: dict
@@ -36,7 +27,7 @@ class TrainState(train_state.TrainState):
         rngs, subkey = jax.random.split(self.rngs)
         return self.replace(rngs=rngs), subkey
-    def apply_ema(self, decay: float=0.999):
+    def apply_ema(self, decay: float = 0.999):
         new_ema_params = jax.tree_util.tree_map(
             lambda ema, param: decay * ema + (1 - decay) * param,
             self.ema_params,
@@ -44,141 +35,142 @@ class TrainState(train_state.TrainState):
         )
         return self.replace(ema_params=new_ema_params)
-class DiffusionTrainer:
-    state : TrainState
-    best_state : TrainState
-    best_loss : float
-    model : nn.Module
-    noise_schedule : NoiseScheduler
-    model_output_transform:DiffusionPredictionTransform
-    ema_decay:float = 0.999
-    def __init__(self,
-                 model:nn.Module,
+class DiffusionTrainer(SimpleTrainer):
+    noise_schedule: NoiseScheduler
+    model_output_transform: DiffusionPredictionTransform
+    ema_decay: float = 0.999
+    def __init__(self,
+                 model: nn.Module,
+                 input_shapes: Dict[str, Tuple[int]],
                  optimizer: optax.GradientTransformation,
-                 noise_schedule:NoiseScheduler,
-                 rngs:jax.random.PRNGKey,
-                 train_state:TrainState=None,
-                 name:str="Diffusion",
-                 load_from_checkpoint:bool=False,
-                 param_transforms:Callable=None,
-                 model_output_transform:DiffusionPredictionTransform=EpsilonPredictionTransform(),
-                 loss_fn=optax.l2_loss,
+                 noise_schedule: NoiseScheduler,
+                 rngs: jax.random.PRNGKey,
+                 unconditional_prob: float = 0.2,
+                 name: str = "Diffusion",
+                 model_output_transform: DiffusionPredictionTransform = EpsilonPredictionTransform(),
+                 **kwargs
                  ):
-        self.model = model
+        super().__init__(
+            model=model,
+            input_shapes=input_shapes,
+            optimizer=optimizer,
+            rngs=rngs,
+            name=name,
+            **kwargs
+        )
         self.noise_schedule = noise_schedule
-        self.name = name
         self.model_output_transform = model_output_transform
-        self.loss_fn = loss_fn
-        checkpointer = orbax.checkpoint.PyTreeCheckpointer()
-        options = orbax.checkpoint.CheckpointManagerOptions(max_to_keep=4, create=True)
-        self.checkpointer = orbax.checkpoint.CheckpointManager(self.checkpoint_path(), checkpointer, options)
+        self.unconditional_prob = unconditional_prob
+    def __init_fn(
+        self,
+        optimizer: optax.GradientTransformation,
+        rngs: jax.random.PRNGKey,
+        existing_state: dict = None,
+        existing_best_state: dict = None,
+        model: nn.Module = None,
+        param_transforms: Callable = None
+    ) -> Tuple[TrainState, TrainState]:
+        rngs, subkey = jax.random.split(rngs)
-        if load_from_checkpoint:
-            params = self.load()
+        if existing_state == None:
+            input_vars = self.get_input_ones()
+            params = model.init(subkey, **input_vars)
+            new_state = {"params": params, "ema_params": params}
         else:
-            params = None
+            new_state = existing_state
-        if train_state == None:
-            self.init_state(optimizer, rngs, params=params, model=model, param_transforms=param_transforms)
-        else:
-            self.state = train_state
-            self.best_state = train_state
-            self.best_loss = 1e9
-    def init_state(self,
-                   optimizer: optax.GradientTransformation,
-                   rngs:jax.random.PRNGKey,
-                   params:dict=None,
-                   model:nn.Module=None,
-                     param_transforms:Callable=None,
-                     batch_size=16,
-                    image_size=64
-                   ):
-        inp = jnp.ones((batch_size, image_size, image_size, 3))
-        temb = jnp.ones((batch_size,))
-        rngs, subkey = jax.random.split(rngs)
-        if params == None:
-            params = model.init(subkey, inp, temb)
         if param_transforms is not None:
             params = param_transforms(params)
-        self.best_loss = 1e9
-        self.state = TrainState.create(
+        state = TrainState.create(
             apply_fn=model.apply,
-            params=params,
-            ema_params=params,
+            params=new_state['params'],
+            ema_params=new_state['ema_params'],
             tx=optimizer,
             rngs=rngs,
+            metrics=Metrics.empty()
         )
-        self.best_state = self.state
-    def checkpoint_path(self):
-        experiment_name = self.name
-        path = os.path.join(os.path.abspath('./models'), experiment_name)
-        if not os.path.exists(path):
-            os.makedirs(path)
-        return path
-    def load(self):
-        step = self.checkpointer.latest_step()
-        print("Loading model from checkpoint", step)
-        ckpt = self.checkpointer.restore(step)
-        state = ckpt['state']
-        # Convert the state to a TrainState
-        self.best_loss = ckpt['best_loss']
-        print(f"Loaded model from checkpoint at step {step}", ckpt['best_loss'])
-        return state.get('params', None)#, ckpt.get('model', None)
-    def save(self, epoch=0, best=False):
-        print(f"Saving model at epoch {epoch}")
-        state = self.best_state if best else self.state
-        # filename = os.path.join(self.checkpoint_path(), f'model_{epoch}' if not best else 'best_model')
-        ckpt = {
-            'model': self.model,
-            'state': state,
-            'best_loss': self.best_loss
-        }
-        save_args = orbax_utils.save_args_from_target(ckpt)
-        self.checkpointer.save(epoch, ckpt, save_kwargs={'save_args': save_args})
-    def summary(self, image_size=64):
-        inp = jnp.ones((1, image_size, image_size, 3))
-        temb = jnp.ones((1,))
-        print(self.model.tabulate(jax.random.key(0), inp, temb, console_kwargs={"width": 200, "force_jupyter":True, }))
-    def _define_train_step(self):
+        if existing_best_state is not None:
+            best_state = state.replace(
+                params=existing_best_state['params'], ema_params=existing_best_state['ema_params'])
+        else:
+            best_state = state
+        return state, best_state
+    def _define_train_step(self, batch_size, null_labels_seq, text_embedder):
         noise_schedule = self.noise_schedule
         model = self.model
         model_output_transform = self.model_output_transform
         loss_fn = self.loss_fn
-        @jax.jit
-        def train_step(state:TrainState, batch):
+        unconditional_prob = self.unconditional_prob
+        # Determine the number of unconditional samples
+        num_unconditional = int(batch_size * unconditional_prob)
+        nS, nC = null_labels_seq.shape
+        null_labels_seq = jnp.broadcast_to(
+            null_labels_seq, (batch_size, nS, nC))
+        distributed_training = self.distributed_training
+        def train_step(state: TrainState, batch):
             """Train for a single step."""
-            images = batch
-            noise_level, state = noise_schedule.generate_timesteps(images.shape[0], state)
+            images = batch['image']
+            # normalize image
+            images = (images - 127.5) / 127.5
+            output = text_embedder(
+                input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
+            # output = infer(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
+            label_seq = output.last_hidden_state
+            # Generate random probabilities to decide how much of this batch will be unconditional
+            label_seq = jnp.concat(
+                [null_labels_seq[:num_unconditional], label_seq[num_unconditional:]], axis=0)
+            noise_level, state = noise_schedule.generate_timesteps(
+                images.shape[0], state)
             state, rngs = state.get_random_key()
-            noise:jax.Array = jax.random.normal(rngs, shape=images.shape)
+            noise: jax.Array = jax.random.normal(rngs, shape=images.shape)
             rates = noise_schedule.get_rates(noise_level)
-            noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(images, noise, rates)
+            noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(
+                images, noise, rates)
             def model_loss(params):
-                preds = model.apply(params, *noise_schedule.transform_inputs(noisy_images*c_in, noise_level))
-                preds = model_output_transform.pred_transform(noisy_images, preds, rates)
+                preds = model.apply(
+                    params, *noise_schedule.transform_inputs(noisy_images*c_in, noise_level), label_seq)
+                preds = model_output_transform.pred_transform(
+                    noisy_images, preds, rates)
                 nloss = loss_fn(preds, expected_output)
                 # nloss = jnp.mean(nloss, axis=1)
                 nloss *= noise_schedule.get_weights(noise_level)
                 nloss = jnp.mean(nloss)
                 loss = nloss
                 return loss
             loss, grads = jax.value_and_grad(model_loss)(state.params)
-            state = state.apply_gradients(grads=grads)
+            if distributed_training:
+                grads = jax.lax.pmean(grads, "device")
+            state = state.apply_gradients(grads=grads)
             state = state.apply_ema(self.ema_decay)
             return state, loss
+        if distributed_training:
+            train_step = jax.pmap(axis_name="device")(train_step)
+        else:
+            train_step = jax.jit(train_step)
         return train_step
     def _define_compute_metrics(self):
         @jax.jit
-        def compute_metrics(state:TrainState, expected, pred):
+        def compute_metrics(state: TrainState, expected, pred):
             loss = jnp.mean(jnp.square(pred - expected))
             metric_updates = state.metrics.single_from_model_output(loss=loss)
             metrics = state.metrics.merge(metric_updates)
@@ -187,20 +179,13 @@ class DiffusionTrainer:
         return compute_metrics
     def fit(self, data, steps_per_epoch, epochs):
-        data = iter(data)
-        train_step = self._define_train_step()
-        compute_metrics = self._define_compute_metrics()
-        state = self.state
-        for epoch in range(epochs):
-            print(f"\nEpoch {epoch+1}/{epochs}")
-            start_time = time.time()
-            epoch_loss = 0
-            with tqdm.tqdm(total=steps_per_epoch, desc=f'\t\tEpoch {epoch+1}', ncols=100, unit='step') as pbar:
-                for i in range(steps_per_epoch):
-                    batch = next(data)
-                    state, loss = train_step(state, batch)
-                    epoch_loss += loss
-                    if i % 100 == 0:
+        null_labels_full = data['null_labels_full']
+        local_batch_size = data['local_batch_size']
+        text_embedder = data['model']
+        super().fit(data, steps_per_epoch, epochs, {
+            "batch_size": local_batch_size, "null_labels_seq": null_labels_full, "text_embedder": text_embedder})
                         pbar.set_postfix(loss=f'{loss:.4f}')
                         pbar.update(100)
             end_time = time.time()

flaxdiff/trainer/simple_trainer.py ADDED Viewed

@@ -0,0 +1,323 @@
+import orbax.checkpoint
+import tqdm
+from flax import linen as nn
+import jax
+from typing import Callable
+from dataclasses import field
+import jax.numpy as jnp
+from clu import metrics
+from flax.training import train_state  # Useful dataclass to keep train state
+import optax
+from flax import struct                # Flax dataclasses
+import time
+import os
+import orbax
+from flax.training import orbax_utils
+@struct.dataclass
+class Metrics(metrics.Collection):
+    accuracy: metrics.Accuracy
+    loss: metrics.Average.from_output('loss')
+# Define the TrainState
+class SimpleTrainState(train_state.TrainState):
+    rngs: jax.random.PRNGKey
+    metrics: Metrics
+    def get_random_key(self):
+        rngs, subkey = jax.random.split(self.rngs)
+        return self.replace(rngs=rngs), subkey
+class SimpleTrainer:
+    state: SimpleTrainState
+    best_state: SimpleTrainState
+    best_loss: float
+    model: nn.Module
+    ema_decay: float = 0.999
+    def __init__(self,
+                 model: nn.Module,
+                 input_shapes: Dict[str, Tuple[int]],
+                 optimizer: optax.GradientTransformation,
+                 rngs: jax.random.PRNGKey,
+                 train_state: SimpleTrainState = None,
+                 name: str = "Simple",
+                 load_from_checkpoint: bool = False,
+                 checkpoint_suffix: str = "",
+                 loss_fn=optax.l2_loss,
+                 param_transforms: Callable = None,
+                 wandb_config: Dict[str, Any] = None,
+                 distributed_training: bool = None,
+                 ):
+        if distributed_training is None or distributed_training is True:
+            # Auto-detect if we are running on multiple devices
+            distributed_training = jax.device_count() > 1
+        self.distributed_training = distributed_training
+        self.model = model
+        self.name = name
+        self.loss_fn = loss_fn
+        self.input_shapes = input_shapes
+        if wandb_config is not None:
+            run = wandb.init(**wandb_config)
+            self.wandb = run
+        checkpointer = orbax.checkpoint.PyTreeCheckpointer()
+        options = orbax.checkpoint.CheckpointManagerOptions(
+            max_to_keep=4, create=True)
+        self.checkpointer = orbax.checkpoint.CheckpointManager(
+            self.checkpoint_path() + checkpoint_suffix, checkpointer, options)
+        if load_from_checkpoint:
+            latest_epoch, old_state, old_best_state = self.load()
+        else:
+            latest_epoch, old_state, old_best_state = 0, None, None
+        self.latest_epoch = latest_epoch
+        if train_state == None:
+            self.init_state(optimizer, rngs, existing_state=old_state,
+                            existing_best_state=old_best_state, model=model, param_transforms=param_transforms)
+        else:
+            self.state = train_state
+            self.best_state = train_state
+            self.best_loss = 1e9
+    def get_input_ones(self):
+        return {k: jnp.ones((1, *v)) for k, v in self.input_shapes.items()}
+    def __init_fn(
+        self,
+        optimizer: optax.GradientTransformation,
+        rngs: jax.random.PRNGKey,
+        existing_state: dict = None,
+        existing_best_state: dict = None,
+        model: nn.Module = None,
+        param_transforms: Callable = None
+    ) -> Tuple[SimpleTrainState, SimpleTrainState]:
+        rngs, subkey = jax.random.split(rngs)
+        if existing_state == None:
+            input_vars = self.get_input_ones()
+            params = model.init(subkey, **input_vars)
+        state = SimpleTrainState.create(
+            apply_fn=model.apply,
+            params=params,
+            tx=optimizer,
+            rngs=rngs,
+            metrics=Metrics.empty()
+        )
+        if existing_best_state is not None:
+            best_state = state.replace(
+                params=existing_best_state['params'])
+        else:
+            best_state = state
+        return state, best_state
+    def init_state(
+        self,
+        optimizer: optax.GradientTransformation,
+        rngs: jax.random.PRNGKey,
+        existing_state: dict = None,
+        existing_best_state: dict = None,
+        model: nn.Module = None,
+        param_transforms: Callable = None
+    ):
+        state, best_state = self.__init_fn(
+            optimizer, rngs, existing_state, existing_best_state, model, param_transforms
+        )
+        self.best_loss = 1e9
+        if self.distributed_training:
+            devices = jax.local_devices()
+            if len(devices) > 1:
+                print("Replicating state across devices ", devices)
+                state = flax.jax_utils.replicate(state, devices)
+                best_state = flax.jax_utils.replicate(best_state, devices)
+            else:
+                print("Not replicating any state, Only single device connected to the process")
+        self.state = state
+        self.best_state = best_state
+    def get_state(self):
+        return flax.jax_utils.unreplicate(self.state)
+    def get_best_state(self):
+        return flax.jax_utils.unreplicate(self.best_state)
+    def checkpoint_path(self):
+        experiment_name = self.name
+        path = os.path.join(os.path.abspath('./checkpoints'), experiment_name)
+        if not os.path.exists(path):
+            os.makedirs(path)
+        return path
+    def tensorboard_path(self):
+        experiment_name = self.name
+        path = os.path.join(os.path.abspath('./tensorboard'), experiment_name)
+        if not os.path.exists(path):
+            os.makedirs(path)
+        return path
+    def load(self):
+        epoch = self.checkpointer.latest_step()
+        print("Loading model from checkpoint", epoch)
+        ckpt = self.checkpointer.restore(epoch)
+        state = ckpt['state']
+        best_state = ckpt['best_state']
+        # Convert the state to a TrainState
+        self.best_loss = ckpt['best_loss']
+        print(
+            f"Loaded model from checkpoint at epoch {epoch}", ckpt['best_loss'])
+        return epoch, state, best_state
+    def save(self, epoch=0):
+        print(f"Saving model at epoch {epoch}")
+        ckpt = {
+            # 'model': self.model,
+            'state': self.get_state(),
+            'best_state': self.get_best_state(),
+            'best_loss': self.best_loss
+        }
+        try:
+            save_args = orbax_utils.save_args_from_target(ckpt)
+            self.checkpointer.save(epoch, ckpt, save_kwargs={
+                                   'save_args': save_args}, force=True)
+            pass
+        except Exception as e:
+            print("Error saving checkpoint", e)
+    def _define_train_step(self, **kwargs):
+        model = self.model
+        loss_fn = self.loss_fn
+        distributed_training = self.distributed_training
+        def train_step(state: SimpleTrainState, batch):
+            """Train for a single step."""
+            images = batch['image']
+            labels = batch['label']
+            def model_loss(params):
+                preds = model.apply(params, images)
+                expected_output = labels
+                nloss = loss_fn(preds, expected_output)
+                loss = jnp.mean(nloss)
+                return loss
+            loss, grads = jax.value_and_grad(model_loss)(state.params)
+            if distributed_training:
+                grads = jax.lax.pmean(grads, "device")
+            state = state.apply_gradients(grads=grads)
+            return state, loss
+        if distributed_training:
+            train_step = jax.pmap(axis_name="device")(train_step)
+        else:
+            train_step = jax.jit(train_step)
+        return train_step
+    def _define_compute_metrics(self):
+        model = self.model
+        loss_fn = self.loss_fn
+        @jax.jit
+        def compute_metrics(state: SimpleTrainState, batch):
+            preds = model.apply(state.params, batch['image'])
+            expected_output = batch['label']
+            loss = jnp.mean(loss_fn(preds, expected_output))
+            metric_updates = state.metrics.single_from_model_output(
+                loss=loss, logits=preds, labels=expected_output)
+            metrics = state.metrics.merge(metric_updates)
+            state = state.replace(metrics=metrics)
+            return state
+        return compute_metrics
+    def summary(self):
+        input_vars = self.get_input_ones()
+        print(self.model.tabulate(jax.random.key(0), **input_vars,
+              console_kwargs={"width": 200, "force_jupyter": True, }))
+    def config(self):
+        return {
+            "model": self.model,
+            "state": self.state,
+            "name": self.name,
+            "input_shapes": self.input_shapes
+        }
+    def init_tensorboard(self, batch_size, steps_per_epoch, epochs):
+        summary_writer = tensorboard.SummaryWriter(self.tensorboard_path())
+        summary_writer.hparams({
+            **self.config(),
+            "steps_per_epoch": steps_per_epoch,
+            "epochs": epochs,
+            "batch_size": batch_size
+        })
+        return summary_writer
+    def fit(self, data, steps_per_epoch, epochs, train_step_args={}):
+        train_ds = iter(data['train']())
+        if 'test' in data:
+            test_ds = data['test']
+        else:
+            test_ds = None
+        train_step = self._define_train_step(**train_step_args)
+        compute_metrics = self._define_compute_metrics()
+        state = self.state
+        device_count = jax.local_device_count()
+        # train_ds = flax.jax_utils.prefetch_to_device(train_ds, jax.devices())
+        summary_writer = self.init_tensorboard(
+            data['global_batch_size'], steps_per_epoch, epochs)
+        while self.latest_epoch <= epochs:
+            self.latest_epoch += 1
+            current_epoch = self.latest_epoch
+            print(f"\nEpoch {current_epoch}/{epochs}")
+            start_time = time.time()
+            epoch_loss = 0
+            with tqdm.tqdm(total=steps_per_epoch, desc=f'\t\tEpoch {current_epoch}', ncols=100, unit='step') as pbar:
+                for i in range(steps_per_epoch):
+                    batch = next(train_ds)
+                    if self.distributed_training and device_count > 1:
+                        batch = jax.tree.map(lambda x: x.reshape(
+                            (device_count, -1, *x.shape[1:])), batch)
+                    state, loss = train_step(state, batch)
+                    loss = jnp.mean(loss)
+                    epoch_loss += loss
+                    if i % 100 == 0:
+                        pbar.set_postfix(loss=f'{loss:.4f}')
+                        pbar.update(100)
+                        current_step = current_epoch*steps_per_epoch + i
+                        summary_writer.scalar(
+                            'Train Loss', loss, step=current_step)
+                        if self.wandb is not None:
+                            self.wandb.log({"train/loss": loss})
+            print(f"\n\tEpoch done")
+            end_time = time.time()
+            self.state = state
+            total_time = end_time - start_time
+            avg_time_per_step = total_time / steps_per_epoch
+            avg_loss = epoch_loss / steps_per_epoch
+            if avg_loss < self.best_loss:
+                self.best_loss = avg_loss
+                self.best_state = state
+                self.save(current_epoch)
+            # Compute Metrics
+            metrics_str = ''
+            print(
+                f"\n\tEpoch {current_epoch} completed. Avg Loss: {avg_loss}, Time: {total_time:.2f}s, Best Loss: {self.best_loss} {metrics_str}")
+        self.save(epochs)
+        return self.state

{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.1
+Version: 0.1.4
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com

{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
 flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=B0GcHlzlVYDNEIdh2v5qmP4u0neIT-FqexNohuyuCvg,2452
 flaxdiff/models/__init__.py,sha256=FAivVYXxM2JrCFIXf-C3374RB2Hth25dBrzOeNFhH1U,26
-flaxdiff/models/attention.py,sha256=enyqoZP4NMbIn07UdnduxvohtfpbsYW-n7nALE3K_s4,18369
-flaxdiff/models/common.py,sha256=WUCbuqSa8jEWAUt0UbEStTlpt5j1Mw8oZmZXYj5VwWQ,241
+flaxdiff/models/attention.py,sha256=SL9cvINjmabW1LPvXLAFZNHv-FF1Ez_d3J7n5uHBTyQ,15301
+flaxdiff/models/common.py,sha256=CjC4iRLjkF3oQ0f6rAqfiLaiHllZGtCOwN3rXDUndbE,274
 flaxdiff/models/favor_fastattn.py,sha256=79Ew1nqarsNLPzZaBSd1ILORzJr74CupYeqGiCQK5E4,27689
-flaxdiff/models/simple_unet.py,sha256=EExRXSo0nvpiDUF_3lPKp4eQVGBa05PSskNs1ER0sqU,19273
+flaxdiff/models/simple_unet.py,sha256=WlLry6v18syHBzcN8zAJ-zIVtq6ItMEIBWbeCcX0MLU,18693
+flaxdiff/models/simple_vit.py,sha256=vTu2CQRoSOxetBHTrnCWddm-vxrZDkMe8EpdNxtpJMk,4015
 flaxdiff/predictors/__init__.py,sha256=SKkYYRF9Wfgk2zhtZw4vCXOdOeRlrm2Mk6cvuaEvAzc,4403
 flaxdiff/samplers/__init__.py,sha256=_S-9TwDeshrI0VmapV-J2hqjTByOa0-oOeUs_IdovjU,285
 flaxdiff/samplers/common.py,sha256=_an5h5Niz9Joz_-ppridLrGHpu8X0VVvhNGknPu6AUY,5272
@@ -23,8 +24,9 @@ flaxdiff/schedulers/exp.py,sha256=cPTnUJpYdzJRRZqMLYQz0rRUCpEmaP2tXhRumLx94jA,60
 flaxdiff/schedulers/karras.py,sha256=4GN120kGwdxxU-h2mVdhBVy9IORkUMm_vvz3XjthBcI,3355
 flaxdiff/schedulers/linear.py,sha256=6003F5ISq1Wc0h6UAzY95MJgsDIKGMhBzbiVALpea0k,581
 flaxdiff/schedulers/sqrt.py,sha256=1F84ZgQPuoNMhe6yxGTR2G0h7dPOZtm4UDQOakbSsEU,445
-flaxdiff/trainer/__init__.py,sha256=iXnrIugF2g2ZLgW3HxZZBzgsoxJx7bWvLxqVmWpmAbo,8536
-flaxdiff-0.1.1.dist-info/METADATA,sha256=ZcNAw19k8s40DKgBILh3CriHkieOXuwhUbUJjx_YW8U,19229
-flaxdiff-0.1.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-flaxdiff-0.1.1.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.1.1.dist-info/RECORD,,
+flaxdiff/trainer/__init__.py,sha256=kwzkm-BD97hffFIXZUP1Hb3_D85fZ4SRNO7bviEwHU8,7591
+flaxdiff/trainer/simple_trainer.py,sha256=jafxr-yZ6FXn0Qi-iTSnlf275QWnIO4GnSvNAeB3H-Q,11651
+flaxdiff-0.1.4.dist-info/METADATA,sha256=G8OijdrrYWuKyAfCNtD_dKwdfBmdME56vpR-EYIZKXg,19229
+flaxdiff-0.1.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+flaxdiff-0.1.4.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.1.4.dist-info/RECORD,,

{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.1.1.dist-info → flaxdiff-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

flaxdiff 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl