PyPI - flaxdiff - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

flaxdiff 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

flaxdiff/models/attention.py +140 -162
flaxdiff/models/autoencoder/__init__.py +2 -0
flaxdiff/models/autoencoder/autoencoder.py +19 -0
flaxdiff/models/autoencoder/diffusers.py +91 -0
flaxdiff/models/autoencoder/simple_autoenc.py +26 -0
flaxdiff/models/common.py +322 -0
flaxdiff/models/simple_unet.py +21 -327
flaxdiff/trainer/__init__.py +2 -201
flaxdiff/trainer/autoencoder_trainer.py +182 -0
flaxdiff/trainer/diffusion_trainer.py +202 -0
flaxdiff/trainer/simple_trainer.py +175 -80
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.6.dist-info}/METADATA +12 -2
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.6.dist-info}/RECORD +15 -9
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.6.dist-info}/WHEEL +0 -0
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.6.dist-info}/top_level.txt +0 -0

flaxdiff/models/attention.py CHANGED Viewed

@@ -5,7 +5,8 @@ Some Code ported from https://github.com/huggingface/diffusers/blob/main/src/dif
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
-from typing import Dict, Callable, Sequence, Any, Union
+from typing import Dict, Callable, Sequence, Any, Union, Tuple, Optional
+from flax.typing import Dtype, PrecisionLike
 import einops
 import functools
 import math
@@ -18,8 +19,8 @@ class EfficientAttention(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
@@ -62,8 +63,13 @@ class EfficientAttention(nn.Module):
         # x has shape [B, H * W, C]
         context = x if context is None else context
-        B, H, W, C = x.shape
-        x = x.reshape((B, 1, H * W, C))
+        orig_x_shape = x.shape
+        if len(x.shape) == 4:
+            B, H, W, C = x.shape
+            x = x.reshape((B, 1, H * W, C))
+        else:
+            B, SEQ, C = x.shape
+            x = x.reshape((B, 1, SEQ, C))
         if len(context.shape) == 4:
             B, _H, _W, _C = context.shape
@@ -93,7 +99,7 @@ class EfficientAttention(nn.Module):
         proj = self.proj_attn(hidden_states)
-        proj = proj.reshape((B, H, W, C))
+        proj = proj.reshape(orig_x_shape)
         return proj
@@ -104,8 +110,8 @@ class NormalAttention(nn.Module):
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
@@ -138,8 +144,10 @@ class NormalAttention(nn.Module):
     @nn.compact
     def __call__(self, x, context=None):
         # x has shape [B, H, W, C]
-        B, H, W, C = x.shape
-        x = x.reshape((B, H*W, C))
+        orig_x_shape = x.shape
+        if len(x.shape) == 4:
+            B, H, W, C = x.shape
+            x = x.reshape((B, H*W, C))
         context = x if context is None else context
         if len(context.shape) == 4:
             context = context.reshape((B, H*W, C))
@@ -151,16 +159,16 @@ class NormalAttention(nn.Module):
             query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
         )
         proj = self.proj_attn(hidden_states)
-        proj = proj.reshape((B, H, W, C))
+        proj = proj.reshape(orig_x_shape)
         return proj
-class AttentionBlock(nn.Module):
+class BasicTransformerBlock(nn.Module):
     # Has self and cross attention
     query_dim: int
     heads: int = 4
     dim_head: int = 64
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGHEST
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
     use_flash_attention:bool = False
@@ -193,129 +201,26 @@ class AttentionBlock(nn.Module):
             kernel_init=self.kernel_init
         )
-        self.ff = nn.DenseGeneral(
-            features=self.query_dim,
-            use_bias=self.use_bias,
-            precision=self.precision,
-            dtype=self.dtype,
-            kernel_init=self.kernel_init(),
-            name="ff"
-        )
+        self.ff = FlaxFeedForward(dim=self.query_dim)
         self.norm1 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
         self.norm2 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
         self.norm3 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm4 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
     @nn.compact
     def __call__(self, hidden_states, context=None):
         # self attention
-        residual = hidden_states
-        hidden_states = self.norm1(hidden_states)
-        if self.use_cross_only:
-            hidden_states = self.attention1(hidden_states, context)
-        else:
-            hidden_states = self.attention1(hidden_states)
-        hidden_states = hidden_states + residual
+        if not self.use_cross_only:
+            print("Using self attention")
+            hidden_states = hidden_states + self.attention1(self.norm1(hidden_states))
         # cross attention
-        residual = hidden_states
-        hidden_states = self.norm2(hidden_states)
-        hidden_states = self.attention2(hidden_states, context)
-        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.attention2(self.norm2(hidden_states), context)
         # feed forward
-        residual = hidden_states
-        hidden_states = self.norm3(hidden_states)
-        hidden_states = nn.gelu(hidden_states)
-        hidden_states = self.ff(hidden_states)
-        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.ff(self.norm3(hidden_states))
         return hidden_states
-class TransformerBlock(nn.Module):
-    heads: int = 4
-    dim_head: int = 32
-    use_linear_attention: bool = True
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGH
-    use_projection: bool = False
-    use_flash_attention:bool = True
-    use_self_and_cross:bool = False
-    @nn.compact
-    def __call__(self, x, context=None):
-        inner_dim = self.heads * self.dim_head
-        B, H, W, C = x.shape
-        normed_x = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)(x)
-        if self.use_projection == True:
-            if self.use_linear_attention:
-                projected_x = nn.Dense(features=inner_dim,
-                                       use_bias=False, precision=self.precision,
-                                       kernel_init=kernel_init(1.0),
-                                       dtype=self.dtype, name=f'project_in')(normed_x)
-            else:
-                projected_x = nn.Conv(
-                    features=inner_dim, kernel_size=(1, 1),
-                    kernel_init=kernel_init(1.0),
-                    strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
-                    precision=self.precision, name=f'project_in_conv',
-                )(normed_x)
-        else:
-            projected_x = normed_x
-            inner_dim = C
-        context = projected_x if context is None else context
-        if self.use_self_and_cross:
-            projected_x = AttentionBlock(
-                query_dim=inner_dim,
-                heads=self.heads,
-                dim_head=self.dim_head,
-                name=f'Attention',
-                precision=self.precision,
-                use_bias=False,
-                dtype=self.dtype,
-                use_flash_attention=self.use_flash_attention,
-                use_cross_only=False
-            )(projected_x, context)
-        elif self.use_flash_attention == True:
-            projected_x = EfficientAttention(
-                query_dim=inner_dim,
-                heads=self.heads,
-                dim_head=self.dim_head,
-                name=f'Attention',
-                precision=self.precision,
-                use_bias=False,
-                dtype=self.dtype,
-            )(projected_x, context)
-        else:
-            projected_x = NormalAttention(
-                query_dim=inner_dim,
-                heads=self.heads,
-                dim_head=self.dim_head,
-                name=f'Attention',
-                precision=self.precision,
-                use_bias=False,
-            )(projected_x, context)
-        if self.use_projection == True:
-            if self.use_linear_attention:
-                projected_x = nn.Dense(features=C, precision=self.precision,
-                                       dtype=self.dtype, use_bias=False,
-                                       kernel_init=kernel_init(1.0),
-                                       name=f'project_out')(projected_x)
-            else:
-                projected_x = nn.Conv(
-                    features=C, kernel_size=(1, 1),
-                    kernel_init=kernel_init(1.0),
-                    strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
-                    precision=self.precision, name=f'project_out_conv',
-                )(projected_x)
-        out = x + projected_x
-        return out
 class FlaxGEGLU(nn.Module):
     r"""
     Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
@@ -333,10 +238,11 @@ class FlaxGEGLU(nn.Module):
     dim: int
     dropout: float = 0.0
     dtype: jnp.dtype = jnp.float32
+    precision: Any = jax.lax.Precision.DEFAULT
     def setup(self):
         inner_dim = self.dim * 4
-        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype, precision=jax.lax.Precision.DEFAULT)
+        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype, precision=self.precision)
     def __call__(self, hidden_states):
         hidden_states = self.proj(hidden_states)
@@ -362,14 +268,14 @@ class FlaxFeedForward(nn.Module):
     """
     dim: int
-    dropout: float = 0.0
     dtype: jnp.dtype = jnp.float32
+    precision: Any = jax.lax.Precision.DEFAULT
     def setup(self):
         # The second linear layer needs to be called
         # net_2 for now to match the index of the Sequential layer
-        self.net_0 = FlaxGEGLU(self.dim, self.dtype)
-        self.net_2 = nn.Dense(self.dim, dtype=self.dtype, precision=jax.lax.Precision.DEFAULT)
+        self.net_0 = FlaxGEGLU(self.dim, self.dtype, precision=self.precision)
+        self.net_2 = nn.Dense(self.dim, dtype=self.dtype, precision=self.precision)
     def __call__(self, hidden_states):
         hidden_states = self.net_0(hidden_states)
@@ -377,55 +283,127 @@ class FlaxFeedForward(nn.Module):
         return hidden_states
 class BasicTransformerBlock(nn.Module):
+    # Has self and cross attention
     query_dim: int
-    heads: int
-    dim_head: int
-    dropout: float = 0.0
-    only_cross_attention: bool = False
-    dtype: jnp.dtype = jnp.float32
-    use_memory_efficient_attention: bool = False
-    split_head_dim: bool = False
-    precision: Any = jax.lax.Precision.DEFAULT
+    heads: int = 4
+    dim_head: int = 64
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    use_bias: bool = True
+    kernel_init: Callable = lambda : kernel_init(1.0)
+    use_flash_attention:bool = False
+    use_cross_only:bool = False
+    only_pure_attention:bool = False
     def setup(self):
-        # self attention (or cross_attention if only_cross_attention is True)
-        self.attn1 = NormalAttention(
-            query_dim=self.query_dim,
+        if self.use_flash_attention:
+            attenBlock = EfficientAttention
+        else:
+            attenBlock = NormalAttention
+        self.attention1 = attenBlock(
+         query_dim=self.query_dim,
             heads=self.heads,
             dim_head=self.dim_head,
-            dtype=self.dtype,
+            name=f'Attention1',
             precision=self.precision,
+            use_bias=self.use_bias,
+            dtype=self.dtype,
+            kernel_init=self.kernel_init
         )
-        # cross attention
-        self.attn2 = NormalAttention(
+        self.attention2 = attenBlock(
             query_dim=self.query_dim,
             heads=self.heads,
             dim_head=self.dim_head,
-            dtype=self.dtype,
+            name=f'Attention2',
             precision=self.precision,
+            use_bias=self.use_bias,
+            dtype=self.dtype,
+            kernel_init=self.kernel_init
         )
-        self.ff = FlaxFeedForward(dim=self.query_dim, dropout=self.dropout, dtype=self.dtype)
+        self.ff = FlaxFeedForward(dim=self.query_dim)
         self.norm1 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
         self.norm2 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
         self.norm3 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-    def __call__(self, hidden_states, context, deterministic=True):
+    @nn.compact
+    def __call__(self, hidden_states, context=None):
+        if self.only_pure_attention:
+            return self.attention2(self.norm2(hidden_states), context)
         # self attention
-        residual = hidden_states
-        if self.only_cross_attention:
-            hidden_states = self.attn1(self.norm1(hidden_states), context)
-        else:
-            hidden_states = self.attn1(self.norm1(hidden_states))
-        hidden_states = hidden_states + residual
+        if not self.use_cross_only:
+            hidden_states = hidden_states + self.attention1(self.norm1(hidden_states))
         # cross attention
-        residual = hidden_states
-        hidden_states = self.attn2(self.norm2(hidden_states), context)
-        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.attention2(self.norm2(hidden_states), context)
         # feed forward
-        residual = hidden_states
-        hidden_states = self.ff(self.norm3(hidden_states))
-        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.ff(self.norm3(hidden_states))
+        return hidden_states
+class TransformerBlock(nn.Module):
+    heads: int = 4
+    dim_head: int = 32
+    use_linear_attention: bool = True
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    use_projection: bool = False
+    use_flash_attention:bool = True
+    use_self_and_cross:bool = False
+    only_pure_attention:bool = False
+    @nn.compact
+    def __call__(self, x, context=None):
+        inner_dim = self.heads * self.dim_head
+        B, H, W, C = x.shape
+        normed_x = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)(x)
+        if self.use_projection == True:
+            if self.use_linear_attention:
+                projected_x = nn.Dense(features=inner_dim,
+                                       use_bias=False, precision=self.precision,
+                                       kernel_init=kernel_init(1.0),
+                                       dtype=self.dtype, name=f'project_in')(normed_x)
+            else:
+                projected_x = nn.Conv(
+                    features=inner_dim, kernel_size=(1, 1),
+                    kernel_init=kernel_init(1.0),
+                    strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
+                    precision=self.precision, name=f'project_in_conv',
+                )(normed_x)
+        else:
+            projected_x = normed_x
+            inner_dim = C
+        context = projected_x if context is None else context
-        return hidden_states
+        projected_x = BasicTransformerBlock(
+            query_dim=inner_dim,
+            heads=self.heads,
+            dim_head=self.dim_head,
+            name=f'Attention',
+            precision=self.precision,
+            use_bias=False,
+            dtype=self.dtype,
+            use_flash_attention=self.use_flash_attention,
+            use_cross_only=(not self.use_self_and_cross),
+            only_pure_attention=self.only_pure_attention
+        )(projected_x, context)
+        if self.use_projection == True:
+            if self.use_linear_attention:
+                projected_x = nn.Dense(features=C, precision=self.precision,
+                                       dtype=self.dtype, use_bias=False,
+                                       kernel_init=kernel_init(1.0),
+                                       name=f'project_out')(projected_x)
+            else:
+                projected_x = nn.Conv(
+                    features=C, kernel_size=(1, 1),
+                    kernel_init=kernel_init(1.0),
+                    strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
+                    precision=self.precision, name=f'project_out_conv',
+                )(projected_x)
+        out = x + projected_x
+        return out

flaxdiff/models/autoencoder/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .autoencoder import AutoEncoder
2	+ from .diffusers import StableDiffusionVAE

flaxdiff/models/autoencoder/autoencoder.py ADDED Viewed

@@ -0,0 +1,19 @@
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from typing import Dict, Callable, Sequence, Any, Union
+import einops
+from ..common import kernel_init, ConvLayer, Upsample, Downsample, PixelShuffle
+class AutoEncoder():
+    def encode(self, x: jnp.ndarray, **kwargs) -> jnp.ndarray:
+        raise NotImplementedError
+    def decode(self, z: jnp.ndarray, **kwargs) -> jnp.ndarray:
+        raise NotImplementedError
+    def __call__(self, x: jnp.ndarray):
+        latents = self.encode(x)
+        reconstructions = self.decode(latents)
+        return reconstructions

flaxdiff/models/autoencoder/diffusers.py ADDED Viewed

@@ -0,0 +1,91 @@
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from .autoencoder import AutoEncoder
+"""
+This module contains an Autoencoder implementation which uses the Stable Diffusion VAE model from the HuggingFace Diffusers library.
+The actual model was not trained by me, but was taken from the HuggingFace model hub.
+I have only implemented the wrapper around the diffusers pipeline to make it compatible with our library
+All credits for the model go to the developers of Stable Diffusion VAE and all credits for the pipeline go to the developers of the Diffusers library.
+"""
+class StableDiffusionVAE(AutoEncoder):
+    def __init__(self, modelname = "CompVis/stable-diffusion-v1-4"):
+        from diffusers.models.vae_flax import FlaxEncoder, FlaxDecoder
+        from diffusers import FlaxStableDiffusionPipeline
+        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+            modelname,
+            revision="bf16",
+            dtype=jnp.bfloat16,
+        )
+        vae = pipeline.vae
+        enc = FlaxEncoder(
+            in_channels=vae.config.in_channels,
+            out_channels=vae.config.latent_channels,
+            down_block_types=vae.config.down_block_types,
+            block_out_channels=vae.config.block_out_channels,
+            layers_per_block=vae.config.layers_per_block,
+            act_fn=vae.config.act_fn,
+            norm_num_groups=vae.config.norm_num_groups,
+            double_z=True,
+            dtype=vae.dtype,
+        )
+        dec = FlaxDecoder(
+            in_channels=vae.config.latent_channels,
+            out_channels=vae.config.out_channels,
+            up_block_types=vae.config.up_block_types,
+            block_out_channels=vae.config.block_out_channels,
+            layers_per_block=vae.config.layers_per_block,
+            norm_num_groups=vae.config.norm_num_groups,
+            act_fn=vae.config.act_fn,
+            dtype=vae.dtype,
+        )
+        quant_conv = nn.Conv(
+            2 * vae.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=vae.dtype,
+        )
+        post_quant_conv = nn.Conv(
+            vae.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=vae.dtype,
+        )
+        self.enc = enc
+        self.dec = dec
+        self.post_quant_conv = post_quant_conv
+        self.quant_conv = quant_conv
+        self.params = params
+        self.scaling_factor = vae.scaling_factor
+    def encode(self, images, rngkey: jax.random.PRNGKey = None):
+        latents = self.enc.apply({"params": self.params["vae"]['encoder']}, images, deterministic=True)
+        latents = self.quant_conv.apply({"params": self.params["vae"]['quant_conv']}, latents)
+        if rngkey is not None:
+            mean, log_std = jnp.split(latents, 2, axis=-1)
+            log_std = jnp.clip(log_std, -30, 20)
+            std = jnp.exp(0.5 * log_std)
+            latents = mean + std * jax.random.normal(rngkey, mean.shape, dtype=mean.dtype)
+            print("Sampled")
+        else:
+            # return the mean
+            latents, _ = jnp.split(latents, 2, axis=-1)
+        latents *= self.scaling_factor
+        return latents
+    def decode(self, latents):
+        latents = (1.0 / self.scaling_factor) * latents
+        latents = self.post_quant_conv.apply({"params": self.params["vae"]['post_quant_conv']}, latents)
+        return self.dec.apply({"params": self.params["vae"]['decoder']}, latents)

flaxdiff/models/autoencoder/simple_autoenc.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import Any, List, Optional, Callable
+import jax
+import flax.linen as nn
+from jax import numpy as jnp
+from flax.typing import Dtype, PrecisionLike
+from .autoencoder import AutoEncoder
+class SimpleAutoEncoder(AutoEncoder):
+    latent_channels: int
+    feature_depths: List[int]=[64, 128, 256, 512]
+    attention_configs:list=[{"heads":8}, {"heads":8}, {"heads":8}, {"heads":8}],
+    num_res_blocks: int=2
+    num_middle_res_blocks:int=1,
+    activation:Callable = jax.nn.swish
+    norm_groups:int=8
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    # def encode(self, x: jnp.ndarray):
+    @nn.compact
+    def __call__(self, x: jnp.ndarray):
+        latents = self.encode(x)
+        reconstructions = self.decode(latents)
+        return reconstructions

flaxdiff 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

flaxdiff 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl