PyPI - flaxdiff - Versions diffs - 0.1.9__tar.gz → 0.1.11__tar.gz - Mend

flaxdiff 0.1.9tar.gz → 0.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.9
+Version: 0.1.11
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff/models/attention.py RENAMED Viewed

@@ -23,6 +23,7 @@ class EfficientAttention(nn.Module):
     precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
+    force_fp32_for_softmax: bool = True
     def setup(self):
         inner_dim = self.dim_head * self.heads
@@ -114,6 +115,7 @@ class NormalAttention(nn.Module):
     precision: PrecisionLike = None
     use_bias: bool = True
     kernel_init: Callable = lambda : kernel_init(1.0)
+    force_fp32_for_softmax: bool = True
     def setup(self):
         inner_dim = self.dim_head * self.heads
@@ -157,7 +159,7 @@ class NormalAttention(nn.Module):
         hidden_states = nn.dot_product_attention(
             query, key, value, dtype=self.dtype, broadcast_dropout=False,
-            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=True,
+            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=self.force_fp32_for_softmax,
             deterministic=True
         )
         proj = self.proj_attn(hidden_states)
@@ -237,6 +239,7 @@ class BasicTransformerBlock(nn.Module):
     use_flash_attention:bool = False
     use_cross_only:bool = False
     only_pure_attention:bool = False
+    force_fp32_for_softmax: bool = True
     def setup(self):
         if self.use_flash_attention:
@@ -252,7 +255,8 @@ class BasicTransformerBlock(nn.Module):
             precision=self.precision,
             use_bias=self.use_bias,
             dtype=self.dtype,
-            kernel_init=self.kernel_init
+            kernel_init=self.kernel_init,
+            force_fp32_for_softmax=self.force_fp32_for_softmax
         )
         self.attention2 = attenBlock(
             query_dim=self.query_dim,
@@ -262,7 +266,8 @@ class BasicTransformerBlock(nn.Module):
             precision=self.precision,
             use_bias=self.use_bias,
             dtype=self.dtype,
-            kernel_init=self.kernel_init
+            kernel_init=self.kernel_init,
+            force_fp32_for_softmax=self.force_fp32_for_softmax
         )
         self.ff = FlaxFeedForward(dim=self.query_dim)
@@ -296,6 +301,8 @@ class TransformerBlock(nn.Module):
     use_flash_attention:bool = False
     use_self_and_cross:bool = True
     only_pure_attention:bool = False
+    force_fp32_for_softmax: bool = True
+    kernel_init: Callable = lambda : kernel_init(1.0)
     @nn.compact
     def __call__(self, x, context=None):
@@ -306,12 +313,12 @@ class TransformerBlock(nn.Module):
             if self.use_linear_attention:
                 projected_x = nn.Dense(features=inner_dim,
                                        use_bias=False, precision=self.precision,
-                                       kernel_init=kernel_init(1.0),
+                                       kernel_init=self.kernel_init(),
                                        dtype=self.dtype, name=f'project_in')(normed_x)
             else:
                 projected_x = nn.Conv(
                     features=inner_dim, kernel_size=(1, 1),
-                    kernel_init=kernel_init(1.0),
+                    kernel_init=self.kernel_init(),
                     strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
                     precision=self.precision, name=f'project_in_conv',
                 )(normed_x)
@@ -331,19 +338,21 @@ class TransformerBlock(nn.Module):
             dtype=self.dtype,
             use_flash_attention=self.use_flash_attention,
             use_cross_only=(not self.use_self_and_cross),
-            only_pure_attention=self.only_pure_attention
+            only_pure_attention=self.only_pure_attention,
+            force_fp32_for_softmax=self.force_fp32_for_softmax,
+            kernel_init=self.kernel_init
         )(projected_x, context)
         if self.use_projection == True:
             if self.use_linear_attention:
                 projected_x = nn.Dense(features=C, precision=self.precision,
                                        dtype=self.dtype, use_bias=False,
-                                       kernel_init=kernel_init(1.0),
+                                       kernel_init=self.kernel_init(),
                                        name=f'project_out')(projected_x)
             else:
                 projected_x = nn.Conv(
                     features=C, kernel_size=(1, 1),
-                    kernel_init=kernel_init(1.0),
+                    kernel_init=self.kernel_init(),
                     strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
                     precision=self.precision, name=f'project_out_conv',
                 )(projected_x)

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff/models/common.py RENAMED Viewed

@@ -267,15 +267,17 @@ class ResidualBlock(nn.Module):
     kernel_init:Callable=kernel_init(1.0)
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
+    named_norms:bool=False
     def setup(self):
         if self.norm_groups > 0:
             norm = partial(nn.GroupNorm, self.norm_groups)
+            self.norm1 = norm(name="GroupNorm_0") if self.named_norms else norm()
+            self.norm2 = norm(name="GroupNorm_1") if self.named_norms else norm()
         else:
             norm = partial(nn.RMSNorm, 1e-5)
-        self.norm1 = norm()
-        self.norm2 = norm()
+            self.norm1 = norm()
+            self.norm2 = norm()
     @nn.compact
     def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_features:jax.Array=None):

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff/models/simple_unet.py RENAMED Viewed

@@ -19,15 +19,16 @@ class Unet(nn.Module):
     norm_groups:int=8
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
+    named_norms: bool = False # This is for backward compatibility reasons; older checkpoints have named norms
+    kernel_init: Callable = partial(kernel_init, dtype=jnp.float32)
     def setup(self):
         if self.norm_groups > 0:
             norm = partial(nn.GroupNorm, self.norm_groups)
+            self.conv_out_norm = norm(name="GroupNorm_0") if self.named_norms else norm()
         else:
             norm = partial(nn.RMSNorm, 1e-5)
-        # self.last_up_norm = norm()
-        self.conv_out_norm = norm()
+            self.conv_out_norm = norm()
     @nn.compact
     def __call__(self, x, temb, textcontext):
@@ -49,7 +50,7 @@ class Unet(nn.Module):
             features=self.feature_depths[0],
             kernel_size=(3, 3),
             strides=(1, 1),
-            kernel_init=kernel_init(1.0),
+            kernel_init=self.kernel_init(1.0),
             dtype=self.dtype,
             precision=self.precision
         )(x)
@@ -64,13 +65,14 @@ class Unet(nn.Module):
                     down_conv_type,
                     name=f"down_{i}_residual_{j}",
                     features=dim_in,
-                    kernel_init=kernel_init(1.0),
+                    kernel_init=self.kernel_init(1.0),
                     kernel_size=(3, 3),
                     strides=(1, 1),
                     activation=self.activation,
                     norm_groups=self.norm_groups,
                     dtype=self.dtype,
-                    precision=self.precision
+                    precision=self.precision,
+                    named_norms=self.named_norms
                 )(x, temb)
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
@@ -80,6 +82,8 @@ class Unet(nn.Module):
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
                                         only_pure_attention=attention_config.get("only_pure_attention", True),
+                                        force_fp32_for_softmax=attention_config.get("force_fp32_for_softmax", False),
+                                        kernel_init=self.kernel_init(1.0),
                                         name=f"down_{i}_attention_{j}")(x, textcontext)
                 # print("down residual for feature level", i, "is of shape", x.shape, "features", dim_in)
                 downs.append(x)
@@ -102,13 +106,14 @@ class Unet(nn.Module):
                 middle_conv_type,
                 name=f"middle_res1_{j}",
                 features=middle_dim_out,
-                kernel_init=kernel_init(1.0),
+                kernel_init=self.kernel_init(1.0),
                 kernel_size=(3, 3),
                 strides=(1, 1),
                 activation=self.activation,
                 norm_groups=self.norm_groups,
                 dtype=self.dtype,
-                precision=self.precision
+                precision=self.precision,
+                named_norms=self.named_norms
             )(x, temb)
             if middle_attention is not None and j == self.num_middle_res_blocks - 1:   # Apply attention only on the last block
                 x = TransformerBlock(heads=middle_attention['heads'], dtype=middle_attention.get('dtype', jnp.float32),
@@ -119,18 +124,21 @@ class Unet(nn.Module):
                                     use_self_and_cross=False,
                                     precision=middle_attention.get("precision", self.precision),
                                     only_pure_attention=middle_attention.get("only_pure_attention", True),
+                                    force_fp32_for_softmax=middle_attention.get("force_fp32_for_softmax", False),
+                                    kernel_init=self.kernel_init(1.0),
                                     name=f"middle_attention_{j}")(x, textcontext)
             x = ResidualBlock(
                 middle_conv_type,
                 name=f"middle_res2_{j}",
                 features=middle_dim_out,
-                kernel_init=kernel_init(1.0),
+                kernel_init=self.kernel_init(1.0),
                 kernel_size=(3, 3),
                 strides=(1, 1),
                 activation=self.activation,
                 norm_groups=self.norm_groups,
                 dtype=self.dtype,
-                precision=self.precision
+                precision=self.precision,
+                named_norms=self.named_norms
             )(x, temb)
         # Upscaling Blocks
@@ -145,13 +153,14 @@ class Unet(nn.Module):
                     up_conv_type,# if j == 0 else "separable",
                     name=f"up_{i}_residual_{j}",
                     features=dim_out,
-                    kernel_init=kernel_init(1.0),
+                    kernel_init=self.kernel_init(1.0),
                     kernel_size=kernel_size,
                     strides=(1, 1),
                     activation=self.activation,
                     norm_groups=self.norm_groups,
                     dtype=self.dtype,
-                    precision=self.precision
+                    precision=self.precision,
+                    named_norms=self.named_norms
                 )(x, temb)
                 if attention_config is not None and j == self.num_res_blocks - 1:   # Apply attention only on the last block
                     x = TransformerBlock(heads=attention_config['heads'], dtype=attention_config.get('dtype', jnp.float32),
@@ -161,6 +170,8 @@ class Unet(nn.Module):
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
                                         only_pure_attention=attention_config.get("only_pure_attention", True),
+                                        force_fp32_for_softmax=middle_attention.get("force_fp32_for_softmax", False),
+                                        kernel_init=self.kernel_init(1.0),
                                         name=f"up_{i}_attention_{j}")(x, textcontext)
             # print("Upscaling ", i, x.shape)
             if i != len(feature_depths) - 1:
@@ -179,7 +190,7 @@ class Unet(nn.Module):
             features=self.feature_depths[0],
             kernel_size=(3, 3),
             strides=(1, 1),
-            kernel_init=kernel_init(1.0),
+            kernel_init=self.kernel_init(1.0),
             dtype=self.dtype,
             precision=self.precision
         )(x)
@@ -190,13 +201,14 @@ class Unet(nn.Module):
             conv_type,
             name="final_residual",
             features=self.feature_depths[0],
-            kernel_init=kernel_init(1.0),
+            kernel_init=self.kernel_init(1.0),
             kernel_size=(3,3),
             strides=(1, 1),
             activation=self.activation,
             norm_groups=self.norm_groups,
             dtype=self.dtype,
-            precision=self.precision
+            precision=self.precision,
+            named_norms=self.named_norms
         )(x, temb)
         x = self.conv_out_norm(x)
@@ -208,7 +220,7 @@ class Unet(nn.Module):
             kernel_size=(3, 3),
             strides=(1, 1),
             # activation=jax.nn.mish
-            kernel_init=kernel_init(0.0),
+            kernel_init=self.kernel_init(0.0),
             dtype=self.dtype,
             precision=self.precision
         )(x)

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff/trainer/diffusion_trainer.py RENAMED Viewed

@@ -16,6 +16,7 @@ from flaxdiff.utils import RandomMarkovState
 from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
 from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
+from flax.training.dynamic_scale import DynamicScale
 class TrainState(SimpleTrainState):
     rngs: jax.random.PRNGKey
@@ -83,7 +84,8 @@ class DiffusionTrainer(SimpleTrainer):
             new_state = existing_state
         if param_transforms is not None:
-            params = param_transforms(params)
+            new_state['params'] = param_transforms(new_state['params'])
+            new_state['ema_params'] = param_transforms(new_state['ema_params'])
         state = TrainState.create(
             apply_fn=model.apply,
@@ -92,7 +94,7 @@ class DiffusionTrainer(SimpleTrainer):
             tx=optimizer,
             rngs=rngs,
             metrics=Metrics.empty(),
-            dynamic_scale = flax.training.dynamic_scale.DynamicScale() if use_dynamic_scale else None
+            dynamic_scale = DynamicScale() if use_dynamic_scale else None
         )
         if existing_best_state is not None:

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff/trainer/simple_trainer.py RENAMED Viewed

@@ -22,7 +22,7 @@ from jax.experimental.shard_map import shard_map
 from orbax.checkpoint.utils import fully_replicated_host_local_array_to_global_array
 from termcolor import colored
 from typing import Dict, Callable, Sequence, Any, Union, Tuple
+from flax.training.dynamic_scale import DynamicScale
 from flaxdiff.utils import RandomMarkovState
 PROCESS_COLOR_MAP = {
@@ -68,7 +68,7 @@ class Metrics(metrics.Collection):
 # Define the TrainState
 class SimpleTrainState(train_state.TrainState):
     metrics: Metrics
-    dynamic_scale: flax.training.dynamic_scale.DynamicScale
+    dynamic_scale: DynamicScale
 class SimpleTrainer:
     state: SimpleTrainState
@@ -177,13 +177,16 @@ class SimpleTrainer:
             params = model.init(subkey, **input_vars)
         else:
             params = existing_state['params']
+        if param_transforms is not None:
+            params = param_transforms(params)
         state = SimpleTrainState.create(
             apply_fn=model.apply,
             params=params,
             tx=optimizer,
             metrics=Metrics.empty(),
-            dynamic_scale = flax.training.dynamic_scale.DynamicScale() if use_dynamic_scale else None
+            dynamic_scale = DynamicScale() if use_dynamic_scale else None
         )
         if existing_best_state is not None:
             best_state = state.replace(

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/flaxdiff.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.9
+Version: 0.1.11
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com

{flaxdiff-0.1.9 → flaxdiff-0.1.11}/setup.py RENAMED Viewed

@@ -11,7 +11,7 @@ required_packages=[
 setup(
     name='flaxdiff',
     packages=find_packages(),
-    version='0.1.9',
+    version='0.1.11',
     description='A versatile and easy to understand Diffusion library',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',