PyPI - flaxdiff - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

flaxdiff 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: flaxdiff
-Version: 0.2.4
+Version: 0.2.6
 Summary: A versatile and easy to understand Diffusion library
 Author-email: Ashish Kumar Singh <ashishkmr472@gmail.com>
 License-Expression: MIT

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/data/sources/images.py RENAMED Viewed

@@ -266,6 +266,12 @@ class ImageGCSAugmenter(DataAugmenter):
         print(f"Using method: {method}")
+        from torchvision.transforms import v2
+        augments = v2.Compose([
+            v2.RandomHorizontalFlip(p=0.5),
+            v2.ColorJitter(brightness=0.2, contrast=0.05, saturation=0.2)
+        ])
         class GCSTransform(pygrain.MapTransform):
             def __init__(self, *args, **kwargs):
                 super().__init__(*args, **kwargs)
@@ -277,6 +283,7 @@ class ImageGCSAugmenter(DataAugmenter):
                 image = np.asarray(bytearray(element['jpg']), dtype="uint8")
                 image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
                 image = self.image_augmenter(image)
+                image = augments(image)
                 caption = labelizer(element).decode('utf-8')
                 results = self.auto_tokenize(caption)
                 return {

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/inference/pipeline.py RENAMED Viewed

@@ -53,6 +53,7 @@ class DiffusionInferencePipeline(InferencePipeline):
     input_config: DiffusionInputConfig = None
     samplers: Dict[Type[DiffusionSampler], Dict[float, DiffusionSampler]] = field(default_factory=dict)
     config: Dict[str, Any] = field(default_factory=dict)
+    wandb_run = None
     @classmethod
     def from_wandb_run(
@@ -75,7 +76,7 @@ class DiffusionInferencePipeline(InferencePipeline):
         Returns:
             DiffusionInferencePipeline instance
         """
-        states, config = load_from_wandb_run(
+        states, config, run = load_from_wandb_run(
             wandb_run,
             project=project,
             entity=entity,
@@ -93,6 +94,7 @@ class DiffusionInferencePipeline(InferencePipeline):
             state=state,
             best_state=best_state,
             rngstate=RandomMarkovState(jax.random.PRNGKey(42)),
+            run=run,
         )
         return pipeline
@@ -117,7 +119,7 @@ class DiffusionInferencePipeline(InferencePipeline):
         Returns:
             DiffusionInferencePipeline instance
         """
-        states, config = load_from_wandb_registry(
+        states, config, run = load_from_wandb_registry(
             modelname=modelname,
             project=project,
             entity=entity,
@@ -137,6 +139,7 @@ class DiffusionInferencePipeline(InferencePipeline):
             state=state,
             best_state=best_state,
             rngstate=RandomMarkovState(jax.random.PRNGKey(42)),
+            run=run,
         )
         return pipeline
@@ -147,6 +150,7 @@ class DiffusionInferencePipeline(InferencePipeline):
         state: Dict[str, Any],
         best_state: Optional[Dict[str, Any]] = None,
         rngstate: Optional[RandomMarkovState] = None,
+        run=None,
     ):
         if rngstate is None:
             rngstate = RandomMarkovState(jax.random.PRNGKey(42))
@@ -161,6 +165,7 @@ class DiffusionInferencePipeline(InferencePipeline):
             autoencoder=config['autoencoder'],
             input_config=config['input_config'],
             config=config,
+            wandb_run=run,
         )
     def get_sampler(
@@ -208,7 +213,8 @@ class DiffusionInferencePipeline(InferencePipeline):
         self,
         num_samples: int,
         resolution: int,
-        conditioning_data: Optional[List[Union[Tuple, Dict]]] = None,  # one list per modality or list of tuples
+        conditioning_data: List[Union[Tuple, Dict]] = None,
+        conditioning_data_tokens: Tuple = None,
         sequence_length: Optional[int] = None,
         diffusion_steps: int = 50,
         guidance_scale: float = 1.0,
@@ -256,5 +262,6 @@ class DiffusionInferencePipeline(InferencePipeline):
             steps_override=steps_override,
             priors=priors,
             rngstate=rngstate,
-            conditioning=conditioning_data
+            conditioning=conditioning_data,
+            model_conditioning_inputs=conditioning_data_tokens,
         )

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/inference/utils.py RENAMED Viewed

@@ -292,7 +292,7 @@ def load_from_wandb_run(
         config = run.config
     except Exception as e:
         print(f"Warning: Failed to load model from wandb: {e}")
-    return states, config
+    return states, config, run
 def load_from_wandb_registry(
     modelname: str,
@@ -307,6 +307,7 @@ def load_from_wandb_registry(
     # Get the model version from wandb
     states = None
     config = None
+    run = None
     try:
         artifact = wandb.Api().artifact(f"{registry}/{modelname}:{version}")
         ckpt_dir = artifact.download()
@@ -317,4 +318,4 @@ def load_from_wandb_registry(
         config = run.config
     except Exception as e:
         print(f"Warning: Failed to load model from wandb: {e}")
-    return states, config
+    return states, config, run

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/metrics/images.py RENAMED Viewed

@@ -7,7 +7,7 @@ def get_clip_metric(
 ):
     from transformers import AutoProcessor, FlaxCLIPModel
     model = FlaxCLIPModel.from_pretrained(modelname, dtype=jnp.float16)
-    processor = AutoProcessor.from_pretrained(modelname, use_fast=True, dtype=jnp.float16)
+    processor = AutoProcessor.from_pretrained(modelname, use_fast=False, dtype=jnp.float16)
     @jax.jit
     def calc(pixel_values, input_ids, attention_mask):

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/models/attention.py RENAMED Viewed

@@ -247,6 +247,7 @@ class BasicTransformerBlock(nn.Module):
     use_cross_only:bool = False
     only_pure_attention:bool = False
     force_fp32_for_softmax: bool = True
+    norm_epsilon: float = 1e-4
     def setup(self):
         if self.use_flash_attention:
@@ -278,9 +279,9 @@ class BasicTransformerBlock(nn.Module):
         )
         self.ff = FlaxFeedForward(dim=self.query_dim)
-        self.norm1 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm2 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm3 = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm1 = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype)
+        self.norm2 = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype)
+        self.norm3 = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype)
     @nn.compact
     def __call__(self, hidden_states, context=None):
@@ -312,13 +313,14 @@ class TransformerBlock(nn.Module):
     # kernel_init: Callable = kernel_init(1.0)
     norm_inputs: bool = True
     explicitly_add_residual: bool = True
+    norm_epsilon: float = 1e-4
     @nn.compact
     def __call__(self, x, context=None):
         inner_dim = self.heads * self.dim_head
         C = x.shape[-1]
         if self.norm_inputs:
-            x = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)(x)
+            x = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype)(x)
         if self.use_projection == True:
             if self.use_linear_attention:
                 projected_x = nn.Dense(features=inner_dim,
@@ -350,6 +352,7 @@ class TransformerBlock(nn.Module):
             use_cross_only=(not self.use_self_and_cross),
             only_pure_attention=self.only_pure_attention,
             force_fp32_for_softmax=self.force_fp32_for_softmax,
+            norm_epsilon=self.norm_epsilon
             # kernel_init=self.kernel_init
         )(projected_x, context)

flaxdiff-0.2.6/flaxdiff/models/better_uvit.py ADDED Viewed

@@ -0,0 +1,380 @@
+# flaxdiff/models/better_uvit.py
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from typing import Callable, Any, Optional, Tuple, Sequence, Union
+import einops
+from functools import partial
+# Re-use existing components if they are suitable
+from .common import kernel_init, FourierEmbedding, TimeProjection, hilbert_indices, inverse_permutation
+from .attention import NormalAttention # Using NormalAttention for RoPE integration
+from flax.typing import Dtype, PrecisionLike
+# --- Rotary Positional Embedding (RoPE) ---
+# Adapted from https://github.com/google-deepmind/ring_attention/blob/main/ring_attention/layers/rotary.py
+def _rotate_half(x: jax.Array) -> jax.Array:
+  """Rotates half the hidden dims of the input."""
+  x1 = x[..., : x.shape[-1] // 2]
+  x2 = x[..., x.shape[-1] // 2 :]
+  return jnp.concatenate((-x2, x1), axis=-1)
+def apply_rotary_embedding(
+    x: jax.Array, freqs_cis: jax.Array
+) -> jax.Array:
+  """Applies rotary embedding to the input tensor using rotate_half method."""
+  # x shape: [..., Sequence, Dimension] e.g. [B, H, S, D] or [B, S, D]
+  # freqs_cis shape: complex [Sequence, Dimension / 2]
+  # Extract cos and sin from the complex freqs_cis
+  cos_freqs = jnp.real(freqs_cis) # Shape [S, D/2]
+  sin_freqs = jnp.imag(freqs_cis) # Shape [S, D/2]
+  # Expand dims for broadcasting: [1, 1, S, D/2] or [1, S, D/2]
+  if x.ndim == 4: # [B, H, S, D]
+      cos_freqs = jnp.expand_dims(cos_freqs, axis=(0, 1))
+      sin_freqs = jnp.expand_dims(sin_freqs, axis=(0, 1))
+  elif x.ndim == 3: # [B, S, D]
+      cos_freqs = jnp.expand_dims(cos_freqs, axis=0)
+      sin_freqs = jnp.expand_dims(sin_freqs, axis=0)
+  # Duplicate cos and sin for the full dimension D
+  # Shape becomes [..., S, D]
+  cos_freqs = jnp.concatenate([cos_freqs, cos_freqs], axis=-1)
+  sin_freqs = jnp.concatenate([sin_freqs, sin_freqs], axis=-1)
+  # Apply rotation: x * cos + rotate_half(x) * sin
+  x_rotated = x * cos_freqs + _rotate_half(x) * sin_freqs
+  return x_rotated.astype(x.dtype)
+class RotaryEmbedding(nn.Module):
+  dim: int # Dimension of the head
+  max_seq_len: int = 2048
+  base: int = 10000
+  dtype: Dtype = jnp.float32
+  def setup(self):
+    inv_freq = 1.0 / (
+        self.base ** (jnp.arange(0, self.dim, 2, dtype=jnp.float32) / self.dim)
+    )
+    t = jnp.arange(self.max_seq_len, dtype=jnp.float32)
+    freqs = jnp.outer(t, inv_freq) # Shape: [max_seq_len, dim / 2]
+    # Precompute the complex form: cos(theta) + i * sin(theta)
+    self.freqs_cis_complex = jnp.cos(freqs) + 1j * jnp.sin(freqs)
+    # Shape: [max_seq_len, dim / 2]
+  def __call__(self, seq_len: int):
+    if seq_len > self.max_seq_len:
+        raise ValueError(f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}")
+    # Return complex shape [seq_len, dim / 2]
+    return self.freqs_cis_complex[:seq_len, :]
+# --- Attention with RoPE ---
+class RoPEAttention(NormalAttention):
+    rope_emb: RotaryEmbedding
+    @nn.compact
+    def __call__(self, x, context=None, freqs_cis=None):
+        # x has shape [B, H, W, C] or [B, S, C]
+        orig_x_shape = x.shape
+        is_4d = len(orig_x_shape) == 4
+        if is_4d:
+            B, H, W, C = x.shape
+            seq_len = H * W
+            x = x.reshape((B, seq_len, C))
+        else:
+            B, seq_len, C = x.shape
+        context = x if context is None else context
+        if len(context.shape) == 4:
+            _B, _H, _W, _C = context.shape
+            context_seq_len = _H * _W
+            context = context.reshape((B, context_seq_len, _C))
+        else:
+            _B, context_seq_len, _C = context.shape
+        query = self.query(x) # [B, S, H, D]
+        key = self.key(context) # [B, S_ctx, H, D]
+        value = self.value(context) # [B, S_ctx, H, D]
+        # Apply RoPE to query and key
+        if freqs_cis is not None:
+            # Permute to [B, H, S, D] for RoPE application if needed by apply_rotary_embedding
+            query = einops.rearrange(query, 'b s h d -> b h s d')
+            key = einops.rearrange(key, 'b s h d -> b h s d')
+            query = apply_rotary_embedding(query, freqs_cis)
+            key = apply_rotary_embedding(key, freqs_cis) # Apply to key as well
+            # Permute back to [B, S, H, D] for dot_product_attention
+            query = einops.rearrange(query, 'b h s d -> b s h d')
+            key = einops.rearrange(key, 'b h s d -> b s h d')
+        hidden_states = nn.dot_product_attention(
+            query, key, value, dtype=self.dtype, broadcast_dropout=False,
+            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=self.force_fp32_for_softmax,
+            deterministic=True
+        ) # Output shape [B, S, H, D]
+        proj = self.proj_attn(hidden_states) # Output shape [B, S, C]
+        if is_4d:
+            proj = proj.reshape(orig_x_shape) # Reshape back if input was 4D
+        return proj
+# --- adaLN-Zero ---
+class AdaLNZero(nn.Module):
+    features: int
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    norm_epsilon: float = 1e-5 # Standard LayerNorm epsilon
+    @nn.compact
+    def __call__(self, x, conditioning):
+        # Project conditioning signal to get scale and shift parameters
+        # Conditioning shape: [B, D_cond] -> [B, 1, ..., 1, 6 * features] for broadcasting
+        # Or [B, 1, 6*features] if x is [B, S, F]
+        # Ensure conditioning has seq dim if x does
+        if x.ndim == 3 and conditioning.ndim == 2: # x=[B,S,F], cond=[B,D_cond]
+            conditioning = jnp.expand_dims(conditioning, axis=1) # cond=[B,1,D_cond]
+        # Project conditioning to get 6 params per feature (scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn)
+        # Using nn.DenseGeneral for flexibility if needed, but nn.Dense is fine if cond is [B, D_cond] or [B, 1, D_cond]
+        ada_params = nn.Dense(
+            features=6 * self.features,
+            dtype=self.dtype,
+            precision=self.precision,
+            kernel_init=nn.initializers.zeros, # Initialize projection to zero (Zero init)
+            name="ada_proj"
+        )(conditioning)
+        # Split into scale, shift, gate for MLP and Attention
+        scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn = jnp.split(ada_params, 6, axis=-1)
+        # Apply Layer Normalization
+        norm = nn.LayerNorm(epsilon=self.norm_epsilon, use_scale=False, use_bias=False, dtype=self.dtype)
+        # norm = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype) # Alternative: RMSNorm
+        norm_x = norm(x)
+        # Modulate for Attention path
+        x_attn = norm_x * (1 + scale_attn) + shift_attn
+        # Modulate for MLP path
+        x_mlp = norm_x * (1 + scale_mlp) + shift_mlp
+        # Return modulated outputs and gates
+        return x_attn, gate_attn, x_mlp, gate_mlp
+# --- DiT Block ---
+class DiTBlock(nn.Module):
+    features: int
+    num_heads: int
+    mlp_ratio: int = 4
+    dropout_rate: float = 0.0 # Typically dropout is not used in diffusion models
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    use_flash_attention: bool = False # Keep option, but RoPEAttention uses NormalAttention base
+    force_fp32_for_softmax: bool = True
+    norm_epsilon: float = 1e-5
+    rope_emb: RotaryEmbedding # Pass RoPE module
+    def setup(self):
+        hidden_features = int(self.features * self.mlp_ratio)
+        self.ada_ln_zero = AdaLNZero(self.features, dtype=self.dtype, precision=self.precision, norm_epsilon=self.norm_epsilon)
+        # Use RoPEAttention
+        self.attention = RoPEAttention(
+            query_dim=self.features,
+            heads=self.num_heads,
+            dim_head=self.features // self.num_heads,
+            dtype=self.dtype,
+            precision=self.precision,
+            use_bias=True, # Bias is common in DiT attention proj
+            force_fp32_for_softmax=self.force_fp32_for_softmax,
+            rope_emb=self.rope_emb # Pass RoPE module instance
+        )
+        # Standard MLP block
+        self.mlp = nn.Sequential([
+            nn.Dense(features=hidden_features, dtype=self.dtype, precision=self.precision),
+            nn.gelu,
+            nn.Dense(features=self.features, dtype=self.dtype, precision=self.precision)
+        ])
+    @nn.compact
+    def __call__(self, x, conditioning, freqs_cis):
+        # x shape: [B, S, F]
+        # conditioning shape: [B, D_cond]
+        residual = x
+        # Apply adaLN-Zero to get modulated inputs and gates
+        x_attn, gate_attn, x_mlp, gate_mlp = self.ada_ln_zero(x, conditioning)
+        # Attention block
+        attn_output = self.attention(x_attn, context=None, freqs_cis=freqs_cis) # Self-attention only
+        x = residual + gate_attn * attn_output
+        # MLP block
+        mlp_output = self.mlp(x_mlp)
+        x = x + gate_mlp * mlp_output
+        return x
+# --- Patch Embedding (reuse or define if needed) ---
+# Assuming PatchEmbedding exists in simple_vit.py and is suitable
+from .simple_vit import PatchEmbedding, unpatchify
+# --- Better UViT (DiT Style) ---
+class BetterUViT(nn.Module):
+    output_channels: int = 3
+    patch_size: int = 16
+    emb_features: int = 768
+    num_layers: int = 12
+    num_heads: int = 12
+    mlp_ratio: int = 4
+    dropout_rate: float = 0.0 # Typically 0 for diffusion
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    use_flash_attention: bool = False # Passed down, but RoPEAttention uses NormalAttention
+    force_fp32_for_softmax: bool = True
+    norm_epsilon: float = 1e-5
+    learn_sigma: bool = False # Option to predict sigma like in DiT paper
+    use_hilbert: bool = False  # Toggle Hilbert patch reorder
+    def setup(self):
+        self.patch_embed = PatchEmbedding(
+            patch_size=self.patch_size,
+            embedding_dim=self.emb_features,
+            dtype=self.dtype,
+            precision=self.precision
+        )
+        # Time embedding projection
+        self.time_embed = nn.Sequential([
+            FourierEmbedding(features=self.emb_features),
+            TimeProjection(features=self.emb_features * self.mlp_ratio), # Project to MLP dim
+            nn.Dense(features=self.emb_features, dtype=self.dtype, precision=self.precision) # Final projection
+        ])
+        # Text context projection (if used)
+        # Assuming textcontext is already projected to some dimension, project it to match emb_features
+        # This might need adjustment based on how text context is provided
+        self.text_proj = nn.Dense(features=self.emb_features, dtype=self.dtype, precision=self.precision, name="text_context_proj")
+        # Rotary Positional Embedding
+        # Max length needs to be estimated or set large enough.
+        # For images, seq len = (H/P) * (W/P). Example: 256/16 * 256/16 = 16*16 = 256
+        # Add 1 if a class token is used, or more for text tokens if concatenated.
+        # Let's assume max seq len accommodates patches + time + text tokens if needed, or just patches.
+        # If only patches use RoPE, max_len = max_image_tokens
+        # If time/text are concatenated *before* blocks, max_len needs to include them.
+        # DiT typically applies PE only to patch tokens. Let's follow that.
+        # max_len should be max number of patches.
+        # Example: max image size 512x512, patch 16 -> (512/16)^2 = 32^2 = 1024 patches
+        self.rope = RotaryEmbedding(dim=self.emb_features // self.num_heads, max_seq_len=4096, dtype=self.dtype) # Dim per head
+        # Transformer Blocks
+        self.blocks = [
+            DiTBlock(
+                features=self.emb_features,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                dropout_rate=self.dropout_rate,
+                dtype=self.dtype,
+                precision=self.precision,
+                use_flash_attention=self.use_flash_attention,
+                force_fp32_for_softmax=self.force_fp32_for_softmax,
+                norm_epsilon=self.norm_epsilon,
+                rope_emb=self.rope, # Pass RoPE instance
+                name=f"dit_block_{i}"
+            ) for i in range(self.num_layers)
+        ]
+        # Final Layer (Normalization + Linear Projection)
+        self.final_norm = nn.LayerNorm(epsilon=self.norm_epsilon, dtype=self.dtype, name="final_norm")
+        # self.final_norm = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype, name="final_norm")
+        # Predict patch pixels + potentially sigma
+        output_dim = self.patch_size * self.patch_size * self.output_channels
+        if self.learn_sigma:
+            output_dim *= 2 # Predict both mean and variance (or log_variance)
+        self.final_proj = nn.Dense(
+            features=output_dim,
+            dtype=self.dtype,
+            precision=self.precision,
+            kernel_init=nn.initializers.zeros, # Initialize final layer to zero
+            name="final_proj"
+        )
+    @nn.compact
+    def __call__(self, x, temb, textcontext=None):
+        B, H, W, C = x.shape
+        assert H % self.patch_size == 0 and W % self.patch_size == 0, "Image dimensions must be divisible by patch size"
+        # 1. Patch Embedding
+        patches = self.patch_embed(x) # Shape: [B, num_patches, emb_features]
+        num_patches = patches.shape[1]
+        # Optional Hilbert reorder
+        if self.use_hilbert:
+            idx = hilbert_indices(H // self.patch_size, W // self.patch_size)
+            inv_idx = inverse_permutation(idx)
+            patches = patches[:, idx, :]
+        # replace x with patches
+        x_seq = patches
+        # 2. Prepare Conditioning Signal (Time + Text Context)
+        t_emb = self.time_embed(temb) # Shape: [B, emb_features]
+        cond_emb = t_emb
+        if textcontext is not None:
+            text_emb = self.text_proj(textcontext) # Shape: [B, num_text_tokens, emb_features]
+            # Pool or select text embedding (e.g., mean pool or use CLS token)
+            # Assuming mean pooling for simplicity
+            text_emb_pooled = jnp.mean(text_emb, axis=1) # Shape: [B, emb_features]
+            cond_emb = cond_emb + text_emb_pooled # Combine time and text embeddings
+        # 3. Apply RoPE
+        # Get RoPE frequencies for the sequence length (number of patches)
+        freqs_cis = self.rope(seq_len=num_patches) # Shape [num_patches, D_head/2]
+        # 4. Apply Transformer Blocks with adaLN-Zero conditioning
+        for block in self.blocks:
+            x_seq = block(x_seq, conditioning=cond_emb, freqs_cis=freqs_cis)
+        # 5. Final Layer
+        x_out = self.final_norm(x_seq)
+        x_out = self.final_proj(x_out) # Shape: [B, num_patches, patch_pixels (*2 if learn_sigma)]
+        # Optional Hilbert inverse reorder
+        if self.use_hilbert:
+            x_out = x_out[:, inv_idx, :]
+        # 6. Unpatchify
+        if self.learn_sigma:
+            # Split into mean and variance predictions
+            x_mean, x_logvar = jnp.split(x_out, 2, axis=-1)
+            x = unpatchify(x_mean, channels=self.output_channels)
+            # Return both mean and logvar if needed by the loss function
+            # For now, just returning the mean prediction like standard diffusion models
+            # logvar = unpatchify(x_logvar, channels=self.output_channels)
+            # return x, logvar
+            return x
+        else:
+            x = unpatchify(x_out, channels=self.output_channels) # Shape: [B, H, W, C]
+            return x

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/models/common.py RENAMED Viewed

@@ -6,6 +6,8 @@ from flax.typing import Dtype, PrecisionLike
 from typing import Dict, Callable, Sequence, Any, Union
 import einops
 from functools import partial
+import math
+from einops import rearrange
 # Kernel initializer to use
 def kernel_init(scale=1.0, dtype=jnp.float32):
@@ -247,7 +249,7 @@ class Downsample(nn.Module):
         return out
-def l2norm(t, axis=1, eps=1e-12):
+def l2norm(t, axis=1, eps=1e-6): # Increased epsilon from 1e-12
     denom = jnp.clip(jnp.linalg.norm(t, ord=2, axis=axis, keepdims=True), eps)
     out = t/denom
     return (out)
@@ -266,14 +268,15 @@ class ResidualBlock(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     named_norms:bool=False
+    norm_epsilon: float = 1e-4 # Added epsilon parameter, increased default
     def setup(self):
         if self.norm_groups > 0:
-            norm = partial(nn.GroupNorm, self.norm_groups)
+            norm = partial(nn.GroupNorm, self.norm_groups, epsilon=self.norm_epsilon)
             self.norm1 = norm(name="GroupNorm_0") if self.named_norms else norm()
             self.norm2 = norm(name="GroupNorm_1") if self.named_norms else norm()
         else:
-            norm = partial(nn.RMSNorm, 1e-5)
+            norm = partial(nn.RMSNorm, epsilon=self.norm_epsilon)
             self.norm1 = norm()
             self.norm2 = norm()
@@ -333,4 +336,72 @@ class ResidualBlock(nn.Module):
         out = jnp.concatenate([out, extra_features], axis=-1) if extra_features is not None else out
         return out
+# Convert Hilbert index d to 2D coordinates (x, y) for an n x n grid
+def _d2xy(n, d):
+    x = 0
+    y = 0
+    t = d
+    s = 1
+    while s < n:
+        rx = (t // 2) & 1
+        ry = (t ^ rx) & 1
+        if ry == 0:
+            if rx == 1:
+                x = n - 1 - x
+                y = n - 1 - y
+            x, y = y, x
+        x += s * rx
+        y += s * ry
+        t //= 4
+        s *= 2
+    return x, y
+# Hilbert index mapping for a rectangular grid of patches H_P x W_P
+def hilbert_indices(H_P, W_P):
+    size = max(H_P, W_P)
+    order = math.ceil(math.log2(size))
+    n = 1 << order
+    coords = []
+    for d in range(n * n):
+        x, y = _d2xy(n, d)
+        # x is column index, y is row index
+        if x < W_P and y < H_P:
+            coords.append((y, x))  # (row, col)
+            if len(coords) == H_P * W_P:
+                break
+    # Convert (row, col) to linear indices row-major
+    indices = [r * W_P + c for r, c in coords]
+    return jnp.array(indices, dtype=jnp.int32)
+# Inverse permutation: given idx where idx[i] = new position of element i, return inv such that inv[idx[i]] = i
+def inverse_permutation(idx):
+    inv = jnp.zeros_like(idx)
+    inv = inv.at[idx].set(jnp.arange(idx.shape[0], dtype=idx.dtype))
+    return inv
+# Patchify using Hilbert ordering: extract patches and reorder sequence
+def hilbert_patchify(x, patch_size):
+    B, H, W, C = x.shape
+    H_P = H // patch_size
+    W_P = W // patch_size
+    # Extract patches in row-major
+    patches = rearrange(x, 'b (h p1) (w p2) c -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size)
+    idx = hilbert_indices(H_P, W_P)
+    return patches[:, idx, :]
+# Unpatchify from Hilbert ordering: reorder sequence back and reconstruct image
+def hilbert_unpatchify(patches, patch_size, H, W, C):
+    B, N, D = patches.shape
+    H_P = H // patch_size
+    W_P = W // patch_size
+    inv = inverse_permutation(hilbert_indices(H_P, W_P))
+    # Reorder back to row-major
+    linear = patches[:, inv, :]
+    # Reconstruct image
+    x = rearrange(linear, 'b (h w) (p1 p2 c) -> b (h p1) (w p2) c', h=H_P, w=W_P, p1=patch_size, p2=patch_size, c=C)
+    return x

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/models/simple_vit.py RENAMED Viewed

@@ -10,6 +10,7 @@ from flaxdiff.models.simple_unet import FourierEmbedding, TimeProjection, ConvLa
 import einops
 from flax.typing import Dtype, PrecisionLike
 from functools import partial
+from .common import hilbert_indices, inverse_permutation
 def unpatchify(x, channels=3):
     patch_size = int((x.shape[2] // channels) ** 0.5)
@@ -55,8 +56,6 @@ class UViT(nn.Module):
     num_layers: int = 12
     num_heads: int = 12
     dropout_rate: float = 0.1
-    dtype: Any = jnp.float32
-    precision: Any = jax.lax.Precision.HIGH
     use_projection: bool = False
     use_flash_attention: bool = False
     use_self_and_cross: bool = False
@@ -65,16 +64,17 @@ class UViT(nn.Module):
     norm_groups:int=8
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
-    # kernel_init: Callable = partial(kernel_init, scale=1.0)
     add_residualblock_output: bool = False
     norm_inputs: bool = False
     explicitly_add_residual: bool = True
+    norm_epsilon: float = 1e-4 # Added epsilon parameter, increased default
+    use_hilbert: bool = False # Toggle Hilbert patch reorder
     def setup(self):
         if self.norm_groups > 0:
-            self.norm = partial(nn.GroupNorm, self.norm_groups)
+            self.norm = partial(nn.GroupNorm, self.norm_groups, epsilon=self.norm_epsilon)
         else:
-            self.norm = partial(nn.RMSNorm, 1e-5)
+            self.norm = partial(nn.RMSNorm, epsilon=self.norm_epsilon)
     @nn.compact
     def __call__(self, x, temb, textcontext=None):
@@ -83,28 +83,32 @@ class UViT(nn.Module):
         temb = TimeProjection(features=self.emb_features)(temb)
         original_img = x
+        B, H, W, C = original_img.shape
+        H_P = H // self.patch_size
+        W_P = W // self.patch_size
         # Patch embedding
         x = PatchEmbedding(patch_size=self.patch_size, embedding_dim=self.emb_features,
                            dtype=self.dtype, precision=self.precision)(x)
         num_patches = x.shape[1]
+        # Optional Hilbert reorder
+        if self.use_hilbert:
+            idx = hilbert_indices(H_P, W_P)
+            inv_idx = inverse_permutation(idx)
+            x = x[:, idx, :]
         context_emb = nn.DenseGeneral(features=self.emb_features,
                                dtype=self.dtype, precision=self.precision)(textcontext)
         num_text_tokens = textcontext.shape[1]
-        # print(f'Shape of x after patch embedding: {x.shape}, numPatches: {num_patches}, temb: {temb.shape}, context_emb: {context_emb.shape}')
         # Add time embedding
         temb = jnp.expand_dims(temb, axis=1)
         x = jnp.concatenate([x, temb, context_emb], axis=1)
-        # print(f'Shape of x after time embedding: {x.shape}')
         # Add positional encoding
         x = PositionalEncoding(max_len=x.shape[1], embedding_dim=self.emb_features)(x)
-        # print(f'Shape of x after positional encoding: {x.shape}')
         skips = []
         # In blocks
         for i in range(self.num_layers // 2):
@@ -114,6 +118,7 @@ class UViT(nn.Module):
                                  only_pure_attention=False,
                                  norm_inputs=self.norm_inputs,
                                  explicitly_add_residual=self.explicitly_add_residual,
+                                 norm_epsilon=self.norm_epsilon, # Pass epsilon
                                  )(x)
             skips.append(x)
@@ -124,9 +129,10 @@ class UViT(nn.Module):
                              only_pure_attention=False,
                             norm_inputs=self.norm_inputs,
                             explicitly_add_residual=self.explicitly_add_residual,
+                            norm_epsilon=self.norm_epsilon, # Pass epsilon
                             )(x)
-        # # Out blocks
+        # Out blocks
         for i in range(self.num_layers // 2):
             x = jnp.concatenate([x, skips.pop()], axis=-1)
             x = nn.DenseGeneral(features=self.emb_features,
@@ -137,14 +143,18 @@ class UViT(nn.Module):
                                  only_pure_attention=False,
                                  norm_inputs=self.norm_inputs,
                                  explicitly_add_residual=self.explicitly_add_residual,
+                                 norm_epsilon=self.norm_epsilon, # Pass epsilon
                                  )(x)
-        # print(f'Shape of x after transformer blocks: {x.shape}')
-        x = self.norm()(x)
+        x = self.norm()(x) # Uses norm_epsilon defined in setup
         patch_dim = self.patch_size ** 2 * self.output_channels
         x = nn.Dense(features=patch_dim, dtype=self.dtype, precision=self.precision)(x)
-        x = x[:, 1 + num_text_tokens:, :]
+        # If Hilbert, restore original patch order
+        if self.use_hilbert:
+            x = x[:, inv_idx, :]
+        # Extract only the image patch tokens (first num_patches tokens)
+        x = x[:, :num_patches, :]
         x = unpatchify(x, channels=self.output_channels)
         if self.add_residualblock_output:

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff/trainer/general_diffusion_trainer.py RENAMED Viewed

@@ -578,6 +578,7 @@ class GeneralDiffusionTrainer(DiffusionTrainer):
         if not hasattr(self, "wandb_sweep"):
             raise ValueError("Wandb sweep is not initialized. Cannot get best runs.")
+        print(f"Getting best runs from sweep {self.wandb_sweep.id}...")
         # Get the sweep runs
         runs = sorted(self.wandb_sweep.runs, key=lambda x: x.summary.get(metric, float('inf')))
         best_runs = runs[:top_k]
@@ -588,18 +589,46 @@ class GeneralDiffusionTrainer(DiffusionTrainer):
             print(f"\t\tRun ID: {run.id}, Metric: {run.summary.get(metric, float('inf'))}")
         return best_runs, (min(lower_bound, upper_bound), max(lower_bound, upper_bound))
-    def __compare_run_against_best__(self, top_k=2, metric="train/best_loss"):
+    def __get_best_general_runs__(
+        self,
+        metric: str = "train/best_loss",
+        top_k: int = 5,
+    ):
         """
-        Compare the current run against the best runs from the sweep.
+        Get the best runs from wandb.
+        Args:
+            metric: Metric to sort by.
+            top_k: Number of top runs to return.
+        """
+        if self.wandb is None:
+            raise ValueError("Wandb is not initialized. Cannot get best runs.")
+        # Get the sweep runs
+        runs = sorted(self.wandb.runs, key=lambda x: x.summary.get(metric, float('inf')))
+        best_runs = runs[:top_k]
+        lower_bound = best_runs[-1].summary.get(metric, float('inf'))
+        upper_bound = best_runs[0].summary.get(metric, float('inf'))
+        print(f"Best runs from wandb {self.wandb.id}:")
+        for run in best_runs:
+            print(f"\t\tRun ID: {run.id}, Metric: {run.summary.get(metric, float('inf'))}")
+        return best_runs, (min(lower_bound, upper_bound), max(lower_bound, upper_bound))
+    def __compare_run_against_best__(self, top_k=2, metric="train/best_loss", from_sweeps=False):
+        """
+        Compare the current run against the best runs from wandb.
         Args:
             top_k: Number of top runs to consider.
             metric: Metric to compare against.
+            from_sweeps: Whether to consider runs from sweeps.
         Returns:
             is_good: Whether the current run is among the best.
             is_best: Whether the current run is the best.
         """
         # Get best runs
-        best_runs, bounds = self.__get_best_sweep_runs__(metric=metric, top_k=top_k)
+        if from_sweeps:
+            best_runs, bounds = self.__get_best_sweep_runs__(metric=metric, top_k=top_k)
+        else:
+            best_runs, bounds = self.__get_best_general_runs__(metric=metric, top_k=top_k)
         # Determine if lower or higher values are better (for loss, lower is better)
         is_lower_better = "loss" in metric.lower()
@@ -621,10 +650,10 @@ class GeneralDiffusionTrainer(DiffusionTrainer):
     def save(self, epoch=0, step=0, state=None, rngstate=None):
         super().save(epoch=epoch, step=step, state=state, rngstate=rngstate)
-        if self.wandb is not None and hasattr(self, "wandb_sweep"):
+        if self.wandb is not None:
             checkpoint = get_latest_checkpoint(self.checkpoint_path())
             try:
-                is_good, is_best = self.__compare_run_against_best__(top_k=5, metric="train/best_loss")
+                is_good, is_best = self.__compare_run_against_best__(top_k=5, metric="train/best_loss", from_sweeps=hasattr(self, "wandb_sweep"))
                 if is_good:
                     # Push to registry with appropriate aliases
                     aliases = []

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: flaxdiff
-Version: 0.2.4
+Version: 0.2.6
 Summary: A versatile and easy to understand Diffusion library
 Author-email: Ashish Kumar Singh <ashishkmr472@gmail.com>
 License-Expression: MIT

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/flaxdiff.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,6 +34,7 @@ flaxdiff/metrics/ssim.py
 flaxdiff/metrics/utils.py
 flaxdiff/models/__init__.py
 flaxdiff/models/attention.py
+flaxdiff/models/better_uvit.py
 flaxdiff/models/common.py
 flaxdiff/models/favor_fastattn.py
 flaxdiff/models/general.py

{flaxdiff-0.2.4 → flaxdiff-0.2.6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "flaxdiff"
-version = "0.2.4"
+version = "0.2.6"
 description = "A versatile and easy to understand Diffusion library"
 readme = "README.md"
 authors = [