PyPI - flaxdiff - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

flaxdiff 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

flaxdiff/data/dataloaders.py +11 -19
flaxdiff/data/dataset_map.py +2 -1
flaxdiff/data/sources/images.py +29 -14
flaxdiff/inference/utils.py +7 -1
flaxdiff/models/simple_dit.py +1 -202
flaxdiff/models/simple_mmdit.py +1 -132
flaxdiff/models/simple_vit.py +217 -118
flaxdiff/models/vit_common.py +262 -0
flaxdiff/trainer/general_diffusion_trainer.py +2 -1
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/METADATA +1 -1
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/RECORD +13 -12
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/WHEEL +0 -0
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/top_level.txt +0 -0

flaxdiff/data/dataloaders.py CHANGED Viewed

@@ -292,14 +292,12 @@ def get_dataset_grain(
         Dictionary with train dataset function and metadata.
     """
     dataset = datasetMap[data_name]
-    train_source = dataset["source"](dataset_source, split="train")
-    # val_source = dataset["source"](dataset_source, split="val")
+    data_source = dataset["source"](dataset_source)
     augmenter = dataset["augmenter"](image_scale, method)
     local_batch_size = batch_size // jax.process_count()
     train_sampler = pygrain.IndexSampler(
-        num_records=len(train_source) if count is None else count,
+        num_records=len(data_source) if count is None else count,
         shuffle=True,
         seed=seed,
         num_epochs=num_epochs,
@@ -307,7 +305,7 @@ def get_dataset_grain(
     )
     # val_sampler = pygrain.IndexSampler(
-    #     num_records=len(val_source) if count is None else count,
+    #     num_records=len(data_source) if count is None else count,
     #     shuffle=False,
     #     seed=seed,
     #     num_epochs=num_epochs,
@@ -318,16 +316,10 @@ def get_dataset_grain(
         transformations = [
             augmenter(),
         ]
-        # if filters:
-        #     print("Adding filters to transformations")
-        #     transformations.append(filters())
-        # transformations.append(CaptionDeletionTransform())
         transformations.append(pygrain.Batch(local_batch_size, drop_remainder=True))
         loader = pygrain.DataLoader(
-            data_source=train_source,
+            data_source=data_source,
             sampler=train_sampler,
             operations=transformations,
             worker_count=worker_count,
@@ -341,26 +333,26 @@ def get_dataset_grain(
     def get_valset():
         transformations = [
             augmenter(),
-            pygrain.Batch(local_batch_size, drop_remainder=True),
+            pygrain.Batch(32, drop_remainder=True),
         ]
         loader = pygrain.DataLoader(
-            data_source=train_source,
+            data_source=data_source,
             sampler=train_sampler,
             operations=transformations,
-            worker_count=2,
+            worker_count=8,
             read_options=pygrain.ReadOptions(
-                read_thread_count, read_buffer_size
+                32, 128
             ),
-            worker_buffer_size=2,
+            worker_buffer_size=32,
         )
         return loader
     return {
         "train": get_trainset,
-        "train_len": len(train_source),
+        "train_len": len(data_source),
         "val": get_valset,
-        "val_len": len(train_source),
+        "val_len": len(data_source),
         "local_batch_size": local_batch_size,
         "global_batch_size": batch_size,
     }

flaxdiff/data/dataset_map.py CHANGED Viewed

@@ -21,8 +21,9 @@ datasetMap = {
         "augmenter": gcs_augmenters,
     },
     "laiona_coco": {
-        "source": data_source_gcs('datasets/laion12m+mscoco_filtered-new'),
+        "source": data_source_gcs('datasets/laion12m+mscoco'),
         "augmenter": gcs_augmenters,
+        "filter": gcs_filters,
     },
     "aesthetic_coyo": {
         "source": data_source_gcs('arrayrecords/aestheticCoyo_0.25clip_6aesthetic'),

flaxdiff/data/sources/images.py CHANGED Viewed

@@ -11,7 +11,7 @@ import struct as st
 from functools import partial
 import numpy as np
 from .base import DataSource, DataAugmenter
+import traceback
 # ----------------------------------------------------------------------------------
 # Utility functions
@@ -79,10 +79,28 @@ def labelizer_oxford_flowers102(path):
 # TFDS Image Source
 # ----------------------------------------------------------------------------------
+def get_oxford_valset(text_encoder):
+    # Construct a validation set by the prompts for consistency
+    val_prompts = ['water tulip', ' a water lily', ' a water lily', ' a photo of a rose', ' a photo of a rose', ' a water lily', ' a water lily', ' a photo of a marigold', ' a photo of a marigold', ' a photo of a marigold', ' a water lily', ' a photo of a sunflower', ' a photo of a lotus', ' columbine', ' columbine', ' an orchid', ' an orchid', ' an orchid', ' a water lily', ' a water lily', ' a water lily', ' columbine', ' columbine', ' a photo of a sunflower', ' a photo of a sunflower', ' a photo of a sunflower', ' a photo of a lotus', ' a photo of a lotus', ' a photo of a marigold', ' a photo of a marigold', ' a photo of a rose', ' a photo of a rose', ' a photo of a rose', ' orange dahlia', ' orange dahlia', ' a lenten rose', ' a lenten rose', ' a water lily', ' a water lily', ' a water lily', ' a water lily', ' an orchid', ' an orchid', ' an orchid', ' hard-leaved pocket orchid', ' bird of paradise', ' bird of paradise', ' a photo of a lovely rose', ' a photo of a lovely rose', ' a photo of a globe-flower', ' a photo of a globe-flower', ' a photo of a lovely rose', ' a photo of a lovely rose', ' a photo of a ruby-lipped cattleya', ' a photo of a ruby-lipped cattleya', ' a photo of a lovely rose', ' a water lily', ' a osteospermum', ' a osteospermum', ' a water lily', ' a water lily', ' a water lily', ' a red rose', ' a red rose']
+    val_prompts *= 100
+    def get_val_dataset(batch_size=128):
+        for i in range(0, len(val_prompts), batch_size):
+            try:
+                prompts = val_prompts[i:i + batch_size]
+                tokens = text_encoder.tokenize(prompts)
+                yield {"text": tokens}
+            except Exception as e:
+                print(f"Error in get_val_dataset: {e}")
+                traceback.print_exc()
+                continue
+    return get_val_dataset, len(val_prompts)
 class ImageTFDSSource(DataSource):
     """Data source for TensorFlow Datasets (TFDS) image datasets."""
-    def __init__(self, name: str, use_tf: bool = True):
+    def __init__(self, name: str, use_tf: bool = True, split: str = "all"):
         """Initialize a TFDS image data source.
         Args:
@@ -92,8 +110,9 @@ class ImageTFDSSource(DataSource):
         """
         self.name = name
         self.use_tf = use_tf
+        self.split = split
-    def get_source(self, path_override: str, split: str = "all") -> Any:
+    def get_source(self, path_override: str) -> Any:
         """Get the TFDS data source.
         Args:
@@ -104,9 +123,9 @@ class ImageTFDSSource(DataSource):
         """
         import tensorflow_datasets as tfds
         if self.use_tf:
-            return tfds.load(self.name, split=split, shuffle_files=True)
+            return tfds.load(self.name, split=self.split, shuffle_files=True)
         else:
-            return tfds.data_source(self.name, split=split, try_gcs=False)
+            return tfds.data_source(self.name, split=self.split, try_gcs=False)
 class ImageTFDSAugmenter(DataAugmenter):
@@ -198,7 +217,7 @@ class ImageGCSSource(DataSource):
         """
         self.source = source
-    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split: str = "train") -> Any:
+    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount") -> Any:
         """Get the GCS data source.
         Args:
@@ -210,8 +229,6 @@ class ImageGCSSource(DataSource):
         records_path = os.path.join(path_override, self.source)
         records = [os.path.join(records_path, i) for i in os.listdir(
             records_path) if 'array_record' in i]
-        if split == "val":
-            records = records[:1]
         return pygrain.ArrayRecordDataSource(records)
@@ -226,7 +243,7 @@ class CombinedImageGCSSource(DataSource):
         """
         self.sources = sources
-    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split: str = "train") -> Any:
+    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount") -> Any:
         """Get the combined GCS data source.
         Args:
@@ -240,8 +257,6 @@ class CombinedImageGCSSource(DataSource):
         for records_path in records_paths:
             records += [os.path.join(records_path, i) for i in os.listdir(
                 records_path) if 'array_record' in i]
-        if split == "val":
-            records = records[:1]
         return pygrain.ArrayRecordDataSource(records)
 class ImageGCSAugmenter(DataAugmenter):
@@ -357,9 +372,9 @@ class ImageGCSAugmenter(DataAugmenter):
 # These functions maintain backward compatibility with existing code
-def data_source_tfds(name, use_tf=True):
+def data_source_tfds(name, use_tf=True, split="all"):
     """Legacy function for TFDS data sources."""
-    source = ImageTFDSSource(name=name, use_tf=use_tf)
+    source = ImageTFDSSource(name=name, use_tf=use_tf, split=split)
     return source.get_source
@@ -389,4 +404,4 @@ def gcs_augmenters(image_scale, method):
 def gcs_filters(image_scale):
     """Legacy function for GCS Filters."""
     augmenter = ImageGCSAugmenter()
-    return augmenter.create_filter(image_scale=image_scale)
+    return augmenter.create_filter(image_scale=image_scale)

flaxdiff/inference/utils.py CHANGED Viewed

@@ -25,6 +25,9 @@ from flaxdiff.models.autoencoder.diffusers import StableDiffusionVAE
 from flaxdiff.inputs import DiffusionInputConfig, ConditionalInputConfig
 from flaxdiff.utils import defaultTextEncodeModel
+from flaxdiff.models.simple_vit import UViT, SimpleUDiT
+from flaxdiff.models.simple_dit import SimpleDiT
+from flaxdiff.models.simple_mmdit import SimpleMMDiT, HierarchicalMMDiT
 from orbax.checkpoint import CheckpointManager, CheckpointManagerOptions, PyTreeCheckpointer
 import os
@@ -116,7 +119,10 @@ def parse_config(config, overrides=None):
     MODEL_CLASSES = {
         'unet': Unet,
         'uvit': UViT,
-        'diffusers_unet_simple': FlaxUNet2DConditionModel
+        'diffusers_unet_simple': FlaxUNet2DConditionModel,
+        'simple_dit': SimpleDiT,
+        'simple_uvit': SimpleUDiT,
+        'simple_mmdit': SimpleMMDiT,
     }
     # Map all the leaves of the model config, converting strings to appropriate types

flaxdiff/models/simple_dit.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from .simple_vit import PatchEmbedding, unpatchify
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
@@ -7,6 +6,7 @@ import einops
 from functools import partial
 # Re-use existing components if they are suitable
+from .vit_common import PatchEmbedding, unpatchify, RotaryEmbedding, RoPEAttention, AdaLNParams
 from .common import kernel_init, FourierEmbedding, TimeProjection
 # Using NormalAttention for RoPE integration
 from .attention import NormalAttention
@@ -15,207 +15,6 @@ from flax.typing import Dtype, PrecisionLike
 # Use our improved Hilbert implementation
 from .hilbert import hilbert_indices, inverse_permutation, hilbert_patchify, hilbert_unpatchify
-# --- Rotary Positional Embedding (RoPE) ---
-# Adapted from https://github.com/google-deepmind/ring_attention/blob/main/ring_attention/layers/rotary.py
-def _rotate_half(x: jax.Array) -> jax.Array:
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return jnp.concatenate((-x2, x1), axis=-1)
-def apply_rotary_embedding(
-    x: jax.Array, freqs_cos: jax.Array, freqs_sin: jax.Array
-) -> jax.Array:
-    """Applies rotary embedding to the input tensor using rotate_half method."""
-    # x shape: [..., Sequence, Dimension] e.g. [B, H, S, D] or [B, S, D]
-    # freqs_cos/sin shape: [Sequence, Dimension / 2]
-    # Expand dims for broadcasting: [1, 1, S, D/2] or [1, S, D/2]
-    if x.ndim == 4:  # [B, H, S, D]
-        cos_freqs = jnp.expand_dims(freqs_cos, axis=(0, 1))
-        sin_freqs = jnp.expand_dims(freqs_sin, axis=(0, 1))
-    elif x.ndim == 3:  # [B, S, D]
-        cos_freqs = jnp.expand_dims(freqs_cos, axis=0)
-        sin_freqs = jnp.expand_dims(freqs_sin, axis=0)
-    # Duplicate cos and sin for the full dimension D
-    # Shape becomes [..., S, D]
-    cos_freqs = jnp.concatenate([cos_freqs, cos_freqs], axis=-1)
-    sin_freqs = jnp.concatenate([sin_freqs, sin_freqs], axis=-1)
-    # Apply rotation: x * cos + rotate_half(x) * sin
-    x_rotated = x * cos_freqs + _rotate_half(x) * sin_freqs
-    return x_rotated.astype(x.dtype)
-class RotaryEmbedding(nn.Module):
-    dim: int  # Dimension of the head
-    max_seq_len: int = 2048
-    base: int = 10000
-    dtype: Dtype = jnp.float32
-    def setup(self):
-        inv_freq = 1.0 / (
-            self.base ** (jnp.arange(0, self.dim, 2,
-                          dtype=jnp.float32) / self.dim)
-        )
-        t = jnp.arange(self.max_seq_len, dtype=jnp.float32)
-        freqs = jnp.outer(t, inv_freq)  # Shape: [max_seq_len, dim / 2]
-        # Store cosine and sine separately instead of as complex numbers
-        self.freqs_cos = jnp.cos(freqs)  # Shape: [max_seq_len, dim / 2]
-        self.freqs_sin = jnp.sin(freqs)  # Shape: [max_seq_len, dim / 2]
-    def __call__(self, seq_len: int):
-        if seq_len > self.max_seq_len:
-            raise ValueError(
-                f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}")
-        # Return separate cos and sin components
-        return self.freqs_cos[:seq_len, :], self.freqs_sin[:seq_len, :]
-# --- Attention with RoPE ---
-class RoPEAttention(NormalAttention):
-    rope_emb: RotaryEmbedding = None  # Instance of RotaryEmbedding
-    @nn.compact
-    def __call__(self, x, context=None, freqs_cis=None):
-        # x has shape [B, H, W, C] or [B, S, C]
-        orig_x_shape = x.shape
-        is_4d = len(orig_x_shape) == 4
-        if is_4d:
-            B, H, W, C = x.shape
-            seq_len = H * W
-            x = x.reshape((B, seq_len, C))
-        else:
-            B, seq_len, C = x.shape
-        context = x if context is None else context
-        if len(context.shape) == 4:
-            _B, _H, _W, _C = context.shape
-            context_seq_len = _H * _W
-            context = context.reshape((B, context_seq_len, _C))
-        # else: context is already [B, S_ctx, C]
-        query = self.query(x)  # [B, S, H, D]
-        key = self.key(context)  # [B, S_ctx, H, D]
-        value = self.value(context)  # [B, S_ctx, H, D]
-        # Apply RoPE to query and key
-        if freqs_cis is None:
-            # Generate frequencies using the rope_emb instance
-            seq_len_q = query.shape[1] # Use query's sequence length
-            freqs_cos, freqs_sin = self.rope_emb(seq_len_q)
-        else:
-            # If freqs_cis is passed in as a tuple
-            freqs_cos, freqs_sin = freqs_cis
-        # Apply RoPE to query and key
-        # Permute to [B, H, S, D] for RoPE application
-        query = einops.rearrange(query, 'b s h d -> b h s d')
-        key = einops.rearrange(key, 'b s h d -> b h s d')
-        # Apply RoPE only up to the context sequence length for keys if different
-        # Assuming self-attention or context has same seq len for simplicity here
-        query = apply_rotary_embedding(query, freqs_cos, freqs_sin)
-        key = apply_rotary_embedding(key, freqs_cos, freqs_sin) # Apply same freqs to key
-        # Permute back to [B, S, H, D] for dot_product_attention
-        query = einops.rearrange(query, 'b h s d -> b s h d')
-        key = einops.rearrange(key, 'b h s d -> b s h d')
-        hidden_states = nn.dot_product_attention(
-            query, key, value, dtype=self.dtype, broadcast_dropout=False,
-            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=self.force_fp32_for_softmax,
-            deterministic=True
-        )  # Output shape [B, S, H, D]
-        # Use the proj_attn from NormalAttention which expects [B, S, H, D]
-        proj = self.proj_attn(hidden_states)  # Output shape [B, S, C]
-        if is_4d:
-            proj = proj.reshape(orig_x_shape)  # Reshape back if input was 4D
-        return proj
-# --- adaLN-Zero ---
-class AdaLNZero(nn.Module):
-    features: int
-    dtype: Optional[Dtype] = None
-    precision: PrecisionLike = None
-    norm_epsilon: float = 1e-5  # Standard LayerNorm epsilon
-    @nn.compact
-    def __call__(self, x, conditioning):
-        # Project conditioning signal to get scale and shift parameters
-        # Conditioning shape: [B, D_cond] -> [B, 1, ..., 1, 6 * features] for broadcasting
-        # Or [B, 1, 6*features] if x is [B, S, F]
-        # Ensure conditioning has seq dim if x does
-        # x=[B,S,F], cond=[B,D_cond]
-        if x.ndim == 3 and conditioning.ndim == 2:
-            conditioning = jnp.expand_dims(
-                conditioning, axis=1)  # cond=[B,1,D_cond]
-        # Project conditioning to get 6 params per feature (scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn)
-        # Using nn.DenseGeneral for flexibility if needed, but nn.Dense is fine if cond is [B, D_cond] or [B, 1, D_cond]
-        ada_params = nn.Dense(
-            features=6 * self.features,
-            dtype=self.dtype,
-            precision=self.precision,
-            # Initialize projection to zero (Zero init)
-            kernel_init=nn.initializers.zeros,
-            name="ada_proj"
-        )(conditioning)
-        # Split into scale, shift, gate for MLP and Attention
-        scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn = jnp.split(
-            ada_params, 6, axis=-1)
-        scale_mlp = jnp.clip(scale_mlp, -10.0, 10.0)
-        shift_mlp = jnp.clip(shift_mlp, -10.0, 10.0)
-        # Apply Layer Normalization
-        norm = nn.LayerNorm(epsilon=self.norm_epsilon,
-                            use_scale=False, use_bias=False, dtype=self.dtype)
-        # norm = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype) # Alternative: RMSNorm
-        norm_x = norm(x)
-        # Modulate for Attention path
-        x_attn = norm_x * (1 + scale_attn) + shift_attn
-        # Modulate for MLP path
-        x_mlp = norm_x * (1 + scale_mlp) + shift_mlp
-        # Return modulated outputs and gates
-        return x_attn, gate_attn, x_mlp, gate_mlp
-class AdaLNParams(nn.Module): # Renamed for clarity
-    features: int
-    dtype: Optional[Dtype] = None
-    precision: PrecisionLike = None
-    @nn.compact
-    def __call__(self, conditioning):
-        # Ensure conditioning is broadcastable if needed (e.g., [B, 1, D_cond])
-        if conditioning.ndim == 2:
-             conditioning = jnp.expand_dims(conditioning, axis=1)
-        # Project conditioning to get 6 params per feature
-        ada_params = nn.Dense(
-            features=6 * self.features,
-            dtype=self.dtype,
-            precision=self.precision,
-            kernel_init=nn.initializers.zeros,
-            name="ada_proj"
-        )(conditioning)
-        # Return all params (or split if preferred, but maybe return tuple/dict)
-        # Shape: [B, 1, 6*F]
-        return ada_params # Or split and return tuple: jnp.split(ada_params, 6, axis=-1)
 # --- DiT Block ---
 class DiTBlock(nn.Module):
     features: int

flaxdiff/models/simple_mmdit.py CHANGED Viewed

@@ -7,143 +7,12 @@ from functools import partial
 from flax.typing import Dtype, PrecisionLike
 # Imports from local modules
-from .simple_vit import PatchEmbedding, unpatchify
+from .vit_common import PatchEmbedding, unpatchify, RotaryEmbedding, RoPEAttention
 from .common import kernel_init, FourierEmbedding, TimeProjection
 from .attention import NormalAttention  # Base for RoPEAttention
 # Replace common.hilbert_indices with improved implementation from hilbert.py
 from .hilbert import hilbert_indices, inverse_permutation, hilbert_patchify, hilbert_unpatchify
-# --- Rotary Positional Embedding (RoPE) ---
-# Re-used from simple_dit.py
-def _rotate_half(x: jax.Array) -> jax.Array:
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return jnp.concatenate((-x2, x1), axis=-1)
-def apply_rotary_embedding(
-    x: jax.Array, freqs_cos: jax.Array, freqs_sin: jax.Array
-) -> jax.Array:
-    """Applies rotary embedding to the input tensor using rotate_half method."""
-    if x.ndim == 4:  # [B, H, S, D]
-        cos_freqs = jnp.expand_dims(freqs_cos, axis=(0, 1))
-        sin_freqs = jnp.expand_dims(freqs_sin, axis=(0, 1))
-    elif x.ndim == 3:  # [B, S, D]
-        cos_freqs = jnp.expand_dims(freqs_cos, axis=0)
-        sin_freqs = jnp.expand_dims(freqs_sin, axis=0)
-    else:
-        raise ValueError(f"Unsupported input dimension: {x.ndim}")
-    cos_freqs = jnp.concatenate([cos_freqs, cos_freqs], axis=-1)
-    sin_freqs = jnp.concatenate([sin_freqs, sin_freqs], axis=-1)
-    x_rotated = x * cos_freqs + _rotate_half(x) * sin_freqs
-    return x_rotated.astype(x.dtype)
-class RotaryEmbedding(nn.Module):
-    dim: int
-    max_seq_len: int = 4096  # Increased default based on SimpleDiT
-    base: int = 10000
-    dtype: Dtype = jnp.float32
-    def setup(self):
-        inv_freq = 1.0 / (
-            self.base ** (jnp.arange(0, self.dim, 2,
-                          dtype=jnp.float32) / self.dim)
-        )
-        t = jnp.arange(self.max_seq_len, dtype=jnp.float32)
-        freqs = jnp.outer(t, inv_freq)
-        self.freqs_cos = jnp.cos(freqs)
-        self.freqs_sin = jnp.sin(freqs)
-    def __call__(self, seq_len: int):
-        if seq_len > self.max_seq_len:
-            # Dynamically extend frequencies if needed (more robust)
-            t = jnp.arange(seq_len, dtype=jnp.float32)
-            inv_freq = 1.0 / (
-                self.base ** (jnp.arange(0, self.dim, 2,
-                              dtype=jnp.float32) / self.dim)
-            )
-            freqs = jnp.outer(t, inv_freq)
-            freqs_cos = jnp.cos(freqs)
-            freqs_sin = jnp.sin(freqs)
-            # Consider caching extended freqs if this happens often
-            return freqs_cos, freqs_sin
-            # Or raise error like before:
-            # raise ValueError(f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}")
-        return self.freqs_cos[:seq_len, :], self.freqs_sin[:seq_len, :]
-# --- Attention with RoPE ---
-# Re-used from simple_dit.py
-class RoPEAttention(NormalAttention):
-    rope_emb: RotaryEmbedding = None
-    @nn.compact
-    def __call__(self, x, context=None, freqs_cis=None):
-        orig_x_shape = x.shape
-        is_4d = len(orig_x_shape) == 4
-        if is_4d:
-            B, H, W, C = x.shape
-            seq_len = H * W
-            x = x.reshape((B, seq_len, C))
-        else:
-            B, seq_len, C = x.shape
-        context = x if context is None else context
-        if len(context.shape) == 4:
-            _B, _H, _W, _C = context.shape
-            context_seq_len = _H * _W
-            context = context.reshape((B, context_seq_len, _C))
-        # else: # context is already [B, S_ctx, C]
-        query = self.query(x)      # [B, S, H, D]
-        key = self.key(context)    # [B, S_ctx, H, D]
-        value = self.value(context)  # [B, S_ctx, H, D]
-        if freqs_cis is None and self.rope_emb is not None:
-            seq_len_q = query.shape[1]  # Use query's sequence length
-            freqs_cos, freqs_sin = self.rope_emb(seq_len_q)
-        elif freqs_cis is not None:
-            freqs_cos, freqs_sin = freqs_cis
-        else:
-            # Should not happen if rope_emb is provided or freqs_cis are passed
-            raise ValueError("RoPE frequencies not provided.")
-        # Apply RoPE to query and key
-        # Permute to [B, H, S, D] for RoPE application
-        query = einops.rearrange(query, 'b s h d -> b h s d')
-        key = einops.rearrange(key, 'b s h d -> b h s d')
-        # Apply RoPE only up to the context sequence length for keys if different
-        # Assuming self-attention or context has same seq len for simplicity here
-        query = apply_rotary_embedding(query, freqs_cos, freqs_sin)
-        key = apply_rotary_embedding(
-            key, freqs_cos, freqs_sin)  # Apply same freqs to key
-        # Permute back to [B, S, H, D] for dot_product_attention
-        query = einops.rearrange(query, 'b h s d -> b s h d')
-        key = einops.rearrange(key, 'b h s d -> b s h d')
-        hidden_states = nn.dot_product_attention(
-            query, key, value, dtype=self.dtype, broadcast_dropout=False,
-            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=self.force_fp32_for_softmax,
-            deterministic=True
-        )
-        proj = self.proj_attn(hidden_states)
-        if is_4d:
-            proj = proj.reshape(orig_x_shape)
-        return proj
 # --- MM-DiT AdaLN-Zero ---
 class MMAdaLNZero(nn.Module):
     """

flaxdiff 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

flaxdiff 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl