PyPI - flaxdiff - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

flaxdiff 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

flaxdiff/data/dataloaders.py +11 -19
flaxdiff/data/dataset_map.py +2 -1
flaxdiff/data/sources/images.py +29 -14
flaxdiff/inference/utils.py +7 -1
flaxdiff/models/simple_dit.py +1 -202
flaxdiff/models/simple_mmdit.py +1 -132
flaxdiff/models/simple_vit.py +217 -118
flaxdiff/models/vit_common.py +262 -0
flaxdiff/trainer/general_diffusion_trainer.py +2 -1
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/METADATA +1 -1
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/RECORD +13 -12
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/WHEEL +0 -0
{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/top_level.txt +0 -0

flaxdiff/models/vit_common.py ADDED Viewed

@@ -0,0 +1,262 @@
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from typing import Any, Optional
+import einops
+from flax.typing import Dtype, PrecisionLike
+from .attention import NormalAttention
+def unpatchify(x, channels=3):
+    patch_size = int((x.shape[2] // channels) ** 0.5)
+    h = w = int(x.shape[1] ** .5)
+    assert h * w == x.shape[1] and patch_size ** 2 * \
+        channels == x.shape[2], f"Invalid shape: {x.shape}, should be {h*w}, {patch_size**2*channels}"
+    x = einops.rearrange(
+        x, 'B (h w) (p1 p2 C) -> B (h p1) (w p2) C', h=h, p1=patch_size, p2=patch_size)
+    return x
+class PatchEmbedding(nn.Module):
+    patch_size: int
+    embedding_dim: int
+    dtype: Any = jnp.float32
+    precision: Any = jax.lax.Precision.HIGH
+    @nn.compact
+    def __call__(self, x):
+        batch, height, width, channels = x.shape
+        assert height % self.patch_size == 0 and width % self.patch_size == 0, "Image dimensions must be divisible by patch size"
+        x = nn.Conv(features=self.embedding_dim,
+                    kernel_size=(self.patch_size, self.patch_size),
+                    strides=(self.patch_size, self.patch_size),
+                    dtype=self.dtype,
+                    precision=self.precision)(x)
+        x = jnp.reshape(x, (batch, -1, self.embedding_dim))
+        return x
+class PositionalEncoding(nn.Module):
+    max_len: int
+    embedding_dim: int
+    @nn.compact
+    def __call__(self, x):
+        pe = self.param('pos_encoding',
+                        jax.nn.initializers.zeros,
+                        (1, self.max_len, self.embedding_dim))
+        return x + pe[:, :x.shape[1], :]
+# --- Rotary Positional Embedding (RoPE) ---
+# Adapted from https://github.com/google-deepmind/ring_attention/blob/main/ring_attention/layers/rotary.py
+def _rotate_half(x: jax.Array) -> jax.Array:
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return jnp.concatenate((-x2, x1), axis=-1)
+def apply_rotary_embedding(
+    x: jax.Array, freqs_cos: jax.Array, freqs_sin: jax.Array
+) -> jax.Array:
+    """Applies rotary embedding to the input tensor using rotate_half method."""
+    # x shape: [..., Sequence, Dimension] e.g. [B, H, S, D] or [B, S, D]
+    # freqs_cos/sin shape: [Sequence, Dimension / 2]
+    # Expand dims for broadcasting: [1, 1, S, D/2] or [1, S, D/2]
+    if x.ndim == 4:  # [B, H, S, D]
+        cos_freqs = jnp.expand_dims(freqs_cos, axis=(0, 1))
+        sin_freqs = jnp.expand_dims(freqs_sin, axis=(0, 1))
+    elif x.ndim == 3:  # [B, S, D]
+        cos_freqs = jnp.expand_dims(freqs_cos, axis=0)
+        sin_freqs = jnp.expand_dims(freqs_sin, axis=0)
+    # Duplicate cos and sin for the full dimension D
+    # Shape becomes [..., S, D]
+    cos_freqs = jnp.concatenate([cos_freqs, cos_freqs], axis=-1)
+    sin_freqs = jnp.concatenate([sin_freqs, sin_freqs], axis=-1)
+    # Apply rotation: x * cos + rotate_half(x) * sin
+    x_rotated = x * cos_freqs + _rotate_half(x) * sin_freqs
+    return x_rotated.astype(x.dtype)
+class RotaryEmbedding(nn.Module):
+    dim: int
+    max_seq_len: int = 4096  # Increased default based on SimpleDiT
+    base: int = 10000
+    dtype: Dtype = jnp.float32
+    def setup(self):
+        inv_freq = 1.0 / (
+            self.base ** (jnp.arange(0, self.dim, 2,
+                          dtype=jnp.float32) / self.dim)
+        )
+        t = jnp.arange(self.max_seq_len, dtype=jnp.float32)
+        freqs = jnp.outer(t, inv_freq)
+        self.freqs_cos = jnp.cos(freqs)
+        self.freqs_sin = jnp.sin(freqs)
+    def __call__(self, seq_len: int):
+        if seq_len > self.max_seq_len:
+            # Dynamically extend frequencies if needed (more robust)
+            t = jnp.arange(seq_len, dtype=jnp.float32)
+            inv_freq = 1.0 / (
+                self.base ** (jnp.arange(0, self.dim, 2,
+                              dtype=jnp.float32) / self.dim)
+            )
+            freqs = jnp.outer(t, inv_freq)
+            freqs_cos = jnp.cos(freqs)
+            freqs_sin = jnp.sin(freqs)
+            # Consider caching extended freqs if this happens often
+            return freqs_cos, freqs_sin
+            # Or raise error like before:
+            # raise ValueError(f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}")
+        return self.freqs_cos[:seq_len, :], self.freqs_sin[:seq_len, :]
+# --- Attention with RoPE ---
+class RoPEAttention(NormalAttention):
+    rope_emb: RotaryEmbedding = None
+    @nn.compact
+    def __call__(self, x, context=None, freqs_cis=None):
+        orig_x_shape = x.shape
+        is_4d = len(orig_x_shape) == 4
+        if is_4d:
+            B, H, W, C = x.shape
+            seq_len = H * W
+            x = x.reshape((B, seq_len, C))
+        else:
+            B, seq_len, C = x.shape
+        context = x if context is None else context
+        if len(context.shape) == 4:
+            _B, _H, _W, _C = context.shape
+            context_seq_len = _H * _W
+            context = context.reshape((B, context_seq_len, _C))
+        # else: # context is already [B, S_ctx, C]
+        query = self.query(x)      # [B, S, H, D]
+        key = self.key(context)    # [B, S_ctx, H, D]
+        value = self.value(context)  # [B, S_ctx, H, D]
+        if freqs_cis is None and self.rope_emb is not None:
+            seq_len_q = query.shape[1]  # Use query's sequence length
+            freqs_cos, freqs_sin = self.rope_emb(seq_len_q)
+        elif freqs_cis is not None:
+            freqs_cos, freqs_sin = freqs_cis
+        else:
+            # Should not happen if rope_emb is provided or freqs_cis are passed
+            raise ValueError("RoPE frequencies not provided.")
+        # Apply RoPE to query and key
+        # Permute to [B, H, S, D] for RoPE application
+        query = einops.rearrange(query, 'b s h d -> b h s d')
+        key = einops.rearrange(key, 'b s h d -> b h s d')
+        # Apply RoPE only up to the context sequence length for keys if different
+        # Assuming self-attention or context has same seq len for simplicity here
+        query = apply_rotary_embedding(query, freqs_cos, freqs_sin)
+        key = apply_rotary_embedding(
+            key, freqs_cos, freqs_sin)  # Apply same freqs to key
+        # Permute back to [B, S, H, D] for dot_product_attention
+        query = einops.rearrange(query, 'b h s d -> b s h d')
+        key = einops.rearrange(key, 'b h s d -> b s h d')
+        hidden_states = nn.dot_product_attention(
+            query, key, value, dtype=self.dtype, broadcast_dropout=False,
+            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=self.force_fp32_for_softmax,
+            deterministic=True
+        )
+        proj = self.proj_attn(hidden_states)
+        if is_4d:
+            proj = proj.reshape(orig_x_shape)
+        return proj
+# --- adaLN-Zero ---
+class AdaLNZero(nn.Module):
+    features: int
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    norm_epsilon: float = 1e-5  # Standard LayerNorm epsilon
+    @nn.compact
+    def __call__(self, x, conditioning):
+        # Project conditioning signal to get scale and shift parameters
+        # Conditioning shape: [B, D_cond] -> [B, 1, ..., 1, 6 * features] for broadcasting
+        # Or [B, 1, 6*features] if x is [B, S, F]
+        # Ensure conditioning has seq dim if x does
+        # x=[B,S,F], cond=[B,D_cond]
+        if x.ndim == 3 and conditioning.ndim == 2:
+            conditioning = jnp.expand_dims(
+                conditioning, axis=1)  # cond=[B,1,D_cond]
+        # Project conditioning to get 6 params per feature (scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn)
+        # Using nn.DenseGeneral for flexibility if needed, but nn.Dense is fine if cond is [B, D_cond] or [B, 1, D_cond]
+        ada_params = nn.Dense(
+            features=6 * self.features,
+            dtype=self.dtype,
+            precision=self.precision,
+            # Initialize projection to zero (Zero init)
+            kernel_init=nn.initializers.zeros,
+            name="ada_proj"
+        )(conditioning)
+        # Split into scale, shift, gate for MLP and Attention
+        scale_mlp, shift_mlp, gate_mlp, scale_attn, shift_attn, gate_attn = jnp.split(
+            ada_params, 6, axis=-1)
+        scale_mlp = jnp.clip(scale_mlp, -10.0, 10.0)
+        shift_mlp = jnp.clip(shift_mlp, -10.0, 10.0)
+        # Apply Layer Normalization
+        norm = nn.LayerNorm(epsilon=self.norm_epsilon,
+                            use_scale=False, use_bias=False, dtype=self.dtype)
+        # norm = nn.RMSNorm(epsilon=self.norm_epsilon, dtype=self.dtype) # Alternative: RMSNorm
+        norm_x = norm(x)
+        # Modulate for Attention path
+        x_attn = norm_x * (1 + scale_attn) + shift_attn
+        # Modulate for MLP path
+        x_mlp = norm_x * (1 + scale_mlp) + shift_mlp
+        # Return modulated outputs and gates
+        return x_attn, gate_attn, x_mlp, gate_mlp
+class AdaLNParams(nn.Module): # Renamed for clarity
+    features: int
+    dtype: Optional[Dtype] = None
+    precision: PrecisionLike = None
+    @nn.compact
+    def __call__(self, conditioning):
+        # Ensure conditioning is broadcastable if needed (e.g., [B, 1, D_cond])
+        if conditioning.ndim == 2:
+             conditioning = jnp.expand_dims(conditioning, axis=1)
+        # Project conditioning to get 6 params per feature
+        ada_params = nn.Dense(
+            features=6 * self.features,
+            dtype=self.dtype,
+            precision=self.precision,
+            kernel_init=nn.initializers.zeros,
+            name="ada_proj"
+        )(conditioning)
+        # Return all params (or split if preferred, but maybe return tuple/dict)
+        # Shape: [B, 1, 6*F]
+        return ada_params # Or split and return tuple: jnp.split(ada_params, 6, axis=-1)

flaxdiff/trainer/general_diffusion_trainer.py CHANGED Viewed

@@ -428,6 +428,7 @@ class GeneralDiffusionTrainer(DiffusionTrainer):
         generate_samples = val_step_fn
         val_ds = iter(val_ds) if val_ds else None
+        print(f"Validation loop started for process index {process_index} with {global_device_count} devices.")
         # Evaluation step
         try:
             metrics = {metric.name: [] for metric in self.eval_metrics} if self.eval_metrics else {}
@@ -487,7 +488,7 @@ class GeneralDiffusionTrainer(DiffusionTrainer):
                         self.wandb.log({
                             f"val/{key}": value,
                         }, step=current_step)
+                print(f"Validation metrics for process index {process_index}: {metrics}")
             # Close validation dataset iterator
             del val_ds

{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: flaxdiff
-Version: 0.2.8
+Version: 0.2.9
 Summary: A versatile and easy to understand Diffusion library
 Author-email: Ashish Kumar Singh <ashishkmr472@gmail.com>
 License-Expression: MIT

{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/RECORD RENAMED Viewed

@@ -2,20 +2,20 @@ flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=DmlWUY1FGz4ESxIHaPQJf92CHjsdMjyDd651wFUtyNg,8838
 flaxdiff/data/__init__.py,sha256=8W5y7NyAOWtpLi8WRawk4VYeE3DMDnM3B_jKPD8BoFQ,143
 flaxdiff/data/benchmark_decord.py,sha256=x56Db1VPmziv_9KJvWdfS0O7cffsYkF5tt5WvldOKc0,13720
-flaxdiff/data/dataloaders.py,sha256=HQR0rsLNYXRPBmdOBKFCc3UfWsmSbSO_-dOQHCbu_VA,23966
-flaxdiff/data/dataset_map.py,sha256=Dz_suGz23Cy7RfWt0FDRX7Q3NTB5SAw2UNHO_-p0qiM,5098
+flaxdiff/data/dataloaders.py,sha256=k_3YGJhiY2Wt_-7qK0Yjl4pmF2QJjX_-BlSFuXbH5-M,23628
+flaxdiff/data/dataset_map.py,sha256=p30U23RkfgMbR8kfPBDIjrjfzDBszWQ9Q1ff2BvDYZk,5116
 flaxdiff/data/online_loader.py,sha256=t1jEhdB6gWTlwx68ehj1ol_PrImbwXYiRlrJPCmNgCM,35701
 flaxdiff/data/sources/audio_utils.py,sha256=X27gG1yQt_abVOYgMtruYmZD7-8_uQCRhhTSpn4clkI,4514
 flaxdiff/data/sources/av_example.py,sha256=RIcbVKqckFqbfnV65NQotzIBxjdDuM67kD1nY8fqw5Q,3826
 flaxdiff/data/sources/av_utils.py,sha256=LCr9MJNurOaoxY-sjzkLqJS_MlX0x3gRSlKAVIglAU0,24045
 flaxdiff/data/sources/base.py,sha256=4Rm9pCtXxzoB8FO0lkDHsrX3ULoU_PNNcid978e6ir0,4610
-flaxdiff/data/sources/images.py,sha256=71TzTVbPzV-Md3-1Lk4eWfb11w6aaO01OClwK_SiCSM,14708
+flaxdiff/data/sources/images.py,sha256=ZHBmZ2fnPN75Hc2kiog-Wcs_NZJZOiqw4WcSH5WZJHA,16572
 flaxdiff/data/sources/utils.py,sha256=kFzM4_kPoThbAu54ulABmEDAR33tR50NgzXIpC0Dzjk,7316
 flaxdiff/data/sources/videos.py,sha256=NkxwEruNpAwDCM53q4WurQ802gSjQMOqjNLxYOqjoNE,9545
 flaxdiff/data/sources/voxceleb2.py,sha256=BoKfat_hsw6ObDyyaiQmPbBzuFiqgCGlgAZmf-t5Iz8,18621
 flaxdiff/inference/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/inference/pipeline.py,sha256=8S30FAlXEjvrDd87H-qdD6biySQZ3cJUflU8gdmPxig,9223
-flaxdiff/inference/utils.py,sha256=MVnWl0LnC-1ILk0SsLd1YFu6igaQFR7mGhzo0jE797E,12323
+flaxdiff/inference/utils.py,sha256=JEBZYSgj-0DLJTV-TNmIAllAqqVJMn0KfryHwFO-MFs,12606
 flaxdiff/inputs/__init__.py,sha256=ybPjQsFAf5sqRVZG1sRiOl99EnwpI-NQ8HE3y7UbXmU,7197
 flaxdiff/inputs/encoders.py,sha256=pjfbx4Rk7bLoE80MOfThZDm6YtsDncRekmn0Bmg_CwI,2963
 flaxdiff/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,12 +31,13 @@ flaxdiff/models/common.py,sha256=QpciwuJldvLUwyAyWBQqiPPGVI-c9qLR7h7C1YoRX7w,105
 flaxdiff/models/favor_fastattn.py,sha256=79Ew1nqarsNLPzZaBSd1ILORzJr74CupYeqGiCQK5E4,27689
 flaxdiff/models/general.py,sha256=7xMME6KVKQY8sScyHYH4f-Kek4j1pRfplKShFXwVZd4,587
 flaxdiff/models/hilbert.py,sha256=AjlAv49dL6UAYWslMJfCMLiFqY4kTgpiUWr2nc1mk34,24823
-flaxdiff/models/simple_dit.py,sha256=Hc2jLOZCYSDm6x88m3bGYu-OKge1TukiQPSdlaO68rE,19667
-flaxdiff/models/simple_mmdit.py,sha256=RmOq6LbfDBUUEib6MSAURujxn9iHgdh77a6ntNsWI2w,36210
+flaxdiff/models/simple_dit.py,sha256=l238MYHRTArv_pS57aY24C2PTfxeL8EmzJ24iQqdoWI,11702
+flaxdiff/models/simple_mmdit.py,sha256=ARk0juopn2k7giln5BAUrnYD1pTFwgTJoSzrhozQ0A8,31356
 flaxdiff/models/simple_unet.py,sha256=pjeixszG_6gEY5PNFbQ7KbOyg4z5bfn4RUbINCJexOM,10758
-flaxdiff/models/simple_vit.py,sha256=QEHPyaQIYhqSYrD6eb65X70jQL-y09nRT8Yc4b5Jq6Q,15181
+flaxdiff/models/simple_vit.py,sha256=J9s3hBF87_iVrJDBe2cs9a56N7ect6pux_f_ge07XXc,17357
 flaxdiff/models/unet_3d.py,sha256=LF0PMxBKGU-_lAMtO_Coxy1yRE02yKKdgb7i6YZxI_4,20163
 flaxdiff/models/unet_3d_blocks.py,sha256=lRYDc9X1VEu54Kg7xEEphXYiQ09tabPXKi-hEcKFYug,19687
+flaxdiff/models/vit_common.py,sha256=1OGu4ezY3uzKinTnw3p8YkQAslHDqEbN78JheXnTleI,9831
 flaxdiff/models/autoencoder/__init__.py,sha256=qY-7MldZpsfkF-_T2LqlRK7VHbqfmosz0NmvzDlBkOk,78
 flaxdiff/models/autoencoder/autoencoder.py,sha256=8XWdsWvsPsyWGtzpCT8w0KXi_ZLGpRuQpn4oXo1gHKw,6039
 flaxdiff/models/autoencoder/diffusers.py,sha256=tPz77YuctrT--jF2AOL8G6vr0NiIr3RXANNrZCxe0bg,5921
@@ -62,9 +63,9 @@ flaxdiff/schedulers/sqrt.py,sha256=mCd_szmOqF6vqQKiAiEOqV_3eBIPGYrW3VxK0o4rBuo,4
 flaxdiff/trainer/__init__.py,sha256=xSoierfi26gxfgxlNnwvyyPmuPAJ--5i3mEHxt3S-AE,215
 flaxdiff/trainer/autoencoder_trainer.py,sha256=2FP2P-k9c0n_k3eT0trkq73dQrHRdBj9ObK1idcyhSw,6996
 flaxdiff/trainer/diffusion_trainer.py,sha256=reQEVWKTqKAeyCMQ-curPOfSRmBKxKooK8EVtUuorcM,14599
-flaxdiff/trainer/general_diffusion_trainer.py,sha256=FUvc--3ibRAjrYiKbA-FyLqKhusakxeNOa6UJZaK4SU,29307
+flaxdiff/trainer/general_diffusion_trainer.py,sha256=OtE2spZIBFPpY6q-ijYol5Y-CaP2UHJYIDX3PFBiPtg,29492
 flaxdiff/trainer/simple_trainer.py,sha256=Hdltuo3lgF61N04Lxc7L3z6NLveW4_h1ff7_5mu3Wbg,28730
-flaxdiff-0.2.8.dist-info/METADATA,sha256=y2jLjsEkR-GKvLWuGzlyBrk1SNM6tCPT0Oc7vRZC7_I,24057
-flaxdiff-0.2.8.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
-flaxdiff-0.2.8.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.2.8.dist-info/RECORD,,
+flaxdiff-0.2.9.dist-info/METADATA,sha256=a8btxHRkAZVieuZfTyXgPkJbEG9fZRknEhq2Ti3_7m4,24057
+flaxdiff-0.2.9.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
+flaxdiff-0.2.9.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.2.9.dist-info/RECORD,,

{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.2.8.dist-info → flaxdiff-0.2.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

flaxdiff 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl