PyPI - flaxdiff - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

flaxdiff 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

flaxdiff/data/__init__.py +5 -1
flaxdiff/data/benchmark_decord.py +443 -0
flaxdiff/data/dataloaders.py +608 -0
flaxdiff/data/dataset_map.py +61 -6
flaxdiff/data/online_loader.py +779 -150
flaxdiff/data/sources/audio_utils.py +142 -0
flaxdiff/data/sources/av_example.py +125 -0
flaxdiff/data/sources/av_utils.py +590 -0
flaxdiff/data/sources/base.py +129 -0
flaxdiff/data/sources/images.py +309 -0
flaxdiff/data/sources/utils.py +158 -0
flaxdiff/data/sources/videos.py +250 -0
flaxdiff/data/sources/voxceleb2.py +412 -0
flaxdiff/inference/__init__.py +0 -0
flaxdiff/inference/pipeline.py +260 -0
flaxdiff/inference/utils.py +320 -0
flaxdiff/inputs/__init__.py +173 -0
flaxdiff/inputs/encoders.py +98 -0
flaxdiff/models/__init__.py +2 -1
flaxdiff/models/attention.py +22 -16
flaxdiff/models/autoencoder/autoencoder.py +141 -9
flaxdiff/models/autoencoder/diffusers.py +88 -25
flaxdiff/models/autoencoder/simple_autoenc.py +40 -8
flaxdiff/models/common.py +8 -18
flaxdiff/models/simple_unet.py +6 -17
flaxdiff/models/simple_vit.py +9 -13
flaxdiff/models/unet_3d.py +446 -0
flaxdiff/models/unet_3d_blocks.py +505 -0
flaxdiff/samplers/common.py +358 -96
flaxdiff/samplers/ddim.py +44 -5
flaxdiff/schedulers/karras.py +20 -12
flaxdiff/trainer/__init__.py +2 -1
flaxdiff/trainer/autoencoder_trainer.py +1 -2
flaxdiff/trainer/diffusion_trainer.py +35 -29
flaxdiff/trainer/general_diffusion_trainer.py +583 -0
flaxdiff/trainer/simple_trainer.py +51 -16
flaxdiff/utils.py +128 -57
{flaxdiff-0.1.38.dist-info → flaxdiff-0.2.0.dist-info}/METADATA +1 -1
flaxdiff-0.2.0.dist-info/RECORD +64 -0
{flaxdiff-0.1.38.dist-info → flaxdiff-0.2.0.dist-info}/WHEEL +1 -1
flaxdiff/data/datasets.py +0 -169
flaxdiff/data/sources/gcs.py +0 -81
flaxdiff/data/sources/tfds.py +0 -79
flaxdiff/trainer/video_diffusion_trainer.py +0 -62
flaxdiff-0.1.38.dist-info/RECORD +0 -50
{flaxdiff-0.1.38.dist-info → flaxdiff-0.2.0.dist-info}/top_level.txt +0 -0

flaxdiff/inputs/__init__.py ADDED Viewed

@@ -0,0 +1,173 @@
+import jax
+import jax.numpy as jnp
+import flax.struct as struct
+import flax.linen as nn
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from functools import partial
+import numpy as np
+from jax.sharding import Mesh, PartitionSpec as P
+from abc import ABC, abstractmethod
+from flaxdiff.models.autoencoder import AutoEncoder
+from .encoders import *
+@dataclass
+class ConditionalInputConfig:
+    """Class representing a conditional input for the model."""
+    encoder: ConditioningEncoder
+    conditioning_data_key: str = None       # Key in the batch for this conditioning input
+    pretokenized: bool = False
+    unconditional_input: Any = None
+    model_key_override: Optional[str] = None  # Optional key override for the model
+    __uncond_cache__ = None  # Cache for unconditional input
+    def __post_init__(self):
+        if self.unconditional_input is not None:
+            uncond = self.encoder([self.unconditional_input])
+        else:
+            uncond = self.encoder([""])  # Default empty text
+        self.__uncond_cache__ = uncond  # Cache the unconditional input
+    def __call__(self, batch_data):
+        """Process batch data to produce conditioning."""
+        key =  self.conditioning_data_key if self.conditioning_data_key else self.encoder.key
+        if self.pretokenized:
+            return self.encoder.encode_from_tokens(batch_data[key])
+        return self.encoder(batch_data[key])
+    def get_unconditional(self):
+        """Get unconditional version of this input."""
+        return self.__uncond_cache__
+    def serialize(self):
+        """Serialize the configuration."""
+        serialized_config = {
+            "encoder": self.encoder.serialize(),
+            "encoder_key": self.encoder.key,
+            "conditioning_data_key": self.conditioning_data_key,
+            "unconditional_input": self.unconditional_input,
+            "model_key_override": self.model_key_override,
+        }
+        return serialized_config
+    @staticmethod
+    def deserialize(serialized_config):
+        """Deserialize the configuration."""
+        encoder_key = serialized_config["encoder_key"]
+        encoder_class = CONDITIONAL_ENCODERS_REGISTRY.get(encoder_key)
+        if encoder_class is None:
+            raise ValueError(f"Unknown encoder type: {encoder_key}")
+        # Create the encoder instance
+        encoder = encoder_class.deserialize(serialized_config["encoder"])
+        # Deserialize the rest of the configuration
+        conditioning_data_key = serialized_config.get("conditioning_data_key")
+        unconditional_input = serialized_config.get("unconditional_input")
+        model_key_override = serialized_config.get("model_key_override")
+        return ConditionalInputConfig(
+            encoder=encoder,
+            conditioning_data_key=conditioning_data_key,
+            unconditional_input=unconditional_input,
+            model_key_override=model_key_override,
+        )
+@dataclass
+class DiffusionInputConfig:
+    """Configuration for the input data."""
+    sample_data_key: str         # Key in the batch for the sample data
+    sample_data_shape: Tuple[int, ...]
+    conditions: List[ConditionalInputConfig]
+    def get_input_shapes(
+        self,
+        autoencoder: AutoEncoder = None,
+        sample_model_key:str = 'x',
+        time_embeddings_model_key:str = 'temb',
+    ) -> Dict[str, Tuple[int, ...]]:
+        """Get the shapes of the input data."""
+        if len(self.sample_data_shape) == 3:
+            H, W, C = self.sample_data_shape
+        elif len(self.sample_data_shape) == 4:
+            T, H, W, C = self.sample_data_shape
+        else:
+            raise ValueError(f"Unsupported shape for sample data {self.sample_data_shape}")
+        if autoencoder is not None:
+            downscale_factor = autoencoder.downscale_factor
+            H = H // downscale_factor
+            W = W // downscale_factor
+            C = autoencoder.latent_channels
+        input_shapes = {
+            sample_model_key: (H, W, C),
+            time_embeddings_model_key: (),
+        }
+        for cond in self.conditions:
+            # Get the shape of the conditioning data by calling the get_unconditional method
+            unconditional = cond.get_unconditional()
+            key = cond.model_key_override if cond.model_key_override else cond.encoder.key
+            input_shapes[key] = unconditional[0].shape
+        print(f"Calculated input shapes: {input_shapes}")
+        return input_shapes
+    def get_unconditionals(self):
+        """Get unconditional inputs for all conditions."""
+        unconditionals = []
+        for cond in self.conditions:
+            uncond = cond.get_unconditional()
+            unconditionals.append(uncond)
+        return unconditionals
+    def process_conditioning(self, batch_data, uncond_mask: Optional[jnp.ndarray] = None):
+        """Process the conditioning data."""
+        results = []
+        for cond in self.conditions:
+            cond_embeddings = cond(batch_data)
+            if uncond_mask is not None:
+                assert len(uncond_mask) == len(cond_embeddings), "Unconditional mask length must match the batch size."
+                uncond_embedding = cond.get_unconditional()
+                # Reshape uncond_mask to be broadcastable with the conditioning embeddings
+                # If cond_embeddings has shape (B, T, D), reshape uncond_mask to (B, 1, 1)
+                broadcast_shape = [len(uncond_mask)] + [1] * (cond_embeddings.ndim - 1)
+                reshaped_mask = jnp.reshape(uncond_mask, broadcast_shape)
+                # Repeat uncond_embedding to match batch size
+                batch_size = len(cond_embeddings)
+                repeated_uncond = jnp.repeat(uncond_embedding, batch_size, axis=0)
+                # Apply unconditional embedding based on the mask
+                cond_embeddings = jnp.where(reshaped_mask, repeated_uncond, cond_embeddings)
+            results.append(cond_embeddings)
+        return results
+    def serialize(self):
+        """Serialize the configuration."""
+        serialized_config = {
+            "sample_data_key": self.sample_data_key,
+            "sample_data_shape": self.sample_data_shape,
+            "conditions": [cond.serialize() for cond in self.conditions],
+        }
+        return serialized_config
+    @staticmethod
+    def deserialize(serialized_config):
+        """Deserialize the configuration."""
+        sample_data_key = serialized_config["sample_data_key"]
+        sample_data_shape = tuple(serialized_config["sample_data_shape"])
+        conditions = serialized_config["conditions"]
+        # Deserialize each condition
+        deserialized_conditions = []
+        for cond in conditions:
+            deserialized_conditions.append(ConditionalInputConfig.deserialize(cond))
+        return DiffusionInputConfig(
+            sample_data_key=sample_data_key,
+            sample_data_shape=sample_data_shape,
+            conditions=deserialized_conditions,
+        )

flaxdiff/inputs/encoders.py ADDED Viewed

@@ -0,0 +1,98 @@
+import jax.numpy as jnp
+import flax.linen as nn
+from typing import Callable
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+@dataclass
+class ConditioningEncoder(ABC):
+    model: nn.Module
+    tokenizer: Callable
+    @property
+    def key(self):
+        name = self.tokenizer.__name__
+        # Remove the 'Encoder' suffix from the name and lowercase it
+        if name.endswith("Encoder"):
+            name = name[:-7].lower()
+        return name
+    def __call__(self, data):
+        tokens = self.tokenize(data)
+        outputs = self.encode_from_tokens(tokens)
+        return outputs
+    def encode_from_tokens(self, tokens):
+        outputs = self.model(input_ids=tokens['input_ids'],
+                        attention_mask=tokens['attention_mask'])
+        last_hidden_state = outputs.last_hidden_state
+        return last_hidden_state
+    def tokenize(self, data):
+        tokens = self.tokenizer(data, padding="max_length",
+                        max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="np")
+        return tokens
+    @abstractmethod
+    def serialize(self):
+        """Serialize the encoder configuration."""
+        pass
+    @staticmethod
+    @abstractmethod
+    def deserialize(serialized_config):
+        """Deserialize the encoder configuration."""
+        pass
+@dataclass
+class TextEncoder(ConditioningEncoder):
+    """Text Encoder."""
+    @property
+    def key(self):
+        return "text"
+@dataclass
+class CLIPTextEncoder(TextEncoder):
+    """CLIP Text Encoder."""
+    modelname: str
+    backend: str
+    @staticmethod
+    def from_modelname(modelname: str = "openai/clip-vit-large-patch14", backend: str="jax"):
+        from transformers import (
+            CLIPTextModel,
+            FlaxCLIPTextModel,
+            AutoTokenizer,
+        )
+        modelname = "openai/clip-vit-large-patch14"
+        if backend == "jax":
+            model = FlaxCLIPTextModel.from_pretrained(
+                modelname, dtype=jnp.bfloat16)
+        else:
+            model = CLIPTextModel.from_pretrained(modelname)
+        tokenizer = AutoTokenizer.from_pretrained(modelname, dtype=jnp.float16)
+        return CLIPTextEncoder(
+            model=model,
+            tokenizer=tokenizer,
+            modelname=modelname,
+            backend=backend
+        )
+    def serialize(self):
+        """Serialize the encoder configuration."""
+        serialized_config = {
+            "modelname": self.modelname,
+            "backend": self.backend,
+        }
+        return serialized_config
+    @staticmethod
+    def deserialize(serialized_config):
+        """Deserialize the encoder configuration."""
+        modelname = serialized_config["modelname"]
+        backend = serialized_config["backend"]
+        return CLIPTextEncoder.from_modelname(modelname=modelname, backend=backend)
+CONDITIONAL_ENCODERS_REGISTRY = {
+    "text": CLIPTextEncoder,
+}

flaxdiff/models/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from .simple_unet import *
+from .simple_unet import *
+# from .video_unet import FlaxUNet3DConditionModel, BCHWModelWrapper, FlaxTemporalConvLayer

flaxdiff/models/attention.py CHANGED Viewed

@@ -23,7 +23,7 @@ class EfficientAttention(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     use_bias: bool = True
-    kernel_init: Callable = kernel_init(1.0)
+    # kernel_init: Callable = kernel_init(1.0)
     force_fp32_for_softmax: bool = True
     def setup(self):
@@ -34,15 +34,21 @@ class EfficientAttention(nn.Module):
             self.heads * self.dim_head,
             precision=self.precision,
             use_bias=self.use_bias,
-            kernel_init=self.kernel_init,
+            # kernel_init=self.kernel_init,
             dtype=self.dtype
         )
         self.query = dense(name="to_q")
         self.key = dense(name="to_k")
         self.value = dense(name="to_v")
-        self.proj_attn = nn.DenseGeneral(self.query_dim, use_bias=False, precision=self.precision,
-                                     kernel_init=self.kernel_init, dtype=self.dtype, name="to_out_0")
+        self.proj_attn = nn.DenseGeneral(
+            self.query_dim,
+            use_bias=False,
+            precision=self.precision,
+            # kernel_init=self.kernel_init,
+            dtype=self.dtype,
+            name="to_out_0"
+        )
         # self.attnfn = make_fast_generalized_attention(qkv_dim=inner_dim, lax_scan_unroll=16)
     def _reshape_tensor_to_head_dim(self, tensor):
@@ -115,7 +121,7 @@ class NormalAttention(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     use_bias: bool = True
-    kernel_init: Callable = kernel_init(1.0)
+    # kernel_init: Callable = kernel_init(1.0)
     force_fp32_for_softmax: bool = True
     def setup(self):
@@ -126,7 +132,7 @@ class NormalAttention(nn.Module):
             axis=-1,
             precision=self.precision,
             use_bias=self.use_bias,
-            kernel_init=self.kernel_init,
+            # kernel_init=self.kernel_init,
             dtype=self.dtype
         )
         self.query = dense(name="to_q")
@@ -140,7 +146,7 @@ class NormalAttention(nn.Module):
             use_bias=self.use_bias,
             dtype=self.dtype,
             name="to_out_0",
-            kernel_init=self.kernel_init
+            # kernel_init=self.kernel_init
             # kernel_init=jax.nn.initializers.xavier_uniform()
         )
@@ -236,7 +242,7 @@ class BasicTransformerBlock(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     use_bias: bool = True
-    kernel_init: Callable = kernel_init(1.0)
+    # kernel_init: Callable = kernel_init(1.0)
     use_flash_attention:bool = False
     use_cross_only:bool = False
     only_pure_attention:bool = False
@@ -256,7 +262,7 @@ class BasicTransformerBlock(nn.Module):
             precision=self.precision,
             use_bias=self.use_bias,
             dtype=self.dtype,
-            kernel_init=self.kernel_init,
+            # kernel_init=self.kernel_init,
             force_fp32_for_softmax=self.force_fp32_for_softmax
         )
         self.attention2 = attenBlock(
@@ -267,7 +273,7 @@ class BasicTransformerBlock(nn.Module):
             precision=self.precision,
             use_bias=self.use_bias,
             dtype=self.dtype,
-            kernel_init=self.kernel_init,
+            # kernel_init=self.kernel_init,
             force_fp32_for_softmax=self.force_fp32_for_softmax
         )
@@ -303,7 +309,7 @@ class TransformerBlock(nn.Module):
     use_self_and_cross:bool = True
     only_pure_attention:bool = False
     force_fp32_for_softmax: bool = True
-    kernel_init: Callable = kernel_init(1.0)
+    # kernel_init: Callable = kernel_init(1.0)
     norm_inputs: bool = True
     explicitly_add_residual: bool = True
@@ -317,12 +323,12 @@ class TransformerBlock(nn.Module):
             if self.use_linear_attention:
                 projected_x = nn.Dense(features=inner_dim,
                                        use_bias=False, precision=self.precision,
-                                       kernel_init=self.kernel_init,
+                                    #    kernel_init=self.kernel_init,
                                        dtype=self.dtype, name=f'project_in')(x)
             else:
                 projected_x = nn.Conv(
                     features=inner_dim, kernel_size=(1, 1),
-                    kernel_init=self.kernel_init,
+                    # kernel_init=self.kernel_init,
                     strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
                     precision=self.precision, name=f'project_in_conv',
                 )(x)
@@ -344,19 +350,19 @@ class TransformerBlock(nn.Module):
             use_cross_only=(not self.use_self_and_cross),
             only_pure_attention=self.only_pure_attention,
             force_fp32_for_softmax=self.force_fp32_for_softmax,
-            kernel_init=self.kernel_init
+            # kernel_init=self.kernel_init
         )(projected_x, context)
         if self.use_projection == True:
             if self.use_linear_attention:
                 projected_x = nn.Dense(features=C, precision=self.precision,
                                        dtype=self.dtype, use_bias=False,
-                                       kernel_init=self.kernel_init,
+                                    #    kernel_init=self.kernel_init,
                                        name=f'project_out')(projected_x)
             else:
                 projected_x = nn.Conv(
                     features=C, kernel_size=(1, 1),
-                    kernel_init=self.kernel_init,
+                    # kernel_init=self.kernel_i nit,
                     strides=(1, 1), padding='VALID', use_bias=False, dtype=self.dtype,
                     precision=self.precision, name=f'project_out_conv',
                 )(projected_x)

flaxdiff/models/autoencoder/autoencoder.py CHANGED Viewed

@@ -1,19 +1,151 @@
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
-from typing import Dict, Callable, Sequence, Any, Union
+from typing import Dict, Callable, Sequence, Any, Union, Optional
 import einops
 from ..common import kernel_init, ConvLayer, Upsample, Downsample, PixelShuffle
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
-class AutoEncoder():
-    def encode(self, x: jnp.ndarray, **kwargs) -> jnp.ndarray:
+@dataclass
+class AutoEncoder(ABC):
+    """Base class for autoencoder models with video support.
+    This class defines the interface for autoencoders and provides
+    video handling functionality, allowing child classes to focus
+    on implementing the core encoding/decoding for individual frames.
+    """
+    @abstractmethod
+    def __encode__(self, x: jnp.ndarray, **kwargs) -> jnp.ndarray:
+        """Abstract method for encoding a batch of images.
+        Child classes must implement this method to perform the actual encoding.
+        Args:
+            x: Input tensor of shape [B, H, W, C] (batch of images)
+            **kwargs: Additional arguments for the encoding process
+        Returns:
+            Encoded latent representation
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def __decode__(self, z: jnp.ndarray, **kwargs) -> jnp.ndarray:
+        """Abstract method for decoding a batch of latents.
+        Child classes must implement this method to perform the actual decoding.
+        Args:
+            z: Latent tensor of shape [B, h, w, c] (encoded representation)
+            **kwargs: Additional arguments for the decoding process
+        Returns:
+            Decoded images
+        """
         raise NotImplementedError
-    def decode(self, z: jnp.ndarray, **kwargs) -> jnp.ndarray:
+    def encode(self, x: jnp.ndarray, key: Optional[jax.random.PRNGKey] = None, **kwargs) -> jnp.ndarray:
+        """Encode input data, with special handling for video data.
+        This method handles both standard image batches and video data (5D tensors).
+        For videos, it reshapes the input, processes each frame, and then restores
+        the temporal dimension.
+        Args:
+            x: Input tensor, either [B, H, W, C] for images or [B, T, H, W, C] for videos
+            key: Optional random key for stochastic encoding
+            **kwargs: Additional arguments passed to __encode__
+        Returns:
+            Encoded representation with the same batch and temporal dimensions as input
+        """
+        # Check for video data (5D tensor)
+        is_video = len(x.shape) == 5
+        if is_video:
+            # Extract dimensions for reshaping
+            batch_size, seq_len, height, width, channels = x.shape
+            # Reshape to [B*T, H, W, C] to process as regular images
+            x_reshaped = x.reshape(-1, height, width, channels)
+            # Encode all frames
+            latent = self.__encode__(x_reshaped, key=key, **kwargs)
+            # Reshape back to include temporal dimension [B, T, h, w, c]
+            latent_shape = latent.shape
+            return latent.reshape(batch_size, seq_len, *latent_shape[1:])
+        else:
+            # Standard image processing
+            return self.__encode__(x, key=key, **kwargs)
+    def decode(self, z: jnp.ndarray, key: Optional[jax.random.PRNGKey] = None, **kwargs) -> jnp.ndarray:
+        """Decode latent representations, with special handling for video data.
+        This method handles both standard image latents and video latents (5D tensors).
+        For videos, it reshapes the input, processes each frame, and then restores
+        the temporal dimension.
+        Args:
+            z: Latent tensor, either [B, h, w, c] for images or [B, T, h, w, c] for videos
+            key: Optional random key for stochastic decoding
+            **kwargs: Additional arguments passed to __decode__
+        Returns:
+            Decoded output with the same batch and temporal dimensions as input
+        """
+        # Check for video data (5D tensor)
+        is_video = len(z.shape) == 5
+        if is_video:
+            # Extract dimensions for reshaping
+            batch_size, seq_len, height, width, channels = z.shape
+            # Reshape to [B*T, h, w, c] to process as regular latents
+            z_reshaped = z.reshape(-1, height, width, channels)
+            # Decode all frames
+            decoded = self.__decode__(z_reshaped, key=key, **kwargs)
+            # Reshape back to include temporal dimension [B, T, H, W, C]
+            decoded_shape = decoded.shape
+            return decoded.reshape(batch_size, seq_len, *decoded_shape[1:])
+        else:
+            # Standard latent processing
+            return self.__decode__(z, key=key, **kwargs)
+    def __call__(self, x: jnp.ndarray, key: Optional[jax.random.PRNGKey] = None, **kwargs):
+        """Encode and then decode the input (autoencoder).
+        Args:
+            x: Input tensor, either [B, H, W, C] for images or [B, T, H, W, C] for videos
+            key: Optional random key for stochastic encoding/decoding
+            **kwargs: Additional arguments for encoding and decoding
+        Returns:
+            Reconstructed output with the same dimensions as input
+        """
+        if key is not None:
+            encode_key, decode_key = jax.random.split(key)
+        else:
+            encode_key = decode_key = None
+        # Encode then decode
+        z = self.encode(x, key=encode_key, **kwargs)
+        return self.decode(z, key=decode_key, **kwargs)
+    @property
+    def spatial_scale(self) -> int:
+        """Get the spatial scale factor between input and latent spaces."""
+        return getattr(self, "_spatial_scale", None)
+    @property
+    def name(self) -> str:
+        """Get the name of the autoencoder model."""
         raise NotImplementedError
-    def __call__(self, x: jnp.ndarray):
-        latents = self.encode(x)
-        reconstructions = self.decode(latents)
-        return reconstructions
+    @abstractmethod
+    def serialize(self) -> Dict[str, Any]:
+        """Serialize the model parameters and configuration."""
+        raise NotImplementedError

flaxdiff 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

flaxdiff 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl