PyPI - diffsynth-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

diffsynth-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

diffsynth_engine/__init__.py +25 -0
diffsynth_engine/algorithm/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/__init__.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py +10 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/__init__.py +5 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_beta.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_ddim.py +25 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/recifited_flow.py +48 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/beta.py +26 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/ddim.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/exponential.py +19 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/karras.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/linear.py +77 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/sgm_uniform.py +20 -0
diffsynth_engine/algorithm/sampler/__init__.py +19 -0
diffsynth_engine/algorithm/sampler/flow_match/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py +22 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/brownian_tree.py +54 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/ddpm.py +32 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/deis.py +125 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m_sde.py +53 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_3m_sde.py +59 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/epsilon.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler.py +12 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler_ancestral.py +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/special_tokens_map.json +125 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer.json +129428 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer_config.json +940 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/merges.txt +40213 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/tokenizer_config.json +38 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/vocab.json +49411 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json +308 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json +1028026 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json +2748 -0
diffsynth_engine/models/__init__.py +0 -0
diffsynth_engine/models/base.py +55 -0
diffsynth_engine/models/basic/__init__.py +0 -0
diffsynth_engine/models/basic/attention.py +137 -0
diffsynth_engine/models/basic/lora.py +293 -0
diffsynth_engine/models/basic/relative_position_emb.py +56 -0
diffsynth_engine/models/basic/timestep.py +81 -0
diffsynth_engine/models/basic/transformer_helper.py +88 -0
diffsynth_engine/models/basic/unet_helper.py +244 -0
diffsynth_engine/models/components/__init__.py +0 -0
diffsynth_engine/models/components/clip.py +56 -0
diffsynth_engine/models/components/t5.py +222 -0
diffsynth_engine/models/components/vae.py +393 -0
diffsynth_engine/models/flux/__init__.py +14 -0
diffsynth_engine/models/flux/flux_dit.py +504 -0
diffsynth_engine/models/flux/flux_text_encoder.py +90 -0
diffsynth_engine/models/flux/flux_vae.py +78 -0
diffsynth_engine/models/sd/__init__.py +12 -0
diffsynth_engine/models/sd/sd_text_encoder.py +142 -0
diffsynth_engine/models/sd/sd_unet.py +293 -0
diffsynth_engine/models/sd/sd_vae.py +38 -0
diffsynth_engine/models/sd3/__init__.py +14 -0
diffsynth_engine/models/sd3/sd3_dit.py +302 -0
diffsynth_engine/models/sd3/sd3_text_encoder.py +163 -0
diffsynth_engine/models/sd3/sd3_vae.py +43 -0
diffsynth_engine/models/sdxl/__init__.py +13 -0
diffsynth_engine/models/sdxl/sdxl_text_encoder.py +307 -0
diffsynth_engine/models/sdxl/sdxl_unet.py +306 -0
diffsynth_engine/models/sdxl/sdxl_vae.py +38 -0
diffsynth_engine/models/utils.py +54 -0
diffsynth_engine/models/wan/__init__.py +0 -0
diffsynth_engine/models/wan/attention.py +200 -0
diffsynth_engine/models/wan/wan_dit.py +431 -0
diffsynth_engine/models/wan/wan_image_encoder.py +495 -0
diffsynth_engine/models/wan/wan_text_encoder.py +264 -0
diffsynth_engine/models/wan/wan_vae.py +771 -0
diffsynth_engine/pipelines/__init__.py +17 -0
diffsynth_engine/pipelines/base.py +216 -0
diffsynth_engine/pipelines/flux_image.py +548 -0
diffsynth_engine/pipelines/sd_image.py +386 -0
diffsynth_engine/pipelines/sdxl_image.py +430 -0
diffsynth_engine/pipelines/wan_video.py +481 -0
diffsynth_engine/tokenizers/__init__.py +4 -0
diffsynth_engine/tokenizers/base.py +157 -0
diffsynth_engine/tokenizers/clip.py +288 -0
diffsynth_engine/tokenizers/t5.py +194 -0
diffsynth_engine/tokenizers/wan.py +79 -0
diffsynth_engine/utils/__init__.py +0 -0
diffsynth_engine/utils/constants.py +34 -0
diffsynth_engine/utils/download.py +139 -0
diffsynth_engine/utils/env.py +7 -0
diffsynth_engine/utils/fp8_linear.py +64 -0
diffsynth_engine/utils/gguf.py +415 -0
diffsynth_engine/utils/loader.py +14 -0
diffsynth_engine/utils/lock.py +56 -0
diffsynth_engine/utils/logging.py +12 -0
diffsynth_engine/utils/offload.py +44 -0
diffsynth_engine/utils/parallel.py +191 -0
diffsynth_engine/utils/prompt.py +9 -0
diffsynth_engine/utils/video.py +40 -0
diffsynth_engine-0.1.0.dist-info/LICENSE +201 -0
diffsynth_engine-0.1.0.dist-info/METADATA +237 -0
diffsynth_engine-0.1.0.dist-info/RECORD +113 -0
diffsynth_engine-0.1.0.dist-info/WHEEL +5 -0
diffsynth_engine-0.1.0.dist-info/top_level.txt +1 -0

diffsynth_engine/models/utils.py ADDED Viewed

@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+from contextlib import contextmanager
+# mofified from transformers.modeling_utils
+TORCH_INIT_FUNCTIONS = {
+    "uniform_": nn.init.uniform_,
+    "normal_": nn.init.normal_,
+    "trunc_normal_": nn.init.trunc_normal_,
+    "constant_": nn.init.constant_,
+    "xavier_uniform_": nn.init.xavier_uniform_,
+    "xavier_normal_": nn.init.xavier_normal_,
+    "kaiming_uniform_": nn.init.kaiming_uniform_,
+    "kaiming_normal_": nn.init.kaiming_normal_,
+    "uniform": nn.init.uniform,
+    "normal": nn.init.normal,
+    "xavier_uniform": nn.init.xavier_uniform,
+    "xavier_normal": nn.init.xavier_normal,
+    "kaiming_uniform": nn.init.kaiming_uniform,
+    "kaiming_normal": nn.init.kaiming_normal,
+}
+_init_weights = True
+@contextmanager
+def no_init_weights():
+    """
+    Context manager to globally disable weight initialization to speed up loading large models.
+    """
+    global _init_weights
+    old_init_weights = _init_weights
+    def _skip_init(*args, **kwargs):
+        pass
+    _init_weights = False
+    # Save the original initialization functions
+    for name, init_func in TORCH_INIT_FUNCTIONS.items():
+        setattr(torch.nn.init, name, _skip_init)
+    try:
+        yield
+    finally:
+        _init_weights = old_init_weights
+        # Restore the original initialization functions
+        for name, init_func in TORCH_INIT_FUNCTIONS.items():
+            setattr(torch.nn.init, name, init_func)
+def zero_module(module: nn.Module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module

diffsynth_engine/models/wan/__init__.py ADDED Viewed

File without changes

diffsynth_engine/models/wan/attention.py ADDED Viewed

@@ -0,0 +1,200 @@
+import torch
+import warnings
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == "cuda" and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(device=k.device, non_blocking=True)
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn("Flash attention 3 is not available, use flash attention 2 instead.")
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        )[0].unflatten(0, (b, lq))
+    elif FLASH_ATTN_2_AVAILABLE:
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    else:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).contiguous()
+    # output
+    return x.type(out_dtype)
+def create_sdpa_mask(q, k, q_lens, k_lens, causal=False):
+    b, lq, lk = q.size(0), q.size(1), k.size(1)
+    if q_lens is None:
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32)
+    if k_lens is None:
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32)
+    attn_mask = torch.zeros((b, lq, lk), dtype=torch.bool)
+    for i in range(b):
+        q_len, k_len = q_lens[i], k_lens[i]
+        attn_mask[i, q_len:, :] = True
+        attn_mask[i, :, k_len:] = True
+        if causal:
+            causal_mask = torch.triu(torch.ones((lq, lk), dtype=torch.bool), diagonal=1)
+            attn_mask[i, :, :] = torch.logical_or(attn_mask[i, :, :], causal_mask)
+    attn_mask = attn_mask.logical_not().to(q.device, non_blocking=True)
+    return attn_mask
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous()
+        return out

diffsynth_engine/models/wan/wan_dit.py ADDED Viewed

@@ -0,0 +1,431 @@
+import math
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple, Optional
+from einops import rearrange
+from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
+from diffsynth_engine.models.utils import no_init_weights
+from diffsynth_engine.utils.constants import (
+    WAN_DIT_1_3B_T2V_CONFIG_FILE,
+    WAN_DIT_14B_I2V_CONFIG_FILE,
+    WAN_DIT_14B_T2V_CONFIG_FILE,
+)
+def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_heads: int):
+    q, k, v = (rearrange(t, "b s (n d) -> b n s d ", n=num_heads) for t in (q, k, v))
+    x = F.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    return x
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
+    return x * (1 + scale) + shift
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(
+        position.type(torch.float64),
+        torch.pow(10000, -torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(dim // 2)),
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def precompute_freqs_cis_3d(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 3d rope precompute
+    f_freqs_cis = precompute_freqs_cis(dim - 2 * (dim // 3), end, theta)
+    h_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    w_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    return f_freqs_cis, h_freqs_cis, w_freqs_cis
+def precompute_freqs_cis(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 1d rope precompute
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].double() / dim))
+    freqs = torch.outer(torch.arange(end, device=freqs.device), freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    x_out = torch.view_as_complex(x.to(torch.float64).reshape(x.shape[0], x.shape[1], x.shape[2], -1, 2))
+    x_out = torch.view_as_real(x_out * freqs).flatten(2)
+    return x_out.to(x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim,
+        eps=1e-5,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+        self.weight = nn.Parameter(torch.ones(dim, device=device, dtype=dtype))
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.norm(x.float()).to(x.dtype) * self.weight
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        eps: float = 1e-6,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.k = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.v = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.o = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.norm_q = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
+        self.norm_k = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x, freqs):
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(x))
+        v = self.v(x)
+        num_heads = q.shape[2] // self.head_dim
+        x = attention(q=rope_apply(q, freqs, num_heads), k=rope_apply(k, freqs, num_heads), v=v, num_heads=num_heads)
+        return self.o(x)
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        eps: float = 1e-6,
+        has_image_input: bool = False,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.k = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.v = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.o = nn.Linear(dim, dim, device=device, dtype=dtype)
+        self.norm_q = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
+        self.norm_k = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim, device=device, dtype=dtype)
+            self.v_img = nn.Linear(dim, dim, device=device, dtype=dtype)
+            self.norm_k_img = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        num_heads = q.shape[2] // self.head_dim
+        x = attention(q, k, v, num_heads=num_heads)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = attention(q, k_img, v_img, num_heads=num_heads)
+            x = x + y
+        return self.o(x)
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        has_image_input: bool,
+        dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        eps: float = 1e-6,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.self_attn = SelfAttention(dim, num_heads, eps, device=device, dtype=dtype)
+        self.cross_attn = CrossAttention(
+            dim, num_heads, eps, has_image_input=has_image_input, device=device, dtype=dtype
+        )
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
+        self.norm3 = nn.LayerNorm(dim, eps=eps, device=device, dtype=dtype)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim, device=device, dtype=dtype),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim, device=device, dtype=dtype),
+        )
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim, device=device, dtype=dtype) / dim**0.5)
+    def forward(self, x, context, t_mod, freqs):
+        # msa: multi-head self-attention  mlp: multi-layer perceptron
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.modulation + t_mod).chunk(6, dim=1)
+        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
+        x = x + gate_msa * self.self_attn(input_x, freqs)
+        x = x + self.cross_attn(self.norm3(x), context)
+        input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = x + gate_mlp * self.ffn(input_x)
+        return x
+class MLP(torch.nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            nn.LayerNorm(in_dim, device=device, dtype=dtype),
+            nn.Linear(in_dim, in_dim, device=device, dtype=dtype),
+            nn.GELU(),
+            nn.Linear(in_dim, out_dim, device=device, dtype=dtype),
+            nn.LayerNorm(out_dim, device=device, dtype=dtype),
+        )
+    def forward(self, x):
+        return self.proj(x)
+class Head(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        out_dim: int,
+        patch_size: Tuple[int, int, int],
+        eps: float,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.patch_size = patch_size
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
+        self.head = nn.Linear(dim, out_dim * math.prod(patch_size), device=device, dtype=dtype)
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim, device=device, dtype=dtype) / dim**0.5)
+    def forward(self, x, t_mod):
+        shift, scale = (self.modulation + t_mod).chunk(2, dim=1)
+        x = self.head(self.norm(x) * (1 + scale) + shift)
+        return x
+class WanDiTStateDictConverter(StateDictConverter):
+    def convert(self, state_dict):
+        return state_dict
+class WanDiT(PreTrainedModel):
+    converter = WanDiTStateDictConverter()
+    def __init__(
+        self,
+        dim: int,
+        in_dim: int,
+        ffn_dim: int,
+        out_dim: int,
+        text_dim: int,
+        freq_dim: int,
+        eps: float,
+        patch_size: Tuple[int, int, int],
+        num_heads: int,
+        num_layers: int,
+        has_image_input: bool,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freq_dim = freq_dim
+        self.has_image_input = has_image_input
+        self.patch_size = patch_size
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim, device=device, dtype=dtype),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(dim, dim, device=device, dtype=dtype),
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim, device=device, dtype=dtype),
+            nn.SiLU(),
+            nn.Linear(dim, dim, device=device, dtype=dtype),
+        )
+        self.time_projection = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, dim * 6, device=device, dtype=dtype),
+        )
+        self.blocks = nn.ModuleList(
+            [
+                DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps, device=device, dtype=dtype)
+                for _ in range(num_layers)
+            ]
+        )
+        self.head = Head(dim, out_dim, patch_size, eps, device=device, dtype=dtype)
+        head_dim = dim // num_heads
+        self.freqs = precompute_freqs_cis_3d(head_dim)
+        if has_image_input:
+            self.img_emb = MLP(1280, dim, device=device, dtype=dtype)  # clip_feature_dim = 1280
+    def patchify(self, x: torch.Tensor):
+        x = self.patch_embedding(x)  # b c f h w -> b 4c f h/2 w/2
+        grid_size = x.shape[2:]
+        x = rearrange(x, "b c f h w -> b (f h w) c").contiguous()
+        return x, grid_size  # x, grid_size: (f, h, w)
+    def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
+        return rearrange(
+            x,
+            "b (f h w) (x y z c) -> b c (f x) (h y) (w z)",
+            f=grid_size[0],
+            h=grid_size[1],
+            w=grid_size[2],
+            x=self.patch_size[0],
+            y=self.patch_size[1],
+            z=self.patch_size[2],
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        timestep: torch.Tensor,
+        clip_feature: Optional[torch.Tensor] = None,  # clip_vision_encoder(img)
+        y: Optional[torch.Tensor] = None,  # vae_encoder(img)
+    ):
+        t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
+        t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+        context = self.text_embedding(context)
+        if self.has_image_input:
+            x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+            clip_embdding = self.img_emb(clip_feature)
+            context = torch.cat([clip_embdding, context], dim=1)  # (b, s1 + s2, d)
+        x, (f, h, w) = self.patchify(x)
+        freqs = (
+            torch.cat(
+                [
+                    self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                    self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                    self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+                ],
+                dim=-1,
+            )
+            .reshape(f * h * w, 1, -1)
+            .to(x.device)
+        )
+        for block in self.blocks:
+            x = block(x, context, t_mod, freqs)
+        x = self.head(x, t)
+        x = self.unpatchify(x, (f, h, w))
+        return x
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        device: str,
+        dtype: torch.dtype,
+        model_type: str = "1.3b-t2v",
+    ):
+        if model_type == "1.3b-t2v":
+            config = json.load(open(WAN_DIT_1_3B_T2V_CONFIG_FILE, "r"))
+        elif model_type == "14b-t2v":
+            config = json.load(open(WAN_DIT_14B_T2V_CONFIG_FILE, "r"))
+        elif model_type == "14b-i2v":
+            config = json.load(open(WAN_DIT_14B_I2V_CONFIG_FILE, "r"))
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(cls, **config, device=device, dtype=dtype)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model
+    def get_tp_plan(self):
+        from torch.distributed.tensor.parallel import (
+            ColwiseParallel,
+            RowwiseParallel,
+            SequenceParallel,
+            PrepareModuleOutput,
+        )
+        from torch.distributed.tensor import Replicate, Shard
+        tp_plan = {
+            "text_embedding.0": ColwiseParallel(),
+            "text_embedding.2": RowwiseParallel(),
+            "time_embedding.0": ColwiseParallel(),
+            "time_embedding.2": RowwiseParallel(),
+            "time_projection.1": ColwiseParallel(output_layouts=Replicate()),
+        }
+        for idx in range(len(self.blocks)):
+            tp_plan.update(
+                {
+                    f"blocks.{idx}.norm1": SequenceParallel(use_local_output=True),
+                    f"blocks.{idx}.norm2": SequenceParallel(use_local_output=True),
+                    f"blocks.{idx}.norm3": SequenceParallel(use_local_output=True),
+                    f"blocks.{idx}.ffn.0": ColwiseParallel(),
+                    f"blocks.{idx}.ffn.2": RowwiseParallel(),
+                    f"blocks.{idx}.self_attn.q": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.self_attn.k": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.self_attn.v": ColwiseParallel(),
+                    f"blocks.{idx}.self_attn.o": RowwiseParallel(),
+                    f"blocks.{idx}.self_attn.norm_q": PrepareModuleOutput(
+                        output_layouts=Replicate(),
+                        desired_output_layouts=Shard(-1),
+                    ),
+                    f"blocks.{idx}.self_attn.norm_k": PrepareModuleOutput(
+                        output_layouts=Replicate(),
+                        desired_output_layouts=Shard(-1),
+                    ),
+                    f"blocks.{idx}.cross_attn.q": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.cross_attn.k": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.cross_attn.v": ColwiseParallel(),
+                    f"blocks.{idx}.cross_attn.o": RowwiseParallel(),
+                    f"blocks.{idx}.cross_attn.norm_q": PrepareModuleOutput(
+                        output_layouts=Replicate(),
+                        desired_output_layouts=Shard(-1),
+                    ),
+                    f"blocks.{idx}.cross_attn.norm_k": PrepareModuleOutput(
+                        output_layouts=Replicate(),
+                        desired_output_layouts=Shard(-1),
+                    ),
+                    f"blocks.{idx}.cross_attn.k_img": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.cross_attn.v_img": ColwiseParallel(),
+                    f"blocks.{idx}.cross_attn.norm_k_img": PrepareModuleOutput(
+                        output_layouts=Replicate(),
+                        desired_output_layouts=Shard(-1),
+                    ),
+                }
+            )
+        return tp_plan