PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

diffsynth_engine/models/flux/flux_dit.py CHANGED Viewed

@@ -176,7 +176,6 @@ class FluxDoubleAttention(nn.Module):
         dim_b,
         num_heads,
         head_dim,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -194,19 +193,20 @@ class FluxDoubleAttention(nn.Module):
         self.a_to_out = nn.Linear(dim_a, dim_a, device=device, dtype=dtype)
         self.b_to_out = nn.Linear(dim_b, dim_b, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out_a, attn_out_b, x_a, x_b, q_a, q_b, k_a, k_b, v_a, v_b, rope_emb, image_emb):
         return attn_out_a, attn_out_b
-    def forward(self, image, text, rope_emb, image_emb):
+    def forward(self, image, text, rope_emb, image_emb, attn_kwargs=None):
         q_a, k_a, v_a = rearrange(self.a_to_qkv(image), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q_b, k_b, v_b = rearrange(self.b_to_qkv(text), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q = torch.cat([self.norm_q_b(q_b), self.norm_q_a(q_a)], dim=1)
         k = torch.cat([self.norm_k_b(k_b), self.norm_k_a(k_a)], dim=1)
         v = torch.cat([v_b, v_a], dim=1)
         q, k = apply_rope(q, k, rope_emb)
-        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention_ops.attention(q, k, v, **attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         text_out, image_out = attn_out[:, : text.shape[1]], attn_out[:, text.shape[1] :]
         image_out, text_out = self.attention_callback(
@@ -231,14 +231,11 @@ class FluxDoubleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
-        self.attn = FluxDoubleAttention(
-            dim, dim, num_heads, dim // num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype
-        )
+        self.attn = FluxDoubleAttention(dim, dim, num_heads, dim // num_heads, device=device, dtype=dtype)
         # Image
         self.norm_msa_a = AdaLayerNormZero(dim, device=device, dtype=dtype)
         self.norm_mlp_a = AdaLayerNormZero(dim, device=device, dtype=dtype)
@@ -256,11 +253,11 @@ class FluxDoubleTransformerBlock(nn.Module):
             nn.Linear(dim * 4, dim, device=device, dtype=dtype),
         )
-    def forward(self, image, text, t_emb, rope_emb, image_emb=None):
+    def forward(self, image, text, t_emb, rope_emb, image_emb=None, attn_kwargs=None):
         # AdaLayerNorm-Zero for Image and Text MSA
         image_in, gate_a = self.norm_msa_a(image, t_emb)
         text_in, gate_b = self.norm_msa_b(text, t_emb)
-        image_out, text_out = self.attn(image_in, text_in, rope_emb, image_emb)
+        image_out, text_out = self.attn(image_in, text_in, rope_emb, image_emb, attn_kwargs)
         image = image + gate_a * image_out
         text = text + gate_b * text_out
@@ -279,7 +276,6 @@ class FluxSingleAttention(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -288,15 +284,16 @@ class FluxSingleAttention(nn.Module):
         self.to_qkv = nn.Linear(dim, dim * 3, device=device, dtype=dtype)
         self.norm_q_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
         self.norm_k_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out, x, q, k, v, rope_emb, image_emb):
         return attn_out
-    def forward(self, x, rope_emb, image_emb):
+    def forward(self, x, rope_emb, image_emb, attn_kwargs=None):
         q, k, v = rearrange(self.to_qkv(x), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q, k = apply_rope(self.norm_q_a(q), self.norm_k_a(k), rope_emb)
-        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention_ops.attention(q, k, v, **attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         return self.attention_callback(attn_out=attn_out, x=x, q=q, k=k, v=v, rope_emb=rope_emb, image_emb=image_emb)
@@ -306,23 +303,22 @@ class FluxSingleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self.dim = dim
         self.norm = AdaLayerNormZero(dim, device=device, dtype=dtype)
-        self.attn = FluxSingleAttention(dim, num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        self.attn = FluxSingleAttention(dim, num_heads, device=device, dtype=dtype)
         self.mlp = nn.Sequential(
             nn.Linear(dim, dim * 4, device=device, dtype=dtype),
             nn.GELU(approximate="tanh"),
         )
         self.proj_out = nn.Linear(dim * 5, dim, device=device, dtype=dtype)
-    def forward(self, x, t_emb, rope_emb, image_emb=None):
+    def forward(self, x, t_emb, rope_emb, image_emb=None, attn_kwargs=None):
         h, gate = self.norm(x, emb=t_emb)
-        attn_output = self.attn(h, rope_emb, image_emb)
+        attn_output = self.attn(h, rope_emb, image_emb, attn_kwargs)
         mlp_output = self.mlp(h)
         return x + gate * self.proj_out(torch.cat([attn_output, mlp_output], dim=2))
@@ -334,7 +330,6 @@ class FluxDiT(PreTrainedModel):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -352,16 +347,10 @@ class FluxDiT(PreTrainedModel):
         self.x_embedder = nn.Linear(in_channel, 3072, device=device, dtype=dtype)
         self.blocks = nn.ModuleList(
-            [
-                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(19)
-            ]
+            [FluxDoubleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(19)]
         )
         self.single_blocks = nn.ModuleList(
-            [
-                FluxSingleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(38)
-            ]
+            [FluxSingleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(38)]
         )
         self.final_norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.final_proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
@@ -403,6 +392,7 @@ class FluxDiT(PreTrainedModel):
         text_ids: torch.Tensor,
         guidance: torch.Tensor,
         image_emb: torch.Tensor | None = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_double_block_output: List[torch.Tensor] | None = None,
         controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
@@ -470,14 +460,16 @@ class FluxDiT(PreTrainedModel):
                 rope_emb = torch.cat((text_rope_emb, image_rope_emb), dim=2)
                 for i, block in enumerate(self.blocks):
-                    hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                    hidden_states, prompt_emb = block(
+                        hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                    )
                     if len(controlnet_double_block_output) > 0:
                         interval_control = len(self.blocks) / len(controlnet_double_block_output)
                         interval_control = int(np.ceil(interval_control))
                         hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
                 hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
                 for i, block in enumerate(self.single_blocks):
-                    hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
+                    hidden_states = block(hidden_states, conditioning, rope_emb, image_emb, attn_kwargs)
                     if len(controlnet_single_block_output) > 0:
                         interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
                         interval_control = int(np.ceil(interval_control))
@@ -498,14 +490,8 @@ class FluxDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        model = cls(
-            device="meta",
-            dtype=dtype,
-            in_channel=in_channel,
-            attn_kwargs=attn_kwargs,
-        )
+        model = cls(device="meta", dtype=dtype, in_channel=in_channel)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
@@ -515,5 +501,8 @@ class FluxDiT(PreTrainedModel):
         for block in self.blocks:
             block.compile(*args, **kwargs)
-    def get_fsdp_modules(self):
-        return ["blocks", "single_blocks"]
+        for block in self.single_blocks:
+            block.compile(*args, **kwargs)
+    def get_fsdp_module_cls(self):
+        return {FluxDoubleTransformerBlock, FluxSingleTransformerBlock}

diffsynth_engine/models/flux/flux_dit_fbcache.py CHANGED Viewed

@@ -20,12 +20,11 @@ class FluxDiTFBCache(FluxDiT):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         relative_l1_threshold: float = 0.05,
     ):
-        super().__init__(in_channel=in_channel, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        super().__init__(in_channel=in_channel, device=device, dtype=dtype)
         self.relative_l1_threshold = relative_l1_threshold
         self.step_count = 0
         self.num_inference_steps = 0
@@ -56,6 +55,7 @@ class FluxDiTFBCache(FluxDiT):
         text_ids: torch.Tensor,
         guidance: torch.Tensor,
         image_emb: torch.Tensor | None = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_double_block_output: List[torch.Tensor] | None = None,
         controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
@@ -124,7 +124,9 @@ class FluxDiTFBCache(FluxDiT):
                 # first block
                 original_hidden_states = hidden_states
-                hidden_states, prompt_emb = self.blocks[0](hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                hidden_states, prompt_emb = self.blocks[0](
+                    hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                )
                 first_hidden_states_residual = hidden_states - original_hidden_states
                 (first_hidden_states_residual,) = sequence_parallel_unshard(
@@ -149,14 +151,16 @@ class FluxDiTFBCache(FluxDiT):
                     first_hidden_states = hidden_states.clone()
                     for i, block in enumerate(self.blocks[1:]):
-                        hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                        hidden_states, prompt_emb = block(
+                            hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                        )
                         if len(controlnet_double_block_output) > 0:
                             interval_control = len(self.blocks) / len(controlnet_double_block_output)
                             interval_control = int(np.ceil(interval_control))
                             hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
                     hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
                     for i, block in enumerate(self.single_blocks):
-                        hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
+                        hidden_states = block(hidden_states, conditioning, rope_emb, image_emb, attn_kwargs)
                         if len(controlnet_single_block_output) > 0:
                             interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
                             interval_control = int(np.ceil(interval_control))
@@ -182,14 +186,12 @@ class FluxDiTFBCache(FluxDiT):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         relative_l1_threshold: float = 0.05,
     ):
         model = cls(
             device="meta",
             dtype=dtype,
             in_channel=in_channel,
-            attn_kwargs=attn_kwargs,
             relative_l1_threshold=relative_l1_threshold,
         )
         model = model.requires_grad_(False)

diffsynth_engine/models/flux/flux_ipadapter.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from einops import rearrange
 from torch import nn
 from PIL import Image
-from typing import Any, Dict, List, Optional
+from typing import Dict, List
 from functools import partial
 from diffsynth_engine.models.text_encoder.siglip import SiglipImageEncoder
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
@@ -18,7 +18,6 @@ class FluxIPAdapterAttention(nn.Module):
         dim: int = 3072,
         head_num: int = 24,
         scale: float = 1.0,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -28,12 +27,13 @@ class FluxIPAdapterAttention(nn.Module):
         self.to_v_ip = nn.Linear(image_emb_dim, dim, device=device, dtype=dtype, bias=False)
         self.head_num = head_num
         self.scale = scale
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
-    def forward(self, query: torch.Tensor, image_emb: torch.Tensor):
+    def forward(self, query: torch.Tensor, image_emb: torch.Tensor, attn_kwargs=None):
         key = rearrange(self.norm_k(self.to_k_ip(image_emb)), "b s (h d) -> b s h d", h=self.head_num)
         value = rearrange(self.to_v_ip(image_emb), "b s (h d) -> b s h d", h=self.head_num)
-        attn_out = attention(query, key, value, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention(query, key, value, **attn_kwargs)
         return self.scale * rearrange(attn_out, "b s h d -> b s (h d)")
     @classmethod

diffsynth_engine/models/qwen_image/qwen2_5_vl.py CHANGED Viewed

@@ -942,6 +942,8 @@ class Qwen2_5_VLModel(nn.Module):
 class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
+    _supports_parallelization = True
     def __init__(
         self,
         vision_config: Qwen2_5_VLVisionConfig,
@@ -1173,6 +1175,9 @@ class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
             return position_ids, mrope_position_deltas
+    def get_fsdp_module_cls(self):
+        return {Qwen2_5_VisionBlock, Qwen2_5_VLDecoderLayer}
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

diffsynth_engine/models/qwen_image/qwen_image_dit.py CHANGED Viewed

@@ -6,7 +6,7 @@ from einops import rearrange
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
-from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, ApproximateGELU, RMSNorm
+from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, GELU, RMSNorm
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.parallel import (
@@ -144,7 +144,7 @@ class QwenFeedForward(nn.Module):
         super().__init__()
         inner_dim = int(dim * 4)
         self.net = nn.ModuleList([])
-        self.net.append(ApproximateGELU(dim, inner_dim, device=device, dtype=dtype))
+        self.net.append(GELU(dim, inner_dim, approximate="tanh", device=device, dtype=dtype))
         self.net.append(nn.Dropout(dropout))
         self.net.append(nn.Linear(inner_dim, dim_out, device=device, dtype=dtype))
@@ -155,8 +155,8 @@ class QwenFeedForward(nn.Module):
 def apply_rotary_emb_qwen(x: torch.Tensor, freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]]):
-    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-    x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))  # (b, s, h, d) -> (b, s, h, d/2, 2)
+    x_out = torch.view_as_real(x_rotated * freqs_cis.unsqueeze(1)).flatten(3)  # (b, s, h, d/2, 2) -> (b, s, h, d)
     return x_out.type_as(x)
@@ -167,7 +167,6 @@ class QwenDoubleStreamAttention(nn.Module):
         dim_b,
         num_heads,
         head_dim,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -189,7 +188,6 @@ class QwenDoubleStreamAttention(nn.Module):
         self.to_out = nn.Linear(dim_a, dim_a, device=device, dtype=dtype)
         self.to_add_out = nn.Linear(dim_b, dim_b, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def forward(
         self,
@@ -197,17 +195,18 @@ class QwenDoubleStreamAttention(nn.Module):
         text: torch.FloatTensor,
         rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attn_mask: Optional[torch.Tensor] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
         img_q, img_k, img_v = self.to_q(image), self.to_k(image), self.to_v(image)
         txt_q, txt_k, txt_v = self.add_q_proj(text), self.add_k_proj(text), self.add_v_proj(text)
-        img_q = rearrange(img_q, "b s (h d) -> b h s d", h=self.num_heads)
-        img_k = rearrange(img_k, "b s (h d) -> b h s d", h=self.num_heads)
-        img_v = rearrange(img_v, "b s (h d) -> b h s d", h=self.num_heads)
+        img_q = rearrange(img_q, "b s (h d) -> b s h d", h=self.num_heads)
+        img_k = rearrange(img_k, "b s (h d) -> b s h d", h=self.num_heads)
+        img_v = rearrange(img_v, "b s (h d) -> b s h d", h=self.num_heads)
-        txt_q = rearrange(txt_q, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_k = rearrange(txt_k, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_v = rearrange(txt_v, "b s (h d) -> b h s d", h=self.num_heads)
+        txt_q = rearrange(txt_q, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_k = rearrange(txt_k, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_v = rearrange(txt_v, "b s (h d) -> b s h d", h=self.num_heads)
         img_q, img_k = self.norm_q(img_q), self.norm_k(img_k)
         txt_q, txt_k = self.norm_added_q(txt_q), self.norm_added_k(txt_k)
@@ -219,15 +218,12 @@ class QwenDoubleStreamAttention(nn.Module):
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs)
-        joint_q = torch.cat([txt_q, img_q], dim=2)
-        joint_k = torch.cat([txt_k, img_k], dim=2)
-        joint_v = torch.cat([txt_v, img_v], dim=2)
+        joint_q = torch.cat([txt_q, img_q], dim=1)
+        joint_k = torch.cat([txt_k, img_k], dim=1)
+        joint_v = torch.cat([txt_v, img_v], dim=1)
-        joint_q = joint_q.transpose(1, 2)
-        joint_k = joint_k.transpose(1, 2)
-        joint_v = joint_v.transpose(1, 2)
-        joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, attn_mask=attn_mask, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, attn_mask=attn_mask, **attn_kwargs)
         joint_attn_out = rearrange(joint_attn_out, "b s h d -> b s (h d)").to(joint_q.dtype)
@@ -247,7 +243,6 @@ class QwenImageTransformerBlock(nn.Module):
         num_attention_heads: int,
         attention_head_dim: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -267,7 +262,6 @@ class QwenImageTransformerBlock(nn.Module):
             dim_b=dim,
             num_heads=num_attention_heads,
             head_dim=attention_head_dim,
-            attn_kwargs=attn_kwargs,
             device=device,
             dtype=dtype,
         )
@@ -293,6 +287,7 @@ class QwenImageTransformerBlock(nn.Module):
         temb: torch.Tensor,
         rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attn_mask: Optional[torch.Tensor] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         img_mod_attn, img_mod_mlp = self.img_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
         txt_mod_attn, txt_mod_mlp = self.txt_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
@@ -308,6 +303,7 @@ class QwenImageTransformerBlock(nn.Module):
             text=txt_modulated,
             rotary_emb=rotary_emb,
             attn_mask=attn_mask,
+            attn_kwargs=attn_kwargs,
         )
         image = image + img_gate * img_attn_out
@@ -335,7 +331,6 @@ class QwenImageDiT(PreTrainedModel):
     def __init__(
         self,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -356,7 +351,6 @@ class QwenImageDiT(PreTrainedModel):
                     dim=3072,
                     num_attention_heads=24,
                     attention_head_dim=128,
-                    attn_kwargs=attn_kwargs,
                     device=device,
                     dtype=dtype,
                 )
@@ -444,6 +438,7 @@ class QwenImageDiT(PreTrainedModel):
         entity_text: Optional[List[torch.Tensor]] = None,
         entity_seq_lens: Optional[List[torch.LongTensor]] = None,
         entity_masks: Optional[List[torch.Tensor]] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         h, w = image.shape[-2:]
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
@@ -509,7 +504,12 @@ class QwenImageDiT(PreTrainedModel):
                 rotary_emb = (img_freqs, txt_freqs)
                 for block in self.transformer_blocks:
                     text, image = block(
-                        image=image, text=text, temb=conditioning, rotary_emb=rotary_emb, attn_mask=attn_mask
+                        image=image,
+                        text=text,
+                        temb=conditioning,
+                        rotary_emb=rotary_emb,
+                        attn_mask=attn_mask,
+                        attn_kwargs=attn_kwargs,
                     )
                 image = self.norm_out(image, conditioning)
                 image = self.proj_out(image)
@@ -527,14 +527,8 @@ class QwenImageDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        model = cls(
-            device="meta",
-            dtype=dtype,
-            num_layers=num_layers,
-            attn_kwargs=attn_kwargs,
-        )
+        model = cls(device="meta", dtype=dtype, num_layers=num_layers)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
@@ -544,5 +538,5 @@ class QwenImageDiT(PreTrainedModel):
         for block in self.transformer_blocks:
             block.compile(*args, **kwargs)
-    def get_fsdp_modules(self):
-        return ["transformer_blocks"]
+    def get_fsdp_module_cls(self):
+        return {QwenImageTransformerBlock}

diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py CHANGED Viewed

@@ -11,12 +11,11 @@ class QwenImageDiTFBCache(QwenImageDiT):
     def __init__(
         self,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         relative_l1_threshold: float = 0.05,
     ):
-        super().__init__(num_layers=num_layers, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        super().__init__(num_layers=num_layers, device=device, dtype=dtype)
         self.relative_l1_threshold = relative_l1_threshold
         self.step_count = 0
         self.num_inference_steps = 0
@@ -43,6 +42,7 @@ class QwenImageDiTFBCache(QwenImageDiT):
         text: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         txt_seq_lens: torch.LongTensor = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         h, w = image.shape[-2:]
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
@@ -72,7 +72,11 @@ class QwenImageDiTFBCache(QwenImageDiT):
             # first block
             original_hidden_states = image
             text, image = self.transformer_blocks[0](
-                image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb
+                image=image,
+                text=text,
+                temb=conditioning,
+                image_rotary_emb=image_rotary_emb,
+                attn_kwargs=attn_kwargs,
             )
             first_hidden_states_residual = image - original_hidden_states
@@ -94,7 +98,13 @@ class QwenImageDiTFBCache(QwenImageDiT):
                 first_hidden_states = image.clone()
                 for block in self.transformer_blocks[1:]:
-                    text, image = block(image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb)
+                    text, image = block(
+                        image=image,
+                        text=text,
+                        temb=conditioning,
+                        image_rotary_emb=image_rotary_emb,
+                        attn_kwargs=attn_kwargs,
+                    )
                 previous_residual = image - first_hidden_states
                 self.previous_residual = previous_residual
@@ -114,14 +124,12 @@ class QwenImageDiTFBCache(QwenImageDiT):
         device: str,
         dtype: torch.dtype,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         relative_l1_threshold: float = 0.05,
     ):
         model = cls(
             device="meta",
             dtype=dtype,
             num_layers=num_layers,
-            attn_kwargs=attn_kwargs,
             relative_l1_threshold=relative_l1_threshold,
         )
         model = model.requires_grad_(False)

diffsynth_engine/models/wan/wan_audio_encoder.py CHANGED Viewed

@@ -223,7 +223,6 @@ class Wav2Vec2StateDictConverter:
 class Wav2Vec2Model(PreTrainedModel):
     converter = Wav2Vec2StateDictConverter()
-    _supports_parallelization = False
     def __init__(self, config: Wav2Vec2Config, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
         super().__init__()

diffsynth-engine 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl