PyPI - diffsynth-engine - Versions diffs - 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

diffsynth_engine/models/flux/flux_dit.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import torch
 import torch.nn as nn
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 from einops import rearrange
 from diffsynth_engine.models.basic.transformer_helper import (
@@ -28,7 +28,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(FLUX_DIT_CONFIG_FILE, "r") as f:
+with open(FLUX_DIT_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)
@@ -176,7 +176,6 @@ class FluxDoubleAttention(nn.Module):
         dim_b,
         num_heads,
         head_dim,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -194,19 +193,20 @@ class FluxDoubleAttention(nn.Module):
         self.a_to_out = nn.Linear(dim_a, dim_a, device=device, dtype=dtype)
         self.b_to_out = nn.Linear(dim_b, dim_b, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out_a, attn_out_b, x_a, x_b, q_a, q_b, k_a, k_b, v_a, v_b, rope_emb, image_emb):
         return attn_out_a, attn_out_b
-    def forward(self, image, text, rope_emb, image_emb):
+    def forward(self, image, text, rope_emb, image_emb, attn_kwargs=None):
         q_a, k_a, v_a = rearrange(self.a_to_qkv(image), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q_b, k_b, v_b = rearrange(self.b_to_qkv(text), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q = torch.cat([self.norm_q_b(q_b), self.norm_q_a(q_a)], dim=1)
         k = torch.cat([self.norm_k_b(k_b), self.norm_k_a(k_a)], dim=1)
         v = torch.cat([v_b, v_a], dim=1)
         q, k = apply_rope(q, k, rope_emb)
-        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention_ops.attention(q, k, v, **attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         text_out, image_out = attn_out[:, : text.shape[1]], attn_out[:, text.shape[1] :]
         image_out, text_out = self.attention_callback(
@@ -231,19 +231,18 @@ class FluxDoubleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
-        self.attn = FluxDoubleAttention(
-            dim, dim, num_heads, dim // num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype
-        )
+        self.attn = FluxDoubleAttention(dim, dim, num_heads, dim // num_heads, device=device, dtype=dtype)
         # Image
         self.norm_msa_a = AdaLayerNormZero(dim, device=device, dtype=dtype)
         self.norm_mlp_a = AdaLayerNormZero(dim, device=device, dtype=dtype)
         self.ff_a = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(approximate="tanh"), nn.Linear(dim * 4, dim, device=device, dtype=dtype)
+            nn.Linear(dim, dim * 4, device=device, dtype=dtype),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(dim * 4, dim, device=device, dtype=dtype),
         )
         # Text
         self.norm_msa_b = AdaLayerNormZero(dim, device=device, dtype=dtype)
@@ -254,11 +253,11 @@ class FluxDoubleTransformerBlock(nn.Module):
             nn.Linear(dim * 4, dim, device=device, dtype=dtype),
         )
-    def forward(self, image, text, t_emb, rope_emb, image_emb=None):
+    def forward(self, image, text, t_emb, rope_emb, image_emb=None, attn_kwargs=None):
         # AdaLayerNorm-Zero for Image and Text MSA
         image_in, gate_a = self.norm_msa_a(image, t_emb)
         text_in, gate_b = self.norm_msa_b(text, t_emb)
-        image_out, text_out = self.attn(image_in, text_in, rope_emb, image_emb)
+        image_out, text_out = self.attn(image_in, text_in, rope_emb, image_emb, attn_kwargs)
         image = image + gate_a * image_out
         text = text + gate_b * text_out
@@ -277,7 +276,6 @@ class FluxSingleAttention(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -286,15 +284,16 @@ class FluxSingleAttention(nn.Module):
         self.to_qkv = nn.Linear(dim, dim * 3, device=device, dtype=dtype)
         self.norm_q_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
         self.norm_k_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out, x, q, k, v, rope_emb, image_emb):
         return attn_out
-    def forward(self, x, rope_emb, image_emb):
+    def forward(self, x, rope_emb, image_emb, attn_kwargs=None):
         q, k, v = rearrange(self.to_qkv(x), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q, k = apply_rope(self.norm_q_a(q), self.norm_k_a(k), rope_emb)
-        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention_ops.attention(q, k, v, **attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         return self.attention_callback(attn_out=attn_out, x=x, q=q, k=k, v=v, rope_emb=rope_emb, image_emb=image_emb)
@@ -304,23 +303,22 @@ class FluxSingleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self.dim = dim
         self.norm = AdaLayerNormZero(dim, device=device, dtype=dtype)
-        self.attn = FluxSingleAttention(dim, num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        self.attn = FluxSingleAttention(dim, num_heads, device=device, dtype=dtype)
         self.mlp = nn.Sequential(
-            nn.Linear(dim, dim * 4),
+            nn.Linear(dim, dim * 4, device=device, dtype=dtype),
             nn.GELU(approximate="tanh"),
         )
-        self.proj_out = nn.Linear(dim * 5, dim)
+        self.proj_out = nn.Linear(dim * 5, dim, device=device, dtype=dtype)
-    def forward(self, x, t_emb, rope_emb, image_emb=None):
+    def forward(self, x, t_emb, rope_emb, image_emb=None, attn_kwargs=None):
         h, gate = self.norm(x, emb=t_emb)
-        attn_output = self.attn(h, rope_emb, image_emb)
+        attn_output = self.attn(h, rope_emb, image_emb, attn_kwargs)
         mlp_output = self.mlp(h)
         return x + gate * self.proj_out(torch.cat([attn_output, mlp_output], dim=2))
@@ -332,7 +330,6 @@ class FluxDiT(PreTrainedModel):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -350,16 +347,10 @@ class FluxDiT(PreTrainedModel):
         self.x_embedder = nn.Linear(in_channel, 3072, device=device, dtype=dtype)
         self.blocks = nn.ModuleList(
-            [
-                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(19)
-            ]
+            [FluxDoubleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(19)]
         )
         self.single_blocks = nn.ModuleList(
-            [
-                FluxSingleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(38)
-            ]
+            [FluxSingleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(38)]
         )
         self.final_norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.final_proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
@@ -393,21 +384,20 @@ class FluxDiT(PreTrainedModel):
     def forward(
         self,
-        hidden_states,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        image_emb,
-        guidance,
-        text_ids,
-        image_ids=None,
-        controlnet_double_block_output=None,
-        controlnet_single_block_output=None,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
+        image_emb: torch.Tensor | None = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_double_block_output: List[torch.Tensor] | None = None,
+        controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
     ):
-        h, w = hidden_states.shape[-2:]
-        if image_ids is None:
-            image_ids = self.prepare_image_ids(hidden_states)
+        image_seq_len = hidden_states.shape[1]
         controlnet_double_block_output = (
             controlnet_double_block_output if controlnet_double_block_output is not None else ()
         )
@@ -426,10 +416,10 @@ class FluxDiT(PreTrainedModel):
                     timestep,
                     prompt_emb,
                     pooled_prompt_emb,
-                    image_emb,
-                    guidance,
-                    text_ids,
                     image_ids,
+                    text_ids,
+                    guidance,
+                    image_emb,
                     *controlnet_double_block_output,
                     *controlnet_single_block_output,
                 ),
@@ -446,7 +436,6 @@ class FluxDiT(PreTrainedModel):
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
             text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
             image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
-            hidden_states = self.patchify(hidden_states)
             with sequence_parallel(
                 (
@@ -471,14 +460,16 @@ class FluxDiT(PreTrainedModel):
                 rope_emb = torch.cat((text_rope_emb, image_rope_emb), dim=2)
                 for i, block in enumerate(self.blocks):
-                    hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                    hidden_states, prompt_emb = block(
+                        hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                    )
                     if len(controlnet_double_block_output) > 0:
                         interval_control = len(self.blocks) / len(controlnet_double_block_output)
                         interval_control = int(np.ceil(interval_control))
                         hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
                 hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
                 for i, block in enumerate(self.single_blocks):
-                    hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
+                    hidden_states = block(hidden_states, conditioning, rope_emb, image_emb, attn_kwargs)
                     if len(controlnet_single_block_output) > 0:
                         interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
                         interval_control = int(np.ceil(interval_control))
@@ -487,9 +478,8 @@ class FluxDiT(PreTrainedModel):
                 hidden_states = hidden_states[:, prompt_emb.shape[1] :]
                 hidden_states = self.final_norm_out(hidden_states, conditioning)
                 hidden_states = self.final_proj_out(hidden_states)
-                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
+                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))
-            hidden_states = self.unpatchify(hidden_states, h, w)
             (hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)
             return hidden_states
@@ -500,14 +490,8 @@ class FluxDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        model = cls(
-            device="meta",
-            dtype=dtype,
-            in_channel=in_channel,
-            attn_kwargs=attn_kwargs,
-        )
+        model = cls(device="meta", dtype=dtype, in_channel=in_channel)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
@@ -517,5 +501,8 @@ class FluxDiT(PreTrainedModel):
         for block in self.blocks:
             block.compile(*args, **kwargs)
-    def get_fsdp_modules(self):
-        return ["blocks", "single_blocks"]
+        for block in self.single_blocks:
+            block.compile(*args, **kwargs)
+    def get_fsdp_module_cls(self):
+        return {FluxDoubleTransformerBlock, FluxSingleTransformerBlock}

diffsynth_engine/models/flux/flux_dit_fbcache.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
@@ -20,12 +20,11 @@ class FluxDiTFBCache(FluxDiT):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         relative_l1_threshold: float = 0.05,
     ):
-        super().__init__(in_channel=in_channel, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        super().__init__(in_channel=in_channel, device=device, dtype=dtype)
         self.relative_l1_threshold = relative_l1_threshold
         self.step_count = 0
         self.num_inference_steps = 0
@@ -48,21 +47,20 @@ class FluxDiTFBCache(FluxDiT):
     def forward(
         self,
-        hidden_states,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        image_emb,
-        guidance,
-        text_ids,
-        image_ids=None,
-        controlnet_double_block_output=None,
-        controlnet_single_block_output=None,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
+        image_emb: torch.Tensor | None = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_double_block_output: List[torch.Tensor] | None = None,
+        controlnet_single_block_output: List[torch.Tensor] | None = None,
         **kwargs,
     ):
-        h, w = hidden_states.shape[-2:]
-        if image_ids is None:
-            image_ids = self.prepare_image_ids(hidden_states)
+        image_seq_len = hidden_states.shape[1]
         controlnet_double_block_output = (
             controlnet_double_block_output if controlnet_double_block_output is not None else ()
         )
@@ -81,10 +79,10 @@ class FluxDiTFBCache(FluxDiT):
                     timestep,
                     prompt_emb,
                     pooled_prompt_emb,
-                    image_emb,
-                    guidance,
-                    text_ids,
                     image_ids,
+                    text_ids,
+                    guidance,
+                    image_emb,
                     *controlnet_double_block_output,
                     *controlnet_single_block_output,
                 ),
@@ -101,7 +99,6 @@ class FluxDiTFBCache(FluxDiT):
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
             text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
             image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
-            hidden_states = self.patchify(hidden_states)
             with sequence_parallel(
                 (
@@ -127,11 +124,13 @@ class FluxDiTFBCache(FluxDiT):
                 # first block
                 original_hidden_states = hidden_states
-                hidden_states, prompt_emb = self.blocks[0](hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                hidden_states, prompt_emb = self.blocks[0](
+                    hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                )
                 first_hidden_states_residual = hidden_states - original_hidden_states
                 (first_hidden_states_residual,) = sequence_parallel_unshard(
-                    (first_hidden_states_residual,), seq_dims=(1,), seq_lens=(h * w // 4,)
+                    (first_hidden_states_residual,), seq_dims=(1,), seq_lens=(image_seq_len,)
                 )
                 if self.step_count == 0 or self.step_count == (self.num_inference_steps - 1):
@@ -152,14 +151,16 @@ class FluxDiTFBCache(FluxDiT):
                     first_hidden_states = hidden_states.clone()
                     for i, block in enumerate(self.blocks[1:]):
-                        hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                        hidden_states, prompt_emb = block(
+                            hidden_states, prompt_emb, conditioning, rope_emb, image_emb, attn_kwargs
+                        )
                         if len(controlnet_double_block_output) > 0:
                             interval_control = len(self.blocks) / len(controlnet_double_block_output)
                             interval_control = int(np.ceil(interval_control))
                             hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
                     hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
                     for i, block in enumerate(self.single_blocks):
-                        hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
+                        hidden_states = block(hidden_states, conditioning, rope_emb, image_emb, attn_kwargs)
                         if len(controlnet_single_block_output) > 0:
                             interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
                             interval_control = int(np.ceil(interval_control))
@@ -172,9 +173,8 @@ class FluxDiTFBCache(FluxDiT):
                 hidden_states = self.final_norm_out(hidden_states, conditioning)
                 hidden_states = self.final_proj_out(hidden_states)
-                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
+                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))
-            hidden_states = self.unpatchify(hidden_states, h, w)
             (hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)
             return hidden_states
@@ -186,14 +186,12 @@ class FluxDiTFBCache(FluxDiT):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         relative_l1_threshold: float = 0.05,
     ):
         model = cls(
             device="meta",
             dtype=dtype,
             in_channel=in_channel,
-            attn_kwargs=attn_kwargs,
             relative_l1_threshold=relative_l1_threshold,
         )
         model = model.requires_grad_(False)

diffsynth_engine/models/flux/flux_ipadapter.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from einops import rearrange
 from torch import nn
 from PIL import Image
-from typing import Any, Dict, List, Optional
+from typing import Dict, List
 from functools import partial
 from diffsynth_engine.models.text_encoder.siglip import SiglipImageEncoder
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
@@ -18,7 +18,6 @@ class FluxIPAdapterAttention(nn.Module):
         dim: int = 3072,
         head_num: int = 24,
         scale: float = 1.0,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -28,12 +27,13 @@ class FluxIPAdapterAttention(nn.Module):
         self.to_v_ip = nn.Linear(image_emb_dim, dim, device=device, dtype=dtype, bias=False)
         self.head_num = head_num
         self.scale = scale
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
-    def forward(self, query: torch.Tensor, image_emb: torch.Tensor):
+    def forward(self, query: torch.Tensor, image_emb: torch.Tensor, attn_kwargs=None):
         key = rearrange(self.norm_k(self.to_k_ip(image_emb)), "b s (h d) -> b s h d", h=self.head_num)
         value = rearrange(self.to_v_ip(image_emb), "b s (h d) -> b s h d", h=self.head_num)
-        attn_out = attention(query, key, value, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        attn_out = attention(query, key, value, **attn_kwargs)
         return self.scale * rearrange(attn_out, "b s h d -> b s (h d)")
     @classmethod

diffsynth_engine/models/flux/flux_text_encoder.py CHANGED Viewed

@@ -10,7 +10,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(FLUX_TEXT_ENCODER_CONFIG_FILE, "r") as f:
+with open(FLUX_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/flux/flux_vae.py CHANGED Viewed

@@ -8,7 +8,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(FLUX_VAE_CONFIG_FILE, "r") as f:
+with open(FLUX_VAE_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)
@@ -25,11 +25,29 @@ class FluxVAEStateDictConverter(VAEStateDictConverter):
             new_state_dict[name_] = param
         return new_state_dict
+    def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        rename_dict = config["diffusers"]["rename_dict"]
+        new_state_dict = {}
+        for name, param in state_dict.items():
+            if name not in rename_dict:
+                continue
+            name_ = rename_dict[name]
+            if "transformer_blocks" in name_:
+                param = param.squeeze()
+            new_state_dict[name_] = param
+        return new_state_dict
     def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         assert self.has_decoder or self.has_encoder, "Either decoder or encoder must be present"
-        if "decoder.conv_in.weight" in state_dict or "encoder.conv_in.weight" in state_dict:
+        if "decoder.up.0.block.0.conv1.weight" in state_dict or "encoder.down.0.block.0.conv1.weight" in state_dict:
             state_dict = self._from_civitai(state_dict)
             logger.info("use civitai format state dict")
+        elif (
+            "decoder.up_blocks.0.resnets.0.conv1.weight" in state_dict
+            or "encoder.down_blocks.0.resnets.0.conv1.weight" in state_dict
+        ):
+            state_dict = self._from_diffusers(state_dict)
+            logger.info("use diffusers format state dict")
         else:
             logger.info("use diffsynth format state dict")
         return self._filter(state_dict)

diffsynth_engine/models/hunyuan3d/dino_image_encoder.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch.nn as nn
 import torchvision.transforms as transforms
 import collections.abc
 import math
-from typing import Optional, Tuple, Dict
+from typing import Optional, Dict
 import torch
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
@@ -112,7 +112,9 @@ class Dinov2SelfAttention(nn.Module):
     def __init__(self, hidden_size: int, num_attention_heads: int, qkv_bias: bool) -> None:
         super().__init__()
         if hidden_size % num_attention_heads != 0:
-            raise ValueError(f"hidden_size {hidden_size} is not a multiple of num_attention_heads {num_attention_heads}.")
+            raise ValueError(
+                f"hidden_size {hidden_size} is not a multiple of num_attention_heads {num_attention_heads}."
+            )
         self.num_attention_heads = num_attention_heads
         self.attention_head_size = int(hidden_size / num_attention_heads)

diffsynth_engine/models/qwen_image/qwen2_5_vl.py CHANGED Viewed

@@ -942,6 +942,8 @@ class Qwen2_5_VLModel(nn.Module):
 class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
+    _supports_parallelization = True
     def __init__(
         self,
         vision_config: Qwen2_5_VLVisionConfig,
@@ -1173,6 +1175,9 @@ class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
             return position_ids, mrope_position_deltas
+    def get_fsdp_module_cls(self):
+        return {Qwen2_5_VisionBlock, Qwen2_5_VLDecoderLayer}
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

diffsynth-engine 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl