PyPI - diffsynth-engine - Versions diffs - 0.2.0__tar.gz → 0.2.1__tar.gz - Mend

diffsynth-engine 0.2.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (232) hide show

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/.gitignore RENAMED Viewed

@@ -6,4 +6,6 @@ tmp/
 build/
 dist/
 *.egg-info/
-.DS_Store/
+.DS_Store/
+.pytest_cache/
+.ruff_cache/

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.2.0
+Version: 0.2.1
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: torch>=2.4.1
+Requires-Dist: torch>=2.6
 Requires-Dist: torchvision
 Requires-Dist: xformers; sys_platform == "linux"
 Requires-Dist: safetensors
@@ -22,7 +22,7 @@ Requires-Dist: scipy
 Requires-Dist: torchsde
 Requires-Dist: pillow
 Requires-Dist: imageio[ffmpeg]
-Requires-Dist: yunchang
+Requires-Dist: yunchang; sys_platform == "linux"
 Provides-Extra: dev
 Requires-Dist: diffusers==0.31.0; extra == "dev"
 Requires-Dist: transformers==4.45.2; extra == "dev"

diffsynth_engine-0.2.1/assets/dingtalk.png ADDED Viewed

Binary file

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/__init__.py RENAMED Viewed

@@ -7,12 +7,16 @@ from .pipelines import (
     SDXLModelConfig,
     SDModelConfig,
     WanModelConfig,
+    ControlNetParams,
 )
+from .models.flux import FluxControlNet
 from .utils.download import fetch_model, fetch_modelscope_model, fetch_civitai_model
 from .utils.video import load_video, save_video
+from .tools import FluxInpaintingTool, FluxOutpaintingTool
 __all__ = [
     "FluxImagePipeline",
+    "FluxControlNet",
     "SDXLImagePipeline",
     "SDImagePipeline",
     "WanVideoPipeline",
@@ -20,6 +24,9 @@ __all__ = [
     "SDXLModelConfig",
     "SDModelConfig",
     "WanModelConfig",
+    "FluxInpaintingTool",
+    "FluxOutpaintingTool",
+    "ControlNetParams",
     "fetch_model",
     "fetch_modelscope_model",
     "fetch_civitai_model",

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/base.py RENAMED Viewed

@@ -1,9 +1,8 @@
 import os
 import torch
 import torch.nn as nn
-from typing import Dict, List, Union
-from safetensors.torch import load_file
+from typing import Dict, Union, List, Any
+from diffsynth_engine.utils.loader import load_file
 from diffsynth_engine.models.basic.lora import LoRALinear, LoRAConv2d
 from diffsynth_engine.models.utils import no_init_weights
@@ -22,18 +21,19 @@ class PreTrainedModel(nn.Module):
     @classmethod
     def from_pretrained(cls, pretrained_model_path: Union[str, os.PathLike], device: str, dtype: torch.dtype, **kwargs):
-        state_dict = load_file(pretrained_model_path, device=device)
+        state_dict = load_file(pretrained_model_path)
         return cls.from_state_dict(state_dict, device=device, dtype=dtype, **kwargs)
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, **kwargs):
         with no_init_weights():
             model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, **kwargs)
+        model.to_empty(device=device)
         model.load_state_dict(state_dict)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
-    def load_loras(self, lora_args: List[Dict[str, any]], fused: bool = True):
+    def load_loras(self, lora_args: List[Dict[str, Any]], fused: bool = True):
         for args in lora_args:
             key = args["name"]
             module = self.get_submodule(key)

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/basic/attention.py RENAMED Viewed

@@ -1,10 +1,9 @@
 import torch
 import torch.nn as nn
-from einops import rearrange
+from einops import rearrange, repeat
 from typing import Optional
-from yunchang import LongContextAttention
-from yunchang.kernels import AttnType
+import torch.nn.functional as F
 from diffsynth_engine.utils import logging
 from diffsynth_engine.utils.flag import (
     FLASH_ATTN_3_AVAILABLE,
@@ -18,12 +17,26 @@ from diffsynth_engine.utils.flag import (
 logger = logging.get_logger(__name__)
+def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
+    padding_size = (alignment - x.shape[dim] % alignment) % alignment
+    padded_x = F.pad(x, (0, padding_size), "constant", 0)
+    return padded_x[..., : x.shape[dim]]
 if FLASH_ATTN_3_AVAILABLE:
     from flash_attn_interface import flash_attn_func as flash_attn3
 if FLASH_ATTN_2_AVAILABLE:
     from flash_attn import flash_attn_func as flash_attn2
 if XFORMERS_AVAILABLE:
-    from xformers.ops import memory_efficient_attention as xformers_attn
+    from xformers.ops import memory_efficient_attention
+    def xformers_attn(q, k, v, attn_mask=None, scale=None):
+        if attn_mask is not None:
+            attn_mask = repeat(attn_mask, "S L -> B H S L", B=q.shape[0], H=q.shape[2])
+            attn_mask = memory_align(attn_mask)
+        return memory_efficient_attention(q, k, v, attn_bias=attn_mask, scale=scale)
 if SDPA_AVAILABLE:
     def sdpa_attn(q, k, v, attn_mask=None, scale=None):
@@ -100,7 +113,7 @@ def attention(
         elif FLASH_ATTN_2_AVAILABLE:
             return flash_attn2(q, k, v, softmax_scale=scale)
         elif XFORMERS_AVAILABLE:
-            return xformers_attn(q, k, v, attn_bias=attn_mask, scale=scale)
+            return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         elif SDPA_AVAILABLE:
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         else:
@@ -113,7 +126,7 @@ def attention(
         elif attn_impl == "flash_attn_2":
             return flash_attn2(q, k, v, softmax_scale=scale)
         elif attn_impl == "xformers":
-            return xformers_attn(q, k, v, attn_bias=attn_mask, scale=scale)
+            return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         elif attn_impl == "sdpa":
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         elif attn_impl == "sage_attn":
@@ -181,6 +194,9 @@ def long_context_attention(
     k: [B, Lk, Nk, C1]
     v: [B, Lk, Nk, C2]
     """
+    from yunchang import LongContextAttention
+    from yunchang.kernels import AttnType
     assert attn_impl in [
         None,
         "auto",

diffsynth_engine-0.2.1/diffsynth_engine/models/components/siglip.py ADDED Viewed

@@ -0,0 +1,169 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from einops import rearrange
+from typing import Union, List
+from PIL import Image
+from diffsynth_engine.models.basic.attention import Attention
+from diffsynth_engine.utils.loader import load_file
+from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(
+        self, num_channels: int, num_positions: int, hidden_size: int, patch_size: int, device: str, dtype: torch.dtype
+    ):
+        super().__init__()
+        self.patch_embedding = nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=hidden_size,
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding="valid",
+            device=device,
+            dtype=dtype,
+        )
+        self.position_embedding = nn.Embedding(num_positions, hidden_size, device=device, dtype=dtype)
+        self.position_ids = torch.arange(num_positions).expand((1, -1))
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        target_device = self.patch_embedding.weight.device
+        self.position_ids = self.position_ids.to(target_device)
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+class SiglipMLP(nn.Module):
+    def __init__(self, hidden_size, inner_dim, device, dtype):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, inner_dim, device=device, dtype=dtype)
+        self.fc2 = nn.Linear(inner_dim, hidden_size, device=device, dtype=dtype)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate="tanh")
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, hidden_size: int, inner_dim: int, num_heads: int, eps: float, device: str, dtype: torch.dtype):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(hidden_size, eps=eps)
+        self.self_attn = Attention(
+            q_dim=hidden_size,
+            num_heads=num_heads,
+            head_dim=hidden_size // num_heads,
+            bias_q=True,
+            bias_kv=True,
+            bias_out=True,
+        )
+        self.layer_norm2 = nn.LayerNorm(hidden_size, eps=eps)
+        self.mlp = SiglipMLP(hidden_size=hidden_size, inner_dim=inner_dim, device=device, dtype=dtype)
+    def forward(self, x):
+        x = self.self_attn(self.layer_norm1(x)) + x
+        x = self.mlp(self.layer_norm2(x)) + x
+        return x
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+    def __init__(self, hidden_size, inner_dim, num_heads, eps, device, dtype) -> None:
+        super().__init__()
+        self.probe = nn.Parameter(data=torch.randn(1, 1, hidden_size))
+        self.attention = nn.MultiheadAttention(
+            embed_dim=hidden_size, num_heads=num_heads, batch_first=True, device=device, dtype=dtype
+        )
+        self.layernorm = nn.LayerNorm(normalized_shape=hidden_size, eps=eps, device=device, dtype=dtype)
+        self.mlp = SiglipMLP(hidden_size=hidden_size, inner_dim=inner_dim, device=device, dtype=dtype)
+    def forward(self, hidden_state) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+class SiglipVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        num_channels: int = 3,
+        image_size: int = 384,
+        patch_size: int = 14,
+        layer_num: int = 27,
+        inner_dim: int = 4304,
+        num_heads: int = 16,
+        eps: float = 1e-06,
+        device: str = "cpu",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.embeddings = SiglipVisionEmbeddings(
+            num_channels=num_channels,
+            num_positions=(image_size // patch_size) ** 2,
+            hidden_size=hidden_size,
+            patch_size=patch_size,
+            device=device,
+            dtype=dtype,
+        )
+        self.layers = nn.ModuleList(
+            [SiglipEncoderLayer(hidden_size, inner_dim, num_heads, eps, device, dtype) for _ in range(layer_num)]
+        )
+        self.post_layernorm = nn.LayerNorm(hidden_size, eps=eps, device=device, dtype=dtype)
+        self.head = SiglipMultiheadAttentionPoolingHead(
+            hidden_size, inner_dim=inner_dim, num_heads=num_heads, eps=eps, device=device, dtype=dtype
+        )
+    def forward(self, x):
+        x = self.embeddings(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.post_layernorm(x)
+        x = self.head(x)
+        return x
+class SiglipImageEncoderConverter(StateDictConverter):
+    def convert(self, state_dict: dict) -> dict:
+        return state_dict
+class SiglipImageEncoder(PreTrainedModel):
+    converter = SiglipImageEncoderConverter()
+    def __init__(self, device: str, dtype: torch.dtype) -> None:
+        super().__init__()
+        self.image_encoder = SiglipVisionTransformer(device=device, dtype=dtype)
+    def image_preprocess(self, images: List[Image.Image]):
+        images = [image.resize(size=(384, 384), resample=3) for image in images]
+        rescaled_images = [np.array(image) / 255 for image in images]
+        normalized_images = [(image - 0.5) / 0.5 for image in rescaled_images]
+        image_tensor = torch.stack([torch.tensor(image) for image in normalized_images])
+        param = next(self.parameters())
+        image_tensor = image_tensor.to(param.device, param.dtype)
+        return rearrange(image_tensor, "b h w c -> b c h w")
+    @torch.no_grad()
+    def forward(self, images: List[Image.Image] | Image.Image):
+        if isinstance(images, Image.Image):
+            images = [images]
+        image_input = self.image_preprocess(images)
+        return self.image_encoder(image_input)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Union[str, os.PathLike], device: str, dtype: torch.dtype, **kwargs):
+        state_dict = load_file(str(pretrained_model_path))
+        return cls.from_state_dict(state_dict, device=device, dtype=dtype, **kwargs)

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/flux/__init__.py RENAMED Viewed

@@ -1,9 +1,11 @@
 from .flux_dit import FluxDiT, config as flux_dit_config
 from .flux_text_encoder import FluxTextEncoder1, FluxTextEncoder2, config as flux_text_encoder_config
 from .flux_vae import FluxVAEDecoder, FluxVAEEncoder, config as flux_vae_config
+from .flux_controlnet import FluxControlNet
 __all__ = [
     "FluxDiT",
+    "FluxControlNet",
     "FluxTextEncoder1",
     "FluxTextEncoder2",
     "FluxVAEDecoder",

diffsynth_engine-0.2.1/diffsynth_engine/models/flux/flux_controlnet.py ADDED Viewed

@@ -0,0 +1,160 @@
+import torch
+import torch.nn as nn
+from typing import Optional, Dict
+from einops import rearrange
+from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
+from diffsynth_engine.models.flux.flux_dit import (
+    FluxJointTransformerBlock,
+    RoPEEmbedding,
+    TimestepEmbeddings,
+)
+from diffsynth_engine.models.utils import no_init_weights
+class FluxControlNetStateDictConverter(StateDictConverter):
+    def __init__(self):
+        super().__init__()
+    def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key
+            if "attn.to_q" in new_key:
+                q = state_dict[new_key]
+                k = state_dict[new_key.replace("attn.to_q", "attn.to_k")]
+                v = state_dict[new_key.replace("attn.to_q", "attn.to_v")]
+                new_key = new_key.replace("transformer_blocks", "blocks")
+                new_key = new_key.replace("attn.to_q", "attn.a_to_qkv")
+                new_state_dict[new_key] = torch.cat((q, k, v), dim=0)
+            elif "attn.add_q_proj" in new_key:
+                q = state_dict[new_key]
+                k = state_dict[new_key.replace("attn.add_q_proj", "attn.add_k_proj")]
+                v = state_dict[new_key.replace("attn.add_q_proj", "attn.add_v_proj")]
+                new_key = new_key.replace("transformer_blocks", "blocks")
+                new_key = new_key.replace("attn.add_q_proj", "attn.b_to_qkv")
+                new_state_dict[new_key.replace("attn.add_q_proj", "attn.b_to_qkv")] = torch.cat((q, k, v), dim=0)
+            elif (
+                "attn.to_k" in new_key
+                or "attn.to_v" in new_key
+                or "attn.add_k_proj" in new_key
+                or "attn.add_v_proj" in new_key
+            ):
+                continue
+            else:
+                new_key = new_key.replace("transformer_blocks", "blocks")
+                new_key = new_key.replace("controlnet_blocks", "blocks_proj")
+                new_key = new_key.replace("time_text_embed.guidance_embedder", "guidance_embedder")
+                new_key = new_key.replace("time_text_embed.timestep_embedder", "time_embedder")
+                new_key = new_key.replace("time_text_embed.text_embedder.linear_1", "pooled_text_embedder.0")
+                new_key = new_key.replace("time_text_embed.text_embedder.linear_2", "pooled_text_embedder.2")
+                new_key = new_key.replace("transformer_blocks", "blocks")
+                new_key = new_key.replace("time_embedder.linear_1", "time_embedder.timestep_embedder.0")
+                new_key = new_key.replace("time_embedder.linear_2", "time_embedder.timestep_embedder.2")
+                new_key = new_key.replace("guidance_embedder.linear_1", "guidance_embedder.timestep_embedder.0")
+                new_key = new_key.replace("guidance_embedder.linear_2", "guidance_embedder.timestep_embedder.2")
+                # joint block
+                new_key = new_key.replace("norm1.linear", "norm1_a.linear")
+                new_key = new_key.replace("norm1_context.linear", "norm1_b.linear")
+                new_key = new_key.replace("attn.to_out.0", "attn.a_to_out")
+                new_key = new_key.replace("attn.to_add_out", "attn.b_to_out")
+                new_key = new_key.replace("attn.norm_q", "attn.norm_q_a")
+                new_key = new_key.replace("attn.norm_k", "attn.norm_k_a")
+                new_key = new_key.replace("attn.norm_added_q", "attn.norm_q_b")
+                new_key = new_key.replace("attn.norm_added_k", "attn.norm_k_b")
+                new_key = new_key.replace("ff.net", "ff_a")
+                new_key = new_key.replace("ff_context.net", "ff_b")
+                new_key = new_key.replace("0.proj", "0")
+                new_state_dict[new_key] = value
+        return new_state_dict
+    def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        return self._from_diffusers(state_dict)
+class FluxControlNet(PreTrainedModel):
+    converter = FluxControlNetStateDictConverter()
+    def __init__(
+        self,
+        condition_channels: int = 64,
+        attn_impl: Optional[str] = None,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
+        self.time_embedder = TimestepEmbeddings(256, 3072, device=device, dtype=dtype)
+        self.guidance_embedder = TimestepEmbeddings(256, 3072, device=device, dtype=dtype)
+        self.pooled_text_embedder = nn.Sequential(
+            nn.Linear(768, 3072, device=device, dtype=dtype),
+            nn.SiLU(),
+            nn.Linear(3072, 3072, device=device, dtype=dtype),
+        )
+        self.context_embedder = nn.Linear(4096, 3072, device=device, dtype=dtype)
+        self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
+        self.controlnet_x_embedder = nn.Linear(condition_channels, 3072)
+        self.blocks = nn.ModuleList(
+            [FluxJointTransformerBlock(3072, 24, attn_impl=attn_impl, device=device, dtype=dtype) for _ in range(6)]
+        )
+        # controlnet projection
+        self.blocks_proj = nn.ModuleList(
+            [nn.Linear(3072, 3072, device=device, dtype=dtype) for _ in range(len(self.blocks))]
+        )
+    def patchify(self, hidden_states):
+        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
+        return hidden_states
+    def forward(
+        self,
+        hidden_states,
+        control_condition,
+        control_scale,
+        timestep,
+        prompt_emb,
+        pooled_prompt_emb,
+        guidance,
+        image_ids,
+        text_ids,
+    ):
+        hidden_states = self.patchify(hidden_states)
+        control_condition = self.patchify(control_condition)
+        hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
+        condition = (
+            self.time_embedder(timestep, hidden_states.dtype)
+            + self.guidance_embedder(guidance * 1000, hidden_states.dtype)
+            + self.pooled_text_embedder(pooled_prompt_emb)
+        )
+        prompt_emb = self.context_embedder(prompt_emb)
+        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+        # double block
+        double_block_outputs = []
+        for i, block in enumerate(self.blocks):
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, condition, image_rotary_emb)
+            double_block_outputs.append(self.blocks_proj[i](hidden_states))
+        # apply control scale
+        double_block_outputs = [control_scale * output for output in double_block_outputs]
+        return double_block_outputs, None
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        device: str,
+        dtype: torch.dtype,
+        attn_impl: Optional[str] = None,
+    ):
+        if "controlnet_x_embedder.weight" in state_dict:
+            condition_channels = state_dict["controlnet_x_embedder.weight"].shape[1]
+        else:
+            condition_channels = 64
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(
+                cls, condition_channels=condition_channels, attn_impl=attn_impl, device=device, dtype=dtype
+            )
+        model.load_state_dict(state_dict)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/flux/flux_dit.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import json
 import torch
 import torch.nn as nn
+import numpy as np
 from typing import Dict, Optional
 from einops import rearrange
@@ -327,7 +328,6 @@ class FluxDiT(PreTrainedModel):
     def __init__(
         self,
-        disable_guidance_embedder=False,
         attn_impl: Optional[str] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
@@ -335,9 +335,7 @@ class FluxDiT(PreTrainedModel):
         super().__init__()
         self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
         self.time_embedder = TimestepEmbeddings(256, 3072, device=device, dtype=dtype)
-        self.guidance_embedder = (
-            None if disable_guidance_embedder else TimestepEmbeddings(256, 3072, device=device, dtype=dtype)
-        )
+        self.guidance_embedder = TimestepEmbeddings(256, 3072, device=device, dtype=dtype)
         self.pooled_text_embedder = nn.Sequential(
             nn.Linear(768, 3072, device=device, dtype=dtype),
             nn.SiLU(),
@@ -392,6 +390,8 @@ class FluxDiT(PreTrainedModel):
         text_ids,
         image_ids=None,
         use_gradient_checkpointing=False,
+        controlnet_double_block_output=None,
+        controlnet_single_block_output=None,
         **kwargs,
     ):
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
@@ -413,16 +413,10 @@ class FluxDiT(PreTrainedModel):
             hidden_states = self.patchify(hidden_states)
             hidden_states = self.x_embedder(hidden_states)
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            for block in self.blocks:
+            for i, block in enumerate(self.blocks):
                 if self.training and use_gradient_checkpointing:
                     hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
+                        block,
                         hidden_states,
                         prompt_emb,
                         conditioning,
@@ -431,12 +425,16 @@ class FluxDiT(PreTrainedModel):
                     )
                 else:
                     hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+                if controlnet_double_block_output is not None:
+                    interval_control = len(self.blocks) / len(controlnet_double_block_output)
+                    interval_control = int(np.ceil(interval_control))
+                    hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
             hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
             for block in self.single_blocks:
                 if self.training and use_gradient_checkpointing:
                     hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
+                        block,
                         hidden_states,
                         prompt_emb,
                         conditioning,
@@ -445,12 +443,15 @@ class FluxDiT(PreTrainedModel):
                     )
                 else:
                     hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
-            hidden_states = hidden_states[:, prompt_emb.shape[1] :]
+                if controlnet_single_block_output is not None:
+                    interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
+                    interval_control = int(np.ceil(interval_control))
+                    hidden_states = hidden_states + controlnet_single_block_output[i // interval_control]
+            hidden_states = hidden_states[:, prompt_emb.shape[1] :]
             hidden_states = self.final_norm_out(hidden_states, conditioning)
             hidden_states = self.final_proj_out(hidden_states)
             hidden_states = self.unpatchify(hidden_states, height, width)
             return hidden_states
     @classmethod
@@ -459,7 +460,6 @@ class FluxDiT(PreTrainedModel):
         state_dict: Dict[str, torch.Tensor],
         device: str,
         dtype: torch.dtype,
-        disable_guidance_embedder: bool = False,
         attn_impl: Optional[str] = None,
     ):
         with no_init_weights():
@@ -467,7 +467,6 @@ class FluxDiT(PreTrainedModel):
                 cls,
                 device=device,
                 dtype=dtype,
-                disable_guidance_embedder=disable_guidance_embedder,
                 attn_impl=attn_impl,
             )
             model = model.requires_grad_(False)  # for loading gguf

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/sd3/sd3_dit.py RENAMED Viewed

@@ -268,16 +268,10 @@ class SD3DiT(PreTrainedModel):
         height, width = hidden_states.shape[-2:]
         hidden_states = self.pos_embedder(hidden_states)
-        def create_custom_forward(module):
-            def custom_forward(*inputs):
-                return module(*inputs)
-            return custom_forward
         for block in self.blocks:
             if self.training and use_gradient_checkpointing:
                 hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                    block,
                     hidden_states,
                     prompt_emb,
                     conditioning,

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/sdxl/sdxl_unet.py RENAMED Viewed

@@ -260,12 +260,6 @@ class SDXLUNet(PreTrainedModel):
         res_stack = [hidden_states]
         # 3. blocks
-        def create_custom_forward(module):
-            def custom_forward(*inputs):
-                return module(*inputs)
-            return custom_forward
         for i, block in enumerate(self.blocks):
             if (
                 self.training
@@ -273,7 +267,7 @@ class SDXLUNet(PreTrainedModel):
                 and not (isinstance(block, PushBlock) or isinstance(block, PopBlock))
             ):
                 hidden_states, time_emb, text_emb, res_stack = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                    block,
                     hidden_states,
                     time_emb,
                     text_emb,

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/models/wan/wan_dit.py RENAMED Viewed

@@ -166,6 +166,7 @@ class CrossAttention(nn.Module):
         if self.has_image_input:
             k_img, v_img = self.norm_k_img(self.k_img(img)), self.v_img(img)
             k_img = rearrange(k_img, "b s (n d) -> b s n d", n=num_heads)
+            v_img = rearrange(v_img, "b s (n d) -> b s n d", n=num_heads)
             y = attention(q, k_img, v_img, attn_impl=self.attn_impl).flatten(2)
             x = x + y
         return self.o(x)

{diffsynth_engine-0.2.0 → diffsynth_engine-0.2.1}/diffsynth_engine/pipelines/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from .base import BasePipeline, LoRAStateDictConverter
-from .flux_image import FluxImagePipeline, FluxModelConfig
+from .flux_image import FluxImagePipeline, FluxModelConfig, ControlNetParams
 from .sdxl_image import SDXLImagePipeline, SDXLModelConfig
 from .sd_image import SDImagePipeline, SDModelConfig
 from .wan_video import WanVideoPipeline, WanModelConfig
@@ -15,4 +15,5 @@ __all__ = [
     "SDModelConfig",
     "WanVideoPipeline",
     "WanModelConfig",
+    "ControlNetParams",
 ]

diffsynth-engine 0.2.0__tar.gz → 0.2.1__tar.gz

diffsynth-engine 0.2.0tar.gz → 0.2.1tar.gz