PyPI - diffsynth-engine - Versions diffs - 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -12,19 +12,15 @@ from diffsynth_engine.utils.flag import (
     SDPA_AVAILABLE,
     SAGE_ATTN_AVAILABLE,
     SPARGE_ATTN_AVAILABLE,
+    VIDEO_SPARSE_ATTN_AVAILABLE,
 )
+from diffsynth_engine.utils.platform import DTYPE_FP8
 FA3_MAX_HEADDIM = 256
 logger = logging.get_logger(__name__)
-def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
-    padding_size = (alignment - x.shape[dim] % alignment) % alignment
-    padded_x = F.pad(x, (0, padding_size), "constant", 0)
-    return padded_x[..., : x.shape[dim]]
 if FLASH_ATTN_3_AVAILABLE:
     from flash_attn_interface import flash_attn_func as flash_attn3
 if FLASH_ATTN_2_AVAILABLE:
@@ -32,6 +28,11 @@ if FLASH_ATTN_2_AVAILABLE:
 if XFORMERS_AVAILABLE:
     from xformers.ops import memory_efficient_attention
+    def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
+        padding_size = (alignment - x.shape[dim] % alignment) % alignment
+        padded_x = F.pad(x, (0, padding_size), "constant", 0)
+        return padded_x[..., : x.shape[dim]]
     def xformers_attn(q, k, v, attn_mask=None, scale=None):
         if attn_mask is not None:
             if attn_mask.ndim == 2:
@@ -93,6 +94,13 @@ if SPARGE_ATTN_AVAILABLE:
         return out.transpose(1, 2)
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from diffsynth_engine.models.basic.video_sparse_attention import (
+        video_sparse_attn,
+        distributed_video_sparse_attn,
+    )
 def eager_attn(q, k, v, attn_mask=None, scale=None):
     q = q.transpose(1, 2)
     k = k.transpose(1, 2)
@@ -108,9 +116,10 @@ def eager_attn(q, k, v, attn_mask=None, scale=None):
 def attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = "auto",
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -125,12 +134,14 @@ def attention(
         None,
         "auto",
         "eager",
-        "flash_attn_2",
-        "flash_attn_3",
+        "fa2",
+        "fa3",
+        "fa3_fp8",
         "xformers",
         "sdpa",
-        "sage_attn",
-        "sparge_attn",
+        "sage",
+        "sparge",
+        "vsa",
     ]
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
@@ -139,9 +150,13 @@ def attention(
                 return flash_attn3(q, k, v, softmax_scale=scale)
             else:
                 if not flash_attn3_compatible:
-                    logger.warning(f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation")
+                    logger.warning(
+                        f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
+                    )
                 else:
-                    logger.debug("flash_attn_3 does not support attention mask, will use fallback attention implementation")
+                    logger.debug(
+                        "flash_attn_3 does not support attention mask, will use fallback attention implementation"
+                    )
         if XFORMERS_AVAILABLE:
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         if SDPA_AVAILABLE:
@@ -152,33 +167,55 @@ def attention(
     else:
         if attn_impl == "eager":
             return eager_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "flash_attn_3":
+        if attn_impl == "fa3" or attn_impl == "fa3_fp8":
             if not flash_attn3_compatible:
                 raise RuntimeError(
                     f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
                 )
             if attn_mask is not None:
                 raise RuntimeError("flash_attn_3 does not support attention mask")
-            return flash_attn3(q, k, v, softmax_scale=scale)
-        if attn_impl == "flash_attn_2":
+            if attn_impl == "fa3":
+                return flash_attn3(q, k, v, softmax_scale=scale)
+            else:
+                origin_dtype = q.dtype
+                q = q.to(dtype=DTYPE_FP8)
+                k = k.to(dtype=DTYPE_FP8)
+                v = v.to(dtype=DTYPE_FP8)
+                out = flash_attn3(q, k, v, softmax_scale=scale)
+                return out.to(dtype=origin_dtype)
+        if attn_impl == "fa2":
             return flash_attn2(q, k, v, softmax_scale=scale)
         if attn_impl == "xformers":
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         if attn_impl == "sdpa":
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "sage_attn":
+        if attn_impl == "sage":
             return sage_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "sparge_attn":
+        if attn_impl == "sparge":
             return sparge_attn(
                 q,
                 k,
                 v,
                 attn_mask=attn_mask,
                 scale=scale,
-                smooth_k=kwargs.get("sparge_smooth_k", True),
-                simthreshd1=kwargs.get("sparge_simthreshd1", 0.6),
-                cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
-                pvthreshd=kwargs.get("sparge_pvthreshd", 50),
+                smooth_k=kwargs.get("smooth_k", True),
+                simthreshd1=kwargs.get("simthreshd1", 0.6),
+                cdfthreshd=kwargs.get("cdfthreshd", 0.98),
+                pvthreshd=kwargs.get("pvthreshd", 50),
+            )
+        if attn_impl == "vsa":
+            return video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
             )
         raise ValueError(f"Invalid attention implementation: {attn_impl}")
@@ -228,9 +265,10 @@ class Attention(nn.Module):
 def long_context_attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -247,12 +285,15 @@ def long_context_attention(
     assert attn_impl in [
         None,
         "auto",
-        "flash_attn_2",
-        "flash_attn_3",
+        "fa2",
+        "fa3",
+        "fa3_fp8",
         "sdpa",
-        "sage_attn",
-        "sparge_attn",
+        "sage",
+        "sparge",
+        "vsa",
     ]
+    assert attn_mask is None, "long context attention does not support attention mask"
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
         if FLASH_ATTN_3_AVAILABLE:
@@ -268,27 +309,48 @@ def long_context_attention(
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         raise ValueError("No available long context attention implementation")
     else:
-        if attn_impl == "flash_attn_3":
-            if flash_attn3_compatible:
-                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
-            else:
+        if attn_impl == "fa3" or attn_impl == "fa3_fp8":
+            if not flash_attn3_compatible:
                 raise RuntimeError(
                     f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
                 )
-        if attn_impl == "flash_attn_2":
+            if attn_impl == "fa3":
+                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+            origin_dtype = q.dtype
+            q = q.to(dtype=DTYPE_FP8)
+            k = k.to(dtype=DTYPE_FP8)
+            v = v.to(dtype=DTYPE_FP8)
+            out = LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+            return out.to(dtype=origin_dtype)
+        if attn_impl == "fa2":
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         if attn_impl == "sdpa":
             return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
-        if attn_impl == "sage_attn":
-            return LongContextAttention(attn_type=AttnType.SAGE_FP8)(q, k, v, softmax_scale=scale)
-        if attn_impl == "sparge_attn":
+        if attn_impl == "sage":
+            return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
+        if attn_impl == "sparge":
             attn_processor = SparseAttentionMeansim()
             # default args from spas_sage2_attn_meansim_cuda
-            attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
-            attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
-            attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
-            attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
+            attn_processor.smooth_k = torch.tensor(kwargs.get("smooth_k", True))
+            attn_processor.simthreshd1 = torch.tensor(kwargs.get("simthreshd1", 0.6))
+            attn_processor.cdfthreshd = torch.tensor(kwargs.get("cdfthreshd", 0.98))
+            attn_processor.pvthreshd = torch.tensor(kwargs.get("pvthreshd", 50))
             return LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)(
                 q, k, v, softmax_scale=scale
             )
+        if attn_impl == "vsa":
+            return distributed_video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
+            )
         raise ValueError(f"Invalid long context attention implementation: {attn_impl}")

diffsynth_engine/models/basic/transformer_helper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import math
@@ -91,8 +92,8 @@ class NewGELUActivation(nn.Module):
     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
-    def forward(self, input: "torch.Tensor") -> "torch.Tensor":
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 class ApproximateGELU(nn.Module):
@@ -115,3 +116,36 @@ class ApproximateGELU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x * torch.sigmoid(1.702 * x)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias, device=device, dtype=dtype)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        x = self.gelu(x)
+        return x

diffsynth_engine/models/basic/video_sparse_attention.py ADDED Viewed

@@ -0,0 +1,238 @@
+import torch
+import math
+import functools
+from diffsynth_engine.utils.flag import VIDEO_SPARSE_ATTN_AVAILABLE
+from diffsynth_engine.utils.parallel import get_sp_ulysses_group, get_sp_ring_world_size
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from vsa import video_sparse_attn as vsa_core
+VSA_TILE_SIZE = (4, 4, 4)
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device, dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(
+                    indices[
+                        t * ts : min(t * ts + ts, T), h * hs : min(h * hs + hs, H), w * ws : min(w * ws + ws, W)
+                    ].flatten()
+                )
+    index = torch.cat(ls, dim=0)
+    return index
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    return torch.argsort(get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """
+    Compute the number of valid (non-padded) tokens inside every
+    (ts_t x ts_h x ts_w) tile after padding -- flattened in the order
+    (t-tile, h-tile, w-tile) that `rearrange` uses.
+    Returns
+    -------
+    torch.LongTensor  # shape: [∏ full_window_size]
+    """
+    # unpack
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = VSA_TILE_SIZE
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n_tiles: int) -> torch.LongTensor:
+        """Vector with the size of each tile along one dimension."""
+        sizes = torch.full((n_tiles,), tile, dtype=torch.int, device=device)
+        # size of last (possibly partial) tile
+        remainder = dim_len - (n_tiles - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)  # [n_t]
+    h_sizes = _sizes(h, ts_h, n_h)  # [n_h]
+    w_sizes = _sizes(w, ts_w, n_w)  # [n_w]
+    # broadcast‑multiply to get voxels per tile, then flatten
+    block_sizes = (
+        t_sizes[:, None, None]  # [n_t, 1,   1]
+        * h_sizes[None, :, None]  # [1,   n_h, 1]
+        * w_sizes[None, None, :]  # [1,   1,   n_w]
+    ).reshape(-1)  # [n_t * n_h * n_w]
+    return block_sizes
+@functools.lru_cache(maxsize=10)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+):
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size, device=device)[None, :]
+    index_mask = torch.arange(max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+def get_vsa_kwargs(
+    latent_shape: tuple[int, int, int],
+    patch_size: tuple[int, int, int],
+    sparsity: float,
+    device: torch.device,
+):
+    dit_seq_shape = (
+        latent_shape[0] // patch_size[0],
+        latent_shape[1] // patch_size[1],
+        latent_shape[2] // patch_size[2],
+    )
+    num_tiles = (
+        math.ceil(dit_seq_shape[0] / VSA_TILE_SIZE[0]),
+        math.ceil(dit_seq_shape[1] / VSA_TILE_SIZE[1]),
+        math.ceil(dit_seq_shape[2] / VSA_TILE_SIZE[2]),
+    )
+    total_seq_length = math.prod(dit_seq_shape)
+    tile_partition_indices = get_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    reverse_tile_partition_indices = get_reverse_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    variable_block_sizes = construct_variable_block_sizes(dit_seq_shape, num_tiles, device)
+    non_pad_index = get_non_pad_index(variable_block_sizes, math.prod(VSA_TILE_SIZE))
+    return {
+        "sparsity": sparsity,
+        "num_tiles": num_tiles,
+        "total_seq_length": total_seq_length,
+        "tile_partition_indices": tile_partition_indices,
+        "reverse_tile_partition_indices": reverse_tile_partition_indices,
+        "variable_block_sizes": variable_block_sizes,
+        "non_pad_index": non_pad_index,
+    }
+def tile(
+    x: torch.Tensor,
+    num_tiles: tuple[int, int, int],
+    tile_partition_indices: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+) -> torch.Tensor:
+    t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
+    h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
+    w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
+    x_padded = torch.zeros(
+        (x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1]),
+        device=x.device,
+        dtype=x.dtype,
+    )
+    x_padded[:, non_pad_index] = x[:, tile_partition_indices]
+    return x_padded
+def untile(
+    x: torch.Tensor, reverse_tile_partition_indices: torch.LongTensor, non_pad_index: torch.LongTensor
+) -> torch.Tensor:
+    x = x[:, non_pad_index][:, reverse_tile_partition_indices]
+    return x
+def video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+):
+    q = tile(q, num_tiles, tile_partition_indices, non_pad_index)
+    k = tile(k, num_tiles, tile_partition_indices, non_pad_index)
+    v = tile(v, num_tiles, tile_partition_indices, non_pad_index)
+    g = tile(g, num_tiles, tile_partition_indices, non_pad_index)
+    q = q.transpose(1, 2).contiguous()
+    k = k.transpose(1, 2).contiguous()
+    v = v.transpose(1, 2).contiguous()
+    g = g.transpose(1, 2).contiguous()
+    topk = math.ceil((1 - sparsity) * (total_seq_length / math.prod(VSA_TILE_SIZE)))
+    out = vsa_core(
+        q,
+        k,
+        v,
+        variable_block_sizes=variable_block_sizes,
+        topk=topk,
+        block_size=VSA_TILE_SIZE,
+        compress_attn_weight=g,
+    ).transpose(1, 2)
+    out = untile(out, reverse_tile_partition_indices, non_pad_index)
+    return out
+def distributed_video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+    scatter_idx: int = 2,
+    gather_idx: int = 1,
+):
+    from yunchang.comm.all_to_all import SeqAllToAll4D
+    assert get_sp_ring_world_size() == 1, "distributed video sparse attention requires ring degree to be 1"
+    sp_ulysses_group = get_sp_ulysses_group()
+    q = SeqAllToAll4D.apply(sp_ulysses_group, q, scatter_idx, gather_idx)
+    k = SeqAllToAll4D.apply(sp_ulysses_group, k, scatter_idx, gather_idx)
+    v = SeqAllToAll4D.apply(sp_ulysses_group, v, scatter_idx, gather_idx)
+    g = SeqAllToAll4D.apply(sp_ulysses_group, g, scatter_idx, gather_idx)
+    out = video_sparse_attn(
+        q,
+        k,
+        v,
+        g,
+        sparsity,
+        num_tiles,
+        total_seq_length,
+        tile_partition_indices,
+        reverse_tile_partition_indices,
+        variable_block_sizes,
+        non_pad_index,
+    )
+    out = SeqAllToAll4D.apply(sp_ulysses_group, out, gather_idx, scatter_idx)
+    return out

diffsynth_engine/models/flux/flux_controlnet.py CHANGED Viewed

@@ -86,7 +86,6 @@ class FluxControlNet(PreTrainedModel):
     def __init__(
         self,
         condition_channels: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -103,10 +102,7 @@ class FluxControlNet(PreTrainedModel):
         self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
         self.controlnet_x_embedder = nn.Linear(condition_channels, 3072)
         self.blocks = nn.ModuleList(
-            [
-                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(6)
-            ]
+            [FluxDoubleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(6)]
         )
         # controlnet projection
         self.blocks_proj = nn.ModuleList(
@@ -119,18 +115,17 @@ class FluxControlNet(PreTrainedModel):
     def forward(
         self,
-        hidden_states,
-        control_condition,
-        control_scale,
-        timestep,
-        prompt_emb,
-        pooled_prompt_emb,
-        guidance,
-        image_ids,
-        text_ids,
+        hidden_states: torch.Tensor,
+        control_condition: torch.Tensor,
+        control_scale: float,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        pooled_prompt_emb: torch.Tensor,
+        image_ids: torch.Tensor,
+        text_ids: torch.Tensor,
+        guidance: torch.Tensor,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        hidden_states = self.patchify(hidden_states)
-        control_condition = self.patchify(control_condition)
         hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
         condition = (
             self.time_embedder(timestep, hidden_states.dtype)
@@ -143,7 +138,9 @@ class FluxControlNet(PreTrainedModel):
         # double block
         double_block_outputs = []
         for i, block in enumerate(self.blocks):
-            hidden_states, prompt_emb = block(hidden_states, prompt_emb, condition, image_rotary_emb)
+            hidden_states, prompt_emb = block(
+                hidden_states, prompt_emb, condition, image_rotary_emb, attn_kwargs=attn_kwargs
+            )
             double_block_outputs.append(self.blocks_proj[i](hidden_states))
         # apply control scale
@@ -151,24 +148,13 @@ class FluxControlNet(PreTrainedModel):
         return double_block_outputs, None
     @classmethod
-    def from_state_dict(
-        cls,
-        state_dict: Dict[str, torch.Tensor],
-        device: str,
-        dtype: torch.dtype,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
-    ):
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
         if "controlnet_x_embedder.weight" in state_dict:
             condition_channels = state_dict["controlnet_x_embedder.weight"].shape[1]
         else:
             condition_channels = 64
-        model = cls(
-            condition_channels=condition_channels,
-            attn_kwargs=attn_kwargs,
-            device="meta",
-            dtype=dtype,
-        )
+        model = cls(condition_channels=condition_channels, device="meta", dtype=dtype)
         model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)

diffsynth-engine 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl