PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

diffsynth_engine/__init__.py CHANGED Viewed

@@ -12,11 +12,13 @@ from .configs import (
     WanStateDicts,
     QwenImageStateDicts,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
     ControlNetParams,
     ControlType,
     QwenImageControlNetParams,
     QwenImageControlType,
-    LoraConfig,
 )
 from .pipelines import (
     SDImagePipeline,
@@ -59,6 +61,9 @@ __all__ = [
     "WanStateDicts",
     "QwenImageStateDicts",
     "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlNetParams",
     "ControlType",
     "QwenImageControlNetParams",
@@ -79,7 +84,6 @@ __all__ = [
     "FluxIPAdapterRefTool",
     "FluxReplaceByControlTool",
     "FluxReduxRefTool",
-    "LoraConfig",
     "fetch_model",
     "fetch_modelscope_model",
     "register_fetch_modelscope_model",

diffsynth_engine/conf/models/wan/dit/wan_dit_keymap.json ADDED Viewed

@@ -0,0 +1,41 @@
+{
+    "diffusers": {
+        "global_rename_dict": {
+            "patch_embedding": "patch_embedding",
+            "condition_embedder.text_embedder.linear_1": "text_embedding.0",
+            "condition_embedder.text_embedder.linear_2": "text_embedding.2",
+            "condition_embedder.time_embedder.linear_1": "time_embedding.0",
+            "condition_embedder.time_embedder.linear_2": "time_embedding.2",
+            "condition_embedder.time_proj": "time_projection.1",
+            "condition_embedder.image_embedder.norm1": "img_emb.proj.0",
+            "condition_embedder.image_embedder.ff.net.0.proj": "img_emb.proj.1",
+            "condition_embedder.image_embedder.ff.net.2": "img_emb.proj.3",
+            "condition_embedder.image_embedder.norm2": "img_emb.proj.4",
+            "condition_embedder.image_embedder.pos_embed": "img_emb.emb_pos",
+            "proj_out": "head.head",
+            "scale_shift_table": "head.modulation"
+        },
+        "rename_dict": {
+            "attn1.to_q": "self_attn.q",
+            "attn1.to_k": "self_attn.k",
+            "attn1.to_v": "self_attn.v",
+            "attn1.to_out.0": "self_attn.o",
+            "attn1.norm_q": "self_attn.norm_q",
+            "attn1.norm_k": "self_attn.norm_k",
+            "to_gate_compress": "self_attn.gate_compress",
+            "attn2.to_q": "cross_attn.q",
+            "attn2.to_k": "cross_attn.k",
+            "attn2.to_v": "cross_attn.v",
+            "attn2.to_out.0": "cross_attn.o",
+            "attn2.norm_q": "cross_attn.norm_q",
+            "attn2.norm_k": "cross_attn.norm_k",
+            "attn2.add_k_proj": "cross_attn.k_img",
+            "attn2.add_v_proj": "cross_attn.v_img",
+            "attn2.norm_added_k": "cross_attn.norm_k_img",
+            "norm2": "norm3",
+            "ffn.net.0.proj": "ffn.0",
+            "ffn.net.2": "ffn.2",
+            "scale_shift_table": "modulation"
+        }
+    }
+}

diffsynth_engine/configs/__init__.py CHANGED Viewed

@@ -17,14 +17,16 @@ from .pipeline import (
     WanStateDicts,
     WanS2VStateDicts,
     QwenImageStateDicts,
-    LoraConfig,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
 )
 from .controlnet import (
     ControlType,
     ControlNetParams,
-    QwenImageControlNetParams,
     QwenImageControlType,
+    QwenImageControlNetParams,
 )
 __all__ = [
@@ -46,10 +48,12 @@ __all__ = [
     "WanStateDicts",
     "WanS2VStateDicts",
     "QwenImageStateDicts",
-    "QwenImageControlType",
-    "QwenImageControlNetParams",
+    "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlType",
     "ControlNetParams",
-    "LoraConfig",
-    "AttnImpl",
+    "QwenImageControlType",
+    "QwenImageControlNetParams",
 ]

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -30,16 +30,26 @@ class AttnImpl(Enum):
     SDPA = "sdpa"  # Scaled Dot Product Attention
     SAGE = "sage"  # Sage Attention
     SPARGE = "sparge"  # Sparge Attention
+    VSA = "vsa"  # Video Sparse Attention
+@dataclass
+class SpargeAttentionParams:
+    smooth_k: bool = True
+    cdfthreshd: float = 0.6
+    simthreshd1: float = 0.98
+    pvthreshd: float = 50.0
+@dataclass
+class VideoSparseAttentionParams:
+    sparsity: float = 0.9
 @dataclass
 class AttentionConfig:
     dit_attn_impl: AttnImpl = AttnImpl.AUTO
-    # Sparge Attention
-    sparge_smooth_k: bool = True
-    sparge_cdfthreshd: float = 0.6
-    sparge_simthreshd1: float = 0.98
-    sparge_pvthreshd: float = 50.0
+    attn_params: Optional[SpargeAttentionParams | VideoSparseAttentionParams] = None
 @dataclass
@@ -234,14 +244,11 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
     encoder_dtype: torch.dtype = torch.bfloat16
     vae_dtype: torch.dtype = torch.float32
+    load_encoder: bool = True
     # override OptimizationConfig
     fbcache_relative_l1_threshold = 0.009
-    # override BaseConfig
-    vae_tiled: bool = True
-    vae_tile_size: Tuple[int, int] = (34, 34)
-    vae_tile_stride: Tuple[int, int] = (18, 16)
     @classmethod
     def basic_config(
         cls,

diffsynth_engine/models/base.py CHANGED Viewed

@@ -57,7 +57,7 @@ class PreTrainedModel(nn.Module):
     def get_tp_plan(self):
         raise NotImplementedError(f"{self.__class__.__name__} does not support TP")
-    def get_fsdp_modules(self):
+    def get_fsdp_module_cls(self):
         raise NotImplementedError(f"{self.__class__.__name__} does not support FSDP")

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -12,6 +12,7 @@ from diffsynth_engine.utils.flag import (
     SDPA_AVAILABLE,
     SAGE_ATTN_AVAILABLE,
     SPARGE_ATTN_AVAILABLE,
+    VIDEO_SPARSE_ATTN_AVAILABLE,
 )
 from diffsynth_engine.utils.platform import DTYPE_FP8
@@ -20,12 +21,6 @@ FA3_MAX_HEADDIM = 256
 logger = logging.get_logger(__name__)
-def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
-    padding_size = (alignment - x.shape[dim] % alignment) % alignment
-    padded_x = F.pad(x, (0, padding_size), "constant", 0)
-    return padded_x[..., : x.shape[dim]]
 if FLASH_ATTN_3_AVAILABLE:
     from flash_attn_interface import flash_attn_func as flash_attn3
 if FLASH_ATTN_2_AVAILABLE:
@@ -33,6 +28,11 @@ if FLASH_ATTN_2_AVAILABLE:
 if XFORMERS_AVAILABLE:
     from xformers.ops import memory_efficient_attention
+    def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
+        padding_size = (alignment - x.shape[dim] % alignment) % alignment
+        padded_x = F.pad(x, (0, padding_size), "constant", 0)
+        return padded_x[..., : x.shape[dim]]
     def xformers_attn(q, k, v, attn_mask=None, scale=None):
         if attn_mask is not None:
             if attn_mask.ndim == 2:
@@ -94,6 +94,13 @@ if SPARGE_ATTN_AVAILABLE:
         return out.transpose(1, 2)
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from diffsynth_engine.models.basic.video_sparse_attention import (
+        video_sparse_attn,
+        distributed_video_sparse_attn,
+    )
 def eager_attn(q, k, v, attn_mask=None, scale=None):
     q = q.transpose(1, 2)
     k = k.transpose(1, 2)
@@ -109,9 +116,10 @@ def eager_attn(q, k, v, attn_mask=None, scale=None):
 def attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = "auto",
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -133,6 +141,7 @@ def attention(
         "sdpa",
         "sage",
         "sparge",
+        "vsa",
     ]
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
@@ -189,10 +198,24 @@ def attention(
                 v,
                 attn_mask=attn_mask,
                 scale=scale,
-                smooth_k=kwargs.get("sparge_smooth_k", True),
-                simthreshd1=kwargs.get("sparge_simthreshd1", 0.6),
-                cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
-                pvthreshd=kwargs.get("sparge_pvthreshd", 50),
+                smooth_k=kwargs.get("smooth_k", True),
+                simthreshd1=kwargs.get("simthreshd1", 0.6),
+                cdfthreshd=kwargs.get("cdfthreshd", 0.98),
+                pvthreshd=kwargs.get("pvthreshd", 50),
+            )
+        if attn_impl == "vsa":
+            return video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
             )
         raise ValueError(f"Invalid attention implementation: {attn_impl}")
@@ -242,9 +265,10 @@ class Attention(nn.Module):
 def long_context_attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -267,6 +291,7 @@ def long_context_attention(
         "sdpa",
         "sage",
         "sparge",
+        "vsa",
     ]
     assert attn_mask is None, "long context attention does not support attention mask"
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
@@ -307,11 +332,25 @@ def long_context_attention(
         if attn_impl == "sparge":
             attn_processor = SparseAttentionMeansim()
             # default args from spas_sage2_attn_meansim_cuda
-            attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
-            attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
-            attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
-            attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
+            attn_processor.smooth_k = torch.tensor(kwargs.get("smooth_k", True))
+            attn_processor.simthreshd1 = torch.tensor(kwargs.get("simthreshd1", 0.6))
+            attn_processor.cdfthreshd = torch.tensor(kwargs.get("cdfthreshd", 0.98))
+            attn_processor.pvthreshd = torch.tensor(kwargs.get("pvthreshd", 50))
             return LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)(
                 q, k, v, softmax_scale=scale
             )
+        if attn_impl == "vsa":
+            return distributed_video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
+            )
         raise ValueError(f"Invalid long context attention implementation: {attn_impl}")

diffsynth_engine/models/basic/transformer_helper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import math
@@ -91,8 +92,8 @@ class NewGELUActivation(nn.Module):
     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
-    def forward(self, input: "torch.Tensor") -> "torch.Tensor":
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 class ApproximateGELU(nn.Module):
@@ -115,3 +116,36 @@ class ApproximateGELU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x * torch.sigmoid(1.702 * x)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias, device=device, dtype=dtype)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        x = self.gelu(x)
+        return x

diffsynth_engine/models/basic/video_sparse_attention.py ADDED Viewed

@@ -0,0 +1,238 @@
+import torch
+import math
+import functools
+from diffsynth_engine.utils.flag import VIDEO_SPARSE_ATTN_AVAILABLE
+from diffsynth_engine.utils.parallel import get_sp_ulysses_group, get_sp_ring_world_size
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from vsa import video_sparse_attn as vsa_core
+VSA_TILE_SIZE = (4, 4, 4)
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device, dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(
+                    indices[
+                        t * ts : min(t * ts + ts, T), h * hs : min(h * hs + hs, H), w * ws : min(w * ws + ws, W)
+                    ].flatten()
+                )
+    index = torch.cat(ls, dim=0)
+    return index
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    return torch.argsort(get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """
+    Compute the number of valid (non-padded) tokens inside every
+    (ts_t x ts_h x ts_w) tile after padding -- flattened in the order
+    (t-tile, h-tile, w-tile) that `rearrange` uses.
+    Returns
+    -------
+    torch.LongTensor  # shape: [∏ full_window_size]
+    """
+    # unpack
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = VSA_TILE_SIZE
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n_tiles: int) -> torch.LongTensor:
+        """Vector with the size of each tile along one dimension."""
+        sizes = torch.full((n_tiles,), tile, dtype=torch.int, device=device)
+        # size of last (possibly partial) tile
+        remainder = dim_len - (n_tiles - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)  # [n_t]
+    h_sizes = _sizes(h, ts_h, n_h)  # [n_h]
+    w_sizes = _sizes(w, ts_w, n_w)  # [n_w]
+    # broadcast‑multiply to get voxels per tile, then flatten
+    block_sizes = (
+        t_sizes[:, None, None]  # [n_t, 1,   1]
+        * h_sizes[None, :, None]  # [1,   n_h, 1]
+        * w_sizes[None, None, :]  # [1,   1,   n_w]
+    ).reshape(-1)  # [n_t * n_h * n_w]
+    return block_sizes
+@functools.lru_cache(maxsize=10)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+):
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size, device=device)[None, :]
+    index_mask = torch.arange(max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+def get_vsa_kwargs(
+    latent_shape: tuple[int, int, int],
+    patch_size: tuple[int, int, int],
+    sparsity: float,
+    device: torch.device,
+):
+    dit_seq_shape = (
+        latent_shape[0] // patch_size[0],
+        latent_shape[1] // patch_size[1],
+        latent_shape[2] // patch_size[2],
+    )
+    num_tiles = (
+        math.ceil(dit_seq_shape[0] / VSA_TILE_SIZE[0]),
+        math.ceil(dit_seq_shape[1] / VSA_TILE_SIZE[1]),
+        math.ceil(dit_seq_shape[2] / VSA_TILE_SIZE[2]),
+    )
+    total_seq_length = math.prod(dit_seq_shape)
+    tile_partition_indices = get_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    reverse_tile_partition_indices = get_reverse_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    variable_block_sizes = construct_variable_block_sizes(dit_seq_shape, num_tiles, device)
+    non_pad_index = get_non_pad_index(variable_block_sizes, math.prod(VSA_TILE_SIZE))
+    return {
+        "sparsity": sparsity,
+        "num_tiles": num_tiles,
+        "total_seq_length": total_seq_length,
+        "tile_partition_indices": tile_partition_indices,
+        "reverse_tile_partition_indices": reverse_tile_partition_indices,
+        "variable_block_sizes": variable_block_sizes,
+        "non_pad_index": non_pad_index,
+    }
+def tile(
+    x: torch.Tensor,
+    num_tiles: tuple[int, int, int],
+    tile_partition_indices: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+) -> torch.Tensor:
+    t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
+    h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
+    w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
+    x_padded = torch.zeros(
+        (x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1]),
+        device=x.device,
+        dtype=x.dtype,
+    )
+    x_padded[:, non_pad_index] = x[:, tile_partition_indices]
+    return x_padded
+def untile(
+    x: torch.Tensor, reverse_tile_partition_indices: torch.LongTensor, non_pad_index: torch.LongTensor
+) -> torch.Tensor:
+    x = x[:, non_pad_index][:, reverse_tile_partition_indices]
+    return x
+def video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+):
+    q = tile(q, num_tiles, tile_partition_indices, non_pad_index)
+    k = tile(k, num_tiles, tile_partition_indices, non_pad_index)
+    v = tile(v, num_tiles, tile_partition_indices, non_pad_index)
+    g = tile(g, num_tiles, tile_partition_indices, non_pad_index)
+    q = q.transpose(1, 2).contiguous()
+    k = k.transpose(1, 2).contiguous()
+    v = v.transpose(1, 2).contiguous()
+    g = g.transpose(1, 2).contiguous()
+    topk = math.ceil((1 - sparsity) * (total_seq_length / math.prod(VSA_TILE_SIZE)))
+    out = vsa_core(
+        q,
+        k,
+        v,
+        variable_block_sizes=variable_block_sizes,
+        topk=topk,
+        block_size=VSA_TILE_SIZE,
+        compress_attn_weight=g,
+    ).transpose(1, 2)
+    out = untile(out, reverse_tile_partition_indices, non_pad_index)
+    return out
+def distributed_video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+    scatter_idx: int = 2,
+    gather_idx: int = 1,
+):
+    from yunchang.comm.all_to_all import SeqAllToAll4D
+    assert get_sp_ring_world_size() == 1, "distributed video sparse attention requires ring degree to be 1"
+    sp_ulysses_group = get_sp_ulysses_group()
+    q = SeqAllToAll4D.apply(sp_ulysses_group, q, scatter_idx, gather_idx)
+    k = SeqAllToAll4D.apply(sp_ulysses_group, k, scatter_idx, gather_idx)
+    v = SeqAllToAll4D.apply(sp_ulysses_group, v, scatter_idx, gather_idx)
+    g = SeqAllToAll4D.apply(sp_ulysses_group, g, scatter_idx, gather_idx)
+    out = video_sparse_attn(
+        q,
+        k,
+        v,
+        g,
+        sparsity,
+        num_tiles,
+        total_seq_length,
+        tile_partition_indices,
+        reverse_tile_partition_indices,
+        variable_block_sizes,
+        non_pad_index,
+    )
+    out = SeqAllToAll4D.apply(sp_ulysses_group, out, gather_idx, scatter_idx)
+    return out

diffsynth_engine/models/flux/flux_controlnet.py CHANGED Viewed

@@ -86,7 +86,6 @@ class FluxControlNet(PreTrainedModel):
     def __init__(
         self,
         condition_channels: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -103,10 +102,7 @@ class FluxControlNet(PreTrainedModel):
         self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
         self.controlnet_x_embedder = nn.Linear(condition_channels, 3072)
         self.blocks = nn.ModuleList(
-            [
-                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(6)
-            ]
+            [FluxDoubleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(6)]
         )
         # controlnet projection
         self.blocks_proj = nn.ModuleList(
@@ -128,6 +124,7 @@ class FluxControlNet(PreTrainedModel):
         image_ids: torch.Tensor,
         text_ids: torch.Tensor,
         guidance: torch.Tensor,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
         condition = (
@@ -141,7 +138,9 @@ class FluxControlNet(PreTrainedModel):
         # double block
         double_block_outputs = []
         for i, block in enumerate(self.blocks):
-            hidden_states, prompt_emb = block(hidden_states, prompt_emb, condition, image_rotary_emb)
+            hidden_states, prompt_emb = block(
+                hidden_states, prompt_emb, condition, image_rotary_emb, attn_kwargs=attn_kwargs
+            )
             double_block_outputs.append(self.blocks_proj[i](hidden_states))
         # apply control scale
@@ -149,24 +148,13 @@ class FluxControlNet(PreTrainedModel):
         return double_block_outputs, None
     @classmethod
-    def from_state_dict(
-        cls,
-        state_dict: Dict[str, torch.Tensor],
-        device: str,
-        dtype: torch.dtype,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
-    ):
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
         if "controlnet_x_embedder.weight" in state_dict:
             condition_channels = state_dict["controlnet_x_embedder.weight"].shape[1]
         else:
             condition_channels = 64
-        model = cls(
-            condition_channels=condition_channels,
-            attn_kwargs=attn_kwargs,
-            device="meta",
-            dtype=dtype,
-        )
+        model = cls(condition_channels=condition_channels, device="meta", dtype=dtype)
         model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)

diffsynth-engine 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl