PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev22__py3-none-any.whl → 0.6.1.dev23__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev22py3-none-any.whl → 0.6.1.dev23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

diffsynth_engine/conf/models/wan/dit/wan_dit_keymap.json ADDED Viewed

@@ -0,0 +1,41 @@
+{
+    "diffusers": {
+        "global_rename_dict": {
+            "patch_embedding": "patch_embedding",
+            "condition_embedder.text_embedder.linear_1": "text_embedding.0",
+            "condition_embedder.text_embedder.linear_2": "text_embedding.2",
+            "condition_embedder.time_embedder.linear_1": "time_embedding.0",
+            "condition_embedder.time_embedder.linear_2": "time_embedding.2",
+            "condition_embedder.time_proj": "time_projection.1",
+            "condition_embedder.image_embedder.norm1": "img_emb.proj.0",
+            "condition_embedder.image_embedder.ff.net.0.proj": "img_emb.proj.1",
+            "condition_embedder.image_embedder.ff.net.2": "img_emb.proj.3",
+            "condition_embedder.image_embedder.norm2": "img_emb.proj.4",
+            "condition_embedder.image_embedder.pos_embed": "img_emb.emb_pos",
+            "proj_out": "head.head",
+            "scale_shift_table": "head.modulation"
+        },
+        "rename_dict": {
+            "attn1.to_q": "self_attn.q",
+            "attn1.to_k": "self_attn.k",
+            "attn1.to_v": "self_attn.v",
+            "attn1.to_out.0": "self_attn.o",
+            "attn1.norm_q": "self_attn.norm_q",
+            "attn1.norm_k": "self_attn.norm_k",
+            "to_gate_compress": "self_attn.gate_compress",
+            "attn2.to_q": "cross_attn.q",
+            "attn2.to_k": "cross_attn.k",
+            "attn2.to_v": "cross_attn.v",
+            "attn2.to_out.0": "cross_attn.o",
+            "attn2.norm_q": "cross_attn.norm_q",
+            "attn2.norm_k": "cross_attn.norm_k",
+            "attn2.add_k_proj": "cross_attn.k_img",
+            "attn2.add_v_proj": "cross_attn.v_img",
+            "attn2.norm_added_k": "cross_attn.norm_k_img",
+            "norm2": "norm3",
+            "ffn.net.0.proj": "ffn.0",
+            "ffn.net.2": "ffn.2",
+            "scale_shift_table": "modulation"
+        }
+    }
+}

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -5,6 +5,7 @@ from dataclasses import dataclass, field
 from typing import List, Dict, Tuple, Optional
 from diffsynth_engine.configs.controlnet import ControlType
+from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 @dataclass
@@ -30,16 +31,43 @@ class AttnImpl(Enum):
     SDPA = "sdpa"  # Scaled Dot Product Attention
     SAGE = "sage"  # Sage Attention
     SPARGE = "sparge"  # Sparge Attention
+    VSA = "vsa"  # Video Sparse Attention
+@dataclass
+class SpargeAttentionParams:
+    smooth_k: bool = True
+    cdfthreshd: float = 0.6
+    simthreshd1: float = 0.98
+    pvthreshd: float = 50.0
+@dataclass
+class VideoSparseAttentionParams:
+    sparsity: float = 0.9
 @dataclass
 class AttentionConfig:
     dit_attn_impl: AttnImpl = AttnImpl.AUTO
-    # Sparge Attention
-    sparge_smooth_k: bool = True
-    sparge_cdfthreshd: float = 0.6
-    sparge_simthreshd1: float = 0.98
-    sparge_pvthreshd: float = 50.0
+    attn_params: Optional[SpargeAttentionParams | VideoSparseAttentionParams] = None
+    def get_attn_kwargs(self, latents: torch.Tensor, device: str) -> Dict:
+        attn_kwargs = {"attn_impl": self.dit_attn_impl.value}
+        if isinstance(self.attn_params, SpargeAttentionParams):
+            assert self.dit_attn_impl == AttnImpl.SPARGE
+            attn_kwargs.update(
+                {
+                    "smooth_k": self.attn_params.smooth_k,
+                    "simthreshd1": self.attn_params.simthreshd1,
+                    "cdfthreshd": self.attn_params.cdfthreshd,
+                    "pvthreshd": self.attn_params.pvthreshd,
+                }
+            )
+        elif isinstance(self.attn_params, VideoSparseAttentionParams):
+            assert self.dit_attn_impl == AttnImpl.VSA
+            attn_kwargs.update(get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.attn_params.sparsity, device=device))
+        return attn_kwargs
 @dataclass

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -12,6 +12,7 @@ from diffsynth_engine.utils.flag import (
     SDPA_AVAILABLE,
     SAGE_ATTN_AVAILABLE,
     SPARGE_ATTN_AVAILABLE,
+    VIDEO_SPARSE_ATTN_AVAILABLE,
 )
 from diffsynth_engine.utils.platform import DTYPE_FP8
@@ -20,12 +21,6 @@ FA3_MAX_HEADDIM = 256
 logger = logging.get_logger(__name__)
-def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
-    padding_size = (alignment - x.shape[dim] % alignment) % alignment
-    padded_x = F.pad(x, (0, padding_size), "constant", 0)
-    return padded_x[..., : x.shape[dim]]
 if FLASH_ATTN_3_AVAILABLE:
     from flash_attn_interface import flash_attn_func as flash_attn3
 if FLASH_ATTN_2_AVAILABLE:
@@ -33,6 +28,11 @@ if FLASH_ATTN_2_AVAILABLE:
 if XFORMERS_AVAILABLE:
     from xformers.ops import memory_efficient_attention
+    def memory_align(x: torch.Tensor, dim=-1, alignment: int = 8):
+        padding_size = (alignment - x.shape[dim] % alignment) % alignment
+        padded_x = F.pad(x, (0, padding_size), "constant", 0)
+        return padded_x[..., : x.shape[dim]]
     def xformers_attn(q, k, v, attn_mask=None, scale=None):
         if attn_mask is not None:
             if attn_mask.ndim == 2:
@@ -94,6 +94,13 @@ if SPARGE_ATTN_AVAILABLE:
         return out.transpose(1, 2)
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from diffsynth_engine.models.basic.video_sparse_attention import (
+        video_sparse_attn,
+        distributed_video_sparse_attn,
+    )
 def eager_attn(q, k, v, attn_mask=None, scale=None):
     q = q.transpose(1, 2)
     k = k.transpose(1, 2)
@@ -109,9 +116,10 @@ def eager_attn(q, k, v, attn_mask=None, scale=None):
 def attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = "auto",
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -133,6 +141,7 @@ def attention(
         "sdpa",
         "sage",
         "sparge",
+        "vsa",
     ]
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
@@ -189,10 +198,24 @@ def attention(
                 v,
                 attn_mask=attn_mask,
                 scale=scale,
-                smooth_k=kwargs.get("sparge_smooth_k", True),
-                simthreshd1=kwargs.get("sparge_simthreshd1", 0.6),
-                cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
-                pvthreshd=kwargs.get("sparge_pvthreshd", 50),
+                smooth_k=kwargs.get("smooth_k", True),
+                simthreshd1=kwargs.get("simthreshd1", 0.6),
+                cdfthreshd=kwargs.get("cdfthreshd", 0.98),
+                pvthreshd=kwargs.get("pvthreshd", 50),
+            )
+        if attn_impl == "vsa":
+            return video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
             )
         raise ValueError(f"Invalid attention implementation: {attn_impl}")
@@ -242,9 +265,10 @@ class Attention(nn.Module):
 def long_context_attention(
-    q,
-    k,
-    v,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
@@ -267,6 +291,7 @@ def long_context_attention(
         "sdpa",
         "sage",
         "sparge",
+        "vsa",
     ]
     assert attn_mask is None, "long context attention does not support attention mask"
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
@@ -307,11 +332,25 @@ def long_context_attention(
         if attn_impl == "sparge":
             attn_processor = SparseAttentionMeansim()
             # default args from spas_sage2_attn_meansim_cuda
-            attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
-            attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
-            attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
-            attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
+            attn_processor.smooth_k = torch.tensor(kwargs.get("smooth_k", True))
+            attn_processor.simthreshd1 = torch.tensor(kwargs.get("simthreshd1", 0.6))
+            attn_processor.cdfthreshd = torch.tensor(kwargs.get("cdfthreshd", 0.98))
+            attn_processor.pvthreshd = torch.tensor(kwargs.get("pvthreshd", 50))
             return LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)(
                 q, k, v, softmax_scale=scale
             )
+        if attn_impl == "vsa":
+            return distributed_video_sparse_attn(
+                q,
+                k,
+                v,
+                g,
+                sparsity=kwargs.get("sparsity"),
+                num_tiles=kwargs.get("num_tiles"),
+                total_seq_length=kwargs.get("total_seq_length"),
+                tile_partition_indices=kwargs.get("tile_partition_indices"),
+                reverse_tile_partition_indices=kwargs.get("reverse_tile_partition_indices"),
+                variable_block_sizes=kwargs.get("variable_block_sizes"),
+                non_pad_index=kwargs.get("non_pad_index"),
+            )
         raise ValueError(f"Invalid long context attention implementation: {attn_impl}")

diffsynth_engine/models/basic/video_sparse_attention.py ADDED Viewed

@@ -0,0 +1,235 @@
+import torch
+import math
+import functools
+from vsa import video_sparse_attn as vsa_core
+from diffsynth_engine.utils.parallel import get_sp_ulysses_group, get_sp_ring_world_size
+VSA_TILE_SIZE = (4, 4, 4)
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device, dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(
+                    indices[
+                        t * ts : min(t * ts + ts, T), h * hs : min(h * hs + hs, H), w * ws : min(w * ws + ws, W)
+                    ].flatten()
+                )
+    index = torch.cat(ls, dim=0)
+    return index
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    return torch.argsort(get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """
+    Compute the number of valid (non-padded) tokens inside every
+    (ts_t x ts_h x ts_w) tile after padding -- flattened in the order
+    (t-tile, h-tile, w-tile) that `rearrange` uses.
+    Returns
+    -------
+    torch.LongTensor  # shape: [∏ full_window_size]
+    """
+    # unpack
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = VSA_TILE_SIZE
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n_tiles: int) -> torch.LongTensor:
+        """Vector with the size of each tile along one dimension."""
+        sizes = torch.full((n_tiles,), tile, dtype=torch.int, device=device)
+        # size of last (possibly partial) tile
+        remainder = dim_len - (n_tiles - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)  # [n_t]
+    h_sizes = _sizes(h, ts_h, n_h)  # [n_h]
+    w_sizes = _sizes(w, ts_w, n_w)  # [n_w]
+    # broadcast‑multiply to get voxels per tile, then flatten
+    block_sizes = (
+        t_sizes[:, None, None]  # [n_t, 1,   1]
+        * h_sizes[None, :, None]  # [1,   n_h, 1]
+        * w_sizes[None, None, :]  # [1,   1,   n_w]
+    ).reshape(-1)  # [n_t * n_h * n_w]
+    return block_sizes
+@functools.lru_cache(maxsize=10)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+):
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size, device=device)[None, :]
+    index_mask = torch.arange(max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+def get_vsa_kwargs(
+    latent_shape: tuple[int, int, int],
+    patch_size: tuple[int, int, int],
+    sparsity: float,
+    device: torch.device,
+):
+    dit_seq_shape = (
+        latent_shape[0] // patch_size[0],
+        latent_shape[1] // patch_size[1],
+        latent_shape[2] // patch_size[2],
+    )
+    num_tiles = (
+        math.ceil(dit_seq_shape[0] / VSA_TILE_SIZE[0]),
+        math.ceil(dit_seq_shape[1] / VSA_TILE_SIZE[1]),
+        math.ceil(dit_seq_shape[2] / VSA_TILE_SIZE[2]),
+    )
+    total_seq_length = math.prod(dit_seq_shape)
+    tile_partition_indices = get_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    reverse_tile_partition_indices = get_reverse_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+    variable_block_sizes = construct_variable_block_sizes(dit_seq_shape, num_tiles, device)
+    non_pad_index = get_non_pad_index(variable_block_sizes, math.prod(VSA_TILE_SIZE))
+    return {
+        "sparsity": sparsity,
+        "num_tiles": num_tiles,
+        "total_seq_length": total_seq_length,
+        "tile_partition_indices": tile_partition_indices,
+        "reverse_tile_partition_indices": reverse_tile_partition_indices,
+        "variable_block_sizes": variable_block_sizes,
+        "non_pad_index": non_pad_index,
+    }
+def tile(
+    x: torch.Tensor,
+    num_tiles: tuple[int, int, int],
+    tile_partition_indices: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+) -> torch.Tensor:
+    t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
+    h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
+    w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
+    x_padded = torch.zeros(
+        (x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1]),
+        device=x.device,
+        dtype=x.dtype,
+    )
+    x_padded[:, non_pad_index] = x[:, tile_partition_indices]
+    return x_padded
+def untile(
+    x: torch.Tensor, reverse_tile_partition_indices: torch.LongTensor, non_pad_index: torch.LongTensor
+) -> torch.Tensor:
+    x = x[:, non_pad_index][:, reverse_tile_partition_indices]
+    return x
+def video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+):
+    q = tile(q, num_tiles, tile_partition_indices, non_pad_index)
+    k = tile(k, num_tiles, tile_partition_indices, non_pad_index)
+    v = tile(v, num_tiles, tile_partition_indices, non_pad_index)
+    g = tile(g, num_tiles, tile_partition_indices, non_pad_index)
+    q = q.transpose(1, 2).contiguous()
+    k = k.transpose(1, 2).contiguous()
+    v = v.transpose(1, 2).contiguous()
+    g = g.transpose(1, 2).contiguous()
+    topk = math.ceil((1 - sparsity) * (total_seq_length / math.prod(VSA_TILE_SIZE)))
+    out = vsa_core(
+        q,
+        k,
+        v,
+        variable_block_sizes=variable_block_sizes,
+        topk=topk,
+        block_size=VSA_TILE_SIZE,
+        compress_attn_weight=g,
+    ).transpose(1, 2)
+    out = untile(out, reverse_tile_partition_indices, non_pad_index)
+    return out
+def distributed_video_sparse_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    sparsity: float,
+    num_tiles: tuple[int, int, int],
+    total_seq_length: int,
+    tile_partition_indices: torch.LongTensor,
+    reverse_tile_partition_indices: torch.LongTensor,
+    variable_block_sizes: torch.LongTensor,
+    non_pad_index: torch.LongTensor,
+    scatter_idx: int = 2,
+    gather_idx: int = 1,
+):
+    from yunchang.comm.all_to_all import SeqAllToAll4D
+    assert get_sp_ring_world_size() == 1, "distributed video sparse attention requires ring degree to be 1"
+    sp_ulysses_group = get_sp_ulysses_group()
+    q = SeqAllToAll4D.apply(sp_ulysses_group, q, scatter_idx, gather_idx)
+    k = SeqAllToAll4D.apply(sp_ulysses_group, k, scatter_idx, gather_idx)
+    v = SeqAllToAll4D.apply(sp_ulysses_group, v, scatter_idx, gather_idx)
+    g = SeqAllToAll4D.apply(sp_ulysses_group, g, scatter_idx, gather_idx)
+    out = video_sparse_attn(
+        q,
+        k,
+        v,
+        g,
+        sparsity,
+        num_tiles,
+        total_seq_length,
+        tile_partition_indices,
+        reverse_tile_partition_indices,
+        variable_block_sizes,
+        non_pad_index,
+    )
+    out = SeqAllToAll4D.apply(sp_ulysses_group, out, gather_idx, scatter_idx)
+    return out

diffsynth_engine/models/flux/flux_controlnet.py CHANGED Viewed

@@ -86,7 +86,6 @@ class FluxControlNet(PreTrainedModel):
     def __init__(
         self,
         condition_channels: int = 64,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -103,10 +102,7 @@ class FluxControlNet(PreTrainedModel):
         self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
         self.controlnet_x_embedder = nn.Linear(condition_channels, 3072)
         self.blocks = nn.ModuleList(
-            [
-                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
-                for _ in range(6)
-            ]
+            [FluxDoubleTransformerBlock(3072, 24, device=device, dtype=dtype) for _ in range(6)]
         )
         # controlnet projection
         self.blocks_proj = nn.ModuleList(
@@ -128,6 +124,7 @@ class FluxControlNet(PreTrainedModel):
         image_ids: torch.Tensor,
         text_ids: torch.Tensor,
         guidance: torch.Tensor,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
         condition = (
@@ -141,7 +138,9 @@ class FluxControlNet(PreTrainedModel):
         # double block
         double_block_outputs = []
         for i, block in enumerate(self.blocks):
-            hidden_states, prompt_emb = block(hidden_states, prompt_emb, condition, image_rotary_emb)
+            hidden_states, prompt_emb = block(
+                hidden_states, prompt_emb, condition, image_rotary_emb, attn_kwargs=attn_kwargs
+            )
             double_block_outputs.append(self.blocks_proj[i](hidden_states))
         # apply control scale
@@ -149,24 +148,13 @@ class FluxControlNet(PreTrainedModel):
         return double_block_outputs, None
     @classmethod
-    def from_state_dict(
-        cls,
-        state_dict: Dict[str, torch.Tensor],
-        device: str,
-        dtype: torch.dtype,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
-    ):
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
         if "controlnet_x_embedder.weight" in state_dict:
             condition_channels = state_dict["controlnet_x_embedder.weight"].shape[1]
         else:
             condition_channels = 64
-        model = cls(
-            condition_channels=condition_channels,
-            attn_kwargs=attn_kwargs,
-            device="meta",
-            dtype=dtype,
-        )
+        model = cls(condition_channels=condition_channels, device="meta", dtype=dtype)
         model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)

diffsynth-engine 0.6.1.dev22__py3-none-any.whl → 0.6.1.dev23__py3-none-any.whl

diffsynth-engine 0.6.1.dev22py3-none-any.whl → 0.6.1.dev23py3-none-any.whl