PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev22__py3-none-any.whl → 0.6.1.dev23__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev22py3-none-any.whl → 0.6.1.dev23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

diffsynth_engine/models/wan/wan_dit.py CHANGED Viewed

@@ -17,6 +17,7 @@ from diffsynth_engine.utils.constants import (
     WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
     WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
     WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
+    WAN_DIT_KEYMAP_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
@@ -30,6 +31,9 @@ from diffsynth_engine.utils.parallel import (
 T5_TOKEN_NUM = 512
 FLF_TOKEN_NUM = 257 * 2
+with open(WAN_DIT_KEYMAP_FILE, "r", encoding="utf-8") as f:
+    config = json.load(f)
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
     return x * (1 + scale) + shift
@@ -73,7 +77,7 @@ class SelfAttention(nn.Module):
         dim: int,
         num_heads: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -86,19 +90,25 @@ class SelfAttention(nn.Module):
         self.o = nn.Linear(dim, dim, device=device, dtype=dtype)
         self.norm_q = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
         self.norm_k = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        self.gate_compress = nn.Linear(dim, dim, device=device, dtype=dtype) if use_vsa else None
-    def forward(self, x, freqs):
+    def forward(self, x, freqs, attn_kwargs=None):
         q, k, v = self.norm_q(self.q(x)), self.norm_k(self.k(x)), self.v(x)
+        g = self.gate_compress(x) if self.gate_compress is not None else None
         num_heads = q.shape[2] // self.head_dim
         q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        g = rearrange(g, "b s (n d) -> b s n d", n=num_heads) if g is not None else None
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
         x = attention_ops.attention(
             q=rope_apply(q, freqs),
             k=rope_apply(k, freqs),
             v=v,
-            **self.attn_kwargs,
+            g=g,
+            **attn_kwargs,
         )
         x = x.flatten(2)
         return self.o(x)
@@ -111,7 +121,6 @@ class CrossAttention(nn.Module):
         num_heads: int,
         eps: float = 1e-6,
         has_image_input: bool = False,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -130,9 +139,8 @@ class CrossAttention(nn.Module):
             self.k_img = nn.Linear(dim, dim, device=device, dtype=dtype)
             self.v_img = nn.Linear(dim, dim, device=device, dtype=dtype)
             self.norm_k_img = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
+    def forward(self, x: torch.Tensor, y: torch.Tensor, attn_kwargs=None):
         if self.has_image_input:
             img = y[:, :-T5_TOKEN_NUM]
             ctx = y[:, -T5_TOKEN_NUM:]
@@ -144,12 +152,16 @@ class CrossAttention(nn.Module):
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
-        x = attention(q, k, v, **self.attn_kwargs).flatten(2)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        if attn_kwargs.get("attn_impl", None) == "vsa":
+            attn_kwargs = attn_kwargs.copy()
+            attn_kwargs["attn_impl"] = "sdpa"
+        x = attention(q, k, v, **attn_kwargs).flatten(2)
         if self.has_image_input:
             k_img, v_img = self.norm_k_img(self.k_img(img)), self.v_img(img)
             k_img = rearrange(k_img, "b s (n d) -> b s n d", n=num_heads)
             v_img = rearrange(v_img, "b s (n d) -> b s n d", n=num_heads)
-            y = attention(q, k_img, v_img, **self.attn_kwargs).flatten(2)
+            y = attention(q, k_img, v_img, **attn_kwargs).flatten(2)
             x = x + y
         return self.o(x)
@@ -162,7 +174,7 @@ class DiTBlock(nn.Module):
         num_heads: int,
         ffn_dim: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -170,9 +182,9 @@ class DiTBlock(nn.Module):
         self.dim = dim
         self.num_heads = num_heads
         self.ffn_dim = ffn_dim
-        self.self_attn = SelfAttention(dim, num_heads, eps, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        self.self_attn = SelfAttention(dim, num_heads, eps, use_vsa=use_vsa, device=device, dtype=dtype)
         self.cross_attn = CrossAttention(
-            dim, num_heads, eps, has_image_input=has_image_input, attn_kwargs=attn_kwargs, device=device, dtype=dtype
+            dim, num_heads, eps, has_image_input=has_image_input, device=device, dtype=dtype
         )
         self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
         self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
@@ -184,14 +196,14 @@ class DiTBlock(nn.Module):
         )
         self.modulation = nn.Parameter(torch.randn(1, 6, dim, device=device, dtype=dtype) / dim**0.5)
-    def forward(self, x, context, t_mod, freqs):
+    def forward(self, x, context, t_mod, freqs, attn_kwargs=None):
         # msa: multi-head self-attention  mlp: multi-layer perceptron
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
             t.squeeze(1) for t in (self.modulation + t_mod).chunk(6, dim=1)
         ]
         input_x = modulate(self.norm1(x), shift_msa, scale_msa)
-        x = x + gate_msa * self.self_attn(input_x, freqs)
-        x = x + self.cross_attn(self.norm3(x), context)
+        x = x + gate_msa * self.self_attn(input_x, freqs, attn_kwargs)
+        x = x + self.cross_attn(self.norm3(x), context, attn_kwargs)
         input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
         x = x + gate_mlp * self.ffn(input_x)
         return x
@@ -249,7 +261,26 @@ class Head(nn.Module):
 class WanDiTStateDictConverter(StateDictConverter):
+    def _from_diffusers(self, state_dict):
+        global_rename_dict = config["diffusers"]["global_rename_dict"]
+        rename_dict = config["diffusers"]["rename_dict"]
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            suffix = ""
+            suffix = ".weight" if name.endswith(".weight") else suffix
+            suffix = ".bias" if name.endswith(".bias") else suffix
+            prefix = name[: -len(suffix)] if suffix else name
+            if prefix in global_rename_dict:
+                state_dict_[f"{global_rename_dict[prefix]}{suffix}"] = param
+            if prefix.startswith("blocks."):
+                _, idx, middle = prefix.split(".", 2)
+                if middle in rename_dict:
+                    state_dict_[f"blocks.{idx}.{rename_dict[middle]}{suffix}"] = param
+        return state_dict_
     def convert(self, state_dict):
+        if "condition_embedder.time_proj.weight" in state_dict:
+            return self._from_diffusers(state_dict)
         return state_dict
@@ -273,7 +304,7 @@ class WanDiT(PreTrainedModel):
         has_vae_feature: bool = False,
         fuse_image_latents: bool = False,
         flf_pos_emb: bool = False,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -307,7 +338,16 @@ class WanDiT(PreTrainedModel):
         )
         self.blocks = nn.ModuleList(
             [
-                DiTBlock(has_clip_feature, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
+                DiTBlock(
+                    has_clip_feature,
+                    dim,
+                    num_heads,
+                    ffn_dim,
+                    eps,
+                    use_vsa,
+                    device=device,
+                    dtype=dtype,
+                )
                 for _ in range(num_layers)
             ]
         )
@@ -344,6 +384,7 @@ class WanDiT(PreTrainedModel):
         timestep: torch.Tensor,
         clip_feature: Optional[torch.Tensor] = None,  # clip_vision_encoder(img)
         y: Optional[torch.Tensor] = None,  # vae_encoder(img)
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
         use_cfg = x.shape[0] > 1
@@ -376,7 +417,7 @@ class WanDiT(PreTrainedModel):
             with sequence_parallel((x, t, t_mod, freqs), seq_dims=(1, 0, 0, 0)):
                 for block in self.blocks:
-                    x = block(x, context, t_mod, freqs)
+                    x = block(x, context, t_mod, freqs, attn_kwargs)
                 x = self.head(x, t)
                 (x,) = sequence_parallel_unshard((x,), seq_dims=(1,), seq_lens=(f * h * w,))
             x = self.unpatchify(x, (f, h, w))
@@ -409,12 +450,11 @@ class WanDiT(PreTrainedModel):
         config: Dict[str, Any],
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
-        assign: bool = True,
+        use_vsa: bool = False,
     ):
-        model = cls(**config, device="meta", dtype=dtype, attn_kwargs=attn_kwargs)
+        model = cls(**config, device="meta", dtype=dtype, use_vsa=use_vsa)
         model = model.requires_grad_(False)
-        model.load_state_dict(state_dict, assign=assign)
+        model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

diffsynth_engine/pipelines/flux_image.py CHANGED Viewed

@@ -17,7 +17,12 @@ from diffsynth_engine.models.flux import (
     flux_dit_config,
     flux_text_encoder_config,
 )
-from diffsynth_engine.configs import FluxPipelineConfig, FluxStateDicts, ControlType, ControlNetParams
+from diffsynth_engine.configs import (
+    FluxPipelineConfig,
+    FluxStateDicts,
+    ControlType,
+    ControlNetParams,
+)
 from diffsynth_engine.models.basic.lora import LoRAContext
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
 from diffsynth_engine.pipelines.utils import accumulate, calculate_shift
@@ -507,20 +512,12 @@ class FluxImagePipeline(BasePipeline):
         vae_encoder = FluxVAEEncoder.from_state_dict(state_dicts.vae, device=init_device, dtype=config.vae_dtype)
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             if config.use_fbcache:
                 dit = FluxDiTFBCache.from_state_dict(
                     state_dicts.model,
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
                     in_channel=config.control_type.get_in_channel(),
-                    attn_kwargs=attn_kwargs,
                     relative_l1_threshold=config.fbcache_relative_l1_threshold,
                 )
             else:
@@ -529,7 +526,6 @@ class FluxImagePipeline(BasePipeline):
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
                     in_channel=config.control_type.get_in_channel(),
-                    attn_kwargs=attn_kwargs,
                 )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -755,6 +751,7 @@ class FluxImagePipeline(BasePipeline):
         latents = latents.to(self.dtype)
         self.load_models_to_device(["dit"])
+        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
         noise_pred = self.dit(
             hidden_states=latents,
             timestep=timestep,
@@ -766,6 +763,7 @@ class FluxImagePipeline(BasePipeline):
             image_ids=image_ids,
             controlnet_double_block_output=double_block_output,
             controlnet_single_block_output=single_block_output,
+            attn_kwargs=attn_kwargs,
         )
         noise_pred = noise_pred[:, :image_seq_len]
         noise_pred = self.dit.unpatchify(noise_pred, height, width)
@@ -887,6 +885,8 @@ class FluxImagePipeline(BasePipeline):
             if self.offload_mode is not None:
                 empty_cache()
                 param.model.to(self.device)
+            attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
             double_block_output, single_block_output = param.model(
                 hidden_states=latents,
                 control_condition=control_condition,
@@ -897,6 +897,7 @@ class FluxImagePipeline(BasePipeline):
                 image_ids=image_ids,
                 text_ids=text_ids,
                 guidance=guidance,
+                attn_kwargs=attn_kwargs,
             )
             if self.offload_mode is not None:
                 param.model.to("cpu")

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -91,7 +91,7 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
             if "lora_A.weight" in key:
                 lora_a_suffix = "lora_A.weight"
                 lora_b_suffix = "lora_B.weight"
             if lora_a_suffix is None:
                 continue
@@ -253,19 +253,11 @@ class QwenImagePipeline(BasePipeline):
         )
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             if config.use_fbcache:
                 dit = QwenImageDiTFBCache.from_state_dict(
                     state_dicts.model,
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                     relative_l1_threshold=config.fbcache_relative_l1_threshold,
                 )
             else:
@@ -273,7 +265,6 @@ class QwenImagePipeline(BasePipeline):
                     state_dicts.model,
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                 )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -548,6 +539,7 @@ class QwenImagePipeline(BasePipeline):
         entity_masks: Optional[List[torch.Tensor]] = None,
     ):
         self.load_models_to_device(["dit"])
+        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
         noise_pred = self.dit(
             image=latents,
             edit=image_latents,
@@ -558,6 +550,7 @@ class QwenImagePipeline(BasePipeline):
             entity_text=entity_prompt_embs,
             entity_seq_lens=[mask.sum(dim=1) for mask in entity_prompt_emb_masks] if entity_prompt_emb_masks else None,
             entity_masks=entity_masks,
+            attn_kwargs=attn_kwargs,
         )
         return noise_pred

diffsynth_engine/pipelines/wan_s2v.py CHANGED Viewed

@@ -394,6 +394,7 @@ class WanSpeech2VideoPipeline(WanVideoPipeline):
         void_audio_input: torch.Tensor | None = None,
     ):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
+        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
         noise_pred = model(
             x=latents,
@@ -408,6 +409,7 @@ class WanSpeech2VideoPipeline(WanVideoPipeline):
             drop_motion_frames=drop_motion_frames,
             audio_mask=audio_mask,
             void_audio_input=void_audio_input,
+            attn_kwargs=attn_kwargs,
         )
         return noise_pred
@@ -654,19 +656,12 @@ class WanSpeech2VideoPipeline(WanVideoPipeline):
         )
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             dit = WanS2VDiT.from_state_dict(
                 state_dicts.model,
                 config=model_config,
                 device=("cpu" if config.use_fsdp else init_device),
                 dtype=config.model_dtype,
-                attn_kwargs=attn_kwargs,
+                use_vsa=(config.dit_attn_impl.value == "vsa"),
             )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)

diffsynth_engine/pipelines/wan_video.py CHANGED Viewed

@@ -323,6 +323,7 @@ class WanVideoPipeline(BasePipeline):
     def predict_noise(self, model, latents, image_clip_feature, image_y, timestep, context):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
+        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
         noise_pred = model(
             x=latents,
@@ -330,6 +331,7 @@ class WanVideoPipeline(BasePipeline):
             context=context,
             clip_feature=image_clip_feature,
             y=image_y,
+            attn_kwargs=attn_kwargs,
         )
         return noise_pred
@@ -578,19 +580,12 @@ class WanVideoPipeline(BasePipeline):
             dit_state_dict = state_dicts.model
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             dit = WanDiT.from_state_dict(
                 dit_state_dict,
                 config=dit_config,
                 device=("cpu" if config.use_fsdp else init_device),
                 dtype=config.model_dtype,
-                attn_kwargs=attn_kwargs,
+                use_vsa=(config.dit_attn_impl.value == "vsa"),
             )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -602,7 +597,7 @@ class WanVideoPipeline(BasePipeline):
                     config=dit_config,
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
+                    use_vsa=(config.dit_attn_impl.value == "vsa"),
                 )
                 if config.use_fp8_linear:
                     enable_fp8_linear(dit2)
@@ -640,19 +635,22 @@ class WanVideoPipeline(BasePipeline):
     @staticmethod
     def _get_dit_type(model_state_dict: Dict[str, torch.Tensor] | Dict[str, Dict[str, torch.Tensor]]) -> str:
         # determine wan dit type by model params
+        def has_any_key(*xs):
+            return any(x in model_state_dict for x in xs)
         dit_type = None
-        if "high_noise_model" in model_state_dict and "low_noise_model" in model_state_dict:
+        if has_any_key("high_noise_model"):
             if model_state_dict["high_noise_model"]["patch_embedding.weight"].shape[1] == 36:
                 dit_type = "wan2.2-i2v-a14b"
             elif model_state_dict["high_noise_model"]["patch_embedding.weight"].shape[1] == 16:
                 dit_type = "wan2.2-t2v-a14b"
         elif model_state_dict["patch_embedding.weight"].shape[1] == 48:
             dit_type = "wan2.2-ti2v-5b"
-        elif "img_emb.emb_pos" in model_state_dict:
+        elif has_any_key("img_emb.emb_pos", "condition_embedder.image_embedder.pos_embed"):
             dit_type = "wan2.1-flf2v-14b"
-        elif "img_emb.proj.0.weight" in model_state_dict:
+        elif has_any_key("img_emb.proj.0.weight", "condition_embedder.image_embedder.norm1"):
             dit_type = "wan2.1-i2v-14b"
-        elif "blocks.39.self_attn.norm_q.weight" in model_state_dict:
+        elif has_any_key("blocks.39.self_attn.norm_q.weight", "blocks.39.attn1.norm_q.weight"):
             dit_type = "wan2.1-t2v-14b"
         else:
             dit_type = "wan2.1-t2v-1.3b"

diffsynth_engine/utils/constants.py CHANGED Viewed

@@ -27,18 +27,19 @@ SD3_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd3", "sd3_tex
 SDXL_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_text_encoder.json")
 SDXL_UNET_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_unet.json")
-WAN2_1_DIT_T2V_1_3B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-t2v-1.3b.json")
-WAN2_1_DIT_T2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-t2v-14b.json")
-WAN2_1_DIT_I2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-i2v-14b.json")
-WAN2_1_DIT_FLF2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-flf2v-14b.json")
-WAN2_2_DIT_TI2V_5B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-ti2v-5b.json")
-WAN2_2_DIT_T2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-t2v-a14b.json")
-WAN2_2_DIT_I2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-i2v-a14b.json")
-WAN2_2_DIT_S2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-s2v-14b.json")
-WAN2_1_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.1-vae.json")
-WAN2_2_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.2-vae.json")
-WAN_VAE_KEYMAP_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan-vae-keymap.json")
+WAN2_1_DIT_T2V_1_3B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1_t2v_1.3b.json")
+WAN2_1_DIT_T2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1_t2v_14b.json")
+WAN2_1_DIT_I2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1_i2v_14b.json")
+WAN2_1_DIT_FLF2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1_flf2v_14b.json")
+WAN2_2_DIT_TI2V_5B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2_ti2v_5b.json")
+WAN2_2_DIT_T2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2_t2v_a14b.json")
+WAN2_2_DIT_I2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2_i2v_a14b.json")
+WAN2_2_DIT_S2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2_s2v_14b.json")
+WAN_DIT_KEYMAP_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan_dit_keymap.json")
+WAN2_1_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.1_vae.json")
+WAN2_2_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.2_vae.json")
+WAN_VAE_KEYMAP_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan_vae_keymap.json")
 QWEN_IMAGE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen2_5_vl_config.json")
 QWEN_IMAGE_VISION_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen2_5_vl_vision_config.json")

diffsynth_engine/utils/flag.py CHANGED Viewed

@@ -44,3 +44,9 @@ if SPARGE_ATTN_AVAILABLE:
     logger.info("Sparge attention is available")
 else:
     logger.info("Sparge attention is not available")
+VIDEO_SPARSE_ATTN_AVAILABLE = importlib.util.find_spec("vsa") is not None
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    logger.info("Video sparse attention is available")
+else:
+    logger.info("Video sparse attention is not available")

diffsynth_engine/utils/parallel.py CHANGED Viewed

@@ -40,10 +40,14 @@ class ProcessGroupSingleton(Singleton):
     def __init__(self):
         self.CFG_GROUP: Optional[dist.ProcessGroup] = None
         self.SP_GROUP: Optional[dist.ProcessGroup] = None
+        self.SP_ULYSSUES_GROUP: Optional[dist.ProcessGroup] = None
+        self.SP_RING_GROUP: Optional[dist.ProcessGroup] = None
         self.TP_GROUP: Optional[dist.ProcessGroup] = None
         self.CFG_RANKS: List[int] = []
         self.SP_RANKS: List[int] = []
+        self.SP_ULYSSUES_RANKS: List[int] = []
+        self.SP_RING_RANKS: List[int] = []
         self.TP_RANKS: List[int] = []
@@ -82,6 +86,38 @@ def get_sp_ranks():
     return PROCESS_GROUP.SP_RANKS
+def get_sp_ulysses_group():
+    return PROCESS_GROUP.SP_ULYSSUES_GROUP
+def get_sp_ulysses_world_size():
+    return PROCESS_GROUP.SP_ULYSSUES_GROUP.size() if PROCESS_GROUP.SP_ULYSSUES_GROUP is not None else 1
+def get_sp_ulysses_rank():
+    return PROCESS_GROUP.SP_ULYSSUES_GROUP.rank() if PROCESS_GROUP.SP_ULYSSUES_GROUP is not None else 0
+def get_sp_ulysses_ranks():
+    return PROCESS_GROUP.SP_ULYSSUES_RANKS
+def get_sp_ring_group():
+    return PROCESS_GROUP.SP_RING_GROUP
+def get_sp_ring_world_size():
+    return PROCESS_GROUP.SP_RING_GROUP.size() if PROCESS_GROUP.SP_RING_GROUP is not None else 1
+def get_sp_ring_rank():
+    return PROCESS_GROUP.SP_RING_GROUP.rank() if PROCESS_GROUP.SP_RING_GROUP is not None else 0
+def get_sp_ring_ranks():
+    return PROCESS_GROUP.SP_RING_RANKS
 def get_tp_group():
     return PROCESS_GROUP.TP_GROUP
@@ -127,23 +163,32 @@ def init_parallel_pgs(
     blocks = [list(range(world_size))]
     cfg_groups, cfg_blocks = make_parallel_groups(blocks, cfg_degree)
     for cfg_ranks in cfg_groups:
-        cfg_group = dist.new_group(cfg_ranks)
         if rank in cfg_ranks:
-            PROCESS_GROUP.CFG_GROUP = cfg_group
+            PROCESS_GROUP.CFG_GROUP = dist.new_group(cfg_ranks)
             PROCESS_GROUP.CFG_RANKS = cfg_ranks
     sp_groups, sp_blocks = make_parallel_groups(cfg_blocks, sp_degree)
     for sp_ranks in sp_groups:
-        group = dist.new_group(sp_ranks)
         if rank in sp_ranks:
-            PROCESS_GROUP.SP_GROUP = group
+            PROCESS_GROUP.SP_GROUP = dist.new_group(sp_ranks)
             PROCESS_GROUP.SP_RANKS = sp_ranks
+    sp_ulysses_groups, sp_ulysses_blocks = make_parallel_groups(cfg_blocks, sp_ulysses_degree)
+    for sp_ulysses_ranks in sp_ulysses_groups:
+        if rank in sp_ulysses_ranks:
+            PROCESS_GROUP.SP_ULYSSUES_GROUP = dist.new_group(sp_ulysses_ranks)
+            PROCESS_GROUP.SP_ULYSSUES_RANKS = sp_ulysses_ranks
+    sp_ring_groups, _ = make_parallel_groups(sp_ulysses_blocks, sp_ring_degree)
+    for sp_ring_ranks in sp_ring_groups:
+        if rank in sp_ring_ranks:
+            PROCESS_GROUP.SP_RING_GROUP = dist.new_group(sp_ring_ranks)
+            PROCESS_GROUP.SP_RING_RANKS = sp_ring_ranks
     tp_groups, _ = make_parallel_groups(sp_blocks, tp_degree)
     for tp_ranks in tp_groups:
-        group = dist.new_group(tp_ranks)
         if rank in tp_ranks:
-            PROCESS_GROUP.TP_GROUP = group
+            PROCESS_GROUP.TP_GROUP = dist.new_group(tp_ranks)
             PROCESS_GROUP.TP_RANKS = tp_ranks
     set_seq_parallel_pg(sp_ulysses_degree, sp_ring_degree, rank, world_size)

{diffsynth_engine-0.6.1.dev22.dist-info → diffsynth_engine-0.6.1.dev23.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev22
+Version: 0.6.1.dev23
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

diffsynth-engine 0.6.1.dev22__py3-none-any.whl → 0.6.1.dev23__py3-none-any.whl

diffsynth-engine 0.6.1.dev22py3-none-any.whl → 0.6.1.dev23py3-none-any.whl