PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

diffsynth_engine/models/wan/wan_dit.py CHANGED Viewed

@@ -17,6 +17,7 @@ from diffsynth_engine.utils.constants import (
     WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
     WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
     WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
+    WAN_DIT_KEYMAP_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
@@ -30,6 +31,9 @@ from diffsynth_engine.utils.parallel import (
 T5_TOKEN_NUM = 512
 FLF_TOKEN_NUM = 257 * 2
+with open(WAN_DIT_KEYMAP_FILE, "r", encoding="utf-8") as f:
+    config = json.load(f)
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
     return x * (1 + scale) + shift
@@ -73,7 +77,7 @@ class SelfAttention(nn.Module):
         dim: int,
         num_heads: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -86,19 +90,25 @@ class SelfAttention(nn.Module):
         self.o = nn.Linear(dim, dim, device=device, dtype=dtype)
         self.norm_q = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
         self.norm_k = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        self.gate_compress = nn.Linear(dim, dim, device=device, dtype=dtype) if use_vsa else None
-    def forward(self, x, freqs):
+    def forward(self, x, freqs, attn_kwargs=None):
         q, k, v = self.norm_q(self.q(x)), self.norm_k(self.k(x)), self.v(x)
+        g = self.gate_compress(x) if self.gate_compress is not None else None
         num_heads = q.shape[2] // self.head_dim
         q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        g = rearrange(g, "b s (n d) -> b s n d", n=num_heads) if g is not None else None
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
         x = attention_ops.attention(
             q=rope_apply(q, freqs),
             k=rope_apply(k, freqs),
             v=v,
-            **self.attn_kwargs,
+            g=g,
+            **attn_kwargs,
         )
         x = x.flatten(2)
         return self.o(x)
@@ -111,7 +121,6 @@ class CrossAttention(nn.Module):
         num_heads: int,
         eps: float = 1e-6,
         has_image_input: bool = False,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -130,9 +139,8 @@ class CrossAttention(nn.Module):
             self.k_img = nn.Linear(dim, dim, device=device, dtype=dtype)
             self.v_img = nn.Linear(dim, dim, device=device, dtype=dtype)
             self.norm_k_img = RMSNorm(dim, eps=eps, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
+    def forward(self, x: torch.Tensor, y: torch.Tensor, attn_kwargs=None):
         if self.has_image_input:
             img = y[:, :-T5_TOKEN_NUM]
             ctx = y[:, -T5_TOKEN_NUM:]
@@ -144,12 +152,16 @@ class CrossAttention(nn.Module):
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
-        x = attention(q, k, v, **self.attn_kwargs).flatten(2)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        if attn_kwargs.get("attn_impl", None) == "vsa":
+            attn_kwargs = attn_kwargs.copy()
+            attn_kwargs["attn_impl"] = "sdpa"
+        x = attention(q, k, v, **attn_kwargs).flatten(2)
         if self.has_image_input:
             k_img, v_img = self.norm_k_img(self.k_img(img)), self.v_img(img)
             k_img = rearrange(k_img, "b s (n d) -> b s n d", n=num_heads)
             v_img = rearrange(v_img, "b s (n d) -> b s n d", n=num_heads)
-            y = attention(q, k_img, v_img, **self.attn_kwargs).flatten(2)
+            y = attention(q, k_img, v_img, **attn_kwargs).flatten(2)
             x = x + y
         return self.o(x)
@@ -162,7 +174,7 @@ class DiTBlock(nn.Module):
         num_heads: int,
         ffn_dim: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -170,9 +182,9 @@ class DiTBlock(nn.Module):
         self.dim = dim
         self.num_heads = num_heads
         self.ffn_dim = ffn_dim
-        self.self_attn = SelfAttention(dim, num_heads, eps, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        self.self_attn = SelfAttention(dim, num_heads, eps, use_vsa=use_vsa, device=device, dtype=dtype)
         self.cross_attn = CrossAttention(
-            dim, num_heads, eps, has_image_input=has_image_input, attn_kwargs=attn_kwargs, device=device, dtype=dtype
+            dim, num_heads, eps, has_image_input=has_image_input, device=device, dtype=dtype
         )
         self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
         self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, device=device, dtype=dtype)
@@ -184,14 +196,14 @@ class DiTBlock(nn.Module):
         )
         self.modulation = nn.Parameter(torch.randn(1, 6, dim, device=device, dtype=dtype) / dim**0.5)
-    def forward(self, x, context, t_mod, freqs):
+    def forward(self, x, context, t_mod, freqs, attn_kwargs=None):
         # msa: multi-head self-attention  mlp: multi-layer perceptron
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
             t.squeeze(1) for t in (self.modulation + t_mod).chunk(6, dim=1)
         ]
         input_x = modulate(self.norm1(x), shift_msa, scale_msa)
-        x = x + gate_msa * self.self_attn(input_x, freqs)
-        x = x + self.cross_attn(self.norm3(x), context)
+        x = x + gate_msa * self.self_attn(input_x, freqs, attn_kwargs)
+        x = x + self.cross_attn(self.norm3(x), context, attn_kwargs)
         input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
         x = x + gate_mlp * self.ffn(input_x)
         return x
@@ -249,7 +261,26 @@ class Head(nn.Module):
 class WanDiTStateDictConverter(StateDictConverter):
+    def _from_diffusers(self, state_dict):
+        global_rename_dict = config["diffusers"]["global_rename_dict"]
+        rename_dict = config["diffusers"]["rename_dict"]
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            suffix = ""
+            suffix = ".weight" if name.endswith(".weight") else suffix
+            suffix = ".bias" if name.endswith(".bias") else suffix
+            prefix = name[: -len(suffix)] if suffix else name
+            if prefix in global_rename_dict:
+                state_dict_[f"{global_rename_dict[prefix]}{suffix}"] = param
+            if prefix.startswith("blocks."):
+                _, idx, middle = prefix.split(".", 2)
+                if middle in rename_dict:
+                    state_dict_[f"blocks.{idx}.{rename_dict[middle]}{suffix}"] = param
+        return state_dict_
     def convert(self, state_dict):
+        if "condition_embedder.time_proj.weight" in state_dict:
+            return self._from_diffusers(state_dict)
         return state_dict
@@ -273,7 +304,7 @@ class WanDiT(PreTrainedModel):
         has_vae_feature: bool = False,
         fuse_image_latents: bool = False,
         flf_pos_emb: bool = False,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
+        use_vsa: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -307,7 +338,16 @@ class WanDiT(PreTrainedModel):
         )
         self.blocks = nn.ModuleList(
             [
-                DiTBlock(has_clip_feature, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
+                DiTBlock(
+                    has_clip_feature,
+                    dim,
+                    num_heads,
+                    ffn_dim,
+                    eps,
+                    use_vsa,
+                    device=device,
+                    dtype=dtype,
+                )
                 for _ in range(num_layers)
             ]
         )
@@ -344,6 +384,7 @@ class WanDiT(PreTrainedModel):
         timestep: torch.Tensor,
         clip_feature: Optional[torch.Tensor] = None,  # clip_vision_encoder(img)
         y: Optional[torch.Tensor] = None,  # vae_encoder(img)
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
         use_cfg = x.shape[0] > 1
@@ -376,7 +417,7 @@ class WanDiT(PreTrainedModel):
             with sequence_parallel((x, t, t_mod, freqs), seq_dims=(1, 0, 0, 0)):
                 for block in self.blocks:
-                    x = block(x, context, t_mod, freqs)
+                    x = block(x, context, t_mod, freqs, attn_kwargs)
                 x = self.head(x, t)
                 (x,) = sequence_parallel_unshard((x,), seq_dims=(1,), seq_lens=(f * h * w,))
             x = self.unpatchify(x, (f, h, w))
@@ -409,12 +450,11 @@ class WanDiT(PreTrainedModel):
         config: Dict[str, Any],
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
-        assign: bool = True,
+        use_vsa: bool = False,
     ):
-        model = cls(**config, device="meta", dtype=dtype, attn_kwargs=attn_kwargs)
+        model = cls(**config, device="meta", dtype=dtype, use_vsa=use_vsa)
         model = model.requires_grad_(False)
-        model.load_state_dict(state_dict, assign=assign)
+        model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
@@ -499,8 +539,5 @@ class WanDiT(PreTrainedModel):
         for block in self.blocks:
             block.compile(*args, **kwargs)
-        for block in self.single_blocks:
-            block.compile(*args, **kwargs)
-    def get_fsdp_modules(self):
-        return ["blocks"]
+    def get_fsdp_module_cls(self):
+        return {DiTBlock}

diffsynth_engine/pipelines/base.py CHANGED Viewed

@@ -2,10 +2,18 @@ import os
 import torch
 import numpy as np
 from einops import rearrange
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Optional
 from PIL import Image
-from diffsynth_engine.configs import BaseConfig, BaseStateDicts, LoraConfig
+from diffsynth_engine.configs import (
+    BaseConfig,
+    BaseStateDicts,
+    LoraConfig,
+    AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+)
+from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 from diffsynth_engine.utils.offload import enable_sequential_cpu_offload, offload_model_to_dict, restore_model_from_dict
 from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
@@ -33,6 +41,7 @@ class BasePipeline:
         dtype=torch.float16,
     ):
         super().__init__()
+        self.config = None
         self.vae_tiled = vae_tiled
         self.vae_tile_size = vae_tile_size
         self.vae_tile_stride = vae_tile_stride
@@ -48,7 +57,7 @@ class BasePipeline:
         raise NotImplementedError()
     @classmethod
-    def from_state_dict(cls, state_dicts: BaseStateDicts, pipeline_config: BaseConfig) -> "BasePipeline":
+    def from_state_dict(cls, state_dicts: BaseStateDicts, config: BaseConfig) -> "BasePipeline":
         raise NotImplementedError()
     def update_weights(self, state_dicts: BaseStateDicts) -> None:
@@ -70,7 +79,11 @@ class BasePipeline:
         lora_list: List[Tuple[str, Union[float, LoraConfig]]],
         fused: bool = True,
         save_original_weight: bool = False,
+        lora_converter: Optional[LoRAStateDictConverter] = None,
     ):
+        if not lora_converter:
+            lora_converter = self.lora_converter
         for lora_path, lora_item in lora_list:
             if isinstance(lora_item, float):
                 lora_scale = lora_item
@@ -86,7 +99,7 @@ class BasePipeline:
                 self.apply_scheduler_config(scheduler_config)
                 logger.info(f"Applied scheduler args from LoraConfig: {scheduler_config}")
-            lora_state_dict = self.lora_converter.convert(state_dict)
+            lora_state_dict = lora_converter.convert(state_dict)
             for model_name, state_dict in lora_state_dict.items():
                 model = getattr(self, model_name)
                 lora_args = []
@@ -256,6 +269,25 @@ class BasePipeline:
         )
         return init_latents, latents, sigmas, timesteps
+    def get_attn_kwargs(self, latents: torch.Tensor) -> Dict:
+        attn_kwargs = {"attn_impl": self.config.dit_attn_impl.value}
+        if isinstance(self.config.attn_params, SpargeAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.SPARGE
+            attn_kwargs.update(
+                {
+                    "smooth_k": self.config.attn_params.smooth_k,
+                    "simthreshd1": self.config.attn_params.simthreshd1,
+                    "cdfthreshd": self.config.attn_params.cdfthreshd,
+                    "pvthreshd": self.config.attn_params.pvthreshd,
+                }
+            )
+        elif isinstance(self.config.attn_params, VideoSparseAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.VSA
+            attn_kwargs.update(
+                get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.config.attn_params.sparsity, device=self.device)
+            )
+        return attn_kwargs
     def eval(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)

diffsynth_engine/pipelines/flux_image.py CHANGED Viewed

@@ -17,7 +17,12 @@ from diffsynth_engine.models.flux import (
     flux_dit_config,
     flux_text_encoder_config,
 )
-from diffsynth_engine.configs import FluxPipelineConfig, FluxStateDicts, ControlType, ControlNetParams
+from diffsynth_engine.configs import (
+    FluxPipelineConfig,
+    FluxStateDicts,
+    ControlType,
+    ControlNetParams,
+)
 from diffsynth_engine.models.basic.lora import LoRAContext
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
 from diffsynth_engine.pipelines.utils import accumulate, calculate_shift
@@ -143,7 +148,7 @@ class FluxLoRAConverter(LoRAStateDictConverter):
                 layer_id, layer_type = name.split("_", 1)
                 layer_type = layer_type.replace("self_attn_", "self_attn.").replace("mlp_", "mlp.")
                 rename = ".".join(["encoders", layer_id, clip_attn_rename_dict[layer_type]])
                 lora_args = {}
                 lora_args["alpha"] = param
                 lora_args["up"] = lora_state_dict[origin_key.replace(".alpha", ".lora_up.weight")]
@@ -507,29 +512,20 @@ class FluxImagePipeline(BasePipeline):
         vae_encoder = FluxVAEEncoder.from_state_dict(state_dicts.vae, device=init_device, dtype=config.vae_dtype)
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             if config.use_fbcache:
                 dit = FluxDiTFBCache.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
                     in_channel=config.control_type.get_in_channel(),
-                    attn_kwargs=attn_kwargs,
                     relative_l1_threshold=config.fbcache_relative_l1_threshold,
                 )
             else:
                 dit = FluxDiT.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
                     in_channel=config.control_type.get_in_channel(),
-                    attn_kwargs=attn_kwargs,
                 )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -573,7 +569,7 @@ class FluxImagePipeline(BasePipeline):
         self.update_component(self.vae_encoder, state_dicts.vae, self.config.device, self.config.vae_dtype)
     def compile(self):
-        self.dit.compile_repeated_blocks(dynamic=True)
+        self.dit.compile_repeated_blocks()
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
@@ -755,6 +751,7 @@ class FluxImagePipeline(BasePipeline):
         latents = latents.to(self.dtype)
         self.load_models_to_device(["dit"])
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             hidden_states=latents,
             timestep=timestep,
@@ -766,6 +763,7 @@ class FluxImagePipeline(BasePipeline):
             image_ids=image_ids,
             controlnet_double_block_output=double_block_output,
             controlnet_single_block_output=single_block_output,
+            attn_kwargs=attn_kwargs,
         )
         noise_pred = noise_pred[:, :image_seq_len]
         noise_pred = self.dit.unpatchify(noise_pred, height, width)
@@ -830,7 +828,7 @@ class FluxImagePipeline(BasePipeline):
                 masked_image = image.clone()
                 masked_image[(mask > 0.5).repeat(1, 3, 1, 1)] = -1
                 latent = self.encode_image(masked_image)
-                mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3]))
+                mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3])).to(latent.dtype)
                 mask = 1 - mask
                 latent = torch.cat([latent, mask], dim=1)
             elif self.config.control_type == ControlType.bfl_fill:
@@ -887,6 +885,8 @@ class FluxImagePipeline(BasePipeline):
             if self.offload_mode is not None:
                 empty_cache()
                 param.model.to(self.device)
+            attn_kwargs = self.get_attn_kwargs(latents)
             double_block_output, single_block_output = param.model(
                 hidden_states=latents,
                 control_condition=control_condition,
@@ -897,6 +897,7 @@ class FluxImagePipeline(BasePipeline):
                 image_ids=image_ids,
                 text_ids=text_ids,
                 guidance=guidance,
+                attn_kwargs=attn_kwargs,
             )
             if self.offload_mode is not None:
                 param.model.to("cpu")
@@ -983,8 +984,9 @@ class FluxImagePipeline(BasePipeline):
         elif self.ip_adapter is not None:
             image_emb = self.ip_adapter.encode_image(ref_image)
         elif self.redux is not None:
-            image_prompt_embeds = self.redux(ref_image)
-            positive_prompt_emb = torch.cat([positive_prompt_emb, image_prompt_embeds], dim=1)
+            ref_prompt_embeds = self.redux(ref_image)
+            flattened_ref_emb = ref_prompt_embeds.view(1, -1, ref_prompt_embeds.size(-1))
+            positive_prompt_emb = torch.cat([positive_prompt_emb, flattened_ref_emb], dim=1)
         # Extra input
         image_ids, text_ids, guidance = self.prepare_extra_input(

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -24,7 +24,7 @@ from diffsynth_engine.models.qwen_image import (
 from diffsynth_engine.models.qwen_image import QwenImageVAE
 from diffsynth_engine.tokenizers import Qwen2TokenizerFast, Qwen2VLProcessor
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
-from diffsynth_engine.pipelines.utils import calculate_shift
+from diffsynth_engine.pipelines.utils import calculate_shift, pad_and_concat
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
 from diffsynth_engine.utils.constants import (
@@ -91,7 +91,7 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
             if "lora_A.weight" in key:
                 lora_a_suffix = "lora_A.weight"
                 lora_b_suffix = "lora_B.weight"
             if lora_a_suffix is None:
                 continue
@@ -147,9 +147,18 @@ class QwenImagePipeline(BasePipeline):
         self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.prompt_template_encode_start_idx = 34
         # qwen image edit
-        self.edit_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_system_prompt = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
+        self.edit_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        )
         # qwen image edit plus
-        self.edit_plus_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_plus_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        )
         self.edit_prompt_template_encode_start_idx = 64
@@ -185,6 +194,7 @@ class QwenImagePipeline(BasePipeline):
         logger.info(f"loading state dict from {config.vae_path} ...")
         vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
+        encoder_state_dict = None
         if config.encoder_path is None:
             config.encoder_path = fetch_model(
                 "MusePublic/Qwen-image",
@@ -196,8 +206,11 @@ class QwenImagePipeline(BasePipeline):
                     "text_encoder/model-00004-of-00004.safetensors",
                 ],
             )
-        logger.info(f"loading state dict from {config.encoder_path} ...")
-        encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
+        if config.load_encoder:
+            logger.info(f"loading state dict from {config.encoder_path} ...")
+            encoder_state_dict = cls.load_model_checkpoint(
+                config.encoder_path, device="cpu", dtype=config.encoder_dtype
+            )
         state_dicts = QwenImageStateDicts(
             model=model_state_dict,
@@ -224,22 +237,25 @@ class QwenImagePipeline(BasePipeline):
     @classmethod
     def _from_state_dict(cls, state_dicts: QwenImageStateDicts, config: QwenImagePipelineConfig) -> "QwenImagePipeline":
         init_device = "cpu" if config.offload_mode is not None else config.device
-        tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
-        processor = Qwen2VLProcessor.from_pretrained(
-            tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
-            image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
-        )
-        with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r", encoding="utf-8") as f:
-            vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
-        with open(QWEN_IMAGE_CONFIG_FILE, "r", encoding="utf-8") as f:
-            text_config = Qwen2_5_VLConfig(**json.load(f))
-        encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
-            state_dicts.encoder,
-            vision_config=vision_config,
-            config=text_config,
-            device=init_device,
-            dtype=config.encoder_dtype,
-        )
+        tokenizer, processor, encoder = None, None, None
+        if config.load_encoder:
+            tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
+            processor = Qwen2VLProcessor.from_pretrained(
+                tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
+                image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
+            )
+            with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r", encoding="utf-8") as f:
+                vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
+            with open(QWEN_IMAGE_CONFIG_FILE, "r", encoding="utf-8") as f:
+                text_config = Qwen2_5_VLConfig(**json.load(f))
+            encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
+                state_dicts.encoder,
+                vision_config=vision_config,
+                config=text_config,
+                device=("cpu" if config.use_fsdp else init_device),
+                dtype=config.encoder_dtype,
+            )
         with open(QWEN_IMAGE_VAE_CONFIG_FILE, "r", encoding="utf-8") as f:
             vae_config = json.load(f)
         vae = QwenImageVAE.from_state_dict(
@@ -247,27 +263,18 @@ class QwenImagePipeline(BasePipeline):
         )
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl.value,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             if config.use_fbcache:
                 dit = QwenImageDiTFBCache.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                     relative_l1_threshold=config.fbcache_relative_l1_threshold,
                 )
             else:
                 dit = QwenImageDiT.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                 )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -307,7 +314,7 @@ class QwenImagePipeline(BasePipeline):
         self.update_component(self.vae, state_dicts.vae, self.config.device, self.config.vae_dtype)
     def compile(self):
-        self.dit.compile_repeated_blocks(dynamic=True)
+        self.dit.compile_repeated_blocks()
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
@@ -493,8 +500,8 @@ class QwenImagePipeline(BasePipeline):
         else:
             # cfg by predict noise in one batch
             bs, _, h, w = latents.shape
-            prompt_emb = torch.cat([prompt_emb, negative_prompt_emb], dim=0)
-            prompt_emb_mask = torch.cat([prompt_emb_mask, negative_prompt_emb_mask], dim=0)
+            prompt_emb = pad_and_concat(prompt_emb, negative_prompt_emb)
+            prompt_emb_mask = pad_and_concat(prompt_emb_mask, negative_prompt_emb_mask)
             if entity_prompt_embs is not None:
                 entity_prompt_embs = [
                     torch.cat([x, y], dim=0) for x, y in zip(entity_prompt_embs, negative_entity_prompt_embs)
@@ -542,6 +549,7 @@ class QwenImagePipeline(BasePipeline):
         entity_masks: Optional[List[torch.Tensor]] = None,
     ):
         self.load_models_to_device(["dit"])
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             image=latents,
             edit=image_latents,
@@ -552,6 +560,7 @@ class QwenImagePipeline(BasePipeline):
             entity_text=entity_prompt_embs,
             entity_seq_lens=[mask.sum(dim=1) for mask in entity_prompt_emb_masks] if entity_prompt_emb_masks else None,
             entity_masks=entity_masks,
+            attn_kwargs=attn_kwargs,
         )
         return noise_pred

diffsynth_engine/pipelines/sdxl_image.py CHANGED Viewed

@@ -181,7 +181,7 @@ class SDXLImagePipeline(BasePipeline):
     @classmethod
     def from_state_dict(cls, state_dicts: SDXLStateDicts, config: SDXLPipelineConfig) -> "SDXLImagePipeline":
-        init_device = "cpu" if config.offload_mode else config.device
+        init_device = "cpu" if config.offload_mode is not None else config.device
         tokenizer = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_CONF_PATH)
         tokenizer_2 = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_2_CONF_PATH)
         with LoRAContext():

diffsynth_engine/pipelines/utils.py CHANGED Viewed

@@ -1,3 +1,7 @@
+import torch
+import torch.nn.functional as F
 def accumulate(result, new_item):
     if result is None:
         return new_item
@@ -17,3 +21,51 @@ def calculate_shift(
     b = base_shift - m * base_seq_len
     mu = image_seq_len * m + b
     return mu
+def pad_and_concat(
+    tensor1: torch.Tensor,
+    tensor2: torch.Tensor,
+    concat_dim: int = 0,
+    pad_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Concatenate two tensors along a specified dimension after padding along another dimension.
+    Assumes input tensors have shape (b, s, d), where:
+    - b: batch dimension
+    - s: sequence dimension (may differ)
+    - d: feature dimension
+    Args:
+        tensor1: First tensor with shape (b1, s1, d)
+        tensor2: Second tensor with shape (b2, s2, d)
+        concat_dim: Dimension to concatenate along, default is 0 (batch dimension)
+        pad_dim: Dimension to pad along, default is 1 (sequence dimension)
+    Returns:
+        Concatenated tensor, shape depends on concat_dim and pad_dim choices
+    """
+    assert tensor1.dim() == tensor2.dim(), "Both tensors must have the same number of dimensions"
+    assert concat_dim != pad_dim, "concat_dim and pad_dim cannot be the same"
+    len1, len2 = tensor1.shape[pad_dim], tensor2.shape[pad_dim]
+    max_len = max(len1, len2)
+    # Calculate the position of pad_dim in the padding list
+    # Padding format: from the last dimension, each pair represents (dim_n_left, dim_n_right, ..., dim_0_left, dim_0_right)
+    ndim = tensor1.dim()
+    padding = [0] * (2 * ndim)
+    pad_right_idx = -2 * pad_dim - 1
+    if len1 < max_len:
+        pad_len = max_len - len1
+        padding[pad_right_idx] = pad_len
+        tensor1 = F.pad(tensor1, padding, mode="constant", value=0)
+    elif len2 < max_len:
+        pad_len = max_len - len2
+        padding[pad_right_idx] = pad_len
+        tensor2 = F.pad(tensor2, padding, mode="constant", value=0)
+    # Concatenate along the specified dimension
+    return torch.cat([tensor1, tensor2], dim=concat_dim)

diffsynth-engine 0.6.1.dev14__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.6.1.dev14py3-none-any.whl → 0.6.1.dev25py3-none-any.whl