PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev32__py3-none-any.whl → 0.6.1.dev33__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev32py3-none-any.whl → 0.6.1.dev33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -251,11 +251,14 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
     # override OptimizationConfig
     fbcache_relative_l1_threshold = 0.009
-    # svd
-    use_nunchaku: Optional[bool] = field(default=None, init=False)
-    use_nunchaku_awq: Optional[bool] = field(default=None, init=False)
-    use_nunchaku_attn: Optional[bool] = field(default=None, init=False)
+    # svd
+    use_nunchaku: Optional[bool] = field(default=None, init=False)
+    use_nunchaku_awq: Optional[bool] = field(default=None, init=False)
+    use_nunchaku_attn: Optional[bool] = field(default=None, init=False)
+    # for 2511
+    use_zero_cond_t: bool = False
     @classmethod
     def basic_config(
         cls,
@@ -266,6 +269,7 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
         offload_to_disk: bool = False,
+        use_zero_cond_t: bool = False,
     ) -> "QwenImagePipelineConfig":
         return cls(
             model_path=model_path,
@@ -277,6 +281,7 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
             use_fsdp=True if parallelism > 1 else False,
             offload_mode=offload_mode,
             offload_to_disk=offload_to_disk,
+            use_zero_cond_t=use_zero_cond_t,
         )
     def __post_init__(self):

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -94,6 +94,7 @@ if SPARGE_ATTN_AVAILABLE:
         )
         return out.transpose(1, 2)
 if AITER_AVAILABLE:
     from aiter import flash_attn_func as aiter_flash_attn
     from aiter import flash_attn_fp8_pertensor_func as aiter_flash_attn_fp8
@@ -203,7 +204,7 @@ def attention(
                 )
             if attn_mask is not None:
                 raise RuntimeError("aiter_flash_attn does not support attention mask")
-            if attn_impl == "aiter" :
+            if attn_impl == "aiter":
                 return aiter_flash_attn(q, k, v, softmax_scale=scale)
             else:
                 origin_dtype = q.dtype
@@ -211,7 +212,7 @@ def attention(
                 k = k.to(dtype=DTYPE_FP8)
                 v = v.to(dtype=DTYPE_FP8)
                 out = aiter_flash_attn_fp8(q, k, v, softmax_scale=scale)
-                return out.to(dtype=origin_dtype)
+                return out.to(dtype=origin_dtype)
         if attn_impl == "fa2":
             return flash_attn2(q, k, v, softmax_scale=scale)
         if attn_impl == "xformers":

diffsynth_engine/models/qwen_image/qwen_image_dit.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import torch.nn as nn
 from typing import Any, Dict, List, Tuple, Union, Optional
 from einops import rearrange
+from math import prod
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic import attention as attention_ops
@@ -243,6 +244,7 @@ class QwenImageTransformerBlock(nn.Module):
         num_attention_heads: int,
         attention_head_dim: int,
         eps: float = 1e-6,
+        zero_cond_t: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -275,10 +277,30 @@ class QwenImageTransformerBlock(nn.Module):
         self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
         self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
         self.txt_mlp = QwenFeedForward(dim=dim, dim_out=dim, device=device, dtype=dtype)
+        self.zero_cond_t = zero_cond_t
-    def _modulate(self, x, mod_params):
+    def _modulate(self, x, mod_params, index=None):
         shift, scale, gate = mod_params.chunk(3, dim=-1)
-        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+        if index is not None:
+            actual_batch = shift.size(0) // 2
+            shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]
+            scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
+            gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
+            index_expanded = index.unsqueeze(-1)
+            shift_0_exp = shift_0.unsqueeze(1)
+            shift_1_exp = shift_1.unsqueeze(1)
+            scale_0_exp = scale_0.unsqueeze(1)
+            scale_1_exp = scale_1.unsqueeze(1)
+            gate_0_exp = gate_0.unsqueeze(1)
+            gate_1_exp = gate_1.unsqueeze(1)
+            shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
+            scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
+            gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
+        else:
+            shift_result = shift.unsqueeze(1)
+            scale_result = scale.unsqueeze(1)
+            gate_result = gate.unsqueeze(1)
+        return x * (1 + scale_result) + shift_result, gate_result
     def forward(
         self,
@@ -288,12 +310,15 @@ class QwenImageTransformerBlock(nn.Module):
         rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attn_mask: Optional[torch.Tensor] = None,
         attn_kwargs: Optional[Dict[str, Any]] = None,
+        modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         img_mod_attn, img_mod_mlp = self.img_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
+        if self.zero_cond_t:
+            temb = torch.chunk(temb, 2, dim=0)[0]
         txt_mod_attn, txt_mod_mlp = self.txt_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
         img_normed = self.img_norm1(image)
-        img_modulated, img_gate = self._modulate(img_normed, img_mod_attn)
+        img_modulated, img_gate = self._modulate(img_normed, img_mod_attn, modulate_index)
         txt_normed = self.txt_norm1(text)
         txt_modulated, txt_gate = self._modulate(txt_normed, txt_mod_attn)
@@ -305,12 +330,11 @@ class QwenImageTransformerBlock(nn.Module):
             attn_mask=attn_mask,
             attn_kwargs=attn_kwargs,
         )
         image = image + img_gate * img_attn_out
         text = text + txt_gate * txt_attn_out
         img_normed_2 = self.img_norm2(image)
-        img_modulated_2, img_gate_2 = self._modulate(img_normed_2, img_mod_mlp)
+        img_modulated_2, img_gate_2 = self._modulate(img_normed_2, img_mod_mlp, modulate_index)
         txt_normed_2 = self.txt_norm2(text)
         txt_modulated_2, txt_gate_2 = self._modulate(txt_normed_2, txt_mod_mlp)
@@ -331,6 +355,7 @@ class QwenImageDiT(PreTrainedModel):
     def __init__(
         self,
         num_layers: int = 60,
+        zero_cond_t: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -351,6 +376,7 @@ class QwenImageDiT(PreTrainedModel):
                     dim=3072,
                     num_attention_heads=24,
                     attention_head_dim=128,
+                    zero_cond_t=zero_cond_t,
                     device=device,
                     dtype=dtype,
                 )
@@ -359,6 +385,7 @@ class QwenImageDiT(PreTrainedModel):
         )
         self.norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
+        self.zero_cond_t = zero_cond_t
     def patchify(self, hidden_states):
         hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
@@ -461,6 +488,9 @@ class QwenImageDiT(PreTrainedModel):
                 use_cfg=use_cfg,
             ),
         ):
+            if self.zero_cond_t:
+                timestep = torch.cat([timestep, timestep * 0], dim=0)
+            modulate_index = None
             conditioning = self.time_text_embed(timestep, image.dtype)
             video_fhw = [(1, h // 2, w // 2)]  # frame, height, width
             text_seq_len = text_seq_lens.max().item()
@@ -478,7 +508,12 @@ class QwenImageDiT(PreTrainedModel):
                     img = self.patchify(img)
                     image = torch.cat([image, img], dim=1)
                     video_fhw += [(1, edit_h // 2, edit_w // 2)]
+            if self.zero_cond_t:
+                modulate_index = torch.tensor(
+                    [[0] * prod(sample[0]) + [1] * sum([prod(s) for s in sample[1:]]) for sample in [video_fhw]],
+                    device=timestep.device,
+                    dtype=torch.int,
+                )
             rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
             image = self.img_in(image)
@@ -510,7 +545,10 @@ class QwenImageDiT(PreTrainedModel):
                         rotary_emb=rotary_emb,
                         attn_mask=attn_mask,
                         attn_kwargs=attn_kwargs,
+                        modulate_index=modulate_index,
                     )
+                if self.zero_cond_t:
+                    conditioning = conditioning.chunk(2, dim=0)[0]
                 image = self.norm_out(image, conditioning)
                 image = self.proj_out(image)
                 (image,) = sequence_parallel_unshard((image,), seq_dims=(1,), seq_lens=(image_seq_len,))
@@ -527,8 +565,9 @@ class QwenImageDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         num_layers: int = 60,
+        use_zero_cond_t: bool = False,
     ):
-        model = cls(device="meta", dtype=dtype, num_layers=num_layers)
+        model = cls(device="meta", dtype=dtype, num_layers=num_layers, zero_cond_t=use_zero_cond_t)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -2,7 +2,6 @@ import json
 import torch
 import torch.distributed as dist
 import math
-import sys
 from typing import Callable, List, Dict, Tuple, Optional, Union
 from tqdm import tqdm
 from einops import rearrange
@@ -45,7 +44,6 @@ from diffsynth_engine.utils.flag import NUNCHAKU_AVAILABLE
 logger = logging.get_logger(__name__)
 class QwenImageLoRAConverter(LoRAStateDictConverter):
     def _from_diffsynth(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
         dit_dict = {}
@@ -205,7 +203,7 @@ class QwenImagePipeline(BasePipeline):
             else:
                 config.use_nunchaku_attn = False
                 logger.info("Disable nunchaku attention quantization.")
         else:
             config.use_nunchaku = False
@@ -318,6 +316,7 @@ class QwenImagePipeline(BasePipeline):
             elif config.use_nunchaku:
                 if not NUNCHAKU_AVAILABLE:
                     from diffsynth_engine.utils.flag import NUNCHAKU_IMPORT_ERROR
                     raise ImportError(NUNCHAKU_IMPORT_ERROR)
                 from diffsynth_engine.models.qwen_image import QwenImageDiTNunchaku
@@ -337,6 +336,7 @@ class QwenImagePipeline(BasePipeline):
                     state_dicts.model,
                     device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
+                    use_zero_cond_t=config.use_zero_cond_t,
                 )
             if config.use_fp8_linear and not config.use_nunchaku:
                 enable_fp8_linear(dit)
@@ -704,7 +704,7 @@ class QwenImagePipeline(BasePipeline):
         context_latents = None
         for param in controlnet_params:
-            self.load_lora(param.model, param.scale, fused=False, save_original_weight=False)
+            self.load_lora(param.model, param.scale, fused=True, save_original_weight=False)
             if param.control_type == QwenImageControlType.in_context:
                 width, height = param.image.size
                 self.validate_image_size(height, width, minimum=64, multiple_of=16)

{diffsynth_engine-0.6.1.dev32.dist-info → diffsynth_engine-0.6.1.dev33.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev32
+Version: 0.6.1.dev33
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.6.1.dev32.dist-info → diffsynth_engine-0.6.1.dev33.dist-info}/RECORD RENAMED Viewed

@@ -81,12 +81,12 @@ diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json,sha256=bhl7TT29cdoU
 diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json,sha256=7Zo6iw-qcacKMoR-BDX-A25uES1N9O23u0ipIeNE3AU,61728
 diffsynth_engine/configs/__init__.py,sha256=vSjJToEdq3JX7t81_z4nwNwIdD4bYnFjxnMZH7PXMKo,1309
 diffsynth_engine/configs/controlnet.py,sha256=f3vclyP3lcAjxDGD9C1vevhqqQ7W2LL_c6Wye0uxk3Q,1180
-diffsynth_engine/configs/pipeline.py,sha256=7duSdoD0LIROtepsLW9PxYsK59p7qSv34BVz0k29vu4,13633
+diffsynth_engine/configs/pipeline.py,sha256=SLaxFd9mKuJgromrkXpJrsNGAGzMl51Twomc4Qo83Wc,13759
 diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
 diffsynth_engine/models/base.py,sha256=svao__9WH8VNcyXz5o5dzywYXDcGV0YV9IfkLzDKews,2558
 diffsynth_engine/models/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-diffsynth_engine/models/basic/attention.py,sha256=mvgk8LTqFwgtPdBeRv797IZNg9k7--X9wD92Hcr188c,15682
+diffsynth_engine/models/basic/attention.py,sha256=62Ar8_ydnn28F1qH9ueXtvISgNszQK3q8k14gCIXGEs,15681
 diffsynth_engine/models/basic/lora.py,sha256=Y6cBgrBsuDAP9FZz_fgK8vBi_EMg23saFIUSAsPIG-M,10670
 diffsynth_engine/models/basic/lora_nunchaku.py,sha256=7qhzGCzUIfDrwtWG0nspwdyZ7YUkaM4vMqzxZby2Zds,7510
 diffsynth_engine/models/basic/relative_position_emb.py,sha256=rCXOweZMcayVnNUVvBcYXMdhHS257B_PC8PZSWxvhNQ,2540
@@ -111,7 +111,7 @@ diffsynth_engine/models/hunyuan3d/surface_extractor.py,sha256=b15mb1N4PYwAvDk1Gu
 diffsynth_engine/models/hunyuan3d/volume_decoder.py,sha256=sgflj1a8sIerqGSalBAVQOlyiIihkLOLXYysNbulCoQ,2355
 diffsynth_engine/models/qwen_image/__init__.py,sha256=_6f0LWaoLdDvD2CsjK2OzEIQryt9efge8DFS4_GUnHQ,582
 diffsynth_engine/models/qwen_image/qwen2_5_vl.py,sha256=Eu-r-c42t_q74Qpwz21ToCGHpvSi7VND4B1EI0e-ePA,57748
-diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=iJ-FinDyXa982Uao1is37bxUttyPu0Eldyd7qPJO_XQ,22582
+diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=JEyK_yOa0A5xaqlmxI3nfD7NdCaHuvLDA10aWVbnac4,24635
 diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py,sha256=LIv9X_BohKk5rcEzyl3ATLwd8MSoFX43wjkArQ68nq8,4828
 diffsynth_engine/models/qwen_image/qwen_image_dit_nunchaku.py,sha256=1y1BkPRrX4_RioKjM09D9f9PK9neug1nSGJka0D9bvM,13516
 diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=eO7f4YqiYXfw7NncBNFTu-xEvdJ5uKY-SnfP15QY0tE,38443
@@ -146,7 +146,7 @@ diffsynth_engine/pipelines/__init__.py,sha256=jh-4LSJ0vqlXiT8BgFgRIQxuAr2atEPyHr
 diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
 diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
 diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
-diffsynth_engine/pipelines/qwen_image.py,sha256=ktOirdU2ljgb6vHhXosC0tWgXI3gwvsoAtrYKYvMwzI,35719
+diffsynth_engine/pipelines/qwen_image.py,sha256=lrqwF3fikgQouifb-8KwWCxQhNVZard_7buoJqxHD7s,35759
 diffsynth_engine/pipelines/sd_image.py,sha256=nr-Nhsnomq8CsUqhTM3i2l2zG01YjwXdfRXgr_bC3F0,17891
 diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8etEV-ZLP91g,21704
 diffsynth_engine/pipelines/utils.py,sha256=HZbJHErNJS1DhlwJKvZ9dY7Kh8Zdlsw3zE2e88TYGRY,2277
@@ -190,8 +190,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
 diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
 diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
-diffsynth_engine-0.6.1.dev32.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
-diffsynth_engine-0.6.1.dev32.dist-info/METADATA,sha256=ZEH2_1Zmgmk30J31qY1S0Ul9dD4rchav5AS3UclyCVg,1164
-diffsynth_engine-0.6.1.dev32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-diffsynth_engine-0.6.1.dev32.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
-diffsynth_engine-0.6.1.dev32.dist-info/RECORD,,
+diffsynth_engine-0.6.1.dev33.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
+diffsynth_engine-0.6.1.dev33.dist-info/METADATA,sha256=pgyNkuwU3lMQA66waiIU3BVtw-7zN3s8pEvinWC_LpI,1164
+diffsynth_engine-0.6.1.dev33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+diffsynth_engine-0.6.1.dev33.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
+diffsynth_engine-0.6.1.dev33.dist-info/RECORD,,

{diffsynth_engine-0.6.1.dev32.dist-info → diffsynth_engine-0.6.1.dev33.dist-info}/WHEEL RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev32.dist-info → diffsynth_engine-0.6.1.dev33.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev32.dist-info → diffsynth_engine-0.6.1.dev33.dist-info}/top_level.txt RENAMED Viewed

File without changes

diffsynth-engine 0.6.1.dev32__py3-none-any.whl → 0.6.1.dev33__py3-none-any.whl

diffsynth-engine 0.6.1.dev32py3-none-any.whl → 0.6.1.dev33py3-none-any.whl