PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev33__py3-none-any.whl → 0.6.1.dev35__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev33py3-none-any.whl → 0.6.1.dev35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

diffsynth_engine/configs/__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .pipeline import (
     WanSpeech2VideoPipelineConfig,
     QwenImagePipelineConfig,
     HunyuanPipelineConfig,
+    ZImagePipelineConfig,
     BaseStateDicts,
     SDStateDicts,
     SDXLStateDicts,
@@ -17,6 +18,7 @@ from .pipeline import (
     WanStateDicts,
     WanS2VStateDicts,
     QwenImageStateDicts,
+    ZImageStateDicts,
     AttnImpl,
     SpargeAttentionParams,
     VideoSparseAttentionParams,
@@ -41,6 +43,7 @@ __all__ = [
     "WanSpeech2VideoPipelineConfig",
     "QwenImagePipelineConfig",
     "HunyuanPipelineConfig",
+    "ZImagePipelineConfig",
     "BaseStateDicts",
     "SDStateDicts",
     "SDXLStateDicts",
@@ -48,6 +51,7 @@ __all__ = [
     "WanStateDicts",
     "WanS2VStateDicts",
     "QwenImageStateDicts",
+    "ZImageStateDicts",
     "AttnImpl",
     "SpargeAttentionParams",
     "VideoSparseAttentionParams",

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -298,6 +298,42 @@ class HunyuanPipelineConfig(BaseConfig):
     image_encoder_dtype: torch.dtype = torch.float16
+@dataclass
+class ZImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    model_dtype: torch.dtype = torch.float16
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_dtype: torch.dtype = torch.float16
+    encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    encoder_dtype: torch.dtype = torch.float16
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
+    ) -> "ZImagePipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            encoder_path=encoder_path,
+            vae_path=vae_path,
+            parallelism=parallelism,
+            use_cfg_parallel=True if parallelism > 1 else False,
+            use_fsdp=True if parallelism > 1 else False,
+            offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
+        )
+    def __post_init__(self):
+        init_parallel_config(self)
 @dataclass
 class BaseStateDicts:
     pass
@@ -349,7 +385,14 @@ class QwenImageStateDicts:
     vae: Dict[str, torch.Tensor]
-def init_parallel_config(config: FluxPipelineConfig | QwenImagePipelineConfig | WanPipelineConfig):
+@dataclass
+class ZImageStateDicts:
+    model: Dict[str, torch.Tensor]
+    encoder: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
+def init_parallel_config(config: FluxPipelineConfig | QwenImagePipelineConfig | WanPipelineConfig | ZImagePipelineConfig):
     assert config.parallelism in (1, 2, 4, 8), "parallelism must be 1, 2, 4 or 8"
     config.batch_cfg = True if config.parallelism > 1 and config.use_cfg_parallel else config.batch_cfg

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -343,7 +343,7 @@ def long_context_attention(
                     f"head_dim={q.shape[-1]}, but aiter_flash_attn only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
                 )
         if SDPA_AVAILABLE:
-            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+            return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
         if FLASH_ATTN_2_AVAILABLE:
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         raise ValueError("No available long context attention implementation")
@@ -379,7 +379,7 @@ def long_context_attention(
         if attn_impl == "fa2":
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         if attn_impl == "sdpa":
-            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+            return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
         if attn_impl == "sage":
             return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
         if attn_impl == "sparge":

diffsynth_engine/models/qwen_image/qwen_image_dit.py CHANGED Viewed

@@ -286,16 +286,15 @@ class QwenImageTransformerBlock(nn.Module):
             shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]
             scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
             gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
-            index_expanded = index.unsqueeze(-1)
             shift_0_exp = shift_0.unsqueeze(1)
             shift_1_exp = shift_1.unsqueeze(1)
             scale_0_exp = scale_0.unsqueeze(1)
             scale_1_exp = scale_1.unsqueeze(1)
             gate_0_exp = gate_0.unsqueeze(1)
             gate_1_exp = gate_1.unsqueeze(1)
-            shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
-            scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
-            gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
+            shift_result = torch.where(index == 0, shift_0_exp, shift_1_exp)
+            scale_result = torch.where(index == 0, scale_0_exp, scale_1_exp)
+            gate_result = torch.where(index == 0, gate_0_exp, gate_1_exp)
         else:
             shift_result = shift.unsqueeze(1)
             scale_result = scale.unsqueeze(1)
@@ -514,6 +513,7 @@ class QwenImageDiT(PreTrainedModel):
                     device=timestep.device,
                     dtype=torch.int,
                 )
+                modulate_index = modulate_index.unsqueeze(-1)
             rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
             image = self.img_in(image)
@@ -535,7 +535,7 @@ class QwenImageDiT(PreTrainedModel):
             # warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
             img_freqs, txt_freqs = rotary_emb
-            with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
+            with sequence_parallel((image, text, img_freqs, txt_freqs, modulate_index), seq_dims=(1, 1, 0, 0, 1)):
                 rotary_emb = (img_freqs, txt_freqs)
                 for block in self.transformer_blocks:
                     text, image = block(

diffsynth_engine/models/qwen_image/qwen_image_vae.py CHANGED Viewed

@@ -685,7 +685,6 @@ class VideoVAE(nn.Module):
         x = patchify(x, patch_size=2 if self.in_channels == 12 else 1)
         t = x.shape[2]
         iter_ = 1 + (t - 1) // 4
         for i in range(iter_):
             if i == 0:
                 out = self.encoder(x[:, :, :1, :, :], feat_cache=feat_cache)

diffsynth_engine/models/z_image/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .qwen3 import (
+    Qwen3Model,
+    Qwen3Config,
+)
+from .z_image_dit import ZImageDiT
+__all__ = [
+    "Qwen3Model",
+    "Qwen3Config",
+    "ZImageDiT",
+]

diffsynth_engine/models/z_image/qwen3.py ADDED Viewed

@@ -0,0 +1,124 @@
+# modified from transformers.models.qwen3.modeling_qwen3
+import torch
+import torch.nn as nn
+from typing import Dict, Tuple, Optional
+from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
+from diffsynth_engine.utils.cache import Cache, DynamicCache
+from diffsynth_engine.utils import logging
+from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3RMSNorm, Qwen3RotaryEmbedding
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.masking_utils import create_causal_mask
+logger = logging.get_logger(__name__)
+class Qwen3ModelStateDictConverter(StateDictConverter):
+    def __init__(self):
+        super().__init__()
+    def _from_diffusers(self, state_dict):
+        new_state_dict = {}
+        for key, param in state_dict.items():
+            if key.startswith("model."):
+                key = key[len("model.") :]
+            new_state_dict[key] = param
+        return new_state_dict
+    def convert(self, state_dict):
+        return self._from_diffusers(state_dict)
+class Qwen3Model(PreTrainedModel):
+    converter = Qwen3ModelStateDictConverter()
+    def __init__(self, config: Qwen3Config, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
+        super().__init__()
+        # for causal_mask
+        config._attn_implementation = "sdpa"
+        self.config = config
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id, device=device, dtype=dtype
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen3DecoderLayer(layer_idx=layer_idx, config=config).to(device=device, dtype=dtype)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, config.rms_norm_eps).to(device=device, dtype=dtype)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        config: Qwen3Config,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        model = cls(config=config, device="meta", dtype=dtype)
+        model.requires_grad_(False)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        all_hidden_states = []
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            seq_len = inputs_embeds.size(1)
+            cache_position = torch.arange(seq_len, device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=None,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        all_hidden_states.append(hidden_states)
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                attention_mask=causal_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+            )
+            all_hidden_states.append(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        return {
+            "last_hidden_state": hidden_states,
+            "past_key_values": past_key_values,
+            "hidden_states": all_hidden_states,
+        }

diffsynth-engine 0.6.1.dev33__py3-none-any.whl → 0.6.1.dev35__py3-none-any.whl

diffsynth-engine 0.6.1.dev33py3-none-any.whl → 0.6.1.dev35py3-none-any.whl