PyPI - diffsynth-engine - Versions diffs - 0.4.3.dev9__py3-none-any.whl → 0.4.3.dev10__py3-none-any.whl - Mend

diffsynth-engine 0.4.3.dev9py3-none-any.whl → 0.4.3.dev10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

diffsynth_engine/conf/models/qwen_image/qwen2_5_vl_config.json CHANGED Viewed

@@ -21,5 +21,6 @@
     "vision_start_token_id": 151652,
     "vision_end_token_id": 151653,
     "image_token_id": 151655,
-    "video_token_id": 151656
+    "video_token_id": 151656,
+    "attn_impl": "sdpa"
 }

diffsynth_engine/conf/tokenizers/qwen_image/qwen2_vl_image_processor.json ADDED Viewed

@@ -0,0 +1,29 @@
+{
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "max_pixels": 12845056,
+    "merge_size": 2,
+    "min_pixels": 3136,
+    "patch_size": 14,
+    "processor_class": "Qwen2_5_VLProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "longest_edge": 12845056,
+        "shortest_edge": 3136
+    },
+    "temporal_patch_size": 2
+}

diffsynth_engine/models/basic/attention.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange, repeat
 from typing import Optional
-import torch.nn.functional as F
 from diffsynth_engine.utils import logging
 from diffsynth_engine.utils.flag import (
     FLASH_ATTN_3_AVAILABLE,
@@ -42,11 +42,11 @@ if XFORMERS_AVAILABLE:
 if SDPA_AVAILABLE:
-    def sdpa_attn(q, k, v, attn_mask=None, scale=None):
+    def sdpa_attn(q, k, v, attn_mask=None, is_causal=False, scale=None):
         q = q.transpose(1, 2)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
-        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, scale=scale)
+        out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, is_causal=is_causal, scale=scale)
         return out.transpose(1, 2)

diffsynth_engine/models/qwen_image/qwen2_5_vl.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Tuple, Optional
 from diffsynth_engine.models.base import PreTrainedModel
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
-from diffsynth_engine.models.basic.attention import attention
+from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.cache import Cache, DynamicCache
 from diffsynth_engine.utils import logging
@@ -152,17 +152,15 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
         self,
         dim: int = 80,
         theta: float = 10000.0,
-        device: str = "cuda:0",
-        dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
-        with torch.device(device):
-            inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        with torch.device("cpu"):
+            self.inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
-    def forward(self, seqlen: int) -> torch.Tensor:
-        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(seq, self.inv_freq)
+    def forward(self, seqlen: int, device: str) -> torch.Tensor:
+        inv_freq = self.inv_freq.to(device=device)
+        seq = torch.arange(seqlen, device=inv_freq.device, dtype=inv_freq.dtype)
+        freqs = torch.outer(seq, inv_freq)
         return freqs
@@ -222,7 +220,7 @@ class Qwen2_5_VisionAttention(nn.Module):
         q = rearrange(q, "s n d -> 1 s n d")
         k = rearrange(k, "s n d -> 1 s n d")
         v = rearrange(v, "s n d -> 1 s n d")
-        out = attention(q, k, v, attn_impl=self.attn_impl, attn_mask=attention_mask)
+        out = attention_ops.attention(q, k, v, attn_impl=self.attn_impl, attn_mask=attention_mask)
         out = rearrange(out, "1 s n d -> s (n d)")
         out = self.proj(out)
         return out
@@ -301,7 +299,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
             dtype=dtype,
         )
         head_dim = config.hidden_size // config.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2, device=device, dtype=dtype)
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
         self.blocks = nn.ModuleList(
             [
                 Qwen2_5_VisionBlock(
@@ -348,7 +346,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device=grid_thw.device)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
@@ -488,7 +486,6 @@ class Qwen2_5_Attention(nn.Module):
         hidden_size: int = 3584,
         num_attention_heads: int = 28,
         num_key_value_heads: int = 4,
-        # dropout: float = 0.0,
         mrope_section: List[int] = [16, 24, 24],
         attn_impl: Optional[str] = None,
         device: str = "cuda:0",
@@ -501,7 +498,6 @@ class Qwen2_5_Attention(nn.Module):
         self.head_dim = hidden_size // num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.num_key_value_groups = num_attention_heads // num_key_value_heads
-        # self.dropout = dropout
         self.mrope_section = mrope_section
         self.attn_impl = attn_impl
@@ -521,8 +517,6 @@ class Qwen2_5_Attention(nn.Module):
             self.num_attention_heads * self.head_dim, self.hidden_size, bias=False, device=device, dtype=dtype
         )
-        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(dim=self.head_dim, device=device, dtype=dtype)
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -556,14 +550,18 @@ class Qwen2_5_Attention(nn.Module):
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[1]]
-        # TODO: attention_mask for flash attention 2
-        out = attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_impl=self.attn_impl,
-            attn_mask=causal_mask,
-        )
+        # TODO: use is_causal when attention mask is causal
+        if self.attn_impl == "sdpa":
+            out = attention_ops.sdpa_attn(query_states, key_states, value_states, is_causal=True)
+        else:
+            # TODO: attention_mask for flash attention 2
+            out = attention_ops.attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_impl=self.attn_impl,
+                attn_mask=causal_mask,
+            )
         out = rearrange(out, "b s n d -> b s (n d)")
         out = self.o_proj(out)
         return out, past_key_values
@@ -647,29 +645,29 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
 class Qwen2_5_VLRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int = 128, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
+    def __init__(self, dim: int = 128):
         super().__init__()
-        with torch.device(device):
-            inv_freq = self.compute_rope(dim)  # default rope without dynamic frequency
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        with torch.device("cpu"):
+            self.inv_freq = self.compute_rope(dim)  # default rope without dynamic frequency
     def compute_rope(self, dim: int, theta: float = 1000000.0):
         inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
         return inv_freq
     @torch.no_grad()
-    def forward(self, x, position_ids):
+    def forward(self, position_ids: torch.LongTensor, device: str, dtype: torch.dtype):
         # In contrast to other models, Qwen2_5_VL has different position ids for the grids
         # So we expand the inv_freq to shape (3, ...)
-        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        inv_freq = self.inv_freq.to(device=device)
+        inv_freq_expanded = inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
         position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
-        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(2, 3)
         emb = torch.cat((freqs, freqs), dim=-1)
         cos = emb.cos()
         sin = emb.sin()
-        return cos.to(device=x.device, dtype=x.dtype), sin.to(device=x.device, dtype=x.dtype)
+        return cos.to(device=device, dtype=dtype), sin.to(device=device, dtype=dtype)
 class Qwen2_5_VLModel(nn.Module):
@@ -702,7 +700,7 @@ class Qwen2_5_VLModel(nn.Module):
         )
         self.norm = Qwen2_5_RMSNorm(config.hidden_size, config.rms_norm_eps, device=device, dtype=dtype)
         head_dim = config.hidden_size // config.num_attention_heads
-        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(dim=head_dim, device=device, dtype=dtype)
+        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(dim=head_dim)
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -749,7 +747,7 @@ class Qwen2_5_VLModel(nn.Module):
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings = self.rotary_emb(position_ids, device=hidden_states.device, dtype=hidden_states.dtype)
         # decoder layers
         for decoder_layer in self.layers:
@@ -940,8 +938,7 @@ class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
         with torch.device("meta"), no_init_weights():
             model = cls(vision_config=vision_config, config=config, device=device, dtype=dtype)
         model.load_state_dict(state_dict, assign=True)
-        for param in model.parameters():  # skip buffers
-            param.data = param.data.to(device=device, dtype=dtype, non_blocking=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
         return model
     def get_input_embeddings(self):
@@ -1202,27 +1199,14 @@ class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
         if position_ids is None:
             assert attention_mask is None or attention_mask.ndim == 2, "attention mask must be 2D"
             # calculate RoPE index once per generation in the pre-fill stage only
-            if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
-                position_ids, rope_deltas = self.get_rope_index(
-                    input_ids,
-                    image_grid_thw,
-                    video_grid_thw,
-                    second_per_grid_ts,
-                    attention_mask,
-                )
-                self.rope_deltas = rope_deltas
-            # then use the prev pre-calculated rope-deltas to get the correct position ids
-            else:
-                batch_size, seq_length, _ = inputs_embeds.shape
-                delta = (
-                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) if cache_position is not None else 0
-                )
-                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
-                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
-                if cache_position is not None:  # otherwise `deltas` is an int `0`
-                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
-                position_ids = position_ids.add(delta)
-                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+            position_ids, rope_deltas = self.get_rope_index(
+                input_ids,
+                image_grid_thw,
+                video_grid_thw,
+                second_per_grid_ts,
+                attention_mask,
+            )
+            self.rope_deltas = rope_deltas
         hidden_states, present_key_values = self.model(
             input_ids=None,

diffsynth_engine/models/qwen_image/qwen_image_dit.py CHANGED Viewed

@@ -81,41 +81,47 @@ class QwenEmbedRope(nn.Module):
     def forward(self, video_fhw, txt_length, device):
         """
-        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video
-        Args: txt_length: an integer representing the length of text
+        Args:
+            video_fhw (List[Tuple[int, int, int]]): A list of (frame, height, width) tuples for each video/image
+            txt_length (int): The maximum length of the text sequences
         """
         if self.pos_freqs.device != device:
             self.pos_freqs = self.pos_freqs.to(device)
             self.neg_freqs = self.neg_freqs.to(device)
-        frame, height, width = video_fhw
-        rope_key = f"{frame}_{height}_{width}"
-        if rope_key not in self.rope_cache:
-            seq_lens = frame * height * width
-            freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        vid_freqs = []
+        max_vid_index = 0
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            rope_key = f"{idx}_{height}_{width}"
+            if rope_key not in self.rope_cache:
+                seq_lens = frame * height * width
+                freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+                freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+                freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+                if self.scale_rope:
+                    freqs_height = torch.cat(
+                        [freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0
+                    )
+                    freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+                    freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+                    freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+                else:
+                    freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+                    freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+                freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+                self.rope_cache[rope_key] = freqs.clone().contiguous()
+            vid_freqs.append(self.rope_cache[rope_key])
             if self.scale_rope:
-                freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-                freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-                freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
             else:
-                freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-            freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-            self.rope_cache[rope_key] = freqs.clone().contiguous()
-        vid_freqs = self.rope_cache[rope_key]
-        if self.scale_rope:
-            max_vid_index = max(height // 2, width // 2)
-        else:
-            max_vid_index = max(height, width)
+                max_vid_index = max(height, width, max_vid_index)
         txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + txt_length, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
         return vid_freqs, txt_freqs
@@ -364,6 +370,7 @@ class QwenImageDiT(PreTrainedModel):
     def forward(
         self,
         image: torch.Tensor,
+        edit: torch.Tensor = None,
         text: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         txt_seq_lens: torch.LongTensor = None,
@@ -377,6 +384,7 @@ class QwenImageDiT(PreTrainedModel):
             cfg_parallel(
                 (
                     image,
+                    edit,
                     text,
                     timestep,
                     txt_seq_lens,
@@ -385,11 +393,18 @@ class QwenImageDiT(PreTrainedModel):
             ),
         ):
             conditioning = self.time_text_embed(timestep, image.dtype)
-            video_fhw = (1, h // 2, w // 2)  # frame, height, width
+            video_fhw = [(1, h // 2, w // 2)]  # frame, height, width
             max_length = txt_seq_lens.max().item()
+            image = self.patchify(image)
+            image_seq_len = image.shape[1]
+            if edit is not None:
+                edit = edit.to(dtype=image.dtype)
+                edit = self.patchify(edit)
+                image = torch.cat([image, edit], dim=1)
+                video_fhw += video_fhw
             image_rotary_emb = self.pos_embed(video_fhw, max_length, image.device)
-            image = self.patchify(image)
             image = self.img_in(image)
             text = self.txt_in(self.txt_norm(text[:, :max_length]))
@@ -397,6 +412,8 @@ class QwenImageDiT(PreTrainedModel):
                 text, image = block(image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb)
             image = self.norm_out(image, conditioning)
             image = self.proj_out(image)
+            if edit is not None:
+                image = image[:, :image_seq_len]
             image = self.unpatchify(image, h, w)

diffsynth_engine/pipelines/base.py CHANGED Viewed

@@ -164,7 +164,7 @@ class BasePipeline:
     @staticmethod
     def generate_noise(shape, seed=None, device="cpu", dtype=torch.float16):
         generator = None if seed is None else torch.Generator(device).manual_seed(seed)
-        noise = torch.randn(shape, generator=generator, device=device).to(dtype)
+        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
         return noise
     def encode_image(

diffsynth-engine 0.4.3.dev9__py3-none-any.whl → 0.4.3.dev10__py3-none-any.whl

diffsynth-engine 0.4.3.dev9py3-none-any.whl → 0.4.3.dev10py3-none-any.whl