PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev23__tar.gz → 0.6.1.dev25__tar.gz - Mend

diffsynth-engine 0.6.1.dev23tar.gz → 0.6.1.dev25tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev23
+Version: 0.6.1.dev25
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/__init__.py RENAMED Viewed

@@ -12,11 +12,13 @@ from .configs import (
     WanStateDicts,
     QwenImageStateDicts,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
     ControlNetParams,
     ControlType,
     QwenImageControlNetParams,
     QwenImageControlType,
-    LoraConfig,
 )
 from .pipelines import (
     SDImagePipeline,
@@ -59,6 +61,9 @@ __all__ = [
     "WanStateDicts",
     "QwenImageStateDicts",
     "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlNetParams",
     "ControlType",
     "QwenImageControlNetParams",
@@ -79,7 +84,6 @@ __all__ = [
     "FluxIPAdapterRefTool",
     "FluxReplaceByControlTool",
     "FluxReduxRefTool",
-    "LoraConfig",
     "fetch_model",
     "fetch_modelscope_model",
     "register_fetch_modelscope_model",

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/configs/__init__.py RENAMED Viewed

@@ -17,14 +17,16 @@ from .pipeline import (
     WanStateDicts,
     WanS2VStateDicts,
     QwenImageStateDicts,
-    LoraConfig,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
 )
 from .controlnet import (
     ControlType,
     ControlNetParams,
-    QwenImageControlNetParams,
     QwenImageControlType,
+    QwenImageControlNetParams,
 )
 __all__ = [
@@ -46,10 +48,12 @@ __all__ = [
     "WanStateDicts",
     "WanS2VStateDicts",
     "QwenImageStateDicts",
-    "QwenImageControlType",
-    "QwenImageControlNetParams",
+    "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlType",
     "ControlNetParams",
-    "LoraConfig",
-    "AttnImpl",
+    "QwenImageControlType",
+    "QwenImageControlNetParams",
 ]

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/configs/pipeline.py RENAMED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
 from typing import List, Dict, Tuple, Optional
 from diffsynth_engine.configs.controlnet import ControlType
-from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 @dataclass
@@ -52,23 +51,6 @@ class AttentionConfig:
     dit_attn_impl: AttnImpl = AttnImpl.AUTO
     attn_params: Optional[SpargeAttentionParams | VideoSparseAttentionParams] = None
-    def get_attn_kwargs(self, latents: torch.Tensor, device: str) -> Dict:
-        attn_kwargs = {"attn_impl": self.dit_attn_impl.value}
-        if isinstance(self.attn_params, SpargeAttentionParams):
-            assert self.dit_attn_impl == AttnImpl.SPARGE
-            attn_kwargs.update(
-                {
-                    "smooth_k": self.attn_params.smooth_k,
-                    "simthreshd1": self.attn_params.simthreshd1,
-                    "cdfthreshd": self.attn_params.cdfthreshd,
-                    "pvthreshd": self.attn_params.pvthreshd,
-                }
-            )
-        elif isinstance(self.attn_params, VideoSparseAttentionParams):
-            assert self.dit_attn_impl == AttnImpl.VSA
-            attn_kwargs.update(get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.attn_params.sparsity, device=device))
-        return attn_kwargs
 @dataclass
 class OptimizationConfig:
@@ -262,16 +244,11 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
     encoder_dtype: torch.dtype = torch.bfloat16
     vae_dtype: torch.dtype = torch.float32
+    load_encoder: bool = True
     # override OptimizationConfig
     fbcache_relative_l1_threshold = 0.009
-    # override BaseConfig
-    vae_tiled: bool = True
-    vae_tile_size: Tuple[int, int] = (34, 34)
-    vae_tile_stride: Tuple[int, int] = (18, 16)
-    load_encoder: bool = True
     @classmethod
     def basic_config(
         cls,

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/models/basic/transformer_helper.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import math
@@ -91,8 +92,8 @@ class NewGELUActivation(nn.Module):
     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
-    def forward(self, input: "torch.Tensor") -> "torch.Tensor":
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 class ApproximateGELU(nn.Module):
@@ -115,3 +116,36 @@ class ApproximateGELU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x * torch.sigmoid(1.702 * x)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias, device=device, dtype=dtype)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        x = self.gelu(x)
+        return x

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/models/basic/video_sparse_attention.py RENAMED Viewed

@@ -2,9 +2,12 @@ import torch
 import math
 import functools
-from vsa import video_sparse_attn as vsa_core
+from diffsynth_engine.utils.flag import VIDEO_SPARSE_ATTN_AVAILABLE
 from diffsynth_engine.utils.parallel import get_sp_ulysses_group, get_sp_ring_world_size
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from vsa import video_sparse_attn as vsa_core
 VSA_TILE_SIZE = (4, 4, 4)

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/models/qwen_image/qwen_image_dit.py RENAMED Viewed

@@ -6,7 +6,7 @@ from einops import rearrange
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
-from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, ApproximateGELU, RMSNorm
+from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, GELU, RMSNorm
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.parallel import (
@@ -144,7 +144,7 @@ class QwenFeedForward(nn.Module):
         super().__init__()
         inner_dim = int(dim * 4)
         self.net = nn.ModuleList([])
-        self.net.append(ApproximateGELU(dim, inner_dim, device=device, dtype=dtype))
+        self.net.append(GELU(dim, inner_dim, approximate="tanh", device=device, dtype=dtype))
         self.net.append(nn.Dropout(dropout))
         self.net.append(nn.Linear(inner_dim, dim_out, device=device, dtype=dtype))
@@ -155,8 +155,8 @@ class QwenFeedForward(nn.Module):
 def apply_rotary_emb_qwen(x: torch.Tensor, freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]]):
-    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-    x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))  # (b, s, h, d) -> (b, s, h, d/2, 2)
+    x_out = torch.view_as_real(x_rotated * freqs_cis.unsqueeze(1)).flatten(3)  # (b, s, h, d/2, 2) -> (b, s, h, d)
     return x_out.type_as(x)
@@ -200,13 +200,13 @@ class QwenDoubleStreamAttention(nn.Module):
         img_q, img_k, img_v = self.to_q(image), self.to_k(image), self.to_v(image)
         txt_q, txt_k, txt_v = self.add_q_proj(text), self.add_k_proj(text), self.add_v_proj(text)
-        img_q = rearrange(img_q, "b s (h d) -> b h s d", h=self.num_heads)
-        img_k = rearrange(img_k, "b s (h d) -> b h s d", h=self.num_heads)
-        img_v = rearrange(img_v, "b s (h d) -> b h s d", h=self.num_heads)
+        img_q = rearrange(img_q, "b s (h d) -> b s h d", h=self.num_heads)
+        img_k = rearrange(img_k, "b s (h d) -> b s h d", h=self.num_heads)
+        img_v = rearrange(img_v, "b s (h d) -> b s h d", h=self.num_heads)
-        txt_q = rearrange(txt_q, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_k = rearrange(txt_k, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_v = rearrange(txt_v, "b s (h d) -> b h s d", h=self.num_heads)
+        txt_q = rearrange(txt_q, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_k = rearrange(txt_k, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_v = rearrange(txt_v, "b s (h d) -> b s h d", h=self.num_heads)
         img_q, img_k = self.norm_q(img_q), self.norm_k(img_k)
         txt_q, txt_k = self.norm_added_q(txt_q), self.norm_added_k(txt_k)
@@ -218,13 +218,9 @@ class QwenDoubleStreamAttention(nn.Module):
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs)
-        joint_q = torch.cat([txt_q, img_q], dim=2)
-        joint_k = torch.cat([txt_k, img_k], dim=2)
-        joint_v = torch.cat([txt_v, img_v], dim=2)
-        joint_q = joint_q.transpose(1, 2)
-        joint_k = joint_k.transpose(1, 2)
-        joint_v = joint_v.transpose(1, 2)
+        joint_q = torch.cat([txt_q, img_q], dim=1)
+        joint_k = torch.cat([txt_k, img_k], dim=1)
+        joint_v = torch.cat([txt_v, img_v], dim=1)
         attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
         joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, attn_mask=attn_mask, **attn_kwargs)

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/pipelines/base.py RENAMED Viewed

@@ -5,7 +5,15 @@ from einops import rearrange
 from typing import Dict, List, Tuple, Union, Optional
 from PIL import Image
-from diffsynth_engine.configs import BaseConfig, BaseStateDicts, LoraConfig
+from diffsynth_engine.configs import (
+    BaseConfig,
+    BaseStateDicts,
+    LoraConfig,
+    AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+)
+from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 from diffsynth_engine.utils.offload import enable_sequential_cpu_offload, offload_model_to_dict, restore_model_from_dict
 from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
@@ -33,6 +41,7 @@ class BasePipeline:
         dtype=torch.float16,
     ):
         super().__init__()
+        self.config = None
         self.vae_tiled = vae_tiled
         self.vae_tile_size = vae_tile_size
         self.vae_tile_stride = vae_tile_stride
@@ -48,7 +57,7 @@ class BasePipeline:
         raise NotImplementedError()
     @classmethod
-    def from_state_dict(cls, state_dicts: BaseStateDicts, pipeline_config: BaseConfig) -> "BasePipeline":
+    def from_state_dict(cls, state_dicts: BaseStateDicts, config: BaseConfig) -> "BasePipeline":
         raise NotImplementedError()
     def update_weights(self, state_dicts: BaseStateDicts) -> None:
@@ -260,6 +269,25 @@ class BasePipeline:
         )
         return init_latents, latents, sigmas, timesteps
+    def get_attn_kwargs(self, latents: torch.Tensor) -> Dict:
+        attn_kwargs = {"attn_impl": self.config.dit_attn_impl.value}
+        if isinstance(self.config.attn_params, SpargeAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.SPARGE
+            attn_kwargs.update(
+                {
+                    "smooth_k": self.config.attn_params.smooth_k,
+                    "simthreshd1": self.config.attn_params.simthreshd1,
+                    "cdfthreshd": self.config.attn_params.cdfthreshd,
+                    "pvthreshd": self.config.attn_params.pvthreshd,
+                }
+            )
+        elif isinstance(self.config.attn_params, VideoSparseAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.VSA
+            attn_kwargs.update(
+                get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.config.attn_params.sparsity, device=self.device)
+            )
+        return attn_kwargs
     def eval(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/pipelines/flux_image.py RENAMED Viewed

@@ -751,7 +751,7 @@ class FluxImagePipeline(BasePipeline):
         latents = latents.to(self.dtype)
         self.load_models_to_device(["dit"])
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             hidden_states=latents,
             timestep=timestep,
@@ -886,7 +886,7 @@ class FluxImagePipeline(BasePipeline):
                 empty_cache()
                 param.model.to(self.device)
-            attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+            attn_kwargs = self.get_attn_kwargs(latents)
             double_block_output, single_block_output = param.model(
                 hidden_states=latents,
                 control_condition=control_condition,

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/pipelines/qwen_image.py RENAMED Viewed

@@ -24,7 +24,7 @@ from diffsynth_engine.models.qwen_image import (
 from diffsynth_engine.models.qwen_image import QwenImageVAE
 from diffsynth_engine.tokenizers import Qwen2TokenizerFast, Qwen2VLProcessor
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
-from diffsynth_engine.pipelines.utils import calculate_shift
+from diffsynth_engine.pipelines.utils import calculate_shift, pad_and_concat
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
 from diffsynth_engine.utils.constants import (
@@ -148,9 +148,17 @@ class QwenImagePipeline(BasePipeline):
         self.prompt_template_encode_start_idx = 34
         # qwen image edit
         self.edit_system_prompt = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
-        self.edit_prompt_template_encode = "<|im_start|>system\n" + self.edit_system_prompt + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        )
         # qwen image edit plus
-        self.edit_plus_prompt_template_encode = "<|im_start|>system\n" + self.edit_system_prompt + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_plus_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        )
         self.edit_prompt_template_encode_start_idx = 64
@@ -200,7 +208,9 @@ class QwenImagePipeline(BasePipeline):
             )
         if config.load_encoder:
             logger.info(f"loading state dict from {config.encoder_path} ...")
-            encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
+            encoder_state_dict = cls.load_model_checkpoint(
+                config.encoder_path, device="cpu", dtype=config.encoder_dtype
+            )
         state_dicts = QwenImageStateDicts(
             model=model_state_dict,
@@ -490,8 +500,8 @@ class QwenImagePipeline(BasePipeline):
         else:
             # cfg by predict noise in one batch
             bs, _, h, w = latents.shape
-            prompt_emb = torch.cat([prompt_emb, negative_prompt_emb], dim=0)
-            prompt_emb_mask = torch.cat([prompt_emb_mask, negative_prompt_emb_mask], dim=0)
+            prompt_emb = pad_and_concat(prompt_emb, negative_prompt_emb)
+            prompt_emb_mask = pad_and_concat(prompt_emb_mask, negative_prompt_emb_mask)
             if entity_prompt_embs is not None:
                 entity_prompt_embs = [
                     torch.cat([x, y], dim=0) for x, y in zip(entity_prompt_embs, negative_entity_prompt_embs)
@@ -539,7 +549,7 @@ class QwenImagePipeline(BasePipeline):
         entity_masks: Optional[List[torch.Tensor]] = None,
     ):
         self.load_models_to_device(["dit"])
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             image=latents,
             edit=image_latents,

diffsynth_engine-0.6.1.dev25/diffsynth_engine/pipelines/utils.py ADDED Viewed

@@ -0,0 +1,71 @@
+import torch
+import torch.nn.functional as F
+def accumulate(result, new_item):
+    if result is None:
+        return new_item
+    for i, item in enumerate(new_item):
+        result[i] += item
+    return result
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def pad_and_concat(
+    tensor1: torch.Tensor,
+    tensor2: torch.Tensor,
+    concat_dim: int = 0,
+    pad_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Concatenate two tensors along a specified dimension after padding along another dimension.
+    Assumes input tensors have shape (b, s, d), where:
+    - b: batch dimension
+    - s: sequence dimension (may differ)
+    - d: feature dimension
+    Args:
+        tensor1: First tensor with shape (b1, s1, d)
+        tensor2: Second tensor with shape (b2, s2, d)
+        concat_dim: Dimension to concatenate along, default is 0 (batch dimension)
+        pad_dim: Dimension to pad along, default is 1 (sequence dimension)
+    Returns:
+        Concatenated tensor, shape depends on concat_dim and pad_dim choices
+    """
+    assert tensor1.dim() == tensor2.dim(), "Both tensors must have the same number of dimensions"
+    assert concat_dim != pad_dim, "concat_dim and pad_dim cannot be the same"
+    len1, len2 = tensor1.shape[pad_dim], tensor2.shape[pad_dim]
+    max_len = max(len1, len2)
+    # Calculate the position of pad_dim in the padding list
+    # Padding format: from the last dimension, each pair represents (dim_n_left, dim_n_right, ..., dim_0_left, dim_0_right)
+    ndim = tensor1.dim()
+    padding = [0] * (2 * ndim)
+    pad_right_idx = -2 * pad_dim - 1
+    if len1 < max_len:
+        pad_len = max_len - len1
+        padding[pad_right_idx] = pad_len
+        tensor1 = F.pad(tensor1, padding, mode="constant", value=0)
+    elif len2 < max_len:
+        pad_len = max_len - len2
+        padding[pad_right_idx] = pad_len
+        tensor2 = F.pad(tensor2, padding, mode="constant", value=0)
+    # Concatenate along the specified dimension
+    return torch.cat([tensor1, tensor2], dim=concat_dim)

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/pipelines/wan_s2v.py RENAMED Viewed

@@ -394,7 +394,7 @@ class WanSpeech2VideoPipeline(WanVideoPipeline):
         void_audio_input: torch.Tensor | None = None,
     ):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = model(
             x=latents,

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/pipelines/wan_video.py RENAMED Viewed

@@ -144,7 +144,7 @@ class WanVideoPipeline(BasePipeline):
         lora_list: List[Tuple[str, float]],
         fused: bool = True,
         save_original_weight: bool = False,
-        lora_converter: Optional[WanLoRAConverter] = None
+        lora_converter: Optional[WanLoRAConverter] = None,
     ):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
             "load LoRA is not allowed when tensor parallel is enabled; "
@@ -156,11 +156,15 @@ class WanVideoPipeline(BasePipeline):
         )
         super().load_loras(lora_list, fused, save_original_weight, lora_converter)
-    def load_loras_low_noise(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
+    def load_loras_low_noise(
+        self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False
+    ):
         assert self.dit2 is not None, "low noise LoRA can only be applied to Wan2.2"
         self.load_loras(lora_list, fused, save_original_weight, self.low_noise_lora_converter)
-    def load_loras_high_noise(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
+    def load_loras_high_noise(
+        self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False
+    ):
         assert self.dit2 is not None, "high noise LoRA can only be applied to Wan2.2"
         self.load_loras(lora_list, fused, save_original_weight)
@@ -323,7 +327,7 @@ class WanVideoPipeline(BasePipeline):
     def predict_noise(self, model, latents, image_clip_feature, image_y, timestep, context):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = model(
             x=latents,

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/tokenizers/base.py RENAMED Viewed

@@ -1,10 +1,16 @@
 # Modified from transformers.tokenization_utils_base
 from typing import Dict, List, Union, overload
+from enum import Enum
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+class PaddingStrategy(str, Enum):
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
 class BaseTokenizer:
     SPECIAL_TOKENS_ATTRIBUTES = [
         "bos_token",

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/tokenizers/qwen2.py RENAMED Viewed

@@ -4,7 +4,7 @@ import torch
 from typing import Dict, List, Union, Optional
 from tokenizers import Tokenizer as TokenizerFast, AddedToken
-from diffsynth_engine.tokenizers.base import BaseTokenizer, TOKENIZER_CONFIG_FILE
+from diffsynth_engine.tokenizers.base import BaseTokenizer, PaddingStrategy, TOKENIZER_CONFIG_FILE
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
@@ -165,22 +165,28 @@ class Qwen2TokenizerFast(BaseTokenizer):
         texts: Union[str, List[str]],
         max_length: Optional[int] = None,
         padding_side: Optional[str] = None,
+        padding_strategy: Union[PaddingStrategy, str] = "longest",
         **kwargs,
     ) -> Dict[str, "torch.Tensor"]:
         """
         Tokenize text and prepare for model inputs.
         Args:
-            text (`str`, `List[str]`, *optional*):
+            texts (`str`, `List[str]`):
                 The sequence or batch of sequences to be encoded.
             max_length (`int`, *optional*):
-                Each encoded sequence will be truncated or padded to max_length.
+                Maximum length of the encoded sequences.
             padding_side (`str`, *optional*):
                 The side on which the padding should be applied. Should be selected between `"right"` and `"left"`.
                 Defaults to `"right"`.
+            padding_strategy (`PaddingStrategy`, `str`, *optional*):
+                If `"longest"`, will pad the sequences to the longest sequence in the batch.
+                If `"max_length"`, will pad the sequences to the `max_length` argument.
+                Defaults to `"longest"`.
         Returns:
             `Dict[str, "torch.Tensor"]`: tensor dict compatible with model_input_names.
         """
@@ -190,7 +196,9 @@ class Qwen2TokenizerFast(BaseTokenizer):
         batch_ids = self.batch_encode(texts)
         ids_lens = [len(ids_) for ids_ in batch_ids]
-        max_length = max_length if max_length is not None else min(max(ids_lens), self.model_max_length)
+        max_length = max_length if max_length is not None else self.model_max_length
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = min(max(ids_lens), max_length)
         padding_side = padding_side if padding_side is not None else self.padding_side
         encoded = torch.zeros(len(texts), max_length, dtype=torch.long)

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine/utils/parallel.py RENAMED Viewed

@@ -19,8 +19,6 @@ from typing import Dict, List, Set, Type, Union, Optional
 from queue import Empty
 import diffsynth_engine.models.basic.attention as attention_ops
-from diffsynth_engine.models import PreTrainedModel
-from diffsynth_engine.pipelines import BasePipeline
 from diffsynth_engine.utils.platform import empty_cache
 from diffsynth_engine.utils import logging
@@ -300,14 +298,15 @@ def _worker_loop(
             world_size=world_size,
         )
-        def wrap_for_parallel(module: Union[PreTrainedModel, BasePipeline]):
-            if isinstance(module, BasePipeline):
-                for model_name in module.model_names:
-                    if isinstance(submodule := getattr(module, model_name), PreTrainedModel):
+        def wrap_for_parallel(module):
+            if hasattr(module, "model_names"):
+                for model_name in getattr(module, "model_names"):
+                    submodule = getattr(module, model_name)
+                    if getattr(submodule, "_supports_parallelization", False):
                         setattr(module, model_name, wrap_for_parallel(submodule))
                 return module
-            if not module._supports_parallelization:
+            if not getattr(module, "_supports_parallelization", False):
                 return module
             if tp_degree > 1:

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/diffsynth_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev23
+Version: 0.6.1.dev25
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

diffsynth_engine-0.6.1.dev23/diffsynth_engine/pipelines/utils.py DELETED Viewed

@@ -1,19 +0,0 @@
-def accumulate(result, new_item):
-    if result is None:
-        return new_item
-    for i, item in enumerate(new_item):
-        result[i] += item
-    return result
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/.gitattributes RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/.gitignore RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/.pre-commit-config.yaml RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev23 → diffsynth_engine-0.6.1.dev25}/MANIFEST.in RENAMED Viewed

File without changes

diffsynth-engine 0.6.1.dev23__tar.gz → 0.6.1.dev25__tar.gz

diffsynth-engine 0.6.1.dev23tar.gz → 0.6.1.dev25tar.gz