PyPI - diffsynth - Versions diffs - 1.1.3__tar.gz → 1.1.7__tar.gz - Mend

diffsynth 1.1.3tar.gz → 1.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (222) hide show

{diffsynth-1.1.3 → diffsynth-1.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: diffsynth
-Version: 1.1.3
+Version: 1.1.7
 Summary: Enjoy the magic of Diffusion models!
 Author: Artiprocher
 Classifier: Programming Language :: Python :: 3

{diffsynth-1.1.3 → diffsynth-1.1.7}/diffsynth/configs/model_config.py RENAMED Viewed

@@ -59,6 +59,7 @@ from ..models.wan_video_dit import WanModel
 from ..models.wan_video_text_encoder import WanTextEncoder
 from ..models.wan_video_image_encoder import WanImageEncoder
 from ..models.wan_video_vae import WanVideoVAE
+from ..models.wan_video_motion_controller import WanMotionControllerModel
 model_loader_configs = [
@@ -120,11 +121,16 @@ model_loader_configs = [
     (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
     (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
     (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6d6ccde6845b95ad9114ab993d917893", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "349723183fc063b2bfc10bb2835cf677", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "efa44cddf936c70abd0ea28b6cbe946c", ["wan_video_dit"], [WanModel], "civitai"),
     (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
     (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
     (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
     (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
     (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
 ]
 huggingface_model_loader_configs = [
     # These configs are provided for detecting model type automatically.

{diffsynth-1.1.3 → diffsynth-1.1.7}/diffsynth/models/wan_video_dit.py RENAMED Viewed

@@ -493,6 +493,62 @@ class WanModelStateDictConverter:
                 "num_layers": 40,
                 "eps": 1e-6
             }
+        elif hash_state_dict_keys(state_dict) == "6d6ccde6845b95ad9114ab993d917893":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6
+            }
+        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6
+            }
+        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6
+            }
         else:
             config = {}
         return state_dict, config

diffsynth-1.1.7/diffsynth/models/wan_video_motion_controller.py ADDED Viewed

@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+from .wan_video_dit import sinusoidal_embedding_1d
+class WanMotionControllerModel(torch.nn.Module):
+    def __init__(self, freq_dim=256, dim=1536):
+        super().__init__()
+        self.freq_dim = freq_dim
+        self.linear = nn.Sequential(
+            nn.Linear(freq_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim * 6),
+        )
+    def forward(self, motion_bucket_id):
+        emb = sinusoidal_embedding_1d(self.freq_dim, motion_bucket_id * 10)
+        emb = self.linear(emb)
+        return emb
+    def init(self):
+        state_dict = self.linear[-1].state_dict()
+        state_dict = {i: state_dict[i] * 0 for i in state_dict}
+        self.linear[-1].load_state_dict(state_dict)
+    @staticmethod
+    def state_dict_converter():
+        return WanMotionControllerModelDictConverter()
+class WanMotionControllerModelDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

{diffsynth-1.1.3 → diffsynth-1.1.7}/diffsynth/pipelines/wan_video.py RENAMED Viewed

@@ -18,6 +18,7 @@ from ..vram_management import enable_vram_management, AutoWrappedModule, AutoWra
 from ..models.wan_video_text_encoder import T5RelativeEmbedding, T5LayerNorm
 from ..models.wan_video_dit import RMSNorm, sinusoidal_embedding_1d
 from ..models.wan_video_vae import RMS_norm, CausalConv3d, Upsample
+from ..models.wan_video_motion_controller import WanMotionControllerModel
@@ -31,7 +32,8 @@ class WanVideoPipeline(BasePipeline):
         self.image_encoder: WanImageEncoder = None
         self.dit: WanModel = None
         self.vae: WanVideoVAE = None
-        self.model_names = ['text_encoder', 'dit', 'vae', 'image_encoder']
+        self.motion_controller: WanMotionControllerModel = None
+        self.model_names = ['text_encoder', 'dit', 'vae', 'image_encoder', 'motion_controller']
         self.height_division_factor = 16
         self.width_division_factor = 16
         self.use_unified_sequence_parallel = False
@@ -122,6 +124,22 @@ class WanVideoPipeline(BasePipeline):
                     computation_device=self.device,
                 ),
             )
+        if self.motion_controller is not None:
+            dtype = next(iter(self.motion_controller.parameters())).dtype
+            enable_vram_management(
+                self.motion_controller,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
         self.enable_cpu_offload()
@@ -134,6 +152,7 @@ class WanVideoPipeline(BasePipeline):
         self.dit = model_manager.fetch_model("wan_video_dit")
         self.vae = model_manager.fetch_model("wan_video_vae")
         self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+        self.motion_controller = model_manager.fetch_model("wan_video_motion_controller")
     @staticmethod
@@ -163,22 +182,47 @@ class WanVideoPipeline(BasePipeline):
         return {"context": prompt_emb}
-    def encode_image(self, image, num_frames, height, width):
+    def encode_image(self, image, end_image, num_frames, height, width):
         image = self.preprocess_image(image.resize((width, height))).to(self.device)
         clip_context = self.image_encoder.encode_image([image])
         msk = torch.ones(1, num_frames, height//8, width//8, device=self.device)
         msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = self.preprocess_image(end_image.resize((width, height))).to(self.device)
+            vae_input = torch.concat([image.transpose(0,1), torch.zeros(3, num_frames-2, height, width).to(image.device), end_image.transpose(0,1)],dim=1)
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)
         msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
         msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
         msk = msk.transpose(1, 2)[0]
-        vae_input = torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)
         y = self.vae.encode([vae_input.to(dtype=self.torch_dtype, device=self.device)], device=self.device)[0]
         y = torch.concat([msk, y])
         y = y.unsqueeze(0)
         clip_context = clip_context.to(dtype=self.torch_dtype, device=self.device)
         y = y.to(dtype=self.torch_dtype, device=self.device)
         return {"clip_feature": clip_context, "y": y}
+    def encode_control_video(self, control_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        control_video = self.preprocess_images(control_video)
+        control_video = torch.stack(control_video, dim=2).to(dtype=self.torch_dtype, device=self.device)
+        latents = self.encode_video(control_video, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).to(dtype=self.torch_dtype, device=self.device)
+        return latents
+    def prepare_controlnet_kwargs(self, control_video, num_frames, height, width, clip_feature=None, y=None, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        if control_video is not None:
+            control_latents = self.encode_control_video(control_video, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+            if clip_feature is None or y is None:
+                clip_feature = torch.zeros((1, 257, 1280), dtype=self.torch_dtype, device=self.device)
+                y = torch.zeros((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), dtype=self.torch_dtype, device=self.device)
+            else:
+                y = y[:, -16:]
+            y = torch.concat([control_latents, y], dim=1)
+        return {"clip_feature": clip_feature, "y": y}
     def tensor2video(self, frames):
@@ -204,6 +248,11 @@ class WanVideoPipeline(BasePipeline):
     def prepare_unified_sequence_parallel(self):
         return {"use_unified_sequence_parallel": self.use_unified_sequence_parallel}
+    def prepare_motion_bucket_id(self, motion_bucket_id):
+        motion_bucket_id = torch.Tensor((motion_bucket_id,)).to(dtype=self.torch_dtype, device=self.device)
+        return {"motion_bucket_id": motion_bucket_id}
     @torch.no_grad()
@@ -212,7 +261,9 @@ class WanVideoPipeline(BasePipeline):
         prompt,
         negative_prompt="",
         input_image=None,
+        end_image=None,
         input_video=None,
+        control_video=None,
         denoising_strength=1.0,
         seed=None,
         rand_device="cpu",
@@ -222,6 +273,7 @@ class WanVideoPipeline(BasePipeline):
         cfg_scale=5.0,
         num_inference_steps=50,
         sigma_shift=5.0,
+        motion_bucket_id=None,
         tiled=True,
         tile_size=(30, 52),
         tile_stride=(15, 26),
@@ -263,10 +315,21 @@ class WanVideoPipeline(BasePipeline):
         # Encode image
         if input_image is not None and self.image_encoder is not None:
             self.load_models_to_device(["image_encoder", "vae"])
-            image_emb = self.encode_image(input_image, num_frames, height, width)
+            image_emb = self.encode_image(input_image, end_image, num_frames, height, width)
         else:
             image_emb = {}
+        # ControlNet
+        if control_video is not None:
+            self.load_models_to_device(["image_encoder", "vae"])
+            image_emb = self.prepare_controlnet_kwargs(control_video, num_frames, height, width, **image_emb, **tiler_kwargs)
+        # Motion Controller
+        if self.motion_controller is not None and motion_bucket_id is not None:
+            motion_kwargs = self.prepare_motion_bucket_id(motion_bucket_id)
+        else:
+            motion_kwargs = {}
         # Extra input
         extra_input = self.prepare_extra_input(latents)
@@ -278,14 +341,24 @@ class WanVideoPipeline(BasePipeline):
         usp_kwargs = self.prepare_unified_sequence_parallel()
         # Denoise
-        self.load_models_to_device(["dit"])
+        self.load_models_to_device(["dit", "motion_controller"])
         for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
             timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)
             # Inference
-            noise_pred_posi = model_fn_wan_video(self.dit, latents, timestep=timestep, **prompt_emb_posi, **image_emb, **extra_input, **tea_cache_posi, **usp_kwargs)
+            noise_pred_posi = model_fn_wan_video(
+                self.dit, motion_controller=self.motion_controller,
+                x=latents, timestep=timestep,
+                **prompt_emb_posi, **image_emb, **extra_input,
+                **tea_cache_posi, **usp_kwargs, **motion_kwargs
+            )
             if cfg_scale != 1.0:
-                noise_pred_nega = model_fn_wan_video(self.dit, latents, timestep=timestep, **prompt_emb_nega, **image_emb, **extra_input, **tea_cache_nega, **usp_kwargs)
+                noise_pred_nega = model_fn_wan_video(
+                    self.dit, motion_controller=self.motion_controller,
+                    x=latents, timestep=timestep,
+                    **prompt_emb_nega, **image_emb, **extra_input,
+                    **tea_cache_nega, **usp_kwargs, **motion_kwargs
+                )
                 noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
             else:
                 noise_pred = noise_pred_posi
@@ -358,13 +431,15 @@ class TeaCache:
 def model_fn_wan_video(
     dit: WanModel,
-    x: torch.Tensor,
-    timestep: torch.Tensor,
-    context: torch.Tensor,
+    motion_controller: WanMotionControllerModel = None,
+    x: torch.Tensor = None,
+    timestep: torch.Tensor = None,
+    context: torch.Tensor = None,
     clip_feature: Optional[torch.Tensor] = None,
     y: Optional[torch.Tensor] = None,
     tea_cache: TeaCache = None,
     use_unified_sequence_parallel: bool = False,
+    motion_bucket_id: Optional[torch.Tensor] = None,
     **kwargs,
 ):
     if use_unified_sequence_parallel:
@@ -375,6 +450,8 @@ def model_fn_wan_video(
     t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
     t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
+    if motion_bucket_id is not None and motion_controller is not None:
+        t_mod = t_mod + motion_controller(motion_bucket_id).unflatten(1, (6, dit.dim))
     context = dit.text_embedding(context)
     if dit.has_image_input:

{diffsynth-1.1.3 → diffsynth-1.1.7}/diffsynth.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: diffsynth
-Version: 1.1.3
+Version: 1.1.7
 Summary: Enjoy the magic of Diffusion models!
 Author: Artiprocher
 Classifier: Programming Language :: Python :: 3

{diffsynth-1.1.3 → diffsynth-1.1.7}/diffsynth.egg-info/SOURCES.txt RENAMED Viewed

@@ -117,6 +117,7 @@ diffsynth/models/tiler.py
 diffsynth/models/utils.py
 diffsynth/models/wan_video_dit.py
 diffsynth/models/wan_video_image_encoder.py
+diffsynth/models/wan_video_motion_controller.py
 diffsynth/models/wan_video_text_encoder.py
 diffsynth/models/wan_video_vae.py
 diffsynth/pipelines/__init__.py

{diffsynth-1.1.3 → diffsynth-1.1.7}/setup.py RENAMED Viewed

@@ -14,7 +14,7 @@ else:
 setup(
     name="diffsynth",
-    version="1.1.3",
+    version="1.1.7",
     description="Enjoy the magic of Diffusion models!",
     author="Artiprocher",
     packages=find_packages(),