PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev37__py3-none-any.whl → 0.6.1.dev39__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev37py3-none-any.whl → 0.6.1.dev39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

diffsynth_engine/__init__.py CHANGED Viewed

@@ -27,6 +27,7 @@ from .pipelines import (
     SDXLImagePipeline,
     FluxImagePipeline,
     WanVideoPipeline,
+    WanDMDPipeline,
     QwenImagePipeline,
     Hunyuan3DShapePipeline,
 )
@@ -81,6 +82,7 @@ __all__ = [
     "FluxIPAdapter",
     "FluxRedux",
     "WanVideoPipeline",
+    "WanDMDPipeline",
     "QwenImagePipeline",
     "Hunyuan3DShapePipeline",
     "FluxInpaintingTool",

diffsynth_engine/pipelines/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .sdxl_image import SDXLImagePipeline
 from .sd_image import SDImagePipeline
 from .wan_video import WanVideoPipeline
 from .wan_s2v import WanSpeech2VideoPipeline
+from .wan_dmd import WanDMDPipeline
 from .qwen_image import QwenImagePipeline
 from .hunyuan3d_shape import Hunyuan3DShapePipeline
 from .z_image import ZImagePipeline
@@ -16,6 +17,7 @@ __all__ = [
     "SDImagePipeline",
     "WanVideoPipeline",
     "WanSpeech2VideoPipeline",
+    "WanDMDPipeline",
     "QwenImagePipeline",
     "Hunyuan3DShapePipeline",
     "ZImagePipeline",

diffsynth_engine/pipelines/base.py CHANGED Viewed

@@ -145,7 +145,7 @@ class BasePipeline:
         self.load_loras([(path, scale)], fused, save_original_weight)
     def apply_scheduler_config(self, scheduler_config: Dict):
-        pass
+        self.noise_scheduler.update_config(scheduler_config)
     def unload_loras(self):
         raise NotImplementedError()

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -393,9 +393,6 @@ class QwenImagePipeline(BasePipeline):
         self.dit.unload_loras()
         self.noise_scheduler.restore_config()
-    def apply_scheduler_config(self, scheduler_config: Dict):
-        self.noise_scheduler.update_config(scheduler_config)
     def prepare_latents(
         self,
         latents: torch.Tensor,

diffsynth_engine/pipelines/wan_dmd.py ADDED Viewed

@@ -0,0 +1,111 @@
+import torch
+import torch.distributed as dist
+from typing import Callable, List, Optional
+from tqdm import tqdm
+from PIL import Image
+from diffsynth_engine.pipelines.wan_video import WanVideoPipeline
+class WanDMDPipeline(WanVideoPipeline):
+    def prepare_latents(
+        self,
+        latents,
+        denoising_step_list,
+    ):
+        height, width = latents.shape[-2:]
+        height, width = height * self.upsampling_factor, width * self.upsampling_factor
+        sigmas, timesteps = self.noise_scheduler.schedule(num_inference_steps=1000)
+        sigmas = sigmas[[1000 - t for t in denoising_step_list] + [-1]]
+        timesteps = timesteps[[1000 - t for t in denoising_step_list]]
+        init_latents = latents.clone()
+        return init_latents, latents, sigmas, timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        input_image: Image.Image | None = None,
+        seed=None,
+        height=480,
+        width=832,
+        num_frames=81,
+        denoising_step_list: List[int] = None,
+        progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
+    ):
+        denoising_step_list = [1000, 750, 500, 250] if denoising_step_list is None else denoising_step_list
+        divisor = 32 if self.vae.z_dim == 48 else 16  # 32 for wan2.2 vae, 16 for wan2.1 vae
+        assert height % divisor == 0 and width % divisor == 0, f"height and width must be divisible by {divisor}"
+        assert (num_frames - 1) % 4 == 0, "num_frames must be 4X+1"
+        # Initialize noise
+        if dist.is_initialized() and seed is None:
+            raise ValueError("must provide a seed when parallelism is enabled")
+        noise = self.generate_noise(
+            (
+                1,
+                self.vae.z_dim,
+                (num_frames - 1) // 4 + 1,
+                height // self.upsampling_factor,
+                width // self.upsampling_factor,
+            ),
+            seed=seed,
+            device="cpu",
+            dtype=torch.float32,
+        ).to(self.device)
+        init_latents, latents, sigmas, timesteps = self.prepare_latents(noise, denoising_step_list)
+        mask = torch.ones((1, 1, *latents.shape[2:]), dtype=latents.dtype, device=latents.device)
+        # Encode prompts
+        self.load_models_to_device(["text_encoder"])
+        prompt_emb_posi = self.encode_prompt(prompt)
+        prompt_emb_nega = None
+        # Encode image
+        image_clip_feature = self.encode_clip_feature(input_image, height, width)
+        image_y = self.encode_vae_feature(input_image, num_frames, height, width)
+        image_latents = self.encode_image_latents(input_image, height, width)
+        if image_latents is not None:
+            latents[:, :, : image_latents.shape[2], :, :] = image_latents
+            init_latents = latents.clone()
+            mask[:, :, : image_latents.shape[2], :, :] = 0
+        # Initialize sampler
+        self.sampler.initialize(sigmas=sigmas)
+        # Denoise
+        hide_progress = dist.is_initialized() and dist.get_rank() != 0
+        for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
+            if timestep.item() / 1000 >= self.config.boundary:
+                self.load_models_to_device(["dit"])
+                model = self.dit
+            else:
+                self.load_models_to_device(["dit2"])
+                model = self.dit2
+            timestep = timestep * mask[:, :, :, ::2, ::2].flatten()  # seq_len
+            timestep = timestep.to(dtype=self.dtype, device=self.device)
+            # Classifier-free guidance
+            noise_pred = self.predict_noise_with_cfg(
+                model=model,
+                latents=latents,
+                timestep=timestep,
+                positive_prompt_emb=prompt_emb_posi,
+                negative_prompt_emb=prompt_emb_nega,
+                image_clip_feature=image_clip_feature,
+                image_y=image_y,
+                cfg_scale=1.0,
+                batch_cfg=self.config.batch_cfg,
+            )
+            # Scheduler
+            latents = self.sampler.step(latents, noise_pred, i)
+            latents = latents * mask + init_latents * (1 - mask)
+            if progress_callback is not None:
+                progress_callback(i + 1, len(timesteps), "DENOISING")
+        # Decode
+        self.load_models_to_device(["vae"])
+        frames = self.decode_video(latents, progress_callback=progress_callback)
+        frames = self.vae_output_to_image(frames)
+        return frames

diffsynth_engine/pipelines/wan_video.py CHANGED Viewed

@@ -43,6 +43,24 @@ class WanLoRAConverter(LoRAStateDictConverter):
             dit_dict[key] = lora_args
         return {"dit": dit_dict}
+    def _from_diffusers(self, state_dict):
+        dit_dict = {}
+        for key, param in state_dict.items():
+            if ".lora_down.weight" not in key:
+                continue
+            lora_args = {}
+            lora_args["up"] = state_dict[key.replace(".lora_down.weight", ".lora_up.weight")]
+            lora_args["down"] = param
+            lora_args["rank"] = lora_args["up"].shape[1]
+            if key.replace(".lora_down.weight", ".alpha") in state_dict:
+                lora_args["alpha"] = state_dict[key.replace(".lora_down.weight", ".alpha")]
+            else:
+                lora_args["alpha"] = lora_args["rank"]
+            key = key.replace("diffusion_model.", "").replace(".lora_down.weight", "")
+            dit_dict[key] = lora_args
+        return {"dit": dit_dict}
     def _from_civitai(self, state_dict):
         dit_dict = {}
         for key, param in state_dict.items():
@@ -86,6 +104,9 @@ class WanLoRAConverter(LoRAStateDictConverter):
         if "lora_unet_blocks_0_cross_attn_k.lora_down.weight" in state_dict:
             state_dict = self._from_fun(state_dict)
             logger.info("use fun format state dict")
+        elif "diffusion_model.blocks.0.cross_attn.k.lora_down.weight" in state_dict:
+            state_dict = self._from_diffusers(state_dict)
+            logger.info("use diffusers format state dict")
         elif "diffusion_model.blocks.0.cross_attn.k.lora_A.weight" in state_dict:
             state_dict = self._from_civitai(state_dict)
             logger.info("use civitai format state dict")
@@ -480,8 +501,8 @@ class WanVideoPipeline(BasePipeline):
         dit_state_dict, dit2_state_dict = None, None
         if isinstance(config.model_path, list):
-            high_noise_model_ckpt = [path for path in config.model_path if "high_noise_model" in path]
-            low_noise_model_ckpt = [path for path in config.model_path if "low_noise_model" in path]
+            high_noise_model_ckpt = [path for path in config.model_path if "high_noise" in path]
+            low_noise_model_ckpt = [path for path in config.model_path if "low_noise" in path]
             if high_noise_model_ckpt and low_noise_model_ckpt:
                 logger.info(f"loading high noise model state dict from {high_noise_model_ckpt} ...")
                 dit_state_dict = cls.load_model_checkpoint(
@@ -681,8 +702,9 @@ class WanVideoPipeline(BasePipeline):
                 config.attn_params = VideoSparseAttentionParams(sparsity=0.9)
     def update_weights(self, state_dicts: WanStateDicts) -> None:
-        is_dual_model_state_dict = (isinstance(state_dicts.model, dict) and
-                                     ("high_noise_model" in state_dicts.model or "low_noise_model" in state_dicts.model))
+        is_dual_model_state_dict = isinstance(state_dicts.model, dict) and (
+            "high_noise_model" in state_dicts.model or "low_noise_model" in state_dicts.model
+        )
         is_dual_model_pipeline = self.dit2 is not None
         if is_dual_model_state_dict != is_dual_model_pipeline:
@@ -694,15 +716,21 @@ class WanVideoPipeline(BasePipeline):
         if is_dual_model_state_dict:
             if "high_noise_model" in state_dicts.model:
-                self.update_component(self.dit, state_dicts.model["high_noise_model"], self.config.device, self.config.model_dtype)
+                self.update_component(
+                    self.dit, state_dicts.model["high_noise_model"], self.config.device, self.config.model_dtype
+                )
             if "low_noise_model" in state_dicts.model:
-                self.update_component(self.dit2, state_dicts.model["low_noise_model"], self.config.device, self.config.model_dtype)
+                self.update_component(
+                    self.dit2, state_dicts.model["low_noise_model"], self.config.device, self.config.model_dtype
+                )
         else:
             self.update_component(self.dit, state_dicts.model, self.config.device, self.config.model_dtype)
         self.update_component(self.text_encoder, state_dicts.t5, self.config.device, self.config.t5_dtype)
         self.update_component(self.vae, state_dicts.vae, self.config.device, self.config.vae_dtype)
-        self.update_component(self.image_encoder, state_dicts.image_encoder, self.config.device, self.config.image_encoder_dtype)
+        self.update_component(
+            self.image_encoder, state_dicts.image_encoder, self.config.device, self.config.image_encoder_dtype
+        )
     def compile(self):
         self.dit.compile_repeated_blocks()

diffsynth_engine/pipelines/z_image.py CHANGED Viewed

@@ -40,20 +40,54 @@ class ZImageLoRAConverter(LoRAStateDictConverter):
         for key, param in lora_state_dict.items():
             if "lora_A.weight" in key:
                 lora_b_key = key.replace("lora_A.weight", "lora_B.weight")
-                target_key = key.replace(".lora_A.weight", "").replace("transformer.", "")
+                target_key = key.replace(".lora_A.weight", "").replace("diffusion_model.", "")
-                if "attn.to_out.0" in target_key:
-                    target_key = target_key.replace("attn.to_out.0", "attn.to_out")
+                if "attention.to_out.0" in target_key:
+                    target_key = target_key.replace("attention.to_out.0", "attention.to_out")
+                if "adaLN_modulation.0" in target_key:
+                    target_key = target_key.replace("adaLN_modulation.0", "adaLN_modulation")
+                up = lora_state_dict[lora_b_key]
+                rank = up.shape[1]
                 dit_dict[target_key] = {
                     "down": param,
-                    "up": lora_state_dict[lora_b_key],
-                    "alpha": lora_state_dict.get(key.replace("lora_A.weight", "alpha"), None),
+                    "up": up,
+                    "rank": rank,
+                    "alpha": lora_state_dict.get(key.replace("lora_A.weight", "alpha"), rank),
                 }
         return {"dit": dit_dict}
+    def _from_diffsynth(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
+        dit_dict = {}
+        for key, param in lora_state_dict.items():
+            if "lora_A.default.weight" in key:
+                lora_b_key = key.replace("lora_A.default.weight", "lora_B.default.weight")
+                target_key = key.replace(".lora_A.default.weight", "")
+                if "attention.to_out.0" in target_key:
+                    target_key = target_key.replace("attention.to_out.0", "attention.to_out")
+                up = lora_state_dict[lora_b_key]
+                rank = up.shape[1]
+                dit_dict[target_key] = {
+                    "down": param,
+                    "up": up,
+                    "rank": rank,
+                    "alpha": lora_state_dict.get(key.replace("lora_A.default.weight", "alpha"), rank),
+                }
+        return {"dit": dit_dict}
     def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
-        return self._from_diffusers(lora_state_dict)
+        key = list(lora_state_dict.keys())[0]
+        if key.startswith("diffusion_model."):
+            return self._from_diffusers(lora_state_dict)
+        else:
+            return self._from_diffsynth(lora_state_dict)
 class ZImagePipeline(BasePipeline):
@@ -180,7 +214,7 @@ class ZImagePipeline(BasePipeline):
     def update_weights(self, state_dicts: ZImageStateDicts) -> None:
         self.update_component(self.dit, state_dicts.model, self.config.device, self.config.model_dtype)
         self.update_component(
-            self.text_encoder, state_dicts.text_encoder, self.config.device, self.config.encoder_dtype
+            self.text_encoder, state_dicts.encoder, self.config.device, self.config.encoder_dtype
         )
         self.update_component(self.vae_decoder, state_dicts.vae, self.config.device, self.config.vae_dtype)
@@ -276,8 +310,8 @@ class ZImagePipeline(BasePipeline):
             comb_pred = self.predict_noise(latents, t, prompt_emb)[0]
         else:
             if not batch_cfg:
-                positive_noise_pred = self.predict_noise(latents, t, prompt_emb)
-                negative_noise_pred = self.predict_noise(latents, t, negative_prompt_emb)
+                positive_noise_pred = self.predict_noise(latents, t, prompt_emb)[0]
+                negative_noise_pred = self.predict_noise(latents, t, negative_prompt_emb)[0]
             else:
                 latents_input = torch.cat([latents, latents], dim=0)
                 t = torch.cat([t, t], dim=0)
@@ -360,6 +394,7 @@ class ZImagePipeline(BasePipeline):
                 prompt_emb=prompt_embeds,
                 negative_prompt_emb=negative_prompt_embeds,
                 batch_cfg=self.config.batch_cfg,
+                cfg_scale=cfg_scale,
                 cfg_truncation=cfg_truncation,
                 cfg_normalization=cfg_normalization,
             )

{diffsynth_engine-0.6.1.dev37.dist-info → diffsynth_engine-0.6.1.dev39.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev37
+Version: 0.6.1.dev39
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.6.1.dev37.dist-info → diffsynth_engine-0.6.1.dev39.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-diffsynth_engine/__init__.py,sha256=hN0jYaikjhpqHB4Mg-e53h-7ck1DsiY4FBti8K9lN2k,2390
+diffsynth_engine/__init__.py,sha256=um2Vh4BgmAAG66LafdcTXPiJ6dFtBU85xwPSKZOswFE,2432
 diffsynth_engine/algorithm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/algorithm/noise_scheduler/__init__.py,sha256=YvcwE2tCNua-OAX9GEPm0EXsINNWH4XvJMNZb-uaZMM,745
 diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py,sha256=3ve4bYxGyfuERynvoNYdFYSk0agdBgXKCeIOS6O6wgI,819
@@ -150,17 +150,18 @@ diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCg
 diffsynth_engine/models/z_image/__init__.py,sha256=d1ztBNgM8GR2_uGwlxOE1Jf5URTq1g-WnmJH7nrMoaY,160
 diffsynth_engine/models/z_image/qwen3.py,sha256=PmT6m46Fc7KZXNzG7ig23Mzj6QfHnMmrpX_MM0UuuYg,4580
 diffsynth_engine/models/z_image/z_image_dit.py,sha256=kGtYzmfzk_FDe7KWfXpJagN7k7ROXl5J01IhRRs-Bsk,23806
-diffsynth_engine/pipelines/__init__.py,sha256=xQUtz2cVmcEInazvT1dqv2HdPiJKmywWTIPfbK5dZXI,662
-diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
+diffsynth_engine/pipelines/__init__.py,sha256=44odpJm_Jnkzdbl1GDq9XVu4LN0_SICsK5ubjYKWeg4,720
+diffsynth_engine/pipelines/base.py,sha256=h6xOqT1LMFGrJYoTD68_VoHcfRX04je8KUE_y3BUZfM,17279
 diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
 diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
-diffsynth_engine/pipelines/qwen_image.py,sha256=9n0fZCYw5E1iloXqd7vOU0XfHVPxQp_pm-v4D3Oloos,35751
+diffsynth_engine/pipelines/qwen_image.py,sha256=Xc3H5LiQj2MUdi2KgFD2G2VqDwUa2ehqj4H35sr8iro,35627
 diffsynth_engine/pipelines/sd_image.py,sha256=nr-Nhsnomq8CsUqhTM3i2l2zG01YjwXdfRXgr_bC3F0,17891
 diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8etEV-ZLP91g,21704
 diffsynth_engine/pipelines/utils.py,sha256=HZbJHErNJS1DhlwJKvZ9dY7Kh8Zdlsw3zE2e88TYGRY,2277
+diffsynth_engine/pipelines/wan_dmd.py,sha256=T_i4xp_tASFSaKZxg50FEAk5TOn89JSNv-4y5Os6Q6E,4508
 diffsynth_engine/pipelines/wan_s2v.py,sha256=QHlCLMqlmnp55iYm2mzg4qCq4jceRAP3Zt5Mubz3mAM,29384
-diffsynth_engine/pipelines/wan_video.py,sha256=9xjSvQ4mlVEDdaL6QuUURj4iyxhJ2xABBphQjkfzK8s,31323
-diffsynth_engine/pipelines/z_image.py,sha256=gSBhKV7TBL9xvCUrABdZA0kNqQzPuawmEv8OcI6KTcs,14756
+diffsynth_engine/pipelines/wan_video.py,sha256=9nUV6h2zBbGu3gvVSM_oqdoruCdBWoa7t6vrJYJt8QY,32391
+diffsynth_engine/pipelines/z_image.py,sha256=VvqjxsKRsmP2tfWg9nDlcQu5oEzIRFa2wtuArzjQAlk,16151
 diffsynth_engine/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/processor/canny_processor.py,sha256=hV30NlblTkEFUAmF_O-LJrNlGVM2SFrqq6okfF8VpOo,602
 diffsynth_engine/processor/depth_processor.py,sha256=dQvs3JsnyMbz4dyI9QoR8oO-mMFBFAgNvgqeCoaU5jk,1532
@@ -199,8 +200,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
 diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
 diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
-diffsynth_engine-0.6.1.dev37.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
-diffsynth_engine-0.6.1.dev37.dist-info/METADATA,sha256=CPIGd-wiemCvRFuEmqWEQC_r5WVbs6Ydilo9u2ZFR3w,1164
-diffsynth_engine-0.6.1.dev37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-diffsynth_engine-0.6.1.dev37.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
-diffsynth_engine-0.6.1.dev37.dist-info/RECORD,,
+diffsynth_engine-0.6.1.dev39.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
+diffsynth_engine-0.6.1.dev39.dist-info/METADATA,sha256=f_qU_vp4RcHSOgW3Agm428engf8v7TKRCt8DuxAOEi8,1164
+diffsynth_engine-0.6.1.dev39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+diffsynth_engine-0.6.1.dev39.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
+diffsynth_engine-0.6.1.dev39.dist-info/RECORD,,

{diffsynth_engine-0.6.1.dev37.dist-info → diffsynth_engine-0.6.1.dev39.dist-info}/WHEEL RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev37.dist-info → diffsynth_engine-0.6.1.dev39.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev37.dist-info → diffsynth_engine-0.6.1.dev39.dist-info}/top_level.txt RENAMED Viewed

File without changes

diffsynth-engine 0.6.1.dev37__py3-none-any.whl → 0.6.1.dev39__py3-none-any.whl

diffsynth-engine 0.6.1.dev37py3-none-any.whl → 0.6.1.dev39py3-none-any.whl