PyPI - diffsynth-engine - Versions diffs - 0.3.6.dev13__py3-none-any.whl → 0.3.6.dev14__py3-none-any.whl - Mend

diffsynth-engine 0.3.6.dev13py3-none-any.whl → 0.3.6.dev14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

diffsynth_engine/pipelines/wan_video.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import torch
 import torch.distributed as dist
-import numpy as np
-from einops import rearrange
 from typing import Callable, List, Tuple, Optional
 from tqdm import tqdm
 from PIL import Image
@@ -97,14 +95,6 @@ class WanLoRAConverter(LoRAStateDictConverter):
         return state_dict
-SHIFT_FACTORS = {
-    "1.3b-t2v": 5.0,
-    "14b-t2v": 5.0,
-    "14b-i2v": 5.0,
-    "14b-flf2v": 16.0,
-}
 class WanVideoPipeline(BasePipeline):
     lora_converter = WanLoRAConverter()
@@ -114,6 +104,7 @@ class WanVideoPipeline(BasePipeline):
         tokenizer: WanT5Tokenizer,
         text_encoder: WanTextEncoder,
         dit: WanDiT,
+        dit2: WanDiT | None,
         vae: WanVideoVAE,
         image_encoder: WanImageEncoder,
     ):
@@ -125,6 +116,7 @@ class WanVideoPipeline(BasePipeline):
             dtype=config.model_dtype,
         )
         self.config = config
+        self.upsampling_factor = vae.upsampling_factor
         # sampler
         self.noise_scheduler = RecifitedFlowScheduler(
             shift=config.shift if config.shift is not None else 5.0,
@@ -135,10 +127,11 @@ class WanVideoPipeline(BasePipeline):
         # models
         self.tokenizer = tokenizer
         self.text_encoder = text_encoder
-        self.dit = dit
+        self.dit = dit  # high noise model
+        self.dit2 = dit2  # low noise model
         self.vae = vae
         self.image_encoder = image_encoder
-        self.model_names = ["text_encoder", "dit", "vae", "image_encoder"]
+        self.model_names = ["text_encoder", "dit", "dit2", "vae", "image_encoder"]
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
@@ -166,40 +159,62 @@ class WanVideoPipeline(BasePipeline):
         prompt_emb = prompt_emb.masked_fill(mask.unsqueeze(-1).expand_as(prompt_emb) == 0, 0)
         return prompt_emb
-    def encode_image(self, images: Image.Image | List[Image.Image], num_frames, height, width):
+    def encode_clip_feature(self, images: Image.Image | List[Image.Image], height, width):
+        if not images or not self.dit.has_clip_feature:
+            return None
+        self.load_models_to_device(["image_encoder"])
         if isinstance(images, Image.Image):
             images = [images]
-        images = [
-            self.preprocess_image(image.resize((width, height), Image.Resampling.LANCZOS)).to(
-                device=self.device, dtype=self.config.image_encoder_dtype
-            )
-            for image in images
-        ]
+        images = [self.preprocess_image(img.resize((width, height), Image.Resampling.LANCZOS)) for img in images]
+        images = [img.to(device=self.device, dtype=self.config.image_encoder_dtype) for img in images]
         clip_context = self.image_encoder.encode_image(images).to(self.dtype)
+        return clip_context
+    def encode_vae_feature(self, images: Image.Image | List[Image.Image], num_frames, height, width):
+        if not images or not self.dit.has_vae_feature:
+            return None
+        self.load_models_to_device(["vae"])
+        if isinstance(images, Image.Image):
+            images = [images]
+        images = [self.preprocess_image(img.resize((width, height), Image.Resampling.LANCZOS)) for img in images]
         indices = torch.linspace(0, num_frames - 1, len(images), dtype=torch.long)
-        msk = torch.zeros(1, num_frames, height // 8, width // 8, device=self.device, dtype=self.config.vae_dtype)
+        msk = torch.zeros(
+            1,
+            num_frames,
+            height // self.upsampling_factor,
+            width // self.upsampling_factor,
+            device=self.device,
+            dtype=self.config.vae_dtype,
+        )
         msk[:, indices] = 1
         msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
-        msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+        msk = msk.view(1, msk.shape[1] // 4, 4, height // self.upsampling_factor, width // self.upsampling_factor)
         msk = msk.transpose(1, 2).squeeze(0)
         video = torch.zeros(3, num_frames, height, width).to(device=self.device, dtype=self.config.vae_dtype)
-        video[:, indices] = torch.concat([image.transpose(0, 1) for image in images], dim=1).to(
-            dtype=self.config.vae_dtype
+        video[:, indices] = torch.concat([img.transpose(0, 1) for img in images], dim=1).to(
+            device=self.device, dtype=self.config.vae_dtype
         )
         y = self.vae.encode([video], device=self.device)[0]
         y = torch.concat([msk, y]).to(dtype=self.dtype)
-        return clip_context, y.unsqueeze(0)
+        return y.unsqueeze(0)
-    def tensor2video(self, frames):
-        frames = rearrange(frames, "C T H W -> T H W C")
-        frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
-        frames = [Image.fromarray(frame) for frame in frames]
-        return frames
+    def encode_image_latents(self, images: Image.Image | List[Image.Image], height, width):
+        if not images or not self.dit.fuse_image_latents:
+            return
-    def encode_video(self, videos: torch.Tensor):
-        videos = videos.to(dtype=self.config.vae_dtype, device=self.device)
+        self.load_models_to_device(["vae"])
+        if isinstance(images, Image.Image):
+            images = [images]
+        frames = [self.preprocess_image(img.resize((width, height), Image.Resampling.LANCZOS)) for img in images]
+        video = torch.stack(frames, dim=2).squeeze(0)
+        latents = self.encode_video([video]).to(dtype=self.dtype, device=self.device)
+        return latents
+    def encode_video(self, videos: List[torch.Tensor]) -> torch.Tensor:
+        videos = [video.to(dtype=self.config.vae_dtype, device=self.device) for video in videos]
         latents = self.vae.encode(
             videos,
             device=self.device,
@@ -210,7 +225,7 @@ class WanVideoPipeline(BasePipeline):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
         return latents
-    def decode_video(self, latents, progress_callback=None) -> List[torch.Tensor]:
+    def decode_video(self, latents: torch.Tensor, progress_callback=None) -> List[torch.Tensor]:
         latents = latents.to(dtype=self.config.vae_dtype, device=self.device)
         videos = self.vae.decode(
             latents,
@@ -225,6 +240,7 @@ class WanVideoPipeline(BasePipeline):
     def predict_noise_with_cfg(
         self,
+        model: WanDiT,
         latents: torch.Tensor,
         image_clip_feature: torch.Tensor,
         image_y: torch.Tensor,
@@ -236,6 +252,7 @@ class WanVideoPipeline(BasePipeline):
     ):
         if cfg_scale <= 1.0:
             return self.predict_noise(
+                model=model,
                 latents=latents,
                 image_clip_feature=image_clip_feature,
                 image_y=image_y,
@@ -245,6 +262,7 @@ class WanVideoPipeline(BasePipeline):
         if not batch_cfg:
             # cfg by predict noise one by one
             positive_noise_pred = self.predict_noise(
+                model=model,
                 latents=latents,
                 image_clip_feature=image_clip_feature,
                 image_y=image_y,
@@ -252,6 +270,7 @@ class WanVideoPipeline(BasePipeline):
                 context=positive_prompt_emb,
             )
             negative_noise_pred = self.predict_noise(
+                model=model,
                 latents=latents,
                 image_clip_feature=image_clip_feature,
                 image_y=image_y,
@@ -270,6 +289,7 @@ class WanVideoPipeline(BasePipeline):
             if image_clip_feature is not None:
                 image_clip_feature = torch.cat([image_clip_feature, image_clip_feature], dim=0)
             positive_noise_pred, negative_noise_pred = self.predict_noise(
+                model=model,
                 latents=latents,
                 image_clip_feature=image_clip_feature,
                 image_y=image_y,
@@ -279,10 +299,10 @@ class WanVideoPipeline(BasePipeline):
             noise_pred = negative_noise_pred + cfg_scale * (positive_noise_pred - negative_noise_pred)
             return noise_pred
-    def predict_noise(self, latents, image_clip_feature, image_y, timestep, context):
+    def predict_noise(self, model, latents, image_clip_feature, image_y, timestep, context):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
-        noise_pred = self.dit(
+        noise_pred = model(
             x=latents,
             timestep=timestep,
             context=context,
@@ -298,17 +318,23 @@ class WanVideoPipeline(BasePipeline):
         denoising_strength,
         num_inference_steps,
     ):
-        if input_video is not None:
+        height, width = latents.shape[-2:]
+        height, width = height * self.upsampling_factor, width * self.upsampling_factor
+        if input_video is not None:  # video to video
             total_steps = num_inference_steps
             sigmas, timesteps = self.noise_scheduler.schedule(total_steps)
             t_start = max(total_steps - int(num_inference_steps * denoising_strength), 1)
             sigma_start, sigmas = sigmas[t_start - 1], sigmas[t_start - 1 :]
             timesteps = timesteps[t_start - 1 :]
+            self.load_models_to_device(["vae"])
             noise = latents
-            input_video = self.preprocess_images(input_video)
-            input_video = torch.stack(input_video, dim=2)
-            latents = self.encode_video(input_video).to(dtype=latents.dtype, device=latents.device)
+            frames = [
+                self.preprocess_image(frame.resize((width, height), Image.Resampling.LANCZOS)) for frame in input_video
+            ]
+            video = torch.stack(frames, dim=2).squeeze(0)
+            video = video.to(dtype=self.config.vae_dtype, device=self.device)
+            latents = self.encode_video([video]).to(dtype=latents.dtype, device=latents.device)
             init_latents = latents.clone()
             latents = self.sampler.add_noise(latents, noise, sigma_start)
         else:
@@ -329,18 +355,29 @@ class WanVideoPipeline(BasePipeline):
         height=480,
         width=832,
         num_frames=81,
-        cfg_scale=5.0,
-        num_inference_steps=50,
+        cfg_scale=None,
+        num_inference_steps=None,
         progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
     ):
         assert height % 16 == 0 and width % 16 == 0, "height and width must be divisible by 16"
         assert (num_frames - 1) % 4 == 0, "num_frames must be 4X+1"
+        cfg_scale = self.config.cfg_scale if cfg_scale is None else cfg_scale
+        num_inference_steps = self.config.num_inference_steps if num_inference_steps is None else num_inference_steps
         # Initialize noise
         if dist.is_initialized() and seed is None:
             raise ValueError("must provide a seed when parallelism is enabled")
         noise = self.generate_noise(
-            (1, 16, (num_frames - 1) // 4 + 1, height // 8, width // 8), seed=seed, device="cpu", dtype=torch.float32
+            (
+                1,
+                self.vae.z_dim,
+                (num_frames - 1) // 4 + 1,
+                height // self.upsampling_factor,
+                width // self.upsampling_factor,
+            ),
+            seed=seed,
+            device="cpu",
+            dtype=torch.float32,
         ).to(self.device)
         init_latents, latents, sigmas, timesteps = self.prepare_latents(
             noise,
@@ -348,33 +385,49 @@ class WanVideoPipeline(BasePipeline):
             denoising_strength,
             num_inference_steps,
         )
-        self.sampler.initialize(init_latents=init_latents, timesteps=timesteps, sigmas=sigmas)
+        mask = torch.ones((1, 1, *latents.shape[2:]), dtype=latents.dtype, device=latents.device)
         # Encode prompts
         self.load_models_to_device(["text_encoder"])
         prompt_emb_posi = self.encode_prompt(prompt)
-        prompt_emb_nega = None if cfg_scale <= 1.0 else self.encode_prompt(negative_prompt)
+        prompt_emb_nega = self.encode_prompt(negative_prompt)
         # Encode image
-        if input_image is not None and self.image_encoder is not None:
-            self.load_models_to_device(["image_encoder", "vae"])
-            image_clip_feature, image_y = self.encode_image(input_image, num_frames, height, width)
-        else:
-            image_clip_feature, image_y = None, None
+        image_clip_feature = self.encode_clip_feature(input_image, height, width)
+        image_y = self.encode_vae_feature(input_image, num_frames, height, width)
+        image_latents = self.encode_image_latents(input_image, height, width)
+        if image_latents is not None:
+            latents[:, :, : image_latents.shape[2], :, :] = image_latents
+            init_latents = latents.clone()
+            mask[:, :, : image_latents.shape[2], :, :] = 0
+        # Initialize sampler
+        self.sampler.initialize(init_latents=init_latents, timesteps=timesteps, sigmas=sigmas, mask=mask)
         # Denoise
-        self.load_models_to_device(["dit"])
         hide_progress = dist.is_initialized() and dist.get_rank() != 0
         for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
-            timestep = timestep.unsqueeze(0).to(dtype=self.config.model_dtype, device=self.device)
+            if timestep.item() / 1000 >= self.config.boundary:
+                self.load_models_to_device(["dit"])
+                model = self.dit
+                cfg_scale_ = cfg_scale if isinstance(cfg_scale, float) else cfg_scale[1]
+            else:
+                self.load_models_to_device(["dit2"])
+                model = self.dit2
+                cfg_scale_ = cfg_scale if isinstance(cfg_scale, float) else cfg_scale[0]
+            timestep = timestep * mask[:, :, :, ::2, ::2].flatten()  # seq_len
+            timestep = timestep.to(dtype=self.config.model_dtype, device=self.device)
             # Classifier-free guidance
             noise_pred = self.predict_noise_with_cfg(
+                model=model,
                 latents=latents,
                 timestep=timestep,
                 positive_prompt_emb=prompt_emb_posi,
                 negative_prompt_emb=prompt_emb_nega,
                 image_clip_feature=image_clip_feature,
                 image_y=image_y,
-                cfg_scale=cfg_scale,
+                cfg_scale=cfg_scale_,
                 batch_cfg=self.config.batch_cfg,
             )
             # Scheduler
@@ -385,7 +438,7 @@ class WanVideoPipeline(BasePipeline):
         # Decode
         self.load_models_to_device(["vae"])
         frames = self.decode_video(latents, progress_callback=progress_callback)
-        frames = self.tensor2video(frames[0])
+        frames = self.vae_output_to_image(frames)
         return frames
     @classmethod
@@ -395,24 +448,73 @@ class WanVideoPipeline(BasePipeline):
         else:
             config = model_path_or_config
+        dit_state_dict, dit2_state_dict = None, None
+        if isinstance(config.model_path, list):
+            high_noise_model_ckpt = [path for path in config.model_path if "high_noise_model" in path]
+            low_noise_model_ckpt = [path for path in config.model_path if "low_noise_model" in path]
+            if high_noise_model_ckpt and low_noise_model_ckpt:
+                logger.info(f"loading high noise model state dict from {high_noise_model_ckpt} ...")
+                dit_state_dict = cls.load_model_checkpoint(
+                    high_noise_model_ckpt, device="cpu", dtype=config.model_dtype
+                )
+                logger.info(f"loading low noise model state dict from {low_noise_model_ckpt} ...")
+                dit2_state_dict = cls.load_model_checkpoint(
+                    low_noise_model_ckpt, device="cpu", dtype=config.model_dtype
+                )
+        if dit_state_dict is None:
+            logger.info(f"loading dit state dict from {config.model_path} ...")
+            dit_state_dict = cls.load_model_checkpoint(config.model_path, device="cpu", dtype=config.model_dtype)
+        # determine wan dit type by model params
+        dit_type = None
+        if dit2_state_dict is not None and dit2_state_dict["patch_embedding.weight"].shape[1] == 36:
+            dit_type = "wan2.2-i2v-a14b"
+        elif dit2_state_dict is not None and dit2_state_dict["patch_embedding.weight"].shape[1] == 16:
+            dit_type = "wan2.2-t2v-a14b"
+        elif dit_state_dict["patch_embedding.weight"].shape[1] == 48:
+            dit_type = "wan2.2-ti2v-5b"
+        elif "img_emb.emb_pos" in dit_state_dict:
+            dit_type = "wan2.1-flf2v-14b"
+        elif "img_emb.proj.0.weight" in dit_state_dict:
+            dit_type = "wan2.1-i2v-14b"
+        elif "blocks.39.self_attn.norm_q.weight" in dit_state_dict:
+            dit_type = "wan2.1-t2v-14b"
+        else:
+            dit_type = "wan2.1-t2v-1.3b"
         if config.t5_path is None:
             config.t5_path = fetch_model("muse/wan2.1-umt5", path="umt5.safetensors")
         if config.vae_path is None:
-            config.vae_path = fetch_model("muse/wan2.1-vae", path="vae.safetensors")
-        logger.info(f"loading state dict from {config.model_path} ...")
-        dit_state_dict = cls.load_model_checkpoint(config.model_path, device="cpu", dtype=config.model_dtype)
+            config.vae_path = (
+                fetch_model("muse/wan2.2-vae", path="vae.safetensors")
+                if dit_type == "wan2.2-ti2v-5b"
+                else fetch_model("muse/wan2.1-vae", path="vae.safetensors")
+            )
-        logger.info(f"loading state dict from {config.t5_path} ...")
+        logger.info(f"loading t5 state dict from {config.t5_path} ...")
         t5_state_dict = cls.load_model_checkpoint(config.t5_path, device="cpu", dtype=config.t5_dtype)
-        logger.info(f"loading state dict from {config.vae_path} ...")
+        logger.info(f"loading vae state dict from {config.vae_path} ...")
         vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
+        # determine wan vae type by model params
+        vae_type = "wan2.1-vae"
+        if vae_state_dict["encoder.conv1.weight"].shape[1] == 12:  # in_channels
+            vae_type = "wan2.2-vae"
+        # default params from model config
+        vae_config: dict = WanVideoVAE.get_model_config(vae_type)
+        model_config: dict = WanDiT.get_model_config(dit_type)
+        config.boundary = model_config.pop("boundary", -1.0)
+        config.shift = model_config.pop("shift", 5.0)
+        config.cfg_scale = model_config.pop("cfg_scale", 5.0)
+        config.num_inference_steps = model_config.pop("num_inference_steps", 50)
+        config.fps = model_config.pop("fps", 16)
         init_device = "cpu" if config.parallelism > 1 or config.offload_mode is not None else config.device
         tokenizer = WanT5Tokenizer(WAN_TOKENIZER_CONF_PATH, seq_len=512, clean="whitespace")
         text_encoder = WanTextEncoder.from_state_dict(t5_state_dict, device=init_device, dtype=config.t5_dtype)
-        vae = WanVideoVAE.from_state_dict(vae_state_dict, device=init_device, dtype=config.vae_dtype)
+        vae = WanVideoVAE.from_state_dict(vae_state_dict, config=vae_config, device=init_device, dtype=config.vae_dtype)
         image_encoder = None
         if config.image_encoder_path is not None:
@@ -428,20 +530,6 @@ class WanVideoPipeline(BasePipeline):
                 dtype=config.image_encoder_dtype,
             )
-        # determine wan video model type by dit params
-        model_type = None
-        if "img_emb.emb_pos" in dit_state_dict:
-            model_type = "14b-flf2v"
-        elif "img_emb.proj.0.weight" in dit_state_dict:
-            model_type = "14b-i2v"
-        elif "blocks.39.self_attn.norm_q.weight" in dit_state_dict:
-            model_type = "14b-t2v"
-        else:
-            model_type = "1.3b-t2v"
-        # shift for different model_type
-        config.shift = SHIFT_FACTORS[model_type] if config.shift is None else config.shift
         with LoRAContext():
             attn_kwargs = {
                 "attn_impl": config.dit_attn_impl,
@@ -452,7 +540,7 @@ class WanVideoPipeline(BasePipeline):
             }
             dit = WanDiT.from_state_dict(
                 dit_state_dict,
-                model_type=model_type,
+                config=model_config,
                 device=init_device,
                 dtype=config.model_dtype,
                 attn_kwargs=attn_kwargs,
@@ -460,11 +548,24 @@ class WanVideoPipeline(BasePipeline):
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
+            dit2 = None
+            if dit2_state_dict is not None:
+                dit2 = WanDiT.from_state_dict(
+                    dit2_state_dict,
+                    config=model_config,
+                    device=init_device,
+                    dtype=config.model_dtype,
+                    attn_kwargs=attn_kwargs,
+                )
+                if config.use_fp8_linear:
+                    enable_fp8_linear(dit2)
         pipe = cls(
             config=config,
             tokenizer=tokenizer,
             text_encoder=text_encoder,
             dit=dit,
+            dit2=dit2,
             vae=vae,
             image_encoder=image_encoder,
         )

diffsynth_engine/utils/constants.py CHANGED Viewed

@@ -23,10 +23,16 @@ SD3_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd3", "sd3_tex
 SDXL_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_text_encoder.json")
 SDXL_UNET_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_unet.json")
-WAN_DIT_1_3B_T2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "1.3b-t2v.json")
-WAN_DIT_14B_I2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "14b-i2v.json")
-WAN_DIT_14B_T2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "14b-t2v.json")
-WAN_DIT_14B_FLF2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "14b-flf2v.json")
+WAN2_1_DIT_T2V_1_3B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-t2v-1.3b.json")
+WAN2_1_DIT_T2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-t2v-14b.json")
+WAN2_1_DIT_I2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-i2v-14b.json")
+WAN2_1_DIT_FLF2V_14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.1-flf2v-14b.json")
+WAN2_2_DIT_TI2V_5B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-ti2v-5b.json")
+WAN2_2_DIT_T2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-t2v-a14b.json")
+WAN2_2_DIT_I2V_A14B_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "wan2.2-i2v-a14b.json")
+WAN2_1_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.1-vae.json")
+WAN2_2_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "vae", "wan2.2-vae.json")
 # data size
 KB = 1024

diffsynth_engine/utils/parallel.py CHANGED Viewed

@@ -336,6 +336,9 @@ class ParallelWrapper:
             except RuntimeError as e:
                 raise RuntimeError("Failed to set start method to spawn:", e)
         super().__init__()
+        self.config = module.config if isinstance(module, BasePipeline) else None
+        self._module_name = module.__class__.__name__
         self.world_size = cfg_degree * sp_ulysses_degree * sp_ring_degree * tp_degree
         self.queue_in = mp.Queue()
         self.queue_out = mp.Queue()
@@ -357,7 +360,6 @@ class ParallelWrapper:
             nprocs=self.world_size,
             join=False,
         )
-        self._module_name = module.__class__.__name__
     def __call__(self, *args, **kwargs):
         data = ["__call__", args, kwargs]

{diffsynth_engine-0.3.6.dev13.dist-info → diffsynth_engine-0.3.6.dev14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.3.6.dev13
+Version: 0.3.6.dev14
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.3.6.dev13.dist-info → diffsynth_engine-0.3.6.dev14.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/linear.py,sha256=QDz
 diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/sgm_uniform.py,sha256=ZQ5OLY6_CMmV0V2MtUzHxcXyVpanhMopWYiRr2CtFTk,683
 diffsynth_engine/algorithm/sampler/__init__.py,sha256=Ow07B9JeQbgCjDtaxYPeU_p2k76CUOuHGDGvoAyD1SU,725
 diffsynth_engine/algorithm/sampler/flow_match/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py,sha256=wJVanlk6R075fBNGHwA3BJENxRyPgOslIjo0VGRXgKQ,746
+diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py,sha256=7wnI0MjgaTZnkK-JWAHIStloCnQNgpZx4JVKjw2SAEE,733
 diffsynth_engine/algorithm/sampler/stable_diffusion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/algorithm/sampler/stable_diffusion/brownian_tree.py,sha256=Y2yTAp_aMWb7z8V7GO48jwPv1LhEkxVJTcsljq0qHqg,2106
 diffsynth_engine/algorithm/sampler/stable_diffusion/ddpm.py,sha256=H5BkzNAUpp_RhTmHRyYWekCYjChX7uuuxFSgE5ZBAlc,1288
@@ -36,10 +36,15 @@ diffsynth_engine/conf/models/sd3/sd3_dit.json,sha256=RyJeCKjd4UPRf2Qbicd8Oxlioxg
 diffsynth_engine/conf/models/sd3/sd3_text_encoder.json,sha256=1yXwzKbbIIVg1QPhQJxjdwvbFkA1mJ6NR6dw2vrN-1A,91415
 diffsynth_engine/conf/models/sdxl/sdxl_text_encoder.json,sha256=cBN3mIm4BjJYbSpL2gz4yeb1aP0BvGt9na4hmuafyJo,35642
 diffsynth_engine/conf/models/sdxl/sdxl_unet.json,sha256=9f9ca1qYQALaDkA5KTCfVP9mKFvhM2xFP5e042Ryppw,129779
-diffsynth_engine/conf/models/wan/dit/1.3b-t2v.json,sha256=mtBk_lj4R18wadhaIu-EDSErhQik3GK2xHSUkGWW6BM,239
-diffsynth_engine/conf/models/wan/dit/14b-flf2v.json,sha256=xsONv_2NgDmcBKsNtjShN2Llp2gniPdKHAfgEOdTtHQ,264
-diffsynth_engine/conf/models/wan/dit/14b-i2v.json,sha256=5xnvVzevKep0xQcbNkuIlskF1jS6co9y8WsZV2BqV9Q,239
-diffsynth_engine/conf/models/wan/dit/14b-t2v.json,sha256=NkggFTpaKb2pOXRLpEi3xv3uXrJdsmY-mve_UjAoVR4,240
+diffsynth_engine/conf/models/wan/dit/wan2.1-flf2v-14b.json,sha256=s7yoVErSiuSlGwwqfrvhvmzz6MD4oAqBKg7iZfL1vX8,313
+diffsynth_engine/conf/models/wan/dit/wan2.1-i2v-14b.json,sha256=BkDV80TkA-_vTRR_1AWpGIzwlgtuKbh-gezW2Q20dlQ,269
+diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-1.3b.json,sha256=M_h55-mMhpgXUuY85sBK6-_f4fg3bfCa6T7n1CyMP3s,209
+diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-14b.json,sha256=7i2Hq8BRH4kDVYBKcIBt8m3vCl_HGZZPFY5fmFw4xgs,210
+diffsynth_engine/conf/models/wan/dit/wan2.2-i2v-a14b.json,sha256=7OmPEfreIu8Ex6NDr1IW69zmKRp21hZkmg_9yg6sUg8,322
+diffsynth_engine/conf/models/wan/dit/wan2.2-t2v-a14b.json,sha256=MqxjGwq8VqD-1RwbPocbkKx0JzsMgwn18hfVK7M0d4k,312
+diffsynth_engine/conf/models/wan/dit/wan2.2-ti2v-5b.json,sha256=tO7nymyqQgBIgxlswITnIc_MsRr1RRPhZbbhJ-1gHow,257
+diffsynth_engine/conf/models/wan/vae/wan2.1-vae.json,sha256=eVLTSRqbXm3JD8QDkLbM6vFfCdynlS-8QxqCfi4BzrI,815
+diffsynth_engine/conf/models/wan/vae/wan2.2-vae.json,sha256=pdnYEEZ_GcZHM_iH1y5ASdf_qZUGCOuDEaFmjdg9RKY,1860
 diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt,sha256=n9aR98gDkhDg_O0VhlRmxlgg0JtjmIsBdL_iXeKZBRo,524619
 diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json,sha256=LNs7gzGmDJL8HlWhPp_WH9IpPFpRJ1_czNYreABSUw4,588
 diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json,sha256=a9zunMzioWyitMDF7QC0LFDqIl9EcqjEweljopAsKIE,705
@@ -62,7 +67,7 @@ diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json,sha256=bhl7TT29cdoU
 diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json,sha256=7Zo6iw-qcacKMoR-BDX-A25uES1N9O23u0ipIeNE3AU,61728
 diffsynth_engine/configs/__init__.py,sha256=qvfbnHf3wK9THPU_mFr1Qx_lU80BaUp5HpxUmjoNy60,502
 diffsynth_engine/configs/controlnet.py,sha256=EpUkCdRNk2G5uo56syaOzPFdR9g0sDHRXckagmMsgaQ,948
-diffsynth_engine/configs/pipeline.py,sha256=NPQlNz-AOpi8qFzRob0RNnOqSc8C-vCdHbstLyUugeo,7703
+diffsynth_engine/configs/pipeline.py,sha256=ltTn4tRETeS-v7IMm3fkokza2PKluVb2DWGk8mPOghU,8079
 diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
 diffsynth_engine/models/base.py,sha256=sbyyGP-ENnqicr6cxjEmXRf6dWrmKjCu6k5yamuJ518,2665
@@ -103,17 +108,17 @@ diffsynth_engine/models/text_encoder/t5.py,sha256=iSYyYQF4DUU0zpN65V_slWoftBTDVw
 diffsynth_engine/models/vae/__init__.py,sha256=TFSIXZ-UyRaZbEr5KUXm1d4koS5gbgsCi7Soh6jDV0Y,140
 diffsynth_engine/models/vae/vae.py,sha256=FWMVqahY1BdnIkzLi8ykCp_VWHs05l0JF21wk7763LI,15844
 diffsynth_engine/models/wan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-diffsynth_engine/models/wan/wan_dit.py,sha256=gUd9KeMl7y_VPLntGoGtT2Io94opPiKlrrfo8E8O-8o,18869
+diffsynth_engine/models/wan/wan_dit.py,sha256=05UG5B3wMu565HGCkfTMHjUHxT18xZ_lz0rvNqVoMqM,19753
 diffsynth_engine/models/wan/wan_image_encoder.py,sha256=LYwcfCcQmXf9FP08DGaU2bfaPgFfdpJ23OpJP8UCggo,14397
 diffsynth_engine/models/wan/wan_text_encoder.py,sha256=bkphxtqNNwXcEA_OaUrwV9CvICV-s16awu5Z9gjjzsM,10912
-diffsynth_engine/models/wan/wan_vae.py,sha256=RxyuHExQmRjGBAqhZdIbtwZFdCibTzh__U4-Sa00zdI,29004
+diffsynth_engine/models/wan/wan_vae.py,sha256=bYXW-7FdLAi7391y9nQbYpKPYFDYyTxWbY_0TBrn2Yw,38444
 diffsynth_engine/pipelines/__init__.py,sha256=kTvANqHcMPrHqiJVg-XohfqRdW6Cj4aElfItTb1B7Vs,380
-diffsynth_engine/pipelines/base.py,sha256=yVp4hSPCqk98azzy3ykKBfPAufvq_ncTFOURN95z7d0,12178
+diffsynth_engine/pipelines/base.py,sha256=o31tD_iFobNbEPsl_d8ih9-GL023-qqb55r06i0SvAw,12050
 diffsynth_engine/pipelines/flux_image.py,sha256=MtQqTnCqQjIFovhA3lzBXpnkS4DkZH2PtFUwNZdl42M,48839
 diffsynth_engine/pipelines/sd_image.py,sha256=5dGIa6crtklO7xPd1eeBVkqj54Pe89Uo3bMyXVEaXxM,17822
 diffsynth_engine/pipelines/sdxl_image.py,sha256=Ns4bCSO3BtCXdjGJEQ0s5oY0S3jrp5yE5lhfon-iNiw,21575
 diffsynth_engine/pipelines/utils.py,sha256=VfSTwRejSVSKXIa7w0VhObmvaBFRvDP-uiYsHHkPAgs,165
-diffsynth_engine/pipelines/wan_video.py,sha256=vi_xW-jU4PeMtZzjkfQbnj8eOymJrTZMrOQau6tx6ks,20187
+diffsynth_engine/pipelines/wan_video.py,sha256=wbCHPDgs4BmyX1DsawaXqxeCoVAcISNcXNnFr2qcTx0,25424
 diffsynth_engine/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/processor/canny_processor.py,sha256=hV30NlblTkEFUAmF_O-LJrNlGVM2SFrqq6okfF8VpOo,602
 diffsynth_engine/processor/depth_processor.py,sha256=dQvs3JsnyMbz4dyI9QoR8oO-mMFBFAgNvgqeCoaU5jk,1532
@@ -128,7 +133,7 @@ diffsynth_engine/tools/flux_outpainting_tool.py,sha256=sxGRAiht27he9CT_dL9KkXVvM
 diffsynth_engine/tools/flux_reference_tool.py,sha256=BJlXQxH8j3AhEhlymIlE6OnIH2gU_l_qv5k10JDZKng,3705
 diffsynth_engine/tools/flux_replace_tool.py,sha256=M_q8KnsBEwNi4w8NOK-F2Bmj7cKUNcA9QMzwrp3zm6E,3336
 diffsynth_engine/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-diffsynth_engine/utils/constants.py,sha256=L7sIxGNMfCvcZG66ul7GIT6fDctkcwhePAjMjG6WXx8,1969
+diffsynth_engine/utils/constants.py,sha256=9N0BuLmDeHgiKAlu1vaCTb9-tPClbCA8nTu916_UumM,2510
 diffsynth_engine/utils/download.py,sha256=NCgfL9tUca-sOhT41k6w4o__Ktbw-1aDwFTR4JDkT28,5639
 diffsynth_engine/utils/env.py,sha256=43x-kBjt5zI2cwZ9G4BOeTbedi2k6TuBzHGOBeFbFvU,280
 diffsynth_engine/utils/flag.py,sha256=6zQLnoEaU69pBEyhavCgydQfP0khw5ppCU7sue4yRqg,1370
@@ -140,12 +145,12 @@ diffsynth_engine/utils/lock.py,sha256=1Ipgst9eEFfFdViAvD5bxdB6HnHHBcqWYOb__fGaPU
 diffsynth_engine/utils/logging.py,sha256=XB0xTT8PBN6btkOjFtOvjlrOCRVgDGT8PFAp1vmse28,467
 diffsynth_engine/utils/offload.py,sha256=jUR4u7J60o4KZIRxHhMCwaeDkiXJvBa0KJkYKKT6mrg,1587
 diffsynth_engine/utils/onnx.py,sha256=jeWUudJHnESjuiEAHyUZYUZz7dCj34O9aGjHCe8yjWo,1149
-diffsynth_engine/utils/parallel.py,sha256=gbIeilfOYsqeDcgkaP68TfLjIXxvD0KfLiAsR_8gJco,16917
+diffsynth_engine/utils/parallel.py,sha256=eXFglYH2w478oYusktpllm0v4IC8CABGmy0HsE-zE_8,17000
 diffsynth_engine/utils/platform.py,sha256=2lXdw6YkqcRONCeT98n4cyg1Ii8Ybbyj2Ns72Se9tlk,496
 diffsynth_engine/utils/prompt.py,sha256=YItMchoVzsG6y-LB4vzzDUWrkhKRVlt1HfVhxZjSxMQ,280
 diffsynth_engine/utils/video.py,sha256=Ne0rd2lb59UT1q5EotpjlY7OT8F9oTCFDyo1ST77uoQ,1004
-diffsynth_engine-0.3.6.dev13.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
-diffsynth_engine-0.3.6.dev13.dist-info/METADATA,sha256=2jH1jlJdbUga4JOoDHfRyKEn6E4xQ1w9wRhLKVYaqRk,1069
-diffsynth_engine-0.3.6.dev13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-diffsynth_engine-0.3.6.dev13.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
-diffsynth_engine-0.3.6.dev13.dist-info/RECORD,,
+diffsynth_engine-0.3.6.dev14.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
+diffsynth_engine-0.3.6.dev14.dist-info/METADATA,sha256=qRZwaOSJBZh1MNJLITijv3WWBJEWHoN0Dyx-CIPdd2w,1069
+diffsynth_engine-0.3.6.dev14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+diffsynth_engine-0.3.6.dev14.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
+diffsynth_engine-0.3.6.dev14.dist-info/RECORD,,

{diffsynth_engine-0.3.6.dev13.dist-info → diffsynth_engine-0.3.6.dev14.dist-info}/WHEEL RENAMED Viewed

File without changes

{diffsynth_engine-0.3.6.dev13.dist-info → diffsynth_engine-0.3.6.dev14.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.3.6.dev13.dist-info → diffsynth_engine-0.3.6.dev14.dist-info}/top_level.txt RENAMED Viewed

File without changes

diffsynth-engine 0.3.6.dev13__py3-none-any.whl → 0.3.6.dev14__py3-none-any.whl

diffsynth-engine 0.3.6.dev13py3-none-any.whl → 0.3.6.dev14py3-none-any.whl