PyPI - diffsynth-engine - Versions diffs - 0.4.1.dev1__tar.gz → 0.4.1.post2.dev1__tar.gz - Mend

diffsynth-engine 0.4.1.dev1tar.gz → 0.4.1.post2.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.4.1.dev1
+Version: 0.4.1.post2.dev1
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/configs/pipeline.py RENAMED Viewed

@@ -16,6 +16,7 @@ class BaseConfig:
     vae_tile_stride: int | Tuple[int, int] = 256
     device: str = "cuda"
     offload_mode: Optional[str] = None
+    offload_to_disk: bool = False
 @dataclass
@@ -62,11 +63,13 @@ class SDPipelineConfig(BaseConfig):
         model_path: str | os.PathLike | List[str | os.PathLike],
         device: str = "cuda",
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "SDPipelineConfig":
         return cls(
             model_path=model_path,
             device=device,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
@@ -87,11 +90,13 @@ class SDXLPipelineConfig(BaseConfig):
         model_path: str | os.PathLike | List[str | os.PathLike],
         device: str = "cuda",
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "SDXLPipelineConfig":
         return cls(
             model_path=model_path,
             device=device,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
@@ -116,6 +121,7 @@ class FluxPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Ba
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "FluxPipelineConfig":
         return cls(
             model_path=model_path,
@@ -123,6 +129,7 @@ class FluxPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Ba
             parallelism=parallelism,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
     def __post_init__(self):
@@ -160,6 +167,7 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "WanPipelineConfig":
         return cls(
             model_path=model_path,
@@ -169,6 +177,7 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
             use_cfg_parallel=True,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
     def __post_init__(self):
@@ -196,6 +205,7 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "QwenImagePipelineConfig":
         return cls(
             model_path=model_path,
@@ -206,6 +216,7 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
             use_cfg_parallel=True,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
     def __post_init__(self):

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/models/qwen_image/qwen_image_dit.py RENAMED Viewed

@@ -315,6 +315,7 @@ class QwenImageTransformerBlock(nn.Module):
 class QwenImageDiT(PreTrainedModel):
     converter = QwenImageDiTStateDictConverter()
+    _supports_parallelization = True
     def __init__(
         self,
@@ -423,3 +424,6 @@ class QwenImageDiT(PreTrainedModel):
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
+    def get_fsdp_modules(self):
+        return ["transformer_blocks"]

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/pipelines/base.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple
 from PIL import Image
 from diffsynth_engine.configs import BaseConfig, BaseStateDicts
-from diffsynth_engine.utils.offload import enable_sequential_cpu_offload
+from diffsynth_engine.utils.offload import enable_sequential_cpu_offload, offload_model_to_dict, restore_model_from_dict
 from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
 from diffsynth_engine.utils import logging
@@ -40,6 +40,8 @@ class BasePipeline:
         self.dtype = dtype
         self.offload_mode = None
         self.model_names = []
+        self._offload_param_dict = {}
+        self.offload_to_disk = False
     @classmethod
     def from_pretrained(cls, model_path_or_config: str | BaseConfig) -> "BasePipeline":
@@ -227,32 +229,44 @@ class BasePipeline:
                 model.eval()
         return self
-    def enable_cpu_offload(self, offload_mode: str):
-        valid_offload_mode = ("cpu_offload", "sequential_cpu_offload")
+    def enable_cpu_offload(self, offload_mode: str | None, offload_to_disk:bool = False):
+        valid_offload_mode = ("cpu_offload", "sequential_cpu_offload", "disable", None)
         if offload_mode not in valid_offload_mode:
             raise ValueError(f"offload_mode must be one of {valid_offload_mode}, but got {offload_mode}")
         if self.device == "cpu" or self.device == "mps":
             logger.warning("must set an non cpu device for pipeline before calling enable_cpu_offload")
             return
-        if offload_mode == "cpu_offload":
+        if offload_mode is None or offload_mode == "disable":
+            self._disable_offload()
+        elif offload_mode == "cpu_offload":
             self._enable_model_cpu_offload()
         elif offload_mode == "sequential_cpu_offload":
             self._enable_sequential_cpu_offload()
+        self.offload_to_disk = offload_to_disk
-    def _enable_model_cpu_offload(self):
+    def _enable_model_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
-                model.to("cpu")
+                self._offload_param_dict[model_name] = offload_model_to_dict(model)
         self.offload_mode = "cpu_offload"
     def _enable_sequential_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
-                model.to("cpu")
                 enable_sequential_cpu_offload(model, self.device)
         self.offload_mode = "sequential_cpu_offload"
+    def _disable_offload(self):
+        self.offload_mode = None
+        self._offload_param_dict = {}
+        for model_name in self.model_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                model.to(self.device)
     def enable_fp8_autocast(
         self, model_names: List[str], compute_dtype: torch.dtype = torch.bfloat16, use_fp8_linear: bool = False
@@ -260,6 +274,7 @@ class BasePipeline:
         for model_name in model_names:
             model = getattr(self, model_name)
             if model is not None:
+                model.to(device=self.device, dtype=torch.float8_e4m3fn)
                 enable_fp8_autocast(model, compute_dtype, use_fp8_linear)
         self.fp8_autocast_enabled = True
@@ -277,23 +292,31 @@ class BasePipeline:
         for model_name in self.model_names:
             if model_name not in load_model_names:
                 model = getattr(self, model_name)
-                if (
-                    model is not None
-                    and (p := next(model.parameters(), None)) is not None
-                    and p.device != torch.device("cpu")
-                ):
-                    model.to("cpu")
+                if model is not None and (p := next(model.parameters(), None)) is not None and p.device.type != "cpu":
+                    restore_model_from_dict(model, self._offload_param_dict[model_name])
         # load the needed models to device
         for model_name in load_model_names:
             model = getattr(self, model_name)
-            if (
-                model is not None
-                and (p := next(model.parameters(), None)) is not None
-                and p.device != torch.device(self.device)
-            ):
+            if model is None:
+                raise ValueError(f"model {model_name} is not loaded, maybe this model has been destroyed by model_lifecycle_finish function with offload_to_disk=True")
+            if model is not None and (p := next(model.parameters(), None)) is not None and p.device.type != self.device:
                 model.to(self.device)
         # fresh the cuda cache
         empty_cache()
+    def model_lifecycle_finish(self, model_names: List[str] | None = None):
+        if not self.offload_to_disk or self.offload_mode is None:
+            return
+        for model_name in model_names:
+            model = getattr(self, model_name)
+            del model
+            if model_name in self._offload_param_dict:
+                del self._offload_param_dict[model_name]
+            setattr(self, model_name, None)
+            print(f"model {model_name} has been deleted from memory")
+            logger.info(f"model {model_name} has been deleted from memory")
+            empty_cache()
     def compile(self):
         raise NotImplementedError(f"{self.__class__.__name__} does not support compile")

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/pipelines/qwen_image.py RENAMED Viewed

@@ -41,19 +41,32 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
         dit_dict = {}
         for key, param in lora_state_dict.items():
             origin_key = key
-            if "lora_A.default.weight" not in key:
+            lora_a_suffix = None
+            if "lora_A.default.weight" in key:
+                lora_a_suffix = "lora_A.default.weight"
+            elif "lora_A.weight" in key:
+                lora_a_suffix = "lora_A.weight"
+            if lora_a_suffix is None:
                 continue
             lora_args = {}
             lora_args["down"] = param
-            lora_args["up"] = lora_state_dict[origin_key.replace("lora_A.default.weight", "lora_B.default.weight")]
+            lora_b_suffix = lora_a_suffix.replace("lora_A", "lora_B")
+            lora_args["up"] = lora_state_dict[origin_key.replace(lora_a_suffix, lora_b_suffix)]
             lora_args["rank"] = lora_args["up"].shape[1]
-            alpha_key = origin_key.replace("lora_A.default.weight", "alpha").replace("lora_up.default.weight", "alpha")
+            alpha_key = origin_key.replace("lora_up", "lora_A").replace(lora_a_suffix, "alpha")
             if alpha_key in lora_state_dict:
                 alpha = lora_state_dict[alpha_key]
             else:
                 alpha = lora_args["rank"]
             lora_args["alpha"] = alpha
-            key = key.replace(".lora_A.default.weight", "")
+            key = key.replace(f".{lora_a_suffix}", "")
             if key.startswith("transformer") and "attn.to_out.0" in key:
                 key = key.replace("attn.to_out.0", "attn.to_out")
             dit_dict[key] = lora_args
@@ -82,10 +95,8 @@ class QwenImagePipeline(BasePipeline):
             dtype=config.model_dtype,
         )
         self.config = config
-        self.tokenizer_max_length = 1024
         self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.prompt_template_encode_start_idx = 34
-        self.default_sample_size = 128
         # sampler
         self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
         self.sampler = FlowMatchEulerSampler()
@@ -197,7 +208,19 @@ class QwenImagePipeline(BasePipeline):
         pipe.eval()
         if config.offload_mode is not None:
-            pipe.enable_cpu_offload(config.offload_mode)
+            pipe.enable_cpu_offload(config.offload_mode, config.offload_to_disk)
+        if config.model_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # compute dtype
+            pipe.enable_fp8_autocast(
+                model_names=["dit"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
+            )
+        if config.encoder_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # compute dtype
+            pipe.enable_fp8_autocast(
+                model_names=["encoder"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
+            )
         if config.parallelism > 1:
             pipe = ParallelWrapper(
@@ -262,7 +285,7 @@ class QwenImagePipeline(BasePipeline):
         template = self.prompt_template_encode
         drop_idx = self.prompt_template_encode_start_idx
         texts = [template.format(txt) for txt in prompt]
-        outputs = self.tokenizer(texts, max_length=min(max_sequence_length, self.tokenizer_max_length) + drop_idx)
+        outputs = self.tokenizer(texts, max_length=max_sequence_length + drop_idx)
         input_ids, attention_mask = outputs["input_ids"].to(self.device), outputs["attention_mask"].to(self.device)
         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         hidden_states = outputs["hidden_states"]
@@ -377,11 +400,12 @@ class QwenImagePipeline(BasePipeline):
         self.sampler.initialize(init_latents=init_latents, timesteps=timesteps, sigmas=sigmas)
         self.load_models_to_device(["encoder"])
-        prompt_embeds, prompt_embeds_mask = self.encode_prompt(prompt, 1, 512)
+        prompt_embeds, prompt_embeds_mask = self.encode_prompt(prompt, 1, 4096)
         if cfg_scale > 1.0 and negative_prompt != "":
-            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 512)
+            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 4096)
         else:
             negative_prompt_embeds, negative_prompt_embeds_mask = None, None
+        self.model_lifecycle_finish(["encoder"])
         hide_progress = dist.is_initialized() and dist.get_rank() != 0
         for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
@@ -401,6 +425,7 @@ class QwenImagePipeline(BasePipeline):
             # UI
             if progress_callback is not None:
                 progress_callback(i, len(timesteps), "DENOISING")
+        self.model_lifecycle_finish(["dit"])
         # Decode image
         self.load_models_to_device(["vae"])
         latents = rearrange(latents, "B C H W -> B C 1 H W")
@@ -412,5 +437,6 @@ class QwenImagePipeline(BasePipeline):
         )
         image = self.vae_output_to_image(vae_output)
         # Offload all models
+        self.model_lifecycle_finish(["vae"])
         self.load_models_to_device([])
         return image

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/pipelines/wan_video.py RENAMED Viewed

@@ -584,4 +584,11 @@ class WanVideoPipeline(BasePipeline):
                 use_fsdp=config.use_fsdp,
                 device="cuda",
             )
+        if config.use_torch_compile:
+            pipe.compile()
         return pipe
+    def compile(self):
+        self.dit.compile()
+        if self.dit2 is not None:
+            self.dit2.compile()

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/tokenizers/qwen2.py RENAMED Viewed

@@ -197,8 +197,8 @@ class Qwen2TokenizerFast(BaseTokenizer):
         encoded.fill_(self.pad_token_id)
         attention_mask = torch.zeros(len(texts), max_length, dtype=torch.long)
         for i, ids in enumerate(batch_ids):
-            if len(ids) > self.model_max_length:
-                ids = ids[: self.model_max_length]
+            if len(ids) > max_length:
+                ids = ids[:max_length]
                 ids[-1] = self.eos_token_id
             if padding_side == "right":
                 encoded[i, : len(ids)] = torch.tensor(ids)

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/utils/offload.py RENAMED Viewed

@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
+from typing import Dict
 def enable_sequential_cpu_offload(module: nn.Module, device: str = "cuda"):
+    module = module.to("cpu")
     if len(list(module.children())) == 0:
         if len(list(module.parameters())) > 0 or len(list(module.buffers())) > 0:
             # leaf module with parameters or buffers
@@ -50,3 +52,24 @@ def add_cpu_offload_hook(module: nn.Module, device: str = "cuda", recurse: bool
     module.register_forward_pre_hook(_forward_pre_hook)
     module.register_forward_hook(_forward_hook)
     setattr(module, "_cpu_offload_enabled", True)
+def offload_model_to_dict(module: nn.Module) -> Dict[str, torch.Tensor]:
+    module = module.to("cpu")
+    offload_param_dict = {}
+    for name, param in module.named_parameters(recurse=True):
+        param.data = param.data.pin_memory()
+        offload_param_dict[name] = param.data
+    for name, buffer in module.named_buffers(recurse=True):
+        buffer.data = buffer.data.pin_memory()
+        offload_param_dict[name] = buffer.data
+    return offload_param_dict
+def restore_model_from_dict(module: nn.Module, offload_param_dict: Dict[str, torch.Tensor]):
+    for name, param in module.named_parameters(recurse=True):
+        if name in offload_param_dict:
+            param.data = offload_param_dict[name]
+    for name, buffer in module.named_buffers(recurse=True):
+        if name in offload_param_dict:
+            buffer.data = offload_param_dict[name]

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine/utils/parallel.py RENAMED Viewed

@@ -304,12 +304,14 @@ def _worker_loop(
             if rank == 0:
                 queue_out.put(res)
             dist.barrier()
-    except Exception as e:
+    except Exception:
         import traceback
-        traceback.print_exc()
-        logger.error(f"Error in worker loop (rank {rank}): {e}")
-        queue_out.put(e)  # any exception caught in the worker will be raised to the main process
+        msg = traceback.format_exc()
+        err = RuntimeError(msg)
+        logger.error(f"Error in worker loop (rank {rank}): {msg}")
+        if rank == 0:
+            queue_out.put(err)  # any exception caught in the worker will be raised to the main process
     finally:
         del module
         torch.cuda.synchronize()

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/diffsynth_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.4.1.dev1
+Version: 0.4.1.post2.dev1
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.4.1.dev1 → diffsynth_engine-0.4.1.post2.dev1}/docs/tutorial.md RENAMED Viewed

@@ -88,6 +88,51 @@ We will continuously update DiffSynth-Engine to support more models. (Wan2.2 LoR
 After the model is downloaded, load the model with the corresponding pipeline and perform inference.
+### Image Generation(Qwen-Image)
+The following code calls `QwenImagePipeline` to load the [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) model and generate an image. Recommended resolutions are 928×1664, 1104×1472, 1328×1328, 1472×1104, and 1664×928, with a suggested cfg_scale of 4. If no negative_prompt is provided, it defaults to a single space character (not an empty string). For multi-GPU parallelism, currently only cfg parallelism is supported (parallelism=2), with other optimization efforts underway.
+```python
+from diffsynth_engine import fetch_model, QwenImagePipeline, QwenImagePipelineConfig
+config = QwenImagePipelineConfig.basic_config(
+    model_path=fetch_model("MusePublic/Qwen-image", revision="v1", path="transformer/*.safetensors"),
+    encoder_path=fetch_model("MusePublic/Qwen-image", revision="v1", path="text_encoder/*.safetensors"),
+    vae_path=fetch_model("MusePublic/Qwen-image", revision="v1", path="vae/*.safetensors"),
+    parallelism=2,
+)
+pipe = QwenImagePipeline.from_pretrained(config)
+prompt = """
+    一副典雅庄重的对联悬挂于厅堂之中，房间是个安静古典的中式布置，桌子上放着一些青花瓷，对联上左书“思涌如泉万类灵感皆可触”，右书“智启于问千机代码自天成”，横批“AI脑洞力”，字体飘逸灵动，兼具传统笔意与未来感。中间挂着一幅中国风的画作，内容是岳阳楼，云雾缭绕间似有数据流光隐现，古今交融，意境深远。
+    """
+negative_prompt = " "
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    cfg_scale=4.0,
+    width=1104,
+    height=1472,
+    num_inference_steps=30,
+    seed=42,
+)
+image.save("image.png")
+```
+Please note that if some necessary modules, like text encoders, are missing from a model repository, the pipeline will automatically download the required files.
+### Detailed Parameters(Qwen-Image)
+In the image generation pipeline `pipe`, we can use the following parameters for fine-grained control:
+* `prompt`: The prompt, used to describe the content of the generated image, It supports multiple languages (Chinese, English, Japanese, etc.), e.g., “一只猫” (Chinese), "a cat" (English), or "庭を走る猫" (Japanese).
+* `negative_prompt`: The negative prompt, used to describe content you do not want in the image, it defaults to a single space character (not an empty string), e.g., "ugly".
+* `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the image but reduces the diversity of the generated content.
+* `height`: Image height.
+* `width`: Image width.
+* `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher image quality.
+* `seed`: The random seed. A fixed seed ensures reproducible results.
 ### Image Generation
 The following code calls `FluxImagePipeline` to load the [MajicFlus](https://www.modelscope.cn/models/MAILAND/majicflus_v1/summary?version=v1.0) model and generate an image. To load other types of models, replace `FluxImagePipeline` and `FluxPipelineConfig` in the code with the corresponding pipeline and config.
@@ -109,16 +154,16 @@ Please note that if some necessary modules, like text encoders, are missing from
 In the image generation pipeline `pipe`, we can use the following parameters for fine-grained control:
-*   `prompt`: The prompt, used to describe the content of the generated image, e.g., "a cat".
-*   `negative_prompt`: The negative prompt, used to describe content you do not want in the image, e.g., "ugly".
-*   `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the image but reduces the diversity of the generated content.
-*   `clip_skip`: The number of layers to skip in the [CLIP](https://arxiv.org/abs/2103.00020) text encoder. The more layers skipped, the lower the text-image correlation, but this can lead to interesting variations in the generated content.
-*   `input_image`: Input image, used for image-to-image generation.
-*   `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input image is preserved.
-*   `height`: Image height.
-*   `width`: Image width.
-*   `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher image quality.
-*   `seed`: The random seed. A fixed seed ensures reproducible results.
+* `prompt`: The prompt, used to describe the content of the generated image, e.g., "a cat".
+* `negative_prompt`: The negative prompt, used to describe content you do not want in the image, e.g., "ugly".
+* `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the image but reduces the diversity of the generated content.
+* `clip_skip`: The number of layers to skip in the [CLIP](https://arxiv.org/abs/2103.00020) text encoder. The more layers skipped, the lower the text-image correlation, but this can lead to interesting variations in the generated content.
+* `input_image`: Input image, used for image-to-image generation.
+* `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input image is preserved.
+* `height`: Image height.
+* `width`: Image width.
+* `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher image quality.
+* `seed`: The random seed. A fixed seed ensures reproducible results.
 #### Loading LoRA
@@ -177,17 +222,17 @@ save_video(video, "video.mp4")
 In the video generation pipeline `pipe`, we can use the following parameters for fine-grained control:
-*   `prompt`: The prompt, used to describe the content of the generated video, e.g., "a cat".
-*   `negative_prompt`: The negative prompt, used to describe content you do not want in the video, e.g., "ugly".
-*   `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the video but reduces the diversity of the generated content.
-*   `input_image`: Input image, only effective in image-to-video models, such as [Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P).
-*   `input_video`: Input video, used for video-to-video generation.
-*   `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input video is preserved.
-*   `height`: Video frame height.
-*   `width`: Video frame width.
-*   `num_frames`: Number of video frames.
-*   `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher video quality.
-*   `seed`: The random seed. A fixed seed ensures reproducible results.
+* `prompt`: The prompt, used to describe the content of the generated video, e.g., "a cat".
+* `negative_prompt`: The negative prompt, used to describe content you do not want in the video, e.g., "ugly".
+* `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the video but reduces the diversity of the generated content.
+* `input_image`: Input image, only effective in image-to-video models, such as [Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P).
+* `input_video`: Input video, used for video-to-video generation.
+* `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input video is preserved.
+* `height`: Video frame height.
+* `width`: Video frame width.
+* `num_frames`: Number of video frames.
+* `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher video quality.
+* `seed`: The random seed. A fixed seed ensures reproducible results.
 #### Loading LoRA

diffsynth-engine 0.4.1.dev1__tar.gz → 0.4.1.post2.dev1__tar.gz

diffsynth-engine 0.4.1.dev1tar.gz → 0.4.1.post2.dev1tar.gz