PyPI - diffsynth-engine - Versions diffs - 0.3.6.dev2__tar.gz → 0.3.6.dev5__tar.gz - Mend

diffsynth-engine 0.3.6.dev2tar.gz → 0.3.6.dev5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.3.6.dev2
+Version: 0.3.6.dev5
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
@@ -24,6 +24,7 @@ Requires-Dist: pillow
 Requires-Dist: imageio[ffmpeg]
 Requires-Dist: yunchang; sys_platform == "linux"
 Requires-Dist: onnxruntime
+Requires-Dist: opencv-python
 Provides-Extra: dev
 Requires-Dist: diffusers==0.31.0; extra == "dev"
 Requires-Dist: transformers==4.45.2; extra == "dev"

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/models/wan/wan_dit.py RENAMED Viewed

@@ -334,9 +334,10 @@ class WanDiT(PreTrainedModel):
         clip_feature: Optional[torch.Tensor] = None,  # clip_vision_encoder(img)
         y: Optional[torch.Tensor] = None,  # vae_encoder(img)
     ):
+        use_cfg = x.shape[0] > 1
         with (
             gguf_inference(),
-            cfg_parallel((x, context, timestep, clip_feature, y)),
+            cfg_parallel((x, context, timestep, clip_feature, y), use_cfg=use_cfg),
         ):
             t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
             t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
@@ -365,7 +366,7 @@ class WanDiT(PreTrainedModel):
                 x = self.head(x, t)
                 (x,) = sequence_parallel_unshard((x,), seq_dims=(1,), seq_lens=(f * h * w,))
             x = self.unpatchify(x, (f, h, w))
-            (x,) = cfg_parallel_unshard((x,))
+            (x,) = cfg_parallel_unshard((x,), use_cfg=use_cfg)
             return x
     @classmethod

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/models/wan/wan_vae.py RENAMED Viewed

@@ -515,7 +515,7 @@ class WanVideoVAEStateDictConverter(StateDictConverter):
 class WanVideoVAE(PreTrainedModel):
     converter = WanVideoVAEStateDictConverter()
-    def __init__(self, z_dim=16, parallelism: int = 1, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+    def __init__(self, z_dim=16, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
         super().__init__()
         mean = [
@@ -561,12 +561,11 @@ class WanVideoVAE(PreTrainedModel):
         # init model
         self.model = VideoVAE(z_dim=z_dim).eval().requires_grad_(False)
         self.upsampling_factor = 8
-        self.parallelism = parallelism
     @classmethod
-    def from_state_dict(cls, state_dict, parallelism=1, device="cuda:0", dtype=torch.float32) -> "WanVideoVAE":
+    def from_state_dict(cls, state_dict, device="cuda:0", dtype=torch.float32) -> "WanVideoVAE":
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, parallelism=parallelism, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
@@ -607,7 +606,7 @@ class WanVideoVAE(PreTrainedModel):
                 h_, w_ = h + size_h, w + size_w
                 tasks.append((h, h_, w, w_))
-        data_device = device if self.parallelism > 1 else "cpu"
+        data_device = device if dist.is_initialized() else "cpu"
         computation_device = device
         out_T = T * 4 - 3
@@ -622,9 +621,9 @@ class WanVideoVAE(PreTrainedModel):
             device=data_device,
         )
-        hide_progress_bar = self.parallelism > 1 and dist.get_rank() != 0
-        for i, (h, h_, w, w_) in enumerate(tqdm(tasks, desc="VAE DECODING", disable=hide_progress_bar)):
-            if self.parallelism > 1 and (i % dist.get_world_size() != dist.get_rank()):
+        hide_progress = dist.is_initialized() and dist.get_rank() != 0
+        for i, (h, h_, w, w_) in enumerate(tqdm(tasks, desc="VAE DECODING", disable=hide_progress)):
+            if dist.is_initialized() and (i % dist.get_world_size() != dist.get_rank()):
                 continue
             hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(computation_device)
             hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(data_device)
@@ -654,11 +653,11 @@ class WanVideoVAE(PreTrainedModel):
                 target_h : target_h + hidden_states_batch.shape[3],
                 target_w : target_w + hidden_states_batch.shape[4],
             ] += mask
-            if progress_callback is not None and not hide_progress_bar:
+            if progress_callback is not None and not hide_progress:
                 progress_callback(i + 1, len(tasks), "VAE DECODING")
-        if progress_callback is not None and not hide_progress_bar:
+        if progress_callback is not None and not hide_progress:
             progress_callback(len(tasks), len(tasks), "VAE DECODING")
-        if self.parallelism > 1:
+        if dist.is_initialized():
             dist.all_reduce(values)
             dist.all_reduce(weight)
         values = values / weight
@@ -681,7 +680,7 @@ class WanVideoVAE(PreTrainedModel):
                 h_, w_ = h + size_h, w + size_w
                 tasks.append((h, h_, w, w_))
-        data_device = device if self.parallelism > 1 else "cpu"
+        data_device = device if dist.is_initialized() else "cpu"
         computation_device = device
         out_T = (T + 3) // 4
@@ -696,9 +695,9 @@ class WanVideoVAE(PreTrainedModel):
             device=data_device,
         )
-        hide_progress_bar = self.parallelism > 1 and dist.get_rank() != 0
+        hide_progress_bar = dist.is_initialized() and dist.get_rank() != 0
         for i, (h, h_, w, w_) in enumerate(tqdm(tasks, desc="VAE ENCODING", disable=hide_progress_bar)):
-            if self.parallelism > 1 and (i % dist.get_world_size() != dist.get_rank()):
+            if dist.is_initialized() and (i % dist.get_world_size() != dist.get_rank()):
                 continue
             hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
             hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(data_device)
@@ -732,7 +731,7 @@ class WanVideoVAE(PreTrainedModel):
                 progress_callback(i + 1, len(tasks), "VAE ENCODING")
         if progress_callback is not None and not hide_progress_bar:
             progress_callback(len(tasks), len(tasks), "VAE ENCODING")
-        if self.parallelism > 1:
+        if dist.is_initialized():
             dist.all_reduce(values)
             dist.all_reduce(weight)
         values = values / weight

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/pipelines/base.py RENAMED Viewed

@@ -91,15 +91,21 @@ class BasePipeline:
     @staticmethod
     def load_model_checkpoint(
-        checkpoint_path: str, device: str = "cpu", dtype: torch.dtype = torch.float16
+        checkpoint_path: str | List[str], device: str = "cpu", dtype: torch.dtype = torch.float16
     ) -> Dict[str, torch.Tensor]:
-        if not os.path.isfile(checkpoint_path):
-            FileNotFoundError(f"{checkpoint_path} is not a file")
-        if checkpoint_path.endswith(".safetensors"):
-            return load_file(checkpoint_path, device=device)
-        if checkpoint_path.endswith(".gguf"):
-            return load_gguf_checkpoint(checkpoint_path, device=device, dtype=dtype)
-        raise ValueError(f"{checkpoint_path} is not a .safetensors or .gguf file")
+        if isinstance(checkpoint_path, str):
+            checkpoint_path = [checkpoint_path]
+        state_dict = {}
+        for path in checkpoint_path:
+            if not os.path.isfile(path):
+                raise FileNotFoundError(f"{path} is not a file")
+            elif path.endswith(".safetensors"):
+                state_dict.update(**load_file(path, device=device))
+            elif path.endswith(".gguf"):
+                state_dict.update(**load_gguf_checkpoint(path, device=device, dtype=dtype))
+            else:
+                raise ValueError(f"{path} is not a .safetensors or .gguf file")
+        return state_dict
     @staticmethod
     def validate_image_size(

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/pipelines/controlnet_helper.py RENAMED Viewed

@@ -8,8 +8,8 @@ ImageType = Union[Image.Image, torch.Tensor, List[Image.Image], List[torch.Tenso
 @dataclass
 class ControlNetParams:
-    scale: float
     image: ImageType
+    scale: float = 1.0
     model: Optional[nn.Module] = None
     mask: Optional[ImageType] = None
     control_start: float = 0

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/pipelines/flux_image.py RENAMED Viewed

@@ -419,9 +419,10 @@ class ControlType(Enum):
     normal = "normal"
     bfl_control = "bfl_control"
     bfl_fill = "bfl_fill"
+    bfl_kontext = "bfl_kontext"
     def get_in_channel(self):
-        if self == ControlType.normal:
+        if self in [ControlType.normal, ControlType.bfl_kontext]:
             return 64
         elif self == ControlType.bfl_control:
             return 128
@@ -764,9 +765,15 @@ class FluxImagePipeline(BasePipeline):
         current_step: int,
         total_step: int,
     ):
+        origin_latents_shape = latents.shape
         if self.control_type != ControlType.normal:
             controlnet_param = controlnet_params[0]
-            latents = torch.cat((latents, controlnet_param.image * controlnet_param.scale), dim=1)
+            if self.control_type == ControlType.bfl_kontext:
+                latents = torch.cat((latents, controlnet_param.image * controlnet_param.scale), dim=2)
+                image_ids = image_ids.repeat(1, 2, 1)
+                image_ids[:, image_ids.shape[1] // 2 :, 0] += 1
+            else:
+                latents = torch.cat((latents, controlnet_param.image * controlnet_param.scale), dim=1)
             latents = latents.to(self.dtype)
             controlnet_params = []
@@ -797,6 +804,8 @@ class FluxImagePipeline(BasePipeline):
             controlnet_double_block_output=double_block_output,
             controlnet_single_block_output=single_block_output,
         )
+        if self.control_type == ControlType.bfl_kontext:
+            noise_pred = noise_pred[:, :, : origin_latents_shape[2], : origin_latents_shape[3]]
         return noise_pred
     def prepare_latents(

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine/utils/download.py RENAMED Viewed

@@ -2,10 +2,11 @@ import os
 import shutil
 import tqdm
 import tempfile
-from typing import Optional
+from typing import List, Optional
 from pathlib import Path
 from urllib.parse import urlparse
 import requests
+import glob
 from modelscope import snapshot_download
 from modelscope.hub.api import HubApi
@@ -23,11 +24,11 @@ MODEL_SOURCES = ["modelscope", "civitai"]
 def fetch_model(
     model_uri: str,
     revision: Optional[str] = None,
-    path: Optional[str] = None,
+    path: Optional[str | List[str]] = None,
     access_token: Optional[str] = None,
     source: str = "modelscope",
-    fetch_safetensors: bool = True,
-) -> str:
+    fetch_safetensors: bool = True,  # TODO: supports other formats like GGUF
+) -> str | List[str]:
     if source == "modelscope":
         return fetch_modelscope_model(model_uri, revision, path, access_token, fetch_safetensors)
     if source == "civitai":
@@ -38,7 +39,7 @@ def fetch_model(
 def fetch_modelscope_model(
     model_id: str,
     revision: Optional[str] = None,
-    path: Optional[str] = None,
+    path: Optional[str | List[str]] = None,
     access_token: Optional[str] = None,
     fetch_safetensors: bool = True,
 ) -> str:
@@ -52,12 +53,15 @@ def fetch_modelscope_model(
         directory = os.path.join(DIFFSYNTH_CACHE, "modelscope", model_id, revision if revision else "__version")
         dirpath = snapshot_download(model_id, revision=revision, local_dir=directory, allow_patterns=path)
-    if path is not None:
-        path = os.path.join(dirpath, path)
+    if isinstance(path, str):
+        path = glob.glob(os.path.join(dirpath, path))
+        path = path[0] if len(path) == 1 else path
+    elif isinstance(path, list):
+        path = [os.path.join(dirpath, p) for p in path]
     else:
         path = dirpath
-    if os.path.isdir(path) and fetch_safetensors:
+    if isinstance(path, str) and os.path.isdir(path) and fetch_safetensors:
         return _fetch_safetensors(path)
     return path
@@ -122,16 +126,17 @@ def ensure_directory_exists(filename: str):
     Path(filename).parent.mkdir(parents=True, exist_ok=True)
-def _fetch_safetensors(dirpath: str) -> str:
+def _fetch_safetensors(dirpath: str) -> str | List[str]:
     all_safetensors = []
     for filename in os.listdir(dirpath):
         if filename.endswith(".safetensors"):
             all_safetensors.append(os.path.join(dirpath, filename))
-    if len(all_safetensors) == 1:
-        logger.info(f"Fetch safetensors file {all_safetensors[0]}")
-        return all_safetensors[0]
-    elif len(all_safetensors) == 0:
+    if len(all_safetensors) == 0:
         logger.error(f"No safetensors file found in {dirpath}")
+        return dirpath
+    elif len(all_safetensors) == 1:
+        all_safetensors = all_safetensors[0]
+        logger.info(f"Fetch safetensors file {all_safetensors}")
     else:
-        logger.error(f"Multiple safetensors files found in {dirpath}, please specify the file name")
-    return dirpath
+        logger.info(f"Fetch safetensors files {all_safetensors}")
+    return all_safetensors

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.3.6.dev2
+Version: 0.3.6.dev5
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
@@ -24,6 +24,7 @@ Requires-Dist: pillow
 Requires-Dist: imageio[ffmpeg]
 Requires-Dist: yunchang; sys_platform == "linux"
 Requires-Dist: onnxruntime
+Requires-Dist: opencv-python
 Provides-Extra: dev
 Requires-Dist: diffusers==0.31.0; extra == "dev"
 Requires-Dist: transformers==4.45.2; extra == "dev"

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/diffsynth_engine.egg-info/requires.txt RENAMED Viewed

@@ -15,6 +15,7 @@ torchsde
 pillow
 imageio[ffmpeg]
 onnxruntime
+opencv-python
 [:sys_platform == "linux"]
 yunchang

diffsynth_engine-0.3.6.dev5/docs/tutorial.md ADDED Viewed

@@ -0,0 +1,241 @@
+# DiffSynth-Engine User Guide
+## Installation
+Before using DiffSynth-Engine, please ensure your device meets the following requirements:
+* NVIDIA GPU with CUDA Compute Capability 8.6+ (e.g., RTX 50 Series, RTX 40 Series, RTX 30 Series, see [NVIDIA documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities) for details) or Apple Silicon M-series chips.
+Python environment requirements: Python 3.10+.
+Use `pip3` to install DiffSynth-Engine from PyPI:
+```shell
+pip3 install diffsynth-engine
+```
+DiffSynth-Engine also supports installation from source, which provides access to the latest features but might come with stability issues. We recommend installing the stable version via `pip3`.
+```shell
+git clone https://github.com/modelscope/diffsynth-engine.git && cd diffsynth-engine
+pip3 install -e .
+```
+## Model Download
+DiffSynth-Engine supports loading models from the [ModelScope Model Hub](https://www.modelscope.cn/aigc/models) by model ID. For example, on the [MajicFlus model page](https://www.modelscope.cn/models/MAILAND/majicflus_v1/summary?version=v1.0), we can find the model ID and the corresponding model filename in the image below.
+![Image](https://github.com/user-attachments/assets/a6f71768-487d-4376-8974-fe6563f2896c)
+Next, download the MajicFlus model with the following code.
+```python
+from diffsynth_engine import fetch_model
+model_path = fetch_model("MAILAND/majicflus_v1", path="majicflus_v134.safetensors")
+```
+![Image](https://github.com/user-attachments/assets/596c3383-23b3-4372-a7ce-3c4e1c1ba81a)
+For sharded models, specify multiple files using the `path` parameter.
+```python
+from diffsynth_engine import fetch_model
+model_path = fetch_model("Wan-AI/Wan2.1-T2V-14B", path=[
+    "diffusion_pytorch_model-00001-of-00006.safetensors",
+    "diffusion_pytorch_model-00002-of-00006.safetensors",
+    "diffusion_pytorch_model-00003-of-00006.safetensors",
+    "diffusion_pytorch_model-00004-of-00006.safetensors",
+    "diffusion_pytorch_model-00005-of-00006.safetensors",
+    "diffusion_pytorch_model-00006-of-00006.safetensors",
+])
+```
+It also supports using wildcards to match multiple files.
+```python
+from diffsynth_engine import fetch_model
+model_path = fetch_model("Wan-AI/Wan2.1-T2V-14B", path="diffusion_pytorch_model*.safetensors")
+```
+The file path `model_path` returned by the `fetch_model` function is the path to the downloaded file(s).
+## Model Types
+Diffusion models come in a wide variety of architectures. Each model is loaded and run for inference by a corresponding pipeline. The model types we currently support include:
+| Model Architecture | Example                                                      | Pipeline              |
+| :----------------- | :----------------------------------------------------------- | :-------------------- |
+| SD1.5              | [DreamShaper](https://www.modelscope.cn/models/MusePublic/DreamShaper_SD_1_5) | `SDImagePipeline`     |
+| SDXL               | [RealVisXL](https://www.modelscope.cn/models/MusePublic/42_ckpt_SD_XL) | `SDXLImagePipeline`   |
+| FLUX               | [MajicFlus](https://www.modelscope.cn/models/MAILAND/majicflus_v1/summary?version=v1.0) | `FluxImagePipeline`   |
+| Wan2.1             | [Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) | `WanVideoPipeline`    |
+| SD1.5 LoRA         | [Detail Tweaker](https://www.modelscope.cn/models/MusePublic/Detail_Tweaker_LoRA_xijietiaozheng_LoRA_SD_1_5) | `SDImagePipeline`     |
+| SDXL LoRA          | [Aesthetic Anime](https://www.modelscope.cn/models/MusePublic/100_lora_SD_XL) | `SDXLImagePipeline`   |
+| FLUX LoRA          | [ArtAug](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1) | `FluxImagePipeline`   |
+| Wan2.1 LoRA        | [Highres-fix](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-highresfix-v1) | `WanVideoPipeline`    |
+Among these, SD1.5, SDXL, and FLUX are base models for image generation, while Wan2.1 is a base model for video generation. Base models can generate content independently. SD1.5 LoRA, SDXL LoRA, FLUX LoRA, and Wan2.1 LoRA are [LoRA](https://arxiv.org/abs/2106.09685) models. LoRA models are trained as "additional branches" on top of base models to enhance specific capabilities. They must be combined with a base model to be used for generation.
+We will continuously update DiffSynth-Engine to support more models.
+## Model Inference
+After the model is downloaded, load the model with the corresponding pipeline and perform inference.
+### Image Generation
+The following code calls `FluxImagePipeline` to load the [MajicFlus](https://www.modelscope.cn/models/MAILAND/majicflus_v1/summary?version=v1.0) model and generate an image. To load other types of models, replace `FluxImagePipeline` in the code with the corresponding pipeline.
+```python
+from diffsynth_engine import fetch_model, FluxImagePipeline
+model_path = fetch_model("MAILAND/majicflus_v1", path="majicflus_v134.safetensors")
+pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
+image = pipe(prompt="a cat")
+image.save("image.png")
+```
+Please note that if some necessary modules, like text encoders, are missing from a model repository, the pipeline will automatically download the required files.
+#### Detailed Parameters
+In the image generation pipeline `pipe`, we can use the following parameters for fine-grained control:
+*   `prompt`: The prompt, used to describe the content of the generated image, e.g., "a cat".
+*   `negative_prompt`: The negative prompt, used to describe content you do not want in the image, e.g., "ugly".
+*   `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the image but reduces the diversity of the generated content.
+*   `clip_skip`: The number of layers to skip in the [CLIP](https://arxiv.org/abs/2103.00020) text encoder. The more layers skipped, the lower the text-image correlation, but this can lead to interesting variations in the generated content.
+*   `input_image`: Input image, used for image-to-image generation.
+*   `mask_image`: Mask image, used for image inpainting.
+*   `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input image is preserved.
+*   `height`: Image height.
+*   `width`: Image width.
+*   `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher image quality.
+*   `tiled`: Whether to enable tiled processing for the VAE. This option is disabled by default. Enabling it can reduce VRAM usage.
+*   `tile_size`: The window size for tiled VAE processing.
+*   `tile_stride`: The stride for tiled VAE processing.
+*   `seed`: The random seed. A fixed seed ensures reproducible results.
+*   `progress_bar_cmd`: The progress bar module. [`tqdm`](https://github.com/tqdm/tqdm) is enabled by default. To disable the progress bar, set it to `lambda x: x`.
+#### Loading LoRA
+We supports loading LoRA on top of the base model. For example, the following code loads a [Cheongsam LoRA](https://www.modelscope.cn/models/DonRat/MAJICFLUS_SuperChinesestyleheongsam) based on the [MajicFlus](https://www.modelscope.cn/models/MAILAND/majicflus_v1/summary?version=v1.0) model to generate images of cheongsams, which the base model might struggle to create.
+```python
+from diffsynth_engine import fetch_model, FluxImagePipeline
+model_path = fetch_model("MAILAND/majicflus_v1", path="majicflus_v134.safetensors")
+lora_path = fetch_model("DonRat/MAJICFLUS_SuperChinesestyleheongsam", path="麦橘超国风旗袍.safetensors")
+pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
+pipe.load_lora(path=lora_path, scale=1.0)
+image = pipe(prompt="a girl, qipao")
+image.save("image.png")
+```
+The `scale` parameter in the code controls the degree of influence the LoRA model has on the base model. A value of 1.0 is usually sufficient. When set to a value greater than 1, the LoRA's effect will be stronger, but this may cause artifacts or degradation in the image content. Please adjust this parameter with caution.
+#### VRAM Optimization
+DiffSynth-Engine supports various levels of VRAM optimization, allowing models to run on GPUs with low VRAM. For example, at `bfloat16` precision and with no optimization options enabled, the FLUX model requires 35.84GB of VRAM for inference. By adding the parameter `offload_mode="cpu_offload"`, the VRAM requirement drops to 22.83GB. Furthermore, using `offload_mode="sequential_cpu_offload"` reduces the requirement to just 4.30GB, although this comes with an increase of inference time.
+```python
+from diffsynth_engine import fetch_model, FluxImagePipeline
+model_path = fetch_model("MAILAND/majicflus_v1", path="majicflus_v134.safetensors")
+pipe = FluxImagePipeline.from_pretrained(model_path, offload_mode="sequential_cpu_offload")
+image = pipe(prompt="a cat")
+image.save("image.png")
+```
+### Video Generation
+DiffSynth-Engine also supports video generation. The following code loads the [Wan Video Generation Model](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) and generates a video.
+```python
+from diffsynth_engine.pipelines.wan_video import WanVideoPipeline, WanModelConfig
+from diffsynth_engine.utils.video import save_video
+from diffsynth_engine import fetch_model
+config = WanModelConfig(
+    model_path=fetch_model("MusePublic/wan2.1-1.3b", path="dit.safetensors"),
+    vae_path=fetch_model("muse/wan2.1-vae", path="vae.safetensors"),
+    t5_path=fetch_model("muse/wan2.1-umt5", path="umt5.safetensors"),
+)
+pipe = WanVideoPipeline.from_pretrained(config, device="cuda")
+# The prompt translates to: "A lively puppy runs quickly on a green lawn. The puppy has brownish-yellow fur,
+# its two ears are perked up, and it looks focused and cheerful. Sunlight shines on it,
+# making its fur look especially soft and shiny."
+video = pipe(prompt="一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。")
+save_video(video, "video.mp4")
+```
+#### Detailed Parameters
+In the video generation pipeline `pipe`, we can use the following parameters for fine-grained control:
+*   `prompt`: The prompt, used to describe the content of the generated video, e.g., "a cat".
+*   `negative_prompt`: The negative prompt, used to describe content you do not want in the video, e.g., "ugly".
+*   `cfg_scale`: The guidance scale for [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598). A larger value usually results in stronger correlation between the text and the video but reduces the diversity of the generated content.
+*   `input_image`: Input image, only effective in image-to-video models, such as [Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P).
+*   `input_video`: Input video, used for video-to-video generation.
+*   `denoising_strength`: The denoising strength. When set to 1, a full generation process is performed. When set to a value between 0 and 1, some information from the input video is preserved.
+*   `height`: Video frame height.
+*   `width`: Video frame width.
+*   `num_frames`: Number of video frames.
+*   `num_inference_steps`: The number of inference steps. Generally, more steps lead to longer computation time but higher video quality.
+*   `tiled`: Whether to enable tiled processing for the VAE. This option is disabled by default. Enabling it can reduce VRAM usage.
+*   `tile_size`: The window size for tiled VAE processing.
+*   `tile_stride`: The stride for tiled VAE processing.
+*   `seed`: The random seed. A fixed seed ensures reproducible results.
+#### Loading LoRA
+We supports loading LoRA on top of the base model. For example, the following code loads a [High-Resolution Fix LoRA](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-highresfix-v1) on top of the [Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) model to improve the generation quality at high resolutions.
+```python
+from diffsynth_engine.pipelines.wan_video import WanVideoPipeline, WanModelConfig
+from diffsynth_engine.utils.video import save_video
+from diffsynth_engine import fetch_model
+config = WanModelConfig(
+    model_path=fetch_model("MusePublic/wan2.1-1.3b", path="dit.safetensors"),
+    vae_path=fetch_model("muse/wan2.1-vae", path="vae.safetensors"),
+    t5_path=fetch_model("muse/wan2.1-umt5", path="umt5.safetensors"),
+)
+lora_path = fetch_model("DiffSynth-Studio/Wan2.1-1.3b-lora-highresfix-v1", path="model.safetensors")
+pipe = WanVideoPipeline.from_pretrained(config, device="cuda")
+pipe.load_lora(path=lora_path, scale=1.0)
+# The prompt translates to: "A lively puppy runs quickly on a green lawn. The puppy has brownish-yellow fur,
+# its two ears are perked up, and it looks focused and cheerful. Sunlight shines on it,
+# making its fur look especially soft and shiny."
+video = pipe(prompt="一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。")
+save_video(video, "video.mp4")
+```
+The `scale` parameter in the code controls the degree of influence the LoRA model has on the base model. A value of 1.0 is usually sufficient. When set to a value greater than 1, the LoRA's effect will be stronger, but this may cause artifacts or degradation in the image content. Please adjust this parameter with caution.
+#### Multi-GPU Parallelism
+We supports multi-GPU parallel inference of the Wan2.1 model for faster video generation. Add the parameters `parallelism=4` (the number of GPUs to use) and `use_cfg_parallel=True` into the code to enable parallelism.
+```python
+from diffsynth_engine.pipelines.wan_video import WanVideoPipeline, WanModelConfig
+from diffsynth_engine.utils.video import save_video
+from diffsynth_engine import fetch_model
+config = WanModelConfig(
+    model_path=fetch_model("MusePublic/wan2.1-1.3b", path="dit.safetensors"),
+    vae_path=fetch_model("muse/wan2.1-vae", path="vae.safetensors"),
+    t5_path=fetch_model("muse/wan2.1-umt5", path="umt5.safetensors"),
+)
+pipe = WanVideoPipeline.from_pretrained(config, device="cuda", parallelism=4, use_cfg_parallel=True)
+# The prompt translates to: "A lively puppy runs quickly on a green lawn. The puppy has brownish-yellow fur,
+# its two ears are perked up, and it looks focused and cheerful. Sunlight shines on it,
+# making its fur look especially soft and shiny."
+video = pipe(prompt="一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。")
+save_video(video, "video.mp4")
+```

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/docs/tutorial_zh.md RENAMED Viewed

@@ -35,6 +35,31 @@ from diffsynth_engine import fetch_model
 model_path = fetch_model("MAILAND/majicflus_v1", path="majicflus_v134.safetensors")
 ```
+![Image](https://github.com/user-attachments/assets/596c3383-23b3-4372-a7ce-3c4e1c1ba81a)
+对于模型分片的情况，可以通过 `path` 参数指定多个文件。
+```python
+from diffsynth_engine import fetch_model
+model_path = fetch_model("Wan-AI/Wan2.1-T2V-14B", path=[
+    "diffusion_pytorch_model-00001-of-00006.safetensors",
+    "diffusion_pytorch_model-00002-of-00006.safetensors",
+    "diffusion_pytorch_model-00003-of-00006.safetensors",
+    "diffusion_pytorch_model-00004-of-00006.safetensors",
+    "diffusion_pytorch_model-00005-of-00006.safetensors",
+    "diffusion_pytorch_model-00006-of-00006.safetensors",
+])
+```
+也支持使用通配符来匹配多个文件。
+```python
+from diffsynth_engine import fetch_model
+model_path = fetch_model("Wan-AI/Wan2.1-T2V-14B", path="diffusion_pytorch_model*.safetensors")
+```
 `fetch_model` 函数返回的文件路径 `model_path` 即为下载后的文件路径。
 ## 模型类型

{diffsynth_engine-0.3.6.dev2 → diffsynth_engine-0.3.6.dev5}/pyproject.toml RENAMED Viewed

@@ -30,7 +30,8 @@ dependencies = [
     "pillow",
     "imageio[ffmpeg]",
     "yunchang ; sys_platform == 'linux'",
-    "onnxruntime"
+    "onnxruntime",
+    "opencv-python"
 ]
 [project.optional-dependencies]