diffsynth 2.0.6__tar.gz → 2.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.6 → diffsynth-2.0.8}/PKG-INFO +1 -1
- {diffsynth-2.0.6 → diffsynth-2.0.8}/README.md +110 -39
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/model_configs.py +31 -1
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/vram_management_module_maps.py +12 -0
- diffsynth-2.0.8/diffsynth/core/gradient/gradient_checkpoint.py +65 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/base_pipeline.py +32 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/flow_match.py +15 -2
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/runner.py +17 -1
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/anima_dit.py +3 -0
- diffsynth-2.0.8/diffsynth/models/ernie_image_dit.py +362 -0
- diffsynth-2.0.8/diffsynth/models/ernie_image_text_encoder.py +76 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_dit.py +3 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_dit.py +3 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_dit.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_dit.py +3 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_dit.py +145 -3
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_vae.py +16 -0
- diffsynth-2.0.8/diffsynth/models/wantodance.py +209 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_dit.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/anima_image.py +1 -0
- diffsynth-2.0.8/diffsynth/pipelines/ernie_image.py +266 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/flux2_image.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/flux_image.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/ltx2_audio_video.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/mova_audio_video.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/qwen_image.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/wan_video.py +238 -19
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/z_image.py +1 -0
- diffsynth-2.0.8/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +21 -0
- diffsynth-2.0.8/diffsynth/utils/state_dict_converters/z_image_dit.py +3 -0
- diffsynth-2.0.8/diffsynth/utils/xfuser/__init__.py +1 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/xfuser/xdit_context_parallel.py +33 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/SOURCES.txt +6 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/pyproject.toml +1 -1
- diffsynth-2.0.6/diffsynth/core/gradient/gradient_checkpoint.py +0 -34
- diffsynth-2.0.6/diffsynth/utils/xfuser/__init__.py +0 -1
- {diffsynth-2.0.6 → diffsynth-2.0.8}/LICENSE +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/attention/attention.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/operators.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/unified_dataset.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/training_module.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/dinov3_image_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/model_loader.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/siglip2_image_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/audio.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/version.py +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.6 → diffsynth-2.0.8}/setup.cfg +0 -0
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
[](https://github.com/modelscope/DiffSynth-Studio/issues)
|
|
8
8
|
[](https://GitHub.com/modelscope/DiffSynth-Studio/pull/)
|
|
9
9
|
[](https://GitHub.com/modelscope/DiffSynth-Studio/commit/)
|
|
10
|
+
[](https://discord.gg/Mm9suEeUDc)
|
|
10
11
|
|
|
11
12
|
[切换到中文版](./README_zh.md)
|
|
12
13
|
|
|
@@ -31,8 +32,9 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
31
32
|
|
|
32
33
|
> DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update.
|
|
33
34
|
|
|
34
|
-
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
35
|
-
|
|
35
|
+
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
|
+
|
|
37
|
+
- **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
|
|
36
38
|
|
|
37
39
|
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
|
|
38
40
|
|
|
@@ -40,6 +42,9 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
40
42
|
|
|
41
43
|
- **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
|
|
42
44
|
|
|
45
|
+
<details>
|
|
46
|
+
<summary>More</summary>
|
|
47
|
+
|
|
43
48
|
- **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details.
|
|
44
49
|
|
|
45
50
|
- **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future.
|
|
@@ -67,9 +72,6 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
67
72
|
- [Differential LoRA Training](/docs/zh/Training/Differential_LoRA.md): This is a training technique we used in [ArtAug](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), now available for LoRA training of any model.
|
|
68
73
|
- [FP8 Training](/docs/zh/Training/FP8_Precision.md): FP8 can be applied to any non-training model during training, i.e., models with gradients turned off or gradients that only affect LoRA weights.
|
|
69
74
|
|
|
70
|
-
<details>
|
|
71
|
-
<summary>More</summary>
|
|
72
|
-
|
|
73
75
|
- **November 4, 2025** Supported the [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) model, which is trained based on Wan 2.1 and supports generating corresponding actions based on reference videos.
|
|
74
76
|
|
|
75
77
|
- **October 30, 2025** Supported the [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) model, which supports text-to-video, image-to-video, and video continuation. This model uses the Wan framework for inference and training in this project.
|
|
@@ -835,41 +837,104 @@ graph LR;
|
|
|
835
837
|
|
|
836
838
|
Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
|
|
837
839
|
|
|
838
|
-
| Model ID | Extra
|
|
840
|
+
| Model ID | Extra Inputs | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training |
|
|
841
|
+
|-|-|-|-|-|-|-|-|
|
|
842
|
+
|[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
|
|
843
|
+
|[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py)|
|
|
844
|
+
|[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
|
|
845
|
+
|[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
|
|
846
|
+
|[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
|
|
847
|
+
|[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
|
|
848
|
+
|[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
|
|
849
|
+
|[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py)|
|
|
850
|
+
|[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
|
|
851
|
+
|[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
|
|
852
|
+
|[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
|
|
853
|
+
|[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
|
|
854
|
+
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
|
|
855
|
+
|[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
|
|
856
|
+
|[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
|
|
857
|
+
|[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
|
|
858
|
+
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
|
|
859
|
+
|[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
|
|
860
|
+
|[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
|
|
861
|
+
|[krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py)|
|
|
862
|
+
|[meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video)|`longcat_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/LongCat-Video.py)|
|
|
863
|
+
|[ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B)|`vap_video`, `vap_prompt`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py)|
|
|
864
|
+
|[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)|
|
|
865
|
+
|[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)|
|
|
866
|
+
|[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)|
|
|
867
|
+
|[Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B)|`input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py)|
|
|
868
|
+
|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py)|
|
|
869
|
+
|[PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py)|
|
|
870
|
+
|[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
|
|
871
|
+
|[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
|
|
872
|
+
|[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
|
|
873
|
+
|[openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py)|
|
|
874
|
+
|[openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py)|
|
|
875
|
+
|[Wan-AI/WanToDance-14B (global model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-global.py)|
|
|
876
|
+
|[Wan-AI/WanToDance-14B (local model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-local.py)|
|
|
877
|
+
|
|
878
|
+
</details>
|
|
879
|
+
|
|
880
|
+
#### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
|
|
881
|
+
|
|
882
|
+
<details>
|
|
883
|
+
|
|
884
|
+
<summary>Quick Start</summary>
|
|
885
|
+
|
|
886
|
+
Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
887
|
+
|
|
888
|
+
```python
|
|
889
|
+
from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
|
|
890
|
+
import torch
|
|
891
|
+
|
|
892
|
+
vram_config = {
|
|
893
|
+
"offload_dtype": torch.bfloat16,
|
|
894
|
+
"offload_device": "cpu",
|
|
895
|
+
"onload_dtype": torch.bfloat16,
|
|
896
|
+
"onload_device": "cpu",
|
|
897
|
+
"preparing_dtype": torch.bfloat16,
|
|
898
|
+
"preparing_device": "cuda",
|
|
899
|
+
"computation_dtype": torch.bfloat16,
|
|
900
|
+
"computation_device": "cuda",
|
|
901
|
+
}
|
|
902
|
+
pipe = ErnieImagePipeline.from_pretrained(
|
|
903
|
+
torch_dtype=torch.bfloat16,
|
|
904
|
+
device='cuda',
|
|
905
|
+
model_configs=[
|
|
906
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
|
|
907
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
908
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
909
|
+
],
|
|
910
|
+
tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
|
|
911
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
image = pipe(
|
|
915
|
+
prompt="一只黑白相间的中华田园犬",
|
|
916
|
+
negative_prompt="",
|
|
917
|
+
height=1024,
|
|
918
|
+
width=1024,
|
|
919
|
+
seed=42,
|
|
920
|
+
num_inference_steps=50,
|
|
921
|
+
cfg_scale=4.0,
|
|
922
|
+
)
|
|
923
|
+
image.save("output.jpg")
|
|
924
|
+
```
|
|
925
|
+
|
|
926
|
+
</details>
|
|
927
|
+
|
|
928
|
+
<details>
|
|
929
|
+
|
|
930
|
+
<summary>Examples</summary>
|
|
931
|
+
|
|
932
|
+
Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
|
|
933
|
+
|
|
934
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
839
935
|
|-|-|-|-|-|-|-|
|
|
840
|
-
|[
|
|
841
|
-
|[
|
|
842
|
-
|[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
|
|
843
|
-
|[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
|
|
844
|
-
|[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
|
|
845
|
-
|[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
|
|
846
|
-
|[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
|
|
847
|
-
|[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py)|
|
|
848
|
-
|[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
|
|
849
|
-
|[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
|
|
850
|
-
|[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
|
|
851
|
-
|[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
|
|
852
|
-
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
|
|
853
|
-
|[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
|
|
854
|
-
|[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
|
|
855
|
-
|[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
|
|
856
|
-
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
|
|
857
|
-
|[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
|
|
858
|
-
|[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
|
|
859
|
-
|[krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video)||[code](/examples/wanvideo/model_inference/krea-realtime-video.py)|[code](/examples/wanvideo/model_training/full/krea-realtime-video.sh)|[code](/examples/wanvideo/model_training/validate_full/krea-realtime-video.py)|[code](/examples/wanvideo/model_training/lora/krea-realtime-video.sh)|[code](/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py)|
|
|
860
|
-
|[meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video)|`longcat_video`|[code](/examples/wanvideo/model_inference/LongCat-Video.py)|[code](/examples/wanvideo/model_training/full/LongCat-Video.sh)|[code](/examples/wanvideo/model_training/validate_full/LongCat-Video.py)|[code](/examples/wanvideo/model_training/lora/LongCat-Video.sh)|[code](/examples/wanvideo/model_training/validate_lora/LongCat-Video.py)|
|
|
861
|
-
|[ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B)|`vap_video`, `vap_prompt`|[code](/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py)|[code](/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py)|[code](/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py)|
|
|
862
|
-
|[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)|
|
|
863
|
-
|[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)|
|
|
864
|
-
|[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)|
|
|
865
|
-
|[Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B)|`input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video`|[code](/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py)|
|
|
866
|
-
|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py)|
|
|
867
|
-
|[PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py)|
|
|
868
|
-
|[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
|
|
869
|
-
|[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
|
|
870
|
-
|[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
|
|
871
|
-
| [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) | `input_image` | [code](/examples/mova/model_inference/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py) |
|
|
872
|
-
| [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) | `input_image` | [code](/examples/mova/model_inference/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py) |
|
|
936
|
+
|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
|
|
937
|
+
|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
|
|
873
938
|
|
|
874
939
|
</details>
|
|
875
940
|
|
|
@@ -1027,3 +1092,9 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-47
|
|
|
1027
1092
|
https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
|
|
1028
1093
|
|
|
1029
1094
|
</details>
|
|
1095
|
+
|
|
1096
|
+
## Contact Us
|
|
1097
|
+
|
|
1098
|
+
|Discord:https://discord.gg/Mm9suEeUDc|
|
|
1099
|
+
|-|
|
|
1100
|
+
|<img width="160" height="160" alt="Image" src="https://github.com/user-attachments/assets/29bdc97b-e35d-4fea-88d6-32e35182e458" />|
|
|
@@ -307,6 +307,13 @@ wan_series = [
|
|
|
307
307
|
"model_class": "diffsynth.models.wav2vec.WanS2VAudioEncoder",
|
|
308
308
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.wans2v_audio_encoder.WanS2VAudioEncoderStateDictConverter",
|
|
309
309
|
},
|
|
310
|
+
{
|
|
311
|
+
# Example: ModelConfig(model_id="Wan-AI/WanToDance-14B", origin_file_pattern="global_model.safetensors")
|
|
312
|
+
"model_hash": "eb18873fc0ba77b541eb7b62dbcd2059",
|
|
313
|
+
"model_name": "wan_video_dit",
|
|
314
|
+
"model_class": "diffsynth.models.wan_video_dit.WanModel",
|
|
315
|
+
"extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'wantodance_enable_music_inject': True, 'wantodance_music_inject_layers': [0, 4, 8, 12, 16, 20, 24, 27], 'wantodance_enable_refimage': True, 'has_ref_conv': True, 'wantodance_enable_refface': False, 'wantodance_enable_global': True, 'wantodance_enable_dynamicfps': True, 'wantodance_enable_unimodel': True}
|
|
316
|
+
},
|
|
310
317
|
]
|
|
311
318
|
|
|
312
319
|
flux_series = [
|
|
@@ -534,6 +541,22 @@ flux2_series = [
|
|
|
534
541
|
},
|
|
535
542
|
]
|
|
536
543
|
|
|
544
|
+
ernie_image_series = [
|
|
545
|
+
{
|
|
546
|
+
# Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
|
|
547
|
+
"model_hash": "584c13713849f1af4e03d5f1858b8b7b",
|
|
548
|
+
"model_name": "ernie_image_dit",
|
|
549
|
+
"model_class": "diffsynth.models.ernie_image_dit.ErnieImageDiT",
|
|
550
|
+
},
|
|
551
|
+
{
|
|
552
|
+
# Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors")
|
|
553
|
+
"model_hash": "404ed9f40796a38dd34c1620f1920207",
|
|
554
|
+
"model_name": "ernie_image_text_encoder",
|
|
555
|
+
"model_class": "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder",
|
|
556
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ernie_image_text_encoder.ErnieImageTextEncoderStateDictConverter",
|
|
557
|
+
},
|
|
558
|
+
]
|
|
559
|
+
|
|
537
560
|
z_image_series = [
|
|
538
561
|
{
|
|
539
562
|
# Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors")
|
|
@@ -597,6 +620,13 @@ z_image_series = [
|
|
|
597
620
|
"extra_kwargs": {"model_size": "0.6B"},
|
|
598
621
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
|
|
599
622
|
},
|
|
623
|
+
{
|
|
624
|
+
# To ensure compatibility with the `model.diffusion_model` prefix introduced by other frameworks.
|
|
625
|
+
"model_hash": "8cf241a0d32f93d5de368502a086852f",
|
|
626
|
+
"model_name": "z_image_dit",
|
|
627
|
+
"model_class": "diffsynth.models.z_image_dit.ZImageDiT",
|
|
628
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_dit.ZImageDiTStateDictConverter",
|
|
629
|
+
},
|
|
600
630
|
]
|
|
601
631
|
"""
|
|
602
632
|
Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
|
|
@@ -870,4 +900,4 @@ mova_series = [
|
|
|
870
900
|
"model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
|
|
871
901
|
},
|
|
872
902
|
]
|
|
873
|
-
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series
|
|
903
|
+
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series + z_image_series + ltx2_series + anima_series + mova_series
|
|
@@ -267,6 +267,18 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
267
267
|
"torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
268
268
|
"torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
269
269
|
},
|
|
270
|
+
"diffsynth.models.ernie_image_dit.ErnieImageDiT": {
|
|
271
|
+
"diffsynth.models.ernie_image_dit.ErnieImageRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
272
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
273
|
+
"torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
274
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
275
|
+
"torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
276
|
+
},
|
|
277
|
+
"diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder": {
|
|
278
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
279
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
280
|
+
"transformers.models.ministral3.modeling_ministral3.Ministral3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
281
|
+
},
|
|
270
282
|
}
|
|
271
283
|
|
|
272
284
|
def QwenImageTextEncoder_Module_Map_Updater():
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import deepspeed
|
|
6
|
+
_HAS_DEEPSPEED = True
|
|
7
|
+
except ModuleNotFoundError:
|
|
8
|
+
_HAS_DEEPSPEED = False
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_custom_forward(module):
|
|
12
|
+
def custom_forward(*inputs, **kwargs):
|
|
13
|
+
return module(*inputs, **kwargs)
|
|
14
|
+
return custom_forward
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_custom_forward_use_reentrant(module):
|
|
18
|
+
def custom_forward(*inputs):
|
|
19
|
+
return module(*inputs)
|
|
20
|
+
return custom_forward
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def judge_args_requires_grad(*args):
|
|
24
|
+
for arg in args:
|
|
25
|
+
if isinstance(arg, torch.Tensor) and arg.requires_grad:
|
|
26
|
+
return True
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def gradient_checkpoint_forward(
|
|
31
|
+
model,
|
|
32
|
+
use_gradient_checkpointing,
|
|
33
|
+
use_gradient_checkpointing_offload,
|
|
34
|
+
*args,
|
|
35
|
+
**kwargs,
|
|
36
|
+
):
|
|
37
|
+
if use_gradient_checkpointing and _HAS_DEEPSPEED and deepspeed.checkpointing.is_configured():
|
|
38
|
+
all_args = args + tuple(kwargs.values())
|
|
39
|
+
if not judge_args_requires_grad(*all_args):
|
|
40
|
+
# get the first grad_enabled tensor from un_checkpointed forward
|
|
41
|
+
model_output = model(*args, **kwargs)
|
|
42
|
+
else:
|
|
43
|
+
model_output = deepspeed.checkpointing.checkpoint(
|
|
44
|
+
create_custom_forward_use_reentrant(model),
|
|
45
|
+
*all_args,
|
|
46
|
+
)
|
|
47
|
+
return model_output
|
|
48
|
+
if use_gradient_checkpointing_offload:
|
|
49
|
+
with torch.autograd.graph.save_on_cpu():
|
|
50
|
+
model_output = torch.utils.checkpoint.checkpoint(
|
|
51
|
+
create_custom_forward(model),
|
|
52
|
+
*args,
|
|
53
|
+
**kwargs,
|
|
54
|
+
use_reentrant=False,
|
|
55
|
+
)
|
|
56
|
+
elif use_gradient_checkpointing:
|
|
57
|
+
model_output = torch.utils.checkpoint.checkpoint(
|
|
58
|
+
create_custom_forward(model),
|
|
59
|
+
*args,
|
|
60
|
+
**kwargs,
|
|
61
|
+
use_reentrant=False,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
model_output = model(*args, **kwargs)
|
|
65
|
+
return model_output
|
|
@@ -339,6 +339,38 @@ class BasePipeline(torch.nn.Module):
|
|
|
339
339
|
noise_pred = noise_pred_posi
|
|
340
340
|
return noise_pred
|
|
341
341
|
|
|
342
|
+
def compile_pipeline(self, mode: str = "default", dynamic: bool = True, fullgraph: bool = False, compile_models: list = None, **kwargs):
|
|
343
|
+
"""
|
|
344
|
+
compile the pipeline with torch.compile. The models that will be compiled are determined by the `compilable_models` attribute of the pipeline.
|
|
345
|
+
If a model has `_repeated_blocks` attribute, we will compile these blocks with regional compilation. Otherwise, we will compile the whole model.
|
|
346
|
+
See https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile for details about compilation arguments.
|
|
347
|
+
Args:
|
|
348
|
+
mode: The compilation mode, which will be passed to `torch.compile`, options are "default", "reduce-overhead", "max-autotune" and "max-autotune-no-cudagraphs. Default to "default".
|
|
349
|
+
dynamic: Whether to enable dynamic graph compilation to support dynamic input shapes, which will be passed to `torch.compile`. Default to True (recommended).
|
|
350
|
+
fullgraph: Whether to use full graph compilation, which will be passed to `torch.compile`. Default to False (recommended).
|
|
351
|
+
compile_models: The list of model names to be compiled. If None, we will compile the models in `pipeline.compilable_models`. Default to None.
|
|
352
|
+
**kwargs: Other arguments for `torch.compile`.
|
|
353
|
+
"""
|
|
354
|
+
compile_models = compile_models or getattr(self, "compilable_models", [])
|
|
355
|
+
if len(compile_models) == 0:
|
|
356
|
+
print("No compilable models in the pipeline. Skip compilation.")
|
|
357
|
+
return
|
|
358
|
+
for name in compile_models:
|
|
359
|
+
model = getattr(self, name, None)
|
|
360
|
+
if model is None:
|
|
361
|
+
print(f"Model '{name}' not found in the pipeline.")
|
|
362
|
+
continue
|
|
363
|
+
repeated_blocks = getattr(model, "_repeated_blocks", None)
|
|
364
|
+
# regional compilation for repeated blocks.
|
|
365
|
+
if repeated_blocks is not None:
|
|
366
|
+
for submod in model.modules():
|
|
367
|
+
if submod.__class__.__name__ in repeated_blocks:
|
|
368
|
+
submod.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
|
|
369
|
+
# compile the whole model.
|
|
370
|
+
else:
|
|
371
|
+
model.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
|
|
372
|
+
print(f"{name} is compiled with mode={mode}, dynamic={dynamic}, fullgraph={fullgraph}.")
|
|
373
|
+
|
|
342
374
|
|
|
343
375
|
class PipelineUnitGraph:
|
|
344
376
|
def __init__(self):
|
|
@@ -4,7 +4,7 @@ from typing_extensions import Literal
|
|
|
4
4
|
|
|
5
5
|
class FlowMatchScheduler():
|
|
6
6
|
|
|
7
|
-
def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning"] = "FLUX.1"):
|
|
7
|
+
def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning", "ERNIE-Image"] = "FLUX.1"):
|
|
8
8
|
self.set_timesteps_fn = {
|
|
9
9
|
"FLUX.1": FlowMatchScheduler.set_timesteps_flux,
|
|
10
10
|
"Wan": FlowMatchScheduler.set_timesteps_wan,
|
|
@@ -13,6 +13,7 @@ class FlowMatchScheduler():
|
|
|
13
13
|
"Z-Image": FlowMatchScheduler.set_timesteps_z_image,
|
|
14
14
|
"LTX-2": FlowMatchScheduler.set_timesteps_ltx2,
|
|
15
15
|
"Qwen-Image-Lightning": FlowMatchScheduler.set_timesteps_qwen_image_lightning,
|
|
16
|
+
"ERNIE-Image": FlowMatchScheduler.set_timesteps_ernie_image,
|
|
16
17
|
}.get(template, FlowMatchScheduler.set_timesteps_flux)
|
|
17
18
|
self.num_train_timesteps = 1000
|
|
18
19
|
|
|
@@ -129,6 +130,18 @@ class FlowMatchScheduler():
|
|
|
129
130
|
timesteps = sigmas * num_train_timesteps
|
|
130
131
|
return sigmas, timesteps
|
|
131
132
|
|
|
133
|
+
@staticmethod
|
|
134
|
+
def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0, shift=3.0):
|
|
135
|
+
sigma_min = 0.0
|
|
136
|
+
sigma_max = 1.0
|
|
137
|
+
num_train_timesteps = 1000
|
|
138
|
+
sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
|
|
139
|
+
sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
|
|
140
|
+
if shift is not None and shift != 1.0:
|
|
141
|
+
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
|
|
142
|
+
timesteps = sigmas * num_train_timesteps
|
|
143
|
+
return sigmas, timesteps
|
|
144
|
+
|
|
132
145
|
@staticmethod
|
|
133
146
|
def set_timesteps_z_image(num_inference_steps=100, denoising_strength=1.0, shift=None, target_timesteps=None):
|
|
134
147
|
sigma_min = 0.0
|
|
@@ -185,7 +198,7 @@ class FlowMatchScheduler():
|
|
|
185
198
|
bsmntw_weighing = bsmntw_weighing * (len(self.timesteps) / steps)
|
|
186
199
|
bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1]
|
|
187
200
|
self.linear_timesteps_weights = bsmntw_weighing
|
|
188
|
-
|
|
201
|
+
|
|
189
202
|
def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs):
|
|
190
203
|
self.sigmas, self.timesteps = self.set_timesteps_fn(
|
|
191
204
|
num_inference_steps=num_inference_steps,
|
|
@@ -29,7 +29,7 @@ def launch_training_task(
|
|
|
29
29
|
dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0], num_workers=num_workers)
|
|
30
30
|
model.to(device=accelerator.device)
|
|
31
31
|
model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
|
|
32
|
-
|
|
32
|
+
initialize_deepspeed_gradient_checkpointing(accelerator)
|
|
33
33
|
for epoch_id in range(num_epochs):
|
|
34
34
|
for data in tqdm(dataloader):
|
|
35
35
|
with accelerator.accumulate(model):
|
|
@@ -70,3 +70,19 @@ def launch_data_process_task(
|
|
|
70
70
|
save_path = os.path.join(model_logger.output_path, str(accelerator.process_index), f"{data_id}.pth")
|
|
71
71
|
data = model(data)
|
|
72
72
|
torch.save(data, save_path)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
|
|
76
|
+
if getattr(accelerator.state, "deepspeed_plugin", None) is not None:
|
|
77
|
+
ds_config = accelerator.state.deepspeed_plugin.deepspeed_config
|
|
78
|
+
if "activation_checkpointing" in ds_config:
|
|
79
|
+
import deepspeed
|
|
80
|
+
act_config = ds_config["activation_checkpointing"]
|
|
81
|
+
deepspeed.checkpointing.configure(
|
|
82
|
+
mpu_=None,
|
|
83
|
+
partition_activations=act_config.get("partition_activations", False),
|
|
84
|
+
checkpoint_in_cpu=act_config.get("cpu_checkpointing", False),
|
|
85
|
+
contiguous_checkpointing=act_config.get("contiguous_memory_optimization", False)
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
print("Do not find activation_checkpointing config in deepspeed config, skip initializing deepspeed gradient checkpointing.")
|
|
@@ -1270,6 +1270,9 @@ class LLMAdapter(nn.Module):
|
|
|
1270
1270
|
|
|
1271
1271
|
|
|
1272
1272
|
class AnimaDiT(MiniTrainDIT):
|
|
1273
|
+
|
|
1274
|
+
_repeated_blocks = ["Block"]
|
|
1275
|
+
|
|
1273
1276
|
def __init__(self):
|
|
1274
1277
|
kwargs = {'image_model': 'anima', 'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'model_channels': 2048, 'concat_padding_mask': True, 'crossattn_emb_channels': 1024, 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'min_fps': 1, 'max_fps': 30, 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'num_blocks': 28, 'num_heads': 16, 'extra_per_block_abs_pos_emb': False, 'rope_h_extrapolation_ratio': 4.0, 'rope_w_extrapolation_ratio': 4.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_h_extrapolation_ratio': 1.0, 'extra_w_extrapolation_ratio': 1.0, 'extra_t_extrapolation_ratio': 1.0, 'rope_enable_fps_modulation': False, 'dtype': torch.bfloat16, 'device': None, 'operations': torch.nn}
|
|
1275
1278
|
super().__init__(**kwargs)
|