diffsynth 2.0.6__tar.gz → 2.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. {diffsynth-2.0.6 → diffsynth-2.0.8}/PKG-INFO +1 -1
  2. {diffsynth-2.0.6 → diffsynth-2.0.8}/README.md +110 -39
  3. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/model_configs.py +31 -1
  4. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/vram_management_module_maps.py +12 -0
  5. diffsynth-2.0.8/diffsynth/core/gradient/gradient_checkpoint.py +65 -0
  6. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/base_pipeline.py +32 -0
  7. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/flow_match.py +15 -2
  8. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/runner.py +17 -1
  9. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/anima_dit.py +3 -0
  10. diffsynth-2.0.8/diffsynth/models/ernie_image_dit.py +362 -0
  11. diffsynth-2.0.8/diffsynth/models/ernie_image_text_encoder.py +76 -0
  12. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_dit.py +3 -0
  13. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_dit.py +3 -0
  14. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_dit.py +1 -0
  15. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_dit.py +3 -0
  16. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_dit.py +145 -3
  17. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_vae.py +16 -0
  18. diffsynth-2.0.8/diffsynth/models/wantodance.py +209 -0
  19. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_dit.py +1 -0
  20. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/anima_image.py +1 -0
  21. diffsynth-2.0.8/diffsynth/pipelines/ernie_image.py +266 -0
  22. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/flux2_image.py +1 -0
  23. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/flux_image.py +1 -0
  24. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/ltx2_audio_video.py +1 -0
  25. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/mova_audio_video.py +1 -0
  26. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/qwen_image.py +1 -0
  27. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/wan_video.py +238 -19
  28. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/pipelines/z_image.py +1 -0
  29. diffsynth-2.0.8/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +21 -0
  30. diffsynth-2.0.8/diffsynth/utils/state_dict_converters/z_image_dit.py +3 -0
  31. diffsynth-2.0.8/diffsynth/utils/xfuser/__init__.py +1 -0
  32. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/xfuser/xdit_context_parallel.py +33 -0
  33. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/PKG-INFO +1 -1
  34. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/SOURCES.txt +6 -0
  35. {diffsynth-2.0.6 → diffsynth-2.0.8}/pyproject.toml +1 -1
  36. diffsynth-2.0.6/diffsynth/core/gradient/gradient_checkpoint.py +0 -34
  37. diffsynth-2.0.6/diffsynth/utils/xfuser/__init__.py +0 -1
  38. {diffsynth-2.0.6 → diffsynth-2.0.8}/LICENSE +0 -0
  39. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/__init__.py +0 -0
  40. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/configs/__init__.py +0 -0
  41. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/__init__.py +0 -0
  42. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/attention/__init__.py +0 -0
  43. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/attention/attention.py +0 -0
  44. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/__init__.py +0 -0
  45. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/operators.py +0 -0
  46. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/data/unified_dataset.py +0 -0
  47. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/device/__init__.py +0 -0
  48. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/device/npu_compatible_device.py +0 -0
  49. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/gradient/__init__.py +0 -0
  50. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/__init__.py +0 -0
  51. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/config.py +0 -0
  52. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/file.py +0 -0
  53. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/loader/model.py +0 -0
  54. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  55. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/__init__.py +0 -0
  56. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/disk_map.py +0 -0
  57. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/initialization.py +0 -0
  58. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/core/vram/layers.py +0 -0
  59. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/__init__.py +0 -0
  60. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/logger.py +0 -0
  61. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/loss.py +0 -0
  62. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/parsers.py +0 -0
  63. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/diffusion/training_module.py +0 -0
  64. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/dinov3_image_encoder.py +0 -0
  65. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_text_encoder.py +0 -0
  66. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux2_vae.py +0 -0
  67. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_controlnet.py +0 -0
  68. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_infiniteyou.py +0 -0
  69. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_ipadapter.py +0 -0
  70. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_lora_encoder.py +0 -0
  71. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_lora_patcher.py +0 -0
  72. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  73. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  74. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_vae.py +0 -0
  75. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/flux_value_control.py +0 -0
  76. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/general_modules.py +0 -0
  77. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/longcat_video_dit.py +0 -0
  78. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_audio_vae.py +0 -0
  79. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_common.py +0 -0
  80. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_text_encoder.py +0 -0
  81. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_upsampler.py +0 -0
  82. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/ltx2_video_vae.py +0 -0
  83. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/model_loader.py +0 -0
  84. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_audio_dit.py +0 -0
  85. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_audio_vae.py +0 -0
  86. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
  87. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen.py +0 -0
  88. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  89. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/nexus_gen_projector.py +0 -0
  90. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_controlnet.py +0 -0
  91. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_image2lora.py +0 -0
  92. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  93. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/qwen_image_vae.py +0 -0
  94. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/sd_text_encoder.py +0 -0
  95. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/siglip2_image_encoder.py +0 -0
  96. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/step1x_connector.py +0 -0
  97. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/step1x_text_encoder.py +0 -0
  98. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  99. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_camera_controller.py +0 -0
  100. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  101. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_image_encoder.py +0 -0
  102. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_mot.py +0 -0
  103. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_motion_controller.py +0 -0
  104. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_text_encoder.py +0 -0
  105. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wan_video_vace.py +0 -0
  106. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/wav2vec.py +0 -0
  107. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_controlnet.py +0 -0
  108. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_image2lora.py +0 -0
  109. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/models/z_image_text_encoder.py +0 -0
  110. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/__init__.py +0 -0
  111. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/annotator.py +0 -0
  112. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  113. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/__init__.py +0 -0
  114. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/audio.py +0 -0
  115. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/audio_video.py +0 -0
  116. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/data/media_io_ltx2.py +0 -0
  117. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/__init__.py +0 -0
  118. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/flux.py +0 -0
  119. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/general.py +0 -0
  120. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/merge.py +0 -0
  121. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/lora/reset_rank.py +0 -0
  122. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/ses/__init__.py +0 -0
  123. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/ses/ses.py +0 -0
  124. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  125. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  126. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  127. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  128. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  129. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  130. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  131. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  132. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  133. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  134. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
  135. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  136. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  137. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
  138. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  139. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  140. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  141. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  142. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  143. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  144. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  145. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  146. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  147. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  148. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  149. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  150. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth/version.py +0 -0
  151. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/dependency_links.txt +0 -0
  152. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/requires.txt +0 -0
  153. {diffsynth-2.0.6 → diffsynth-2.0.8}/diffsynth.egg-info/top_level.txt +0 -0
  154. {diffsynth-2.0.6 → diffsynth-2.0.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.6
3
+ Version: 2.0.8
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -7,6 +7,7 @@
7
7
  [![open issues](https://isitmaintained.com/badge/open/modelscope/DiffSynth-Studio.svg)](https://github.com/modelscope/DiffSynth-Studio/issues)
8
8
  [![GitHub pull-requests](https://img.shields.io/github/issues-pr/modelscope/DiffSynth-Studio.svg)](https://GitHub.com/modelscope/DiffSynth-Studio/pull/)
9
9
  [![GitHub latest commit](https://badgen.net/github/last-commit/modelscope/DiffSynth-Studio)](https://GitHub.com/modelscope/DiffSynth-Studio/commit/)
10
+ [![Discord](https://badgen.net//discord/members/Mm9suEeUDc)](https://discord.gg/Mm9suEeUDc)
10
11
 
11
12
  [切换到中文版](./README_zh.md)
12
13
 
@@ -31,8 +32,9 @@ We believe that a well-developed open-source code framework can lower the thresh
31
32
 
32
33
  > DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update.
33
34
 
34
- > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
35
- - **January 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
35
+ > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
36
+
37
+ - **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
36
38
 
37
39
  - **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
38
40
 
@@ -40,6 +42,9 @@ We believe that a well-developed open-source code framework can lower the thresh
40
42
 
41
43
  - **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
42
44
 
45
+ <details>
46
+ <summary>More</summary>
47
+
43
48
  - **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details.
44
49
 
45
50
  - **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future.
@@ -67,9 +72,6 @@ We believe that a well-developed open-source code framework can lower the thresh
67
72
  - [Differential LoRA Training](/docs/zh/Training/Differential_LoRA.md): This is a training technique we used in [ArtAug](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), now available for LoRA training of any model.
68
73
  - [FP8 Training](/docs/zh/Training/FP8_Precision.md): FP8 can be applied to any non-training model during training, i.e., models with gradients turned off or gradients that only affect LoRA weights.
69
74
 
70
- <details>
71
- <summary>More</summary>
72
-
73
75
  - **November 4, 2025** Supported the [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) model, which is trained based on Wan 2.1 and supports generating corresponding actions based on reference videos.
74
76
 
75
77
  - **October 30, 2025** Supported the [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) model, which supports text-to-video, image-to-video, and video continuation. This model uses the Wan framework for inference and training in this project.
@@ -835,41 +837,104 @@ graph LR;
835
837
 
836
838
  Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
837
839
 
838
- | Model ID | Extra Args | Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
840
+ | Model ID | Extra Inputs | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training |
841
+ |-|-|-|-|-|-|-|-|
842
+ |[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
843
+ |[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py)|
844
+ |[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
845
+ |[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
846
+ |[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
847
+ |[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
848
+ |[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
849
+ |[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py)|
850
+ |[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
851
+ |[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
852
+ |[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
853
+ |[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
854
+ |[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
855
+ |[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
856
+ |[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
857
+ |[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
858
+ |[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
859
+ |[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
860
+ |[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
861
+ |[krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py)|
862
+ |[meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video)|`longcat_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/LongCat-Video.py)|
863
+ |[ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B)|`vap_video`, `vap_prompt`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py)|
864
+ |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)|
865
+ |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)|
866
+ |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)|
867
+ |[Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B)|`input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py)|
868
+ |[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py)|
869
+ |[PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py)|
870
+ |[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
871
+ |[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
872
+ |[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
873
+ |[openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py)|
874
+ |[openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py)|
875
+ |[Wan-AI/WanToDance-14B (global model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-global.py)|
876
+ |[Wan-AI/WanToDance-14B (local model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-local.py)|
877
+
878
+ </details>
879
+
880
+ #### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
881
+
882
+ <details>
883
+
884
+ <summary>Quick Start</summary>
885
+
886
+ Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
887
+
888
+ ```python
889
+ from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
890
+ import torch
891
+
892
+ vram_config = {
893
+ "offload_dtype": torch.bfloat16,
894
+ "offload_device": "cpu",
895
+ "onload_dtype": torch.bfloat16,
896
+ "onload_device": "cpu",
897
+ "preparing_dtype": torch.bfloat16,
898
+ "preparing_device": "cuda",
899
+ "computation_dtype": torch.bfloat16,
900
+ "computation_device": "cuda",
901
+ }
902
+ pipe = ErnieImagePipeline.from_pretrained(
903
+ torch_dtype=torch.bfloat16,
904
+ device='cuda',
905
+ model_configs=[
906
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
907
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
908
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
909
+ ],
910
+ tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
911
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
912
+ )
913
+
914
+ image = pipe(
915
+ prompt="一只黑白相间的中华田园犬",
916
+ negative_prompt="",
917
+ height=1024,
918
+ width=1024,
919
+ seed=42,
920
+ num_inference_steps=50,
921
+ cfg_scale=4.0,
922
+ )
923
+ image.save("output.jpg")
924
+ ```
925
+
926
+ </details>
927
+
928
+ <details>
929
+
930
+ <summary>Examples</summary>
931
+
932
+ Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
933
+
934
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
839
935
  |-|-|-|-|-|-|-|
840
- |[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
841
- |[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py)|
842
- |[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
843
- |[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
844
- |[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
845
- |[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
846
- |[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
847
- |[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py)|
848
- |[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
849
- |[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
850
- |[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
851
- |[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
852
- |[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
853
- |[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
854
- |[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
855
- |[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
856
- |[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
857
- |[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
858
- |[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
859
- |[krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video)||[code](/examples/wanvideo/model_inference/krea-realtime-video.py)|[code](/examples/wanvideo/model_training/full/krea-realtime-video.sh)|[code](/examples/wanvideo/model_training/validate_full/krea-realtime-video.py)|[code](/examples/wanvideo/model_training/lora/krea-realtime-video.sh)|[code](/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py)|
860
- |[meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video)|`longcat_video`|[code](/examples/wanvideo/model_inference/LongCat-Video.py)|[code](/examples/wanvideo/model_training/full/LongCat-Video.sh)|[code](/examples/wanvideo/model_training/validate_full/LongCat-Video.py)|[code](/examples/wanvideo/model_training/lora/LongCat-Video.sh)|[code](/examples/wanvideo/model_training/validate_lora/LongCat-Video.py)|
861
- |[ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B)|`vap_video`, `vap_prompt`|[code](/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py)|[code](/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py)|[code](/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py)|
862
- |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)|
863
- |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)|
864
- |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)|
865
- |[Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B)|`input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video`|[code](/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py)|
866
- |[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py)|
867
- |[PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B)|`vace_control_video`, `vace_reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py)|
868
- |[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
869
- |[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
870
- |[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
871
- | [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) | `input_image` | [code](/examples/mova/model_inference/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py) |
872
- | [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) | `input_image` | [code](/examples/mova/model_inference/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py) |
936
+ |[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
937
+ |[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
873
938
 
874
939
  </details>
875
940
 
@@ -1027,3 +1092,9 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-47
1027
1092
  https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
1028
1093
 
1029
1094
  </details>
1095
+
1096
+ ## Contact Us
1097
+
1098
+ |Discord:https://discord.gg/Mm9suEeUDc|
1099
+ |-|
1100
+ |<img width="160" height="160" alt="Image" src="https://github.com/user-attachments/assets/29bdc97b-e35d-4fea-88d6-32e35182e458" />|
@@ -307,6 +307,13 @@ wan_series = [
307
307
  "model_class": "diffsynth.models.wav2vec.WanS2VAudioEncoder",
308
308
  "state_dict_converter": "diffsynth.utils.state_dict_converters.wans2v_audio_encoder.WanS2VAudioEncoderStateDictConverter",
309
309
  },
310
+ {
311
+ # Example: ModelConfig(model_id="Wan-AI/WanToDance-14B", origin_file_pattern="global_model.safetensors")
312
+ "model_hash": "eb18873fc0ba77b541eb7b62dbcd2059",
313
+ "model_name": "wan_video_dit",
314
+ "model_class": "diffsynth.models.wan_video_dit.WanModel",
315
+ "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'wantodance_enable_music_inject': True, 'wantodance_music_inject_layers': [0, 4, 8, 12, 16, 20, 24, 27], 'wantodance_enable_refimage': True, 'has_ref_conv': True, 'wantodance_enable_refface': False, 'wantodance_enable_global': True, 'wantodance_enable_dynamicfps': True, 'wantodance_enable_unimodel': True}
316
+ },
310
317
  ]
311
318
 
312
319
  flux_series = [
@@ -534,6 +541,22 @@ flux2_series = [
534
541
  },
535
542
  ]
536
543
 
544
+ ernie_image_series = [
545
+ {
546
+ # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
547
+ "model_hash": "584c13713849f1af4e03d5f1858b8b7b",
548
+ "model_name": "ernie_image_dit",
549
+ "model_class": "diffsynth.models.ernie_image_dit.ErnieImageDiT",
550
+ },
551
+ {
552
+ # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors")
553
+ "model_hash": "404ed9f40796a38dd34c1620f1920207",
554
+ "model_name": "ernie_image_text_encoder",
555
+ "model_class": "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder",
556
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ernie_image_text_encoder.ErnieImageTextEncoderStateDictConverter",
557
+ },
558
+ ]
559
+
537
560
  z_image_series = [
538
561
  {
539
562
  # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors")
@@ -597,6 +620,13 @@ z_image_series = [
597
620
  "extra_kwargs": {"model_size": "0.6B"},
598
621
  "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
599
622
  },
623
+ {
624
+ # To ensure compatibility with the `model.diffusion_model` prefix introduced by other frameworks.
625
+ "model_hash": "8cf241a0d32f93d5de368502a086852f",
626
+ "model_name": "z_image_dit",
627
+ "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
628
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_dit.ZImageDiTStateDictConverter",
629
+ },
600
630
  ]
601
631
  """
602
632
  Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
@@ -870,4 +900,4 @@ mova_series = [
870
900
  "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
871
901
  },
872
902
  ]
873
- MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series
903
+ MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series + z_image_series + ltx2_series + anima_series + mova_series
@@ -267,6 +267,18 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
267
267
  "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
268
268
  "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
269
269
  },
270
+ "diffsynth.models.ernie_image_dit.ErnieImageDiT": {
271
+ "diffsynth.models.ernie_image_dit.ErnieImageRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
272
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
273
+ "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
274
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
275
+ "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
276
+ },
277
+ "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder": {
278
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
279
+ "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
280
+ "transformers.models.ministral3.modeling_ministral3.Ministral3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
281
+ },
270
282
  }
271
283
 
272
284
  def QwenImageTextEncoder_Module_Map_Updater():
@@ -0,0 +1,65 @@
1
+ import torch
2
+
3
+
4
+ try:
5
+ import deepspeed
6
+ _HAS_DEEPSPEED = True
7
+ except ModuleNotFoundError:
8
+ _HAS_DEEPSPEED = False
9
+
10
+
11
+ def create_custom_forward(module):
12
+ def custom_forward(*inputs, **kwargs):
13
+ return module(*inputs, **kwargs)
14
+ return custom_forward
15
+
16
+
17
+ def create_custom_forward_use_reentrant(module):
18
+ def custom_forward(*inputs):
19
+ return module(*inputs)
20
+ return custom_forward
21
+
22
+
23
+ def judge_args_requires_grad(*args):
24
+ for arg in args:
25
+ if isinstance(arg, torch.Tensor) and arg.requires_grad:
26
+ return True
27
+ return False
28
+
29
+
30
+ def gradient_checkpoint_forward(
31
+ model,
32
+ use_gradient_checkpointing,
33
+ use_gradient_checkpointing_offload,
34
+ *args,
35
+ **kwargs,
36
+ ):
37
+ if use_gradient_checkpointing and _HAS_DEEPSPEED and deepspeed.checkpointing.is_configured():
38
+ all_args = args + tuple(kwargs.values())
39
+ if not judge_args_requires_grad(*all_args):
40
+ # get the first grad_enabled tensor from un_checkpointed forward
41
+ model_output = model(*args, **kwargs)
42
+ else:
43
+ model_output = deepspeed.checkpointing.checkpoint(
44
+ create_custom_forward_use_reentrant(model),
45
+ *all_args,
46
+ )
47
+ return model_output
48
+ if use_gradient_checkpointing_offload:
49
+ with torch.autograd.graph.save_on_cpu():
50
+ model_output = torch.utils.checkpoint.checkpoint(
51
+ create_custom_forward(model),
52
+ *args,
53
+ **kwargs,
54
+ use_reentrant=False,
55
+ )
56
+ elif use_gradient_checkpointing:
57
+ model_output = torch.utils.checkpoint.checkpoint(
58
+ create_custom_forward(model),
59
+ *args,
60
+ **kwargs,
61
+ use_reentrant=False,
62
+ )
63
+ else:
64
+ model_output = model(*args, **kwargs)
65
+ return model_output
@@ -339,6 +339,38 @@ class BasePipeline(torch.nn.Module):
339
339
  noise_pred = noise_pred_posi
340
340
  return noise_pred
341
341
 
342
+ def compile_pipeline(self, mode: str = "default", dynamic: bool = True, fullgraph: bool = False, compile_models: list = None, **kwargs):
343
+ """
344
+ compile the pipeline with torch.compile. The models that will be compiled are determined by the `compilable_models` attribute of the pipeline.
345
+ If a model has `_repeated_blocks` attribute, we will compile these blocks with regional compilation. Otherwise, we will compile the whole model.
346
+ See https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile for details about compilation arguments.
347
+ Args:
348
+ mode: The compilation mode, which will be passed to `torch.compile`, options are "default", "reduce-overhead", "max-autotune" and "max-autotune-no-cudagraphs. Default to "default".
349
+ dynamic: Whether to enable dynamic graph compilation to support dynamic input shapes, which will be passed to `torch.compile`. Default to True (recommended).
350
+ fullgraph: Whether to use full graph compilation, which will be passed to `torch.compile`. Default to False (recommended).
351
+ compile_models: The list of model names to be compiled. If None, we will compile the models in `pipeline.compilable_models`. Default to None.
352
+ **kwargs: Other arguments for `torch.compile`.
353
+ """
354
+ compile_models = compile_models or getattr(self, "compilable_models", [])
355
+ if len(compile_models) == 0:
356
+ print("No compilable models in the pipeline. Skip compilation.")
357
+ return
358
+ for name in compile_models:
359
+ model = getattr(self, name, None)
360
+ if model is None:
361
+ print(f"Model '{name}' not found in the pipeline.")
362
+ continue
363
+ repeated_blocks = getattr(model, "_repeated_blocks", None)
364
+ # regional compilation for repeated blocks.
365
+ if repeated_blocks is not None:
366
+ for submod in model.modules():
367
+ if submod.__class__.__name__ in repeated_blocks:
368
+ submod.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
369
+ # compile the whole model.
370
+ else:
371
+ model.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
372
+ print(f"{name} is compiled with mode={mode}, dynamic={dynamic}, fullgraph={fullgraph}.")
373
+
342
374
 
343
375
  class PipelineUnitGraph:
344
376
  def __init__(self):
@@ -4,7 +4,7 @@ from typing_extensions import Literal
4
4
 
5
5
  class FlowMatchScheduler():
6
6
 
7
- def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning"] = "FLUX.1"):
7
+ def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning", "ERNIE-Image"] = "FLUX.1"):
8
8
  self.set_timesteps_fn = {
9
9
  "FLUX.1": FlowMatchScheduler.set_timesteps_flux,
10
10
  "Wan": FlowMatchScheduler.set_timesteps_wan,
@@ -13,6 +13,7 @@ class FlowMatchScheduler():
13
13
  "Z-Image": FlowMatchScheduler.set_timesteps_z_image,
14
14
  "LTX-2": FlowMatchScheduler.set_timesteps_ltx2,
15
15
  "Qwen-Image-Lightning": FlowMatchScheduler.set_timesteps_qwen_image_lightning,
16
+ "ERNIE-Image": FlowMatchScheduler.set_timesteps_ernie_image,
16
17
  }.get(template, FlowMatchScheduler.set_timesteps_flux)
17
18
  self.num_train_timesteps = 1000
18
19
 
@@ -129,6 +130,18 @@ class FlowMatchScheduler():
129
130
  timesteps = sigmas * num_train_timesteps
130
131
  return sigmas, timesteps
131
132
 
133
+ @staticmethod
134
+ def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0, shift=3.0):
135
+ sigma_min = 0.0
136
+ sigma_max = 1.0
137
+ num_train_timesteps = 1000
138
+ sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
139
+ sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
140
+ if shift is not None and shift != 1.0:
141
+ sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
142
+ timesteps = sigmas * num_train_timesteps
143
+ return sigmas, timesteps
144
+
132
145
  @staticmethod
133
146
  def set_timesteps_z_image(num_inference_steps=100, denoising_strength=1.0, shift=None, target_timesteps=None):
134
147
  sigma_min = 0.0
@@ -185,7 +198,7 @@ class FlowMatchScheduler():
185
198
  bsmntw_weighing = bsmntw_weighing * (len(self.timesteps) / steps)
186
199
  bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1]
187
200
  self.linear_timesteps_weights = bsmntw_weighing
188
-
201
+
189
202
  def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs):
190
203
  self.sigmas, self.timesteps = self.set_timesteps_fn(
191
204
  num_inference_steps=num_inference_steps,
@@ -29,7 +29,7 @@ def launch_training_task(
29
29
  dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0], num_workers=num_workers)
30
30
  model.to(device=accelerator.device)
31
31
  model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
32
-
32
+ initialize_deepspeed_gradient_checkpointing(accelerator)
33
33
  for epoch_id in range(num_epochs):
34
34
  for data in tqdm(dataloader):
35
35
  with accelerator.accumulate(model):
@@ -70,3 +70,19 @@ def launch_data_process_task(
70
70
  save_path = os.path.join(model_logger.output_path, str(accelerator.process_index), f"{data_id}.pth")
71
71
  data = model(data)
72
72
  torch.save(data, save_path)
73
+
74
+
75
+ def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
76
+ if getattr(accelerator.state, "deepspeed_plugin", None) is not None:
77
+ ds_config = accelerator.state.deepspeed_plugin.deepspeed_config
78
+ if "activation_checkpointing" in ds_config:
79
+ import deepspeed
80
+ act_config = ds_config["activation_checkpointing"]
81
+ deepspeed.checkpointing.configure(
82
+ mpu_=None,
83
+ partition_activations=act_config.get("partition_activations", False),
84
+ checkpoint_in_cpu=act_config.get("cpu_checkpointing", False),
85
+ contiguous_checkpointing=act_config.get("contiguous_memory_optimization", False)
86
+ )
87
+ else:
88
+ print("Do not find activation_checkpointing config in deepspeed config, skip initializing deepspeed gradient checkpointing.")
@@ -1270,6 +1270,9 @@ class LLMAdapter(nn.Module):
1270
1270
 
1271
1271
 
1272
1272
  class AnimaDiT(MiniTrainDIT):
1273
+
1274
+ _repeated_blocks = ["Block"]
1275
+
1273
1276
  def __init__(self):
1274
1277
  kwargs = {'image_model': 'anima', 'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'model_channels': 2048, 'concat_padding_mask': True, 'crossattn_emb_channels': 1024, 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'min_fps': 1, 'max_fps': 30, 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'num_blocks': 28, 'num_heads': 16, 'extra_per_block_abs_pos_emb': False, 'rope_h_extrapolation_ratio': 4.0, 'rope_w_extrapolation_ratio': 4.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_h_extrapolation_ratio': 1.0, 'extra_w_extrapolation_ratio': 1.0, 'extra_t_extrapolation_ratio': 1.0, 'rope_enable_fps_modulation': False, 'dtype': torch.bfloat16, 'device': None, 'operations': torch.nn}
1275
1278
  super().__init__(**kwargs)