diffsynth 2.0.9__tar.gz → 2.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. {diffsynth-2.0.9 → diffsynth-2.0.11}/PKG-INFO +1 -1
  2. {diffsynth-2.0.9 → diffsynth-2.0.11}/README.md +327 -22
  3. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/model_configs.py +144 -1
  4. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/vram_management_module_maps.py +93 -0
  5. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/operators.py +25 -0
  6. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/base_pipeline.py +38 -5
  7. diffsynth-2.0.11/diffsynth/diffusion/ddim_scheduler.py +107 -0
  8. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/flow_match.py +24 -1
  9. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/loss.py +5 -0
  10. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/parsers.py +6 -0
  11. diffsynth-2.0.11/diffsynth/diffusion/template.py +203 -0
  12. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/training_module.py +67 -8
  13. diffsynth-2.0.11/diffsynth/models/ace_step_conditioner.py +695 -0
  14. diffsynth-2.0.11/diffsynth/models/ace_step_dit.py +901 -0
  15. diffsynth-2.0.11/diffsynth/models/ace_step_text_encoder.py +53 -0
  16. diffsynth-2.0.11/diffsynth/models/ace_step_tokenizer.py +722 -0
  17. diffsynth-2.0.11/diffsynth/models/ace_step_vae.py +281 -0
  18. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/dinov3_image_encoder.py +11 -7
  19. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_dit.py +82 -137
  20. diffsynth-2.0.11/diffsynth/models/joyai_image_dit.py +636 -0
  21. diffsynth-2.0.11/diffsynth/models/joyai_image_text_encoder.py +82 -0
  22. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/siglip2_image_encoder.py +13 -7
  23. diffsynth-2.0.11/diffsynth/models/stable_diffusion_text_encoder.py +216 -0
  24. diffsynth-2.0.11/diffsynth/models/stable_diffusion_unet.py +912 -0
  25. diffsynth-2.0.11/diffsynth/models/stable_diffusion_vae.py +642 -0
  26. diffsynth-2.0.11/diffsynth/models/stable_diffusion_xl_text_encoder.py +69 -0
  27. diffsynth-2.0.11/diffsynth/models/stable_diffusion_xl_unet.py +922 -0
  28. diffsynth-2.0.11/diffsynth/pipelines/ace_step.py +582 -0
  29. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/anima_image.py +1 -1
  30. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/flux2_image.py +51 -2
  31. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/flux_image.py +1 -6
  32. diffsynth-2.0.11/diffsynth/pipelines/joyai_image.py +282 -0
  33. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/ltx2_audio_video.py +29 -29
  34. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/mova_audio_video.py +18 -18
  35. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/qwen_image.py +1 -1
  36. diffsynth-2.0.11/diffsynth/pipelines/stable_diffusion.py +230 -0
  37. diffsynth-2.0.11/diffsynth/pipelines/stable_diffusion_xl.py +331 -0
  38. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/wan_video.py +54 -54
  39. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/z_image.py +2 -2
  40. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/audio.py +1 -0
  41. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +13 -0
  42. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_dit.py +10 -0
  43. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +15 -0
  44. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +8 -0
  45. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/dino_v3.py +9 -0
  46. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +20 -0
  47. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +7 -0
  48. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +18 -0
  49. diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +13 -0
  50. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/PKG-INFO +1 -1
  51. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/SOURCES.txt +27 -0
  52. {diffsynth-2.0.9 → diffsynth-2.0.11}/pyproject.toml +1 -1
  53. {diffsynth-2.0.9 → diffsynth-2.0.11}/LICENSE +0 -0
  54. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/__init__.py +0 -0
  55. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/__init__.py +0 -0
  56. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/__init__.py +0 -0
  57. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/attention/__init__.py +0 -0
  58. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/attention/attention.py +0 -0
  59. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/__init__.py +0 -0
  60. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/unified_dataset.py +0 -0
  61. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/device/__init__.py +0 -0
  62. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/device/npu_compatible_device.py +0 -0
  63. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/gradient/__init__.py +0 -0
  64. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
  65. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/__init__.py +0 -0
  66. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/config.py +0 -0
  67. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/file.py +0 -0
  68. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/model.py +0 -0
  69. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  70. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/__init__.py +0 -0
  71. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/disk_map.py +0 -0
  72. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/initialization.py +0 -0
  73. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/layers.py +0 -0
  74. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/__init__.py +0 -0
  75. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/logger.py +0 -0
  76. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/runner.py +2 -2
  77. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/anima_dit.py +0 -0
  78. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ernie_image_dit.py +0 -0
  79. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ernie_image_text_encoder.py +0 -0
  80. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_text_encoder.py +0 -0
  81. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_vae.py +0 -0
  82. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_controlnet.py +0 -0
  83. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_dit.py +0 -0
  84. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_infiniteyou.py +0 -0
  85. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_ipadapter.py +0 -0
  86. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_lora_encoder.py +0 -0
  87. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_lora_patcher.py +0 -0
  88. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  89. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  90. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_vae.py +0 -0
  91. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_value_control.py +0 -0
  92. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/general_modules.py +0 -0
  93. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/longcat_video_dit.py +0 -0
  94. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_audio_vae.py +0 -0
  95. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_common.py +0 -0
  96. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_dit.py +0 -0
  97. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_text_encoder.py +0 -0
  98. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_upsampler.py +0 -0
  99. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_video_vae.py +0 -0
  100. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/model_loader.py +0 -0
  101. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_audio_dit.py +0 -0
  102. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_audio_vae.py +0 -0
  103. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
  104. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen.py +0 -0
  105. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  106. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen_projector.py +0 -0
  107. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_controlnet.py +0 -0
  108. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_dit.py +0 -0
  109. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_image2lora.py +0 -0
  110. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  111. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_vae.py +0 -0
  112. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/sd_text_encoder.py +0 -0
  113. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/step1x_connector.py +0 -0
  114. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/step1x_text_encoder.py +0 -0
  115. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  116. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_camera_controller.py +0 -0
  117. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_dit.py +0 -0
  118. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  119. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_image_encoder.py +0 -0
  120. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_mot.py +0 -0
  121. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_motion_controller.py +0 -0
  122. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_text_encoder.py +0 -0
  123. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_vace.py +0 -0
  124. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_vae.py +0 -0
  125. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wantodance.py +0 -0
  126. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wav2vec.py +0 -0
  127. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_controlnet.py +0 -0
  128. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_dit.py +0 -0
  129. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_image2lora.py +0 -0
  130. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_text_encoder.py +0 -0
  131. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/ernie_image.py +0 -0
  132. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/__init__.py +0 -0
  133. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/annotator.py +0 -0
  134. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  135. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/__init__.py +0 -0
  136. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/audio_video.py +0 -0
  137. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/media_io_ltx2.py +0 -0
  138. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/__init__.py +0 -0
  139. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/flux.py +0 -0
  140. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/general.py +0 -0
  141. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/merge.py +0 -0
  142. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/reset_rank.py +0 -0
  143. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/ses/__init__.py +0 -0
  144. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/ses/ses.py +0 -0
  145. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  146. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  147. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
  148. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  149. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  150. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  151. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  152. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  153. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  154. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  155. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  156. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
  157. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  158. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  159. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
  160. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  161. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  162. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  163. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  164. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  165. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  166. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  167. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  168. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  169. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  170. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  171. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
  172. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  173. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/xfuser/__init__.py +0 -0
  174. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
  175. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/version.py +0 -0
  176. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/dependency_links.txt +0 -0
  177. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/requires.txt +0 -0
  178. {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/top_level.txt +0 -0
  179. {diffsynth-2.0.9 → diffsynth-2.0.11}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.9
3
+ Version: 2.0.11
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -34,6 +34,21 @@ We believe that a well-developed open-source code framework can lower the thresh
34
34
 
35
35
  > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
36
36
 
37
+ - **April 28, 2026** 🔥 We are excited to announce the release of **Diffusion Templates**, a plugin framework designed for Diffusion models that significantly lowers the barrier to training controllable generative models. Let's explore this cutting-edge technology together!
38
+ * Open-source code: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
39
+ * Technical report: [arXiv](https://arxiv.org/abs/2604.24351)
40
+ * Project homepage: [GitHub](https://modelscope.github.io/diffusion-templates-web/)
41
+ * Documentation: [English Version](https://diffsynth-studio-doc.readthedocs.io/en/latest/Diffusion_Templates/Introducing_Diffusion_Templates.html) | [Chinese Version](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/Diffusion_Templates/Introducing_Diffusion_Templates.html)
42
+ * Online demo: [ModelScope](https://modelscope.cn/studios/DiffSynth-Studio/Diffusion-Templates)
43
+ * Model collections: [ModelScope](https://modelscope.cn/collections/DiffSynth-Studio/KleinBase4B-Templates) | [ModelScope International](https://modelscope.ai/collections/DiffSynth-Studio/KleinBase4B-Templates) | [HuggingFace](https://huggingface.co/collections/DiffSynth-Studio/kleinbase4b-templates)
44
+ * Datasets: [ModelScope](https://modelscope.cn/collections/DiffSynth-Studio/ImagePulseV2) | [ModelScope International](https://modelscope.ai/collections/DiffSynth-Studio/ImagePulseV2) | [HuggingFace](https://huggingface.co/collections/DiffSynth-Studio/imagepulsev2)
45
+
46
+ - **April 27, 2026** We support ACE-Step-1.5! Support includes text-to-music generation, low VRAM inference, and LoRA training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
47
+
48
+ - **April 27, 2026**: We have reinstated support for the Stable Diffusion v1.5 and SDXL models, providing academic research support exclusively for these two model types.
49
+
50
+ - **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
51
+
37
52
  - **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
38
53
 
39
54
  - **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
@@ -90,7 +105,7 @@ We believe that a well-developed open-source code framework can lower the thresh
90
105
 
91
106
  - **August 20, 2025** We open-sourced the [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) model, improving the editing effect of Qwen-Image-Edit on low-resolution image inputs. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)
92
107
 
93
- - **August 19, 2025** 🔥 Qwen-Image-Edit open-sourced, welcome a new member to the image editing model family!
108
+ - **August 19, 2025** Qwen-Image-Edit open-sourced, welcome a new member to the image editing model family!
94
109
 
95
110
  - **August 18, 2025** We trained and open-sourced the Qwen-Image inpainting ControlNet model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint). The model structure adopts a lightweight design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py).
96
111
 
@@ -106,7 +121,7 @@ We believe that a well-developed open-source code framework can lower the thresh
106
121
 
107
122
  - **August 5, 2025** We open-sourced the distilled acceleration model [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) for Qwen-Image, achieving approximately 5x acceleration.
108
123
 
109
- - **August 4, 2025** 🔥 Qwen-Image open-sourced, welcome a new member to the image generation model family!
124
+ - **August 4, 2025** Qwen-Image open-sourced, welcome a new member to the image generation model family!
110
125
 
111
126
  - **August 1, 2025** [FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) open-sourced, a text-to-image model focused on aesthetic photography. We provided comprehensive support in a timely manner, including low VRAM layer-by-layer offload, LoRA training, and full training. For more details, please refer to [./examples/flux/](./examples/flux/).
112
127
 
@@ -297,6 +312,129 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
297
312
 
298
313
  </details>
299
314
 
315
+ #### Stable Diffusion: [/docs/en/Model_Details/Stable-Diffusion.md](/docs/en/Model_Details/Stable-Diffusion.md)
316
+
317
+ <details>
318
+
319
+ <summary>Quick Start</summary>
320
+
321
+ Running the following code will quickly load the [AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 2GB VRAM.
322
+
323
+ ```python
324
+ import torch
325
+ from diffsynth.core import ModelConfig
326
+ from diffsynth.pipelines.stable_diffusion import StableDiffusionPipeline
327
+
328
+ vram_config = {
329
+ "offload_dtype": torch.float32,
330
+ "offload_device": "cpu",
331
+ "onload_dtype": torch.float32,
332
+ "onload_device": "cpu",
333
+ "preparing_dtype": torch.float32,
334
+ "preparing_device": "cuda",
335
+ "computation_dtype": torch.float32,
336
+ "computation_device": "cuda",
337
+ }
338
+ pipe = StableDiffusionPipeline.from_pretrained(
339
+ torch_dtype=torch.float32,
340
+ model_configs=[
341
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
342
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
343
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
344
+ ],
345
+ tokenizer_config=ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="tokenizer/"),
346
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
347
+ )
348
+
349
+ image = pipe(
350
+ prompt="a photo of an astronaut riding a horse on mars, high quality, detailed",
351
+ negative_prompt="blurry, low quality, deformed",
352
+ cfg_scale=7.5,
353
+ height=512,
354
+ width=512,
355
+ seed=42,
356
+ rand_device="cuda",
357
+ num_inference_steps=50,
358
+ )
359
+ image.save("image.jpg")
360
+ ```
361
+
362
+ </details>
363
+
364
+ <details>
365
+
366
+ <summary>Examples</summary>
367
+
368
+ Example code for Stable Diffusion is available at: [/examples/stable_diffusion/](/examples/stable_diffusion/)
369
+
370
+ |Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
371
+ |-|-|-|-|-|-|-|
372
+ |[AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5)|[code](/examples/stable_diffusion/model_inference/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_inference_low_vram/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/full/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_full/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/lora/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_lora/stable-diffusion-v1-5.py)|
373
+
374
+ </details>
375
+
376
+ #### Stable Diffusion XL: [/docs/en/Model_Details/Stable-Diffusion-XL.md](/docs/en/Model_Details/Stable-Diffusion-XL.md)
377
+
378
+ <details>
379
+
380
+ <summary>Quick Start</summary>
381
+
382
+ Running the following code will quickly load the [stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 6GB VRAM.
383
+
384
+ ```python
385
+ import torch
386
+ from diffsynth.core import ModelConfig
387
+ from diffsynth.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
388
+
389
+ vram_config = {
390
+ "offload_dtype": torch.float32,
391
+ "offload_device": "cpu",
392
+ "onload_dtype": torch.float32,
393
+ "onload_device": "cpu",
394
+ "preparing_dtype": torch.float32,
395
+ "preparing_device": "cuda",
396
+ "computation_dtype": torch.float32,
397
+ "computation_device": "cuda",
398
+ }
399
+ pipe = StableDiffusionXLPipeline.from_pretrained(
400
+ torch_dtype=torch.float32,
401
+ model_configs=[
402
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
403
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors", **vram_config),
404
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
405
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
406
+ ],
407
+ tokenizer_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer/"),
408
+ tokenizer_2_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer_2/"),
409
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
410
+ )
411
+
412
+ image = pipe(
413
+ prompt="a photo of an astronaut riding a horse on mars",
414
+ negative_prompt="",
415
+ cfg_scale=5.0,
416
+ height=1024,
417
+ width=1024,
418
+ seed=42,
419
+ num_inference_steps=50,
420
+ )
421
+ image.save("image.jpg")
422
+ ```
423
+
424
+ </details>
425
+
426
+ <details>
427
+
428
+ <summary>Examples</summary>
429
+
430
+ Example code for Stable Diffusion XL is available at: [/examples/stable_diffusion_xl/](/examples/stable_diffusion_xl/)
431
+
432
+ |Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
433
+ |-|-|-|-|-|-|-|
434
+ |[stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0)|[code](/examples/stable_diffusion_xl/model_inference/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_inference_low_vram/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/full/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_full/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/lora/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_lora/stable-diffusion-xl-base-1.0.py)|
435
+
436
+ </details>
437
+
300
438
  #### FLUX.2: [/docs/en/Model_Details/FLUX2.md](/docs/en/Model_Details/FLUX2.md)
301
439
 
302
440
  <details>
@@ -350,6 +488,17 @@ Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
350
488
  |[black-forest-labs/FLUX.2-klein-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-9B.py)|
351
489
  |[black-forest-labs/FLUX.2-klein-base-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-4B.py)|
352
490
  |[black-forest-labs/FLUX.2-klein-base-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-9B.py)|
491
+ |[DiffSynth-Studio/Template-KleinBase4B-Aesthetic](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Aesthetic)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Aesthetic.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Aesthetic.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Aesthetic.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Aesthetic.py)|-|-|
492
+ |[DiffSynth-Studio/Template-KleinBase4B-Brightness](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Brightness)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Brightness.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Brightness.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Brightness.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Brightness.py)|-|-|
493
+ |[DiffSynth-Studio/Template-KleinBase4B-Age](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Age)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Age.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Age.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Age.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Age.py)|-|-|
494
+ |[DiffSynth-Studio/Template-KleinBase4B-ControlNet](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ControlNet)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ControlNet.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ControlNet.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ControlNet.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ControlNet.py)|-|-|
495
+ |[DiffSynth-Studio/Template-KleinBase4B-Edit](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Edit)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Edit.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Edit.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Edit.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Edit.py)|-|-|
496
+ |[DiffSynth-Studio/Template-KleinBase4B-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Inpaint)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Inpaint.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Inpaint.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Inpaint.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Inpaint.py)|-|-|
497
+ |[DiffSynth-Studio/Template-KleinBase4B-PandaMeme](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-PandaMeme)|[code](/examples/flux2/model_inference/Template-KleinBase4B-PandaMeme.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-PandaMeme.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-PandaMeme.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-PandaMeme.py)|-|-|
498
+ |[DiffSynth-Studio/Template-KleinBase4B-Sharpness](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Sharpness)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Sharpness.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Sharpness.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Sharpness.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Sharpness.py)|-|-|
499
+ |[DiffSynth-Studio/Template-KleinBase4B-SoftRGB](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-SoftRGB)|[code](/examples/flux2/model_inference/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-SoftRGB.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-SoftRGB.py)|-|-|
500
+ |[DiffSynth-Studio/Template-KleinBase4B-Upscaler](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Upscaler)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Upscaler.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Upscaler.py)|-|-|
501
+ |[DiffSynth-Studio/Template-KleinBase4B-ContentRef](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ContentRef)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ContentRef.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ContentRef.py)|-|-|
353
502
 
354
503
  </details>
355
504
 
@@ -598,6 +747,143 @@ Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
598
747
 
599
748
  </details>
600
749
 
750
+ #### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
751
+
752
+ <details>
753
+
754
+ <summary>Quick Start</summary>
755
+
756
+ Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
757
+
758
+ ```python
759
+ from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
760
+ import torch
761
+
762
+ vram_config = {
763
+ "offload_dtype": torch.bfloat16,
764
+ "offload_device": "cpu",
765
+ "onload_dtype": torch.bfloat16,
766
+ "onload_device": "cpu",
767
+ "preparing_dtype": torch.bfloat16,
768
+ "preparing_device": "cuda",
769
+ "computation_dtype": torch.bfloat16,
770
+ "computation_device": "cuda",
771
+ }
772
+ pipe = ErnieImagePipeline.from_pretrained(
773
+ torch_dtype=torch.bfloat16,
774
+ device='cuda',
775
+ model_configs=[
776
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
777
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
778
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
779
+ ],
780
+ tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
781
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
782
+ )
783
+
784
+ image = pipe(
785
+ prompt="一只黑白相间的中华田园犬",
786
+ negative_prompt="",
787
+ height=1024,
788
+ width=1024,
789
+ seed=42,
790
+ num_inference_steps=50,
791
+ cfg_scale=4.0,
792
+ )
793
+ image.save("output.jpg")
794
+ ```
795
+
796
+ </details>
797
+
798
+ <details>
799
+
800
+ <summary>Examples</summary>
801
+
802
+ Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
803
+
804
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
805
+ |-|-|-|-|-|-|-|
806
+ |[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
807
+ |[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
808
+
809
+ </details>
810
+
811
+ #### JoyAI-Image: [/docs/en/Model_Details/JoyAI-Image.md](/docs/en/Model_Details/JoyAI-Image.md)
812
+
813
+ <details>
814
+
815
+ <summary>Quick Start</summary>
816
+
817
+ Running the following code will quickly load the [jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 4GB VRAM.
818
+
819
+ ```python
820
+ from diffsynth.pipelines.joyai_image import JoyAIImagePipeline, ModelConfig
821
+ import torch
822
+ from PIL import Image
823
+ from modelscope import dataset_snapshot_download
824
+
825
+ # Download dataset
826
+ dataset_snapshot_download(
827
+ dataset_id="DiffSynth-Studio/diffsynth_example_dataset",
828
+ local_dir="data/diffsynth_example_dataset",
829
+ allow_file_pattern="joyai_image/JoyAI-Image-Edit/*"
830
+ )
831
+
832
+ vram_config = {
833
+ "offload_dtype": torch.bfloat16,
834
+ "offload_device": "cpu",
835
+ "onload_dtype": torch.bfloat16,
836
+ "onload_device": "cpu",
837
+ "preparing_dtype": torch.bfloat16,
838
+ "preparing_device": "cuda",
839
+ "computation_dtype": torch.bfloat16,
840
+ "computation_device": "cuda",
841
+ }
842
+
843
+ pipe = JoyAIImagePipeline.from_pretrained(
844
+ torch_dtype=torch.bfloat16,
845
+ device="cuda",
846
+ model_configs=[
847
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth", **vram_config),
848
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model*.safetensors", **vram_config),
849
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="vae/Wan2.1_VAE.pth", **vram_config),
850
+ ],
851
+ processor_config=ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/"),
852
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
853
+ )
854
+
855
+ # Use first sample from dataset
856
+ dataset_base_path = "data/diffsynth_example_dataset/joyai_image/JoyAI-Image-Edit"
857
+ prompt = "将裙子改为粉色"
858
+ edit_image = Image.open(f"{dataset_base_path}/edit/image1.jpg").convert("RGB")
859
+
860
+ output = pipe(
861
+ prompt=prompt,
862
+ edit_image=edit_image,
863
+ height=1024,
864
+ width=1024,
865
+ seed=0,
866
+ num_inference_steps=30,
867
+ cfg_scale=5.0,
868
+ )
869
+
870
+ output.save("output_joyai_edit_low_vram.png")
871
+ ```
872
+
873
+ </details>
874
+
875
+ <details>
876
+
877
+ <summary>Examples</summary>
878
+
879
+ Example code for JoyAI-Image is available at: [/examples/joyai_image/](/examples/joyai_image/)
880
+
881
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
882
+ |-|-|-|-|-|-|-|
883
+ |[jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit)|[code](/examples/joyai_image/model_inference/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_inference_low_vram/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/full/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_full/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/lora/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_lora/JoyAI-Image-Edit.py)|
884
+
885
+ </details>
886
+
601
887
  ### Video Synthesis
602
888
 
603
889
  https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
@@ -877,18 +1163,22 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
877
1163
 
878
1164
  </details>
879
1165
 
880
- #### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
1166
+ ### Audio Synthesis
1167
+
1168
+ #### ACE-Step: [/docs/en/Model_Details/ACE-Step.md](/docs/en/Model_Details/ACE-Step.md)
881
1169
 
882
1170
  <details>
883
1171
 
884
1172
  <summary>Quick Start</summary>
885
1173
 
886
- Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
1174
+ Running the following code will quickly load the [ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
887
1175
 
888
1176
  ```python
889
- from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
1177
+ from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
1178
+ from diffsynth.utils.data.audio import save_audio
890
1179
  import torch
891
1180
 
1181
+
892
1182
  vram_config = {
893
1183
  "offload_dtype": torch.bfloat16,
894
1184
  "offload_device": "cpu",
@@ -899,28 +1189,34 @@ vram_config = {
899
1189
  "computation_dtype": torch.bfloat16,
900
1190
  "computation_device": "cuda",
901
1191
  }
902
- pipe = ErnieImagePipeline.from_pretrained(
1192
+
1193
+
1194
+ pipe = AceStepPipeline.from_pretrained(
903
1195
  torch_dtype=torch.bfloat16,
904
- device='cuda',
1196
+ device="cuda",
905
1197
  model_configs=[
906
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
907
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
908
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
1198
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors", **vram_config),
1199
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
1200
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
909
1201
  ],
910
- tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
1202
+ text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
911
1203
  vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
912
1204
  )
913
1205
 
914
- image = pipe(
915
- prompt="一只黑白相间的中华田园犬",
916
- negative_prompt="",
917
- height=1024,
918
- width=1024,
1206
+ prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
1207
+ lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
1208
+ audio = pipe(
1209
+ prompt=prompt,
1210
+ lyrics=lyrics,
1211
+ duration=160,
1212
+ bpm=100,
1213
+ keyscale="B minor",
1214
+ timesignature="4",
1215
+ vocal_language="zh",
919
1216
  seed=42,
920
- num_inference_steps=50,
921
- cfg_scale=4.0,
922
1217
  )
923
- image.save("output.jpg")
1218
+
1219
+ save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
924
1220
  ```
925
1221
 
926
1222
  </details>
@@ -929,12 +1225,21 @@ image.save("output.jpg")
929
1225
 
930
1226
  <summary>Examples</summary>
931
1227
 
932
- Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
1228
+ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_step/)
933
1229
 
934
1230
  | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
935
1231
  |-|-|-|-|-|-|-|
936
- |[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
937
- |[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
1232
+ |[ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5)|[code](/examples/ace_step/model_inference/Ace-Step1.5.py)|[code](/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/full/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_full/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/lora/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py)|
1233
+ |[ACE-Step/acestep-v15-turbo-shift1](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift1)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py)|
1234
+ |[ACE-Step/acestep-v15-turbo-shift3](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift3)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py)|
1235
+ |[ACE-Step/acestep-v15-turbo-continuous](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-continuous)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py)|
1236
+ |[ACE-Step/acestep-v15-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-base.py)|
1237
+ |[ACE-Step/acestep-v15-base: CoverTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py)|—|—|—|—|
1238
+ |[ACE-Step/acestep-v15-base: RepaintTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py)|—|—|—|—|
1239
+ |[ACE-Step/acestep-v15-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-sft)|[code](/examples/ace_step/model_inference/acestep-v15-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py)|
1240
+ |[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
1241
+ |[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
1242
+ |[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
938
1243
 
939
1244
  </details>
940
1245
 
@@ -42,6 +42,7 @@ qwen_image_series = [
42
42
  "model_hash": "5722b5c873720009de96422993b15682",
43
43
  "model_name": "dinov3_image_encoder",
44
44
  "model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
45
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
45
46
  },
46
47
  {
47
48
  # Example:
@@ -900,4 +901,146 @@ mova_series = [
900
901
  "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
901
902
  },
902
903
  ]
903
- MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series + z_image_series + ltx2_series + anima_series + mova_series
904
+ stable_diffusion_xl_series = [
905
+ {
906
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
907
+ "model_hash": "142b114f67f5ab3a6d83fb5788f12ded",
908
+ "model_name": "stable_diffusion_xl_unet",
909
+ "model_class": "diffsynth.models.stable_diffusion_xl_unet.SDXLUNet2DConditionModel",
910
+ "extra_kwargs": {"attention_head_dim": [5, 10, 20], "transformer_layers_per_block": [1, 2, 10], "use_linear_projection": True, "addition_embed_type": "text_time", "addition_time_embed_dim": 256, "projection_class_embeddings_input_dim": 2816},
911
+ },
912
+ {
913
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors")
914
+ "model_hash": "98cc34ccc5b54ae0e56bdea8688dcd5a",
915
+ "model_name": "stable_diffusion_xl_text_encoder",
916
+ "model_class": "diffsynth.models.stable_diffusion_xl_text_encoder.SDXLTextEncoder2",
917
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_xl_text_encoder.SDXLTextEncoder2StateDictConverter",
918
+ },
919
+ {
920
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors")
921
+ "model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
922
+ "model_name": "stable_diffusion_text_encoder",
923
+ "model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
924
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
925
+ },
926
+ {
927
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
928
+ "model_hash": "13115dd45a6e1c39860f91ab073b8a78",
929
+ "model_name": "stable_diffusion_xl_vae",
930
+ "model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
931
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
932
+ "extra_kwargs": {"scaling_factor": 0.13025, "sample_size": 1024, "force_upcast": True},
933
+ },
934
+ ]
935
+
936
+ stable_diffusion_series = [
937
+ {
938
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors")
939
+ "model_hash": "ffd1737ae9df7fd43f5fbed653bdad67",
940
+ "model_name": "stable_diffusion_text_encoder",
941
+ "model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
942
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
943
+ },
944
+ {
945
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
946
+ "model_hash": "f86d5683ed32433be8ca69969c67ba69",
947
+ "model_name": "stable_diffusion_vae",
948
+ "model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
949
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
950
+ },
951
+ {
952
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
953
+ "model_hash": "025a4b86a84829399d89f613e580757b",
954
+ "model_name": "stable_diffusion_unet",
955
+ "model_class": "diffsynth.models.stable_diffusion_unet.UNet2DConditionModel",
956
+ },
957
+ ]
958
+
959
+ joyai_image_series = [
960
+ {
961
+ # Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth")
962
+ "model_hash": "56592ddfd7d0249d3aa527d24161a863",
963
+ "model_name": "joyai_image_dit",
964
+ "model_class": "diffsynth.models.joyai_image_dit.JoyAIImageDiT",
965
+ },
966
+ {
967
+ # Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
968
+ "model_hash": "2d11bf14bba8b4e87477c8199a895403",
969
+ "model_name": "joyai_image_text_encoder",
970
+ "model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
971
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.joyai_image_text_encoder.JoyAIImageTextEncoderStateDictConverter",
972
+ },
973
+ ]
974
+
975
+ ace_step_series = [
976
+ {
977
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
978
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
979
+ "model_name": "ace_step_dit",
980
+ "model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
981
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
982
+ },
983
+ {
984
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
985
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
986
+ "model_name": "ace_step_dit",
987
+ "model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
988
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
989
+ "extra_kwargs": {
990
+ "hidden_size": 2560,
991
+ "intermediate_size": 9728,
992
+ "num_hidden_layers": 32,
993
+ "num_attention_heads": 32,
994
+ "num_key_value_heads": 8,
995
+ "head_dim": 128,
996
+ "encoder_hidden_size": 2048,
997
+ "layer_types": ["sliding_attention", "full_attention"] * 16,
998
+ },
999
+ },
1000
+ {
1001
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
1002
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
1003
+ "model_name": "ace_step_conditioner",
1004
+ "model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
1005
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
1006
+ },
1007
+ {
1008
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
1009
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
1010
+ "model_name": "ace_step_conditioner",
1011
+ "model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
1012
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
1013
+ },
1014
+ {
1015
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors")
1016
+ "model_hash": "3509bea17b0e8cffc3dd4a15cc7899d0",
1017
+ "model_name": "ace_step_text_encoder",
1018
+ "model_class": "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder",
1019
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_text_encoder.AceStepTextEncoderStateDictConverter",
1020
+ },
1021
+ {
1022
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
1023
+ "model_hash": "51420834e54474986a7f4be0e4d6f687",
1024
+ "model_name": "ace_step_vae",
1025
+ "model_class": "diffsynth.models.ace_step_vae.AceStepVAE",
1026
+ },
1027
+ {
1028
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
1029
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
1030
+ "model_name": "ace_step_tokenizer",
1031
+ "model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
1032
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
1033
+ },
1034
+ {
1035
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
1036
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
1037
+ "model_name": "ace_step_tokenizer",
1038
+ "model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
1039
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
1040
+ },
1041
+ ]
1042
+
1043
+ MODEL_CONFIGS = (
1044
+ stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
1045
+ + z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series
1046
+ )