diffsynth 2.0.9__tar.gz → 2.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. {diffsynth-2.0.9 → diffsynth-2.0.10}/PKG-INFO +1 -1
  2. {diffsynth-2.0.9 → diffsynth-2.0.10}/README.md +305 -20
  3. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/model_configs.py +144 -1
  4. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/vram_management_module_maps.py +93 -0
  5. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/operators.py +25 -0
  6. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/base_pipeline.py +1 -1
  7. diffsynth-2.0.10/diffsynth/diffusion/ddim_scheduler.py +107 -0
  8. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/flow_match.py +24 -1
  9. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/training_module.py +15 -8
  10. diffsynth-2.0.10/diffsynth/models/ace_step_conditioner.py +695 -0
  11. diffsynth-2.0.10/diffsynth/models/ace_step_dit.py +901 -0
  12. diffsynth-2.0.10/diffsynth/models/ace_step_text_encoder.py +53 -0
  13. diffsynth-2.0.10/diffsynth/models/ace_step_tokenizer.py +722 -0
  14. diffsynth-2.0.10/diffsynth/models/ace_step_vae.py +281 -0
  15. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/dinov3_image_encoder.py +5 -5
  16. diffsynth-2.0.10/diffsynth/models/joyai_image_dit.py +636 -0
  17. diffsynth-2.0.10/diffsynth/models/joyai_image_text_encoder.py +82 -0
  18. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/siglip2_image_encoder.py +5 -5
  19. diffsynth-2.0.10/diffsynth/models/stable_diffusion_text_encoder.py +216 -0
  20. diffsynth-2.0.10/diffsynth/models/stable_diffusion_unet.py +912 -0
  21. diffsynth-2.0.10/diffsynth/models/stable_diffusion_vae.py +642 -0
  22. diffsynth-2.0.10/diffsynth/models/stable_diffusion_xl_text_encoder.py +69 -0
  23. diffsynth-2.0.10/diffsynth/models/stable_diffusion_xl_unet.py +922 -0
  24. diffsynth-2.0.10/diffsynth/pipelines/ace_step.py +582 -0
  25. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/anima_image.py +1 -1
  26. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/flux2_image.py +2 -2
  27. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/flux_image.py +1 -6
  28. diffsynth-2.0.10/diffsynth/pipelines/joyai_image.py +282 -0
  29. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/ltx2_audio_video.py +29 -29
  30. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/mova_audio_video.py +18 -18
  31. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/qwen_image.py +1 -1
  32. diffsynth-2.0.10/diffsynth/pipelines/stable_diffusion.py +230 -0
  33. diffsynth-2.0.10/diffsynth/pipelines/stable_diffusion_xl.py +331 -0
  34. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/wan_video.py +53 -53
  35. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/z_image.py +2 -2
  36. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/audio.py +1 -0
  37. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +13 -0
  38. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_dit.py +10 -0
  39. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +15 -0
  40. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +8 -0
  41. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/dino_v3.py +9 -0
  42. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +20 -0
  43. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +7 -0
  44. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +18 -0
  45. diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +13 -0
  46. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/PKG-INFO +1 -1
  47. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/SOURCES.txt +26 -0
  48. {diffsynth-2.0.9 → diffsynth-2.0.10}/pyproject.toml +1 -1
  49. {diffsynth-2.0.9 → diffsynth-2.0.10}/LICENSE +0 -0
  50. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/__init__.py +0 -0
  51. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/__init__.py +0 -0
  52. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/__init__.py +0 -0
  53. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/attention/__init__.py +0 -0
  54. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/attention/attention.py +0 -0
  55. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/__init__.py +0 -0
  56. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/unified_dataset.py +0 -0
  57. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/device/__init__.py +0 -0
  58. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/device/npu_compatible_device.py +0 -0
  59. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/gradient/__init__.py +0 -0
  60. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
  61. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/__init__.py +0 -0
  62. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/config.py +0 -0
  63. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/file.py +0 -0
  64. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/model.py +0 -0
  65. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  66. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/__init__.py +0 -0
  67. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/disk_map.py +0 -0
  68. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/initialization.py +0 -0
  69. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/layers.py +0 -0
  70. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/__init__.py +0 -0
  71. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/logger.py +0 -0
  72. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/loss.py +0 -0
  73. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/parsers.py +0 -0
  74. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/runner.py +2 -2
  75. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/anima_dit.py +0 -0
  76. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ernie_image_dit.py +0 -0
  77. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ernie_image_text_encoder.py +0 -0
  78. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_dit.py +0 -0
  79. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_text_encoder.py +0 -0
  80. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_vae.py +0 -0
  81. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_controlnet.py +0 -0
  82. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_dit.py +0 -0
  83. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_infiniteyou.py +0 -0
  84. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_ipadapter.py +0 -0
  85. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_lora_encoder.py +0 -0
  86. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_lora_patcher.py +0 -0
  87. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  88. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  89. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_vae.py +0 -0
  90. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_value_control.py +0 -0
  91. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/general_modules.py +0 -0
  92. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/longcat_video_dit.py +0 -0
  93. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_audio_vae.py +0 -0
  94. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_common.py +0 -0
  95. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_dit.py +0 -0
  96. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_text_encoder.py +0 -0
  97. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_upsampler.py +0 -0
  98. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_video_vae.py +0 -0
  99. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/model_loader.py +0 -0
  100. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_audio_dit.py +0 -0
  101. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_audio_vae.py +0 -0
  102. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
  103. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen.py +0 -0
  104. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  105. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen_projector.py +0 -0
  106. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_controlnet.py +0 -0
  107. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_dit.py +0 -0
  108. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_image2lora.py +0 -0
  109. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  110. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_vae.py +0 -0
  111. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/sd_text_encoder.py +0 -0
  112. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/step1x_connector.py +0 -0
  113. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/step1x_text_encoder.py +0 -0
  114. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  115. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_camera_controller.py +0 -0
  116. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_dit.py +0 -0
  117. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  118. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_image_encoder.py +0 -0
  119. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_mot.py +0 -0
  120. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_motion_controller.py +0 -0
  121. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_text_encoder.py +0 -0
  122. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_vace.py +0 -0
  123. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_vae.py +0 -0
  124. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wantodance.py +0 -0
  125. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wav2vec.py +0 -0
  126. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_controlnet.py +0 -0
  127. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_dit.py +0 -0
  128. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_image2lora.py +0 -0
  129. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_text_encoder.py +0 -0
  130. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/ernie_image.py +0 -0
  131. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/__init__.py +0 -0
  132. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/annotator.py +0 -0
  133. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  134. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/__init__.py +0 -0
  135. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/audio_video.py +0 -0
  136. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/media_io_ltx2.py +0 -0
  137. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/__init__.py +0 -0
  138. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/flux.py +0 -0
  139. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/general.py +0 -0
  140. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/merge.py +0 -0
  141. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/reset_rank.py +0 -0
  142. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/ses/__init__.py +0 -0
  143. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/ses/ses.py +0 -0
  144. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  145. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  146. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
  147. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  148. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  149. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  150. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  151. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  152. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  153. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  154. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  155. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
  156. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  157. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  158. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
  159. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  160. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  161. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  162. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  163. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  164. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  165. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  166. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  167. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  168. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  169. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  170. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
  171. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  172. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/xfuser/__init__.py +0 -0
  173. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
  174. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/version.py +0 -0
  175. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/dependency_links.txt +0 -0
  176. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/requires.txt +0 -0
  177. {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/top_level.txt +0 -0
  178. {diffsynth-2.0.9 → diffsynth-2.0.10}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.9
3
+ Version: 2.0.10
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -34,6 +34,12 @@ We believe that a well-developed open-source code framework can lower the thresh
34
34
 
35
35
  > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
36
36
 
37
+ - **April 27, 2026** We support ACE-Step-1.5! Support includes text-to-music generation, low VRAM inference, and LoRA training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
38
+
39
+ - **April 27, 2026**: We have reinstated support for the Stable Diffusion v1.5 and SDXL models, providing academic research support exclusively for these two model types.
40
+
41
+ - **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
42
+
37
43
  - **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
38
44
 
39
45
  - **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
@@ -297,6 +303,129 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
297
303
 
298
304
  </details>
299
305
 
306
+ #### Stable Diffusion: [/docs/en/Model_Details/Stable-Diffusion.md](/docs/en/Model_Details/Stable-Diffusion.md)
307
+
308
+ <details>
309
+
310
+ <summary>Quick Start</summary>
311
+
312
+ Running the following code will quickly load the [AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 2GB VRAM.
313
+
314
+ ```python
315
+ import torch
316
+ from diffsynth.core import ModelConfig
317
+ from diffsynth.pipelines.stable_diffusion import StableDiffusionPipeline
318
+
319
+ vram_config = {
320
+ "offload_dtype": torch.float32,
321
+ "offload_device": "cpu",
322
+ "onload_dtype": torch.float32,
323
+ "onload_device": "cpu",
324
+ "preparing_dtype": torch.float32,
325
+ "preparing_device": "cuda",
326
+ "computation_dtype": torch.float32,
327
+ "computation_device": "cuda",
328
+ }
329
+ pipe = StableDiffusionPipeline.from_pretrained(
330
+ torch_dtype=torch.float32,
331
+ model_configs=[
332
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
333
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
334
+ ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
335
+ ],
336
+ tokenizer_config=ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="tokenizer/"),
337
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
338
+ )
339
+
340
+ image = pipe(
341
+ prompt="a photo of an astronaut riding a horse on mars, high quality, detailed",
342
+ negative_prompt="blurry, low quality, deformed",
343
+ cfg_scale=7.5,
344
+ height=512,
345
+ width=512,
346
+ seed=42,
347
+ rand_device="cuda",
348
+ num_inference_steps=50,
349
+ )
350
+ image.save("image.jpg")
351
+ ```
352
+
353
+ </details>
354
+
355
+ <details>
356
+
357
+ <summary>Examples</summary>
358
+
359
+ Example code for Stable Diffusion is available at: [/examples/stable_diffusion/](/examples/stable_diffusion/)
360
+
361
+ |Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
362
+ |-|-|-|-|-|-|-|
363
+ |[AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5)|[code](/examples/stable_diffusion/model_inference/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_inference_low_vram/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/full/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_full/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/lora/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_lora/stable-diffusion-v1-5.py)|
364
+
365
+ </details>
366
+
367
+ #### Stable Diffusion XL: [/docs/en/Model_Details/Stable-Diffusion-XL.md](/docs/en/Model_Details/Stable-Diffusion-XL.md)
368
+
369
+ <details>
370
+
371
+ <summary>Quick Start</summary>
372
+
373
+ Running the following code will quickly load the [stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 6GB VRAM.
374
+
375
+ ```python
376
+ import torch
377
+ from diffsynth.core import ModelConfig
378
+ from diffsynth.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
379
+
380
+ vram_config = {
381
+ "offload_dtype": torch.float32,
382
+ "offload_device": "cpu",
383
+ "onload_dtype": torch.float32,
384
+ "onload_device": "cpu",
385
+ "preparing_dtype": torch.float32,
386
+ "preparing_device": "cuda",
387
+ "computation_dtype": torch.float32,
388
+ "computation_device": "cuda",
389
+ }
390
+ pipe = StableDiffusionXLPipeline.from_pretrained(
391
+ torch_dtype=torch.float32,
392
+ model_configs=[
393
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
394
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors", **vram_config),
395
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
396
+ ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
397
+ ],
398
+ tokenizer_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer/"),
399
+ tokenizer_2_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer_2/"),
400
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
401
+ )
402
+
403
+ image = pipe(
404
+ prompt="a photo of an astronaut riding a horse on mars",
405
+ negative_prompt="",
406
+ cfg_scale=5.0,
407
+ height=1024,
408
+ width=1024,
409
+ seed=42,
410
+ num_inference_steps=50,
411
+ )
412
+ image.save("image.jpg")
413
+ ```
414
+
415
+ </details>
416
+
417
+ <details>
418
+
419
+ <summary>Examples</summary>
420
+
421
+ Example code for Stable Diffusion XL is available at: [/examples/stable_diffusion_xl/](/examples/stable_diffusion_xl/)
422
+
423
+ |Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
424
+ |-|-|-|-|-|-|-|
425
+ |[stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0)|[code](/examples/stable_diffusion_xl/model_inference/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_inference_low_vram/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/full/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_full/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/lora/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_lora/stable-diffusion-xl-base-1.0.py)|
426
+
427
+ </details>
428
+
300
429
  #### FLUX.2: [/docs/en/Model_Details/FLUX2.md](/docs/en/Model_Details/FLUX2.md)
301
430
 
302
431
  <details>
@@ -598,6 +727,143 @@ Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
598
727
 
599
728
  </details>
600
729
 
730
+ #### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
731
+
732
+ <details>
733
+
734
+ <summary>Quick Start</summary>
735
+
736
+ Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
737
+
738
+ ```python
739
+ from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
740
+ import torch
741
+
742
+ vram_config = {
743
+ "offload_dtype": torch.bfloat16,
744
+ "offload_device": "cpu",
745
+ "onload_dtype": torch.bfloat16,
746
+ "onload_device": "cpu",
747
+ "preparing_dtype": torch.bfloat16,
748
+ "preparing_device": "cuda",
749
+ "computation_dtype": torch.bfloat16,
750
+ "computation_device": "cuda",
751
+ }
752
+ pipe = ErnieImagePipeline.from_pretrained(
753
+ torch_dtype=torch.bfloat16,
754
+ device='cuda',
755
+ model_configs=[
756
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
757
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
758
+ ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
759
+ ],
760
+ tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
761
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
762
+ )
763
+
764
+ image = pipe(
765
+ prompt="一只黑白相间的中华田园犬",
766
+ negative_prompt="",
767
+ height=1024,
768
+ width=1024,
769
+ seed=42,
770
+ num_inference_steps=50,
771
+ cfg_scale=4.0,
772
+ )
773
+ image.save("output.jpg")
774
+ ```
775
+
776
+ </details>
777
+
778
+ <details>
779
+
780
+ <summary>Examples</summary>
781
+
782
+ Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
783
+
784
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
785
+ |-|-|-|-|-|-|-|
786
+ |[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
787
+ |[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
788
+
789
+ </details>
790
+
791
+ #### JoyAI-Image: [/docs/en/Model_Details/JoyAI-Image.md](/docs/en/Model_Details/JoyAI-Image.md)
792
+
793
+ <details>
794
+
795
+ <summary>Quick Start</summary>
796
+
797
+ Running the following code will quickly load the [jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 4GB VRAM.
798
+
799
+ ```python
800
+ from diffsynth.pipelines.joyai_image import JoyAIImagePipeline, ModelConfig
801
+ import torch
802
+ from PIL import Image
803
+ from modelscope import dataset_snapshot_download
804
+
805
+ # Download dataset
806
+ dataset_snapshot_download(
807
+ dataset_id="DiffSynth-Studio/diffsynth_example_dataset",
808
+ local_dir="data/diffsynth_example_dataset",
809
+ allow_file_pattern="joyai_image/JoyAI-Image-Edit/*"
810
+ )
811
+
812
+ vram_config = {
813
+ "offload_dtype": torch.bfloat16,
814
+ "offload_device": "cpu",
815
+ "onload_dtype": torch.bfloat16,
816
+ "onload_device": "cpu",
817
+ "preparing_dtype": torch.bfloat16,
818
+ "preparing_device": "cuda",
819
+ "computation_dtype": torch.bfloat16,
820
+ "computation_device": "cuda",
821
+ }
822
+
823
+ pipe = JoyAIImagePipeline.from_pretrained(
824
+ torch_dtype=torch.bfloat16,
825
+ device="cuda",
826
+ model_configs=[
827
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth", **vram_config),
828
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model*.safetensors", **vram_config),
829
+ ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="vae/Wan2.1_VAE.pth", **vram_config),
830
+ ],
831
+ processor_config=ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/"),
832
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
833
+ )
834
+
835
+ # Use first sample from dataset
836
+ dataset_base_path = "data/diffsynth_example_dataset/joyai_image/JoyAI-Image-Edit"
837
+ prompt = "将裙子改为粉色"
838
+ edit_image = Image.open(f"{dataset_base_path}/edit/image1.jpg").convert("RGB")
839
+
840
+ output = pipe(
841
+ prompt=prompt,
842
+ edit_image=edit_image,
843
+ height=1024,
844
+ width=1024,
845
+ seed=0,
846
+ num_inference_steps=30,
847
+ cfg_scale=5.0,
848
+ )
849
+
850
+ output.save("output_joyai_edit_low_vram.png")
851
+ ```
852
+
853
+ </details>
854
+
855
+ <details>
856
+
857
+ <summary>Examples</summary>
858
+
859
+ Example code for JoyAI-Image is available at: [/examples/joyai_image/](/examples/joyai_image/)
860
+
861
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
862
+ |-|-|-|-|-|-|-|
863
+ |[jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit)|[code](/examples/joyai_image/model_inference/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_inference_low_vram/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/full/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_full/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/lora/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_lora/JoyAI-Image-Edit.py)|
864
+
865
+ </details>
866
+
601
867
  ### Video Synthesis
602
868
 
603
869
  https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
@@ -877,18 +1143,22 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
877
1143
 
878
1144
  </details>
879
1145
 
880
- #### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
1146
+ ### Audio Synthesis
1147
+
1148
+ #### ACE-Step: [/docs/en/Model_Details/ACE-Step.md](/docs/en/Model_Details/ACE-Step.md)
881
1149
 
882
1150
  <details>
883
1151
 
884
1152
  <summary>Quick Start</summary>
885
1153
 
886
- Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
1154
+ Running the following code will quickly load the [ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
887
1155
 
888
1156
  ```python
889
- from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
1157
+ from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
1158
+ from diffsynth.utils.data.audio import save_audio
890
1159
  import torch
891
1160
 
1161
+
892
1162
  vram_config = {
893
1163
  "offload_dtype": torch.bfloat16,
894
1164
  "offload_device": "cpu",
@@ -899,28 +1169,34 @@ vram_config = {
899
1169
  "computation_dtype": torch.bfloat16,
900
1170
  "computation_device": "cuda",
901
1171
  }
902
- pipe = ErnieImagePipeline.from_pretrained(
1172
+
1173
+
1174
+ pipe = AceStepPipeline.from_pretrained(
903
1175
  torch_dtype=torch.bfloat16,
904
- device='cuda',
1176
+ device="cuda",
905
1177
  model_configs=[
906
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
907
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
908
- ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
1178
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors", **vram_config),
1179
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
1180
+ ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
909
1181
  ],
910
- tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
1182
+ text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
911
1183
  vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
912
1184
  )
913
1185
 
914
- image = pipe(
915
- prompt="一只黑白相间的中华田园犬",
916
- negative_prompt="",
917
- height=1024,
918
- width=1024,
1186
+ prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
1187
+ lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
1188
+ audio = pipe(
1189
+ prompt=prompt,
1190
+ lyrics=lyrics,
1191
+ duration=160,
1192
+ bpm=100,
1193
+ keyscale="B minor",
1194
+ timesignature="4",
1195
+ vocal_language="zh",
919
1196
  seed=42,
920
- num_inference_steps=50,
921
- cfg_scale=4.0,
922
1197
  )
923
- image.save("output.jpg")
1198
+
1199
+ save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
924
1200
  ```
925
1201
 
926
1202
  </details>
@@ -929,12 +1205,21 @@ image.save("output.jpg")
929
1205
 
930
1206
  <summary>Examples</summary>
931
1207
 
932
- Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
1208
+ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_step/)
933
1209
 
934
1210
  | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
935
1211
  |-|-|-|-|-|-|-|
936
- |[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
937
- |[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
1212
+ |[ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5)|[code](/examples/ace_step/model_inference/Ace-Step1.5.py)|[code](/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/full/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_full/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/lora/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py)|
1213
+ |[ACE-Step/acestep-v15-turbo-shift1](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift1)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py)|
1214
+ |[ACE-Step/acestep-v15-turbo-shift3](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift3)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py)|
1215
+ |[ACE-Step/acestep-v15-turbo-continuous](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-continuous)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py)|
1216
+ |[ACE-Step/acestep-v15-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-base.py)|
1217
+ |[ACE-Step/acestep-v15-base: CoverTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py)|—|—|—|—|
1218
+ |[ACE-Step/acestep-v15-base: RepaintTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py)|—|—|—|—|
1219
+ |[ACE-Step/acestep-v15-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-sft)|[code](/examples/ace_step/model_inference/acestep-v15-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py)|
1220
+ |[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
1221
+ |[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
1222
+ |[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
938
1223
 
939
1224
  </details>
940
1225
 
@@ -42,6 +42,7 @@ qwen_image_series = [
42
42
  "model_hash": "5722b5c873720009de96422993b15682",
43
43
  "model_name": "dinov3_image_encoder",
44
44
  "model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
45
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
45
46
  },
46
47
  {
47
48
  # Example:
@@ -900,4 +901,146 @@ mova_series = [
900
901
  "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
901
902
  },
902
903
  ]
903
- MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series + z_image_series + ltx2_series + anima_series + mova_series
904
+ stable_diffusion_xl_series = [
905
+ {
906
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
907
+ "model_hash": "142b114f67f5ab3a6d83fb5788f12ded",
908
+ "model_name": "stable_diffusion_xl_unet",
909
+ "model_class": "diffsynth.models.stable_diffusion_xl_unet.SDXLUNet2DConditionModel",
910
+ "extra_kwargs": {"attention_head_dim": [5, 10, 20], "transformer_layers_per_block": [1, 2, 10], "use_linear_projection": True, "addition_embed_type": "text_time", "addition_time_embed_dim": 256, "projection_class_embeddings_input_dim": 2816},
911
+ },
912
+ {
913
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors")
914
+ "model_hash": "98cc34ccc5b54ae0e56bdea8688dcd5a",
915
+ "model_name": "stable_diffusion_xl_text_encoder",
916
+ "model_class": "diffsynth.models.stable_diffusion_xl_text_encoder.SDXLTextEncoder2",
917
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_xl_text_encoder.SDXLTextEncoder2StateDictConverter",
918
+ },
919
+ {
920
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors")
921
+ "model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
922
+ "model_name": "stable_diffusion_text_encoder",
923
+ "model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
924
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
925
+ },
926
+ {
927
+ # Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
928
+ "model_hash": "13115dd45a6e1c39860f91ab073b8a78",
929
+ "model_name": "stable_diffusion_xl_vae",
930
+ "model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
931
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
932
+ "extra_kwargs": {"scaling_factor": 0.13025, "sample_size": 1024, "force_upcast": True},
933
+ },
934
+ ]
935
+
936
+ stable_diffusion_series = [
937
+ {
938
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors")
939
+ "model_hash": "ffd1737ae9df7fd43f5fbed653bdad67",
940
+ "model_name": "stable_diffusion_text_encoder",
941
+ "model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
942
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
943
+ },
944
+ {
945
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
946
+ "model_hash": "f86d5683ed32433be8ca69969c67ba69",
947
+ "model_name": "stable_diffusion_vae",
948
+ "model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
949
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
950
+ },
951
+ {
952
+ # Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
953
+ "model_hash": "025a4b86a84829399d89f613e580757b",
954
+ "model_name": "stable_diffusion_unet",
955
+ "model_class": "diffsynth.models.stable_diffusion_unet.UNet2DConditionModel",
956
+ },
957
+ ]
958
+
959
+ joyai_image_series = [
960
+ {
961
+ # Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth")
962
+ "model_hash": "56592ddfd7d0249d3aa527d24161a863",
963
+ "model_name": "joyai_image_dit",
964
+ "model_class": "diffsynth.models.joyai_image_dit.JoyAIImageDiT",
965
+ },
966
+ {
967
+ # Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
968
+ "model_hash": "2d11bf14bba8b4e87477c8199a895403",
969
+ "model_name": "joyai_image_text_encoder",
970
+ "model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
971
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.joyai_image_text_encoder.JoyAIImageTextEncoderStateDictConverter",
972
+ },
973
+ ]
974
+
975
+ ace_step_series = [
976
+ {
977
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
978
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
979
+ "model_name": "ace_step_dit",
980
+ "model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
981
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
982
+ },
983
+ {
984
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
985
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
986
+ "model_name": "ace_step_dit",
987
+ "model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
988
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
989
+ "extra_kwargs": {
990
+ "hidden_size": 2560,
991
+ "intermediate_size": 9728,
992
+ "num_hidden_layers": 32,
993
+ "num_attention_heads": 32,
994
+ "num_key_value_heads": 8,
995
+ "head_dim": 128,
996
+ "encoder_hidden_size": 2048,
997
+ "layer_types": ["sliding_attention", "full_attention"] * 16,
998
+ },
999
+ },
1000
+ {
1001
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
1002
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
1003
+ "model_name": "ace_step_conditioner",
1004
+ "model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
1005
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
1006
+ },
1007
+ {
1008
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
1009
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
1010
+ "model_name": "ace_step_conditioner",
1011
+ "model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
1012
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
1013
+ },
1014
+ {
1015
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors")
1016
+ "model_hash": "3509bea17b0e8cffc3dd4a15cc7899d0",
1017
+ "model_name": "ace_step_text_encoder",
1018
+ "model_class": "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder",
1019
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_text_encoder.AceStepTextEncoderStateDictConverter",
1020
+ },
1021
+ {
1022
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
1023
+ "model_hash": "51420834e54474986a7f4be0e4d6f687",
1024
+ "model_name": "ace_step_vae",
1025
+ "model_class": "diffsynth.models.ace_step_vae.AceStepVAE",
1026
+ },
1027
+ {
1028
+ # Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
1029
+ "model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
1030
+ "model_name": "ace_step_tokenizer",
1031
+ "model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
1032
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
1033
+ },
1034
+ {
1035
+ # Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
1036
+ "model_hash": "3a28a410c2246f125153ef792d8bc828",
1037
+ "model_name": "ace_step_tokenizer",
1038
+ "model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
1039
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
1040
+ },
1041
+ ]
1042
+
1043
+ MODEL_CONFIGS = (
1044
+ stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
1045
+ + z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series
1046
+ )