diffsynth 2.0.12__tar.gz → 2.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. {diffsynth-2.0.12 → diffsynth-2.0.14}/PKG-INFO +1 -1
  2. {diffsynth-2.0.12 → diffsynth-2.0.14}/README.md +147 -3
  3. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/configs/model_configs.py +129 -0
  4. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/configs/vram_management_module_maps.py +13 -0
  5. diffsynth-2.0.14/diffsynth/core/attention/attention.py +191 -0
  6. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/data/operators.py +9 -5
  7. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/data/unified_dataset.py +1 -1
  8. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/loader/model.py +5 -1
  9. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/vram/layers.py +3 -3
  10. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/flow_match.py +31 -2
  11. diffsynth-2.0.14/diffsynth/diffusion/logger.py +107 -0
  12. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/parsers.py +14 -3
  13. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/runner.py +17 -2
  14. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/training_module.py +11 -0
  15. diffsynth-2.0.14/diffsynth/metrics/__init__.py +32 -0
  16. diffsynth-2.0.14/diffsynth/metrics/aesthetic.py +42 -0
  17. diffsynth-2.0.14/diffsynth/metrics/base.py +28 -0
  18. diffsynth-2.0.14/diffsynth/metrics/bioclip.py +45 -0
  19. diffsynth-2.0.14/diffsynth/metrics/clip.py +55 -0
  20. diffsynth-2.0.14/diffsynth/metrics/fid.py +37 -0
  21. diffsynth-2.0.14/diffsynth/metrics/hpsv2.py +41 -0
  22. diffsynth-2.0.14/diffsynth/metrics/hpsv3.py +63 -0
  23. diffsynth-2.0.14/diffsynth/metrics/image_reward.py +48 -0
  24. diffsynth-2.0.14/diffsynth/metrics/lpips.py +63 -0
  25. diffsynth-2.0.14/diffsynth/metrics/pickscore.py +59 -0
  26. diffsynth-2.0.14/diffsynth/metrics/qwen_image_bench.py +70 -0
  27. diffsynth-2.0.14/diffsynth/metrics/unified_reward_2.py +69 -0
  28. diffsynth-2.0.14/diffsynth/metrics/unified_reward_edit.py +97 -0
  29. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_conditioner.py +5 -26
  30. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_dit.py +33 -454
  31. diffsynth-2.0.14/diffsynth/models/aesthetic.py +90 -0
  32. diffsynth-2.0.14/diffsynth/models/bioclip.py +118 -0
  33. diffsynth-2.0.14/diffsynth/models/clip.py +153 -0
  34. diffsynth-2.0.14/diffsynth/models/demucs.py +483 -0
  35. diffsynth-2.0.14/diffsynth/models/fid.py +238 -0
  36. diffsynth-2.0.14/diffsynth/models/hpsv2.py +92 -0
  37. diffsynth-2.0.14/diffsynth/models/hpsv3.py +353 -0
  38. diffsynth-2.0.14/diffsynth/models/ideogram4_dit.py +447 -0
  39. diffsynth-2.0.14/diffsynth/models/ideogram4_text_encoder.py +353 -0
  40. diffsynth-2.0.14/diffsynth/models/ideogram4_vae.py +74 -0
  41. diffsynth-2.0.14/diffsynth/models/image_reward.py +206 -0
  42. diffsynth-2.0.14/diffsynth/models/lpips.py +351 -0
  43. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/model_loader.py +2 -1
  44. diffsynth-2.0.14/diffsynth/models/pickscore.py +84 -0
  45. diffsynth-2.0.14/diffsynth/models/qwen_image_bench.py +593 -0
  46. diffsynth-2.0.14/diffsynth/models/unified_reward_2.py +230 -0
  47. diffsynth-2.0.14/diffsynth/models/unified_reward_edit.py +377 -0
  48. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/ace_step.py +29 -15
  49. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/hidream_o1_image.py +5 -0
  50. diffsynth-2.0.14/diffsynth/pipelines/ideogram4.py +297 -0
  51. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/ltx2_audio_video.py +1 -2
  52. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/z_image.py +6 -1
  53. diffsynth-2.0.14/diffsynth/utils/demucs/__init__.py +21 -0
  54. diffsynth-2.0.14/diffsynth/utils/dequantizer/__init__.py +15 -0
  55. diffsynth-2.0.14/diffsynth/utils/state_dict_converters/image_metrics.py +135 -0
  56. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth.egg-info/PKG-INFO +1 -1
  57. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth.egg-info/SOURCES.txt +34 -0
  58. {diffsynth-2.0.12 → diffsynth-2.0.14}/pyproject.toml +1 -1
  59. diffsynth-2.0.12/diffsynth/core/attention/attention.py +0 -121
  60. diffsynth-2.0.12/diffsynth/diffusion/logger.py +0 -43
  61. {diffsynth-2.0.12 → diffsynth-2.0.14}/LICENSE +0 -0
  62. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/__init__.py +0 -0
  63. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/configs/__init__.py +0 -0
  64. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/__init__.py +0 -0
  65. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/attention/__init__.py +0 -0
  66. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/data/__init__.py +0 -0
  67. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/device/__init__.py +0 -0
  68. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/device/npu_compatible_device.py +0 -0
  69. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/gradient/__init__.py +0 -0
  70. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
  71. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/loader/__init__.py +0 -0
  72. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/loader/config.py +0 -0
  73. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/loader/file.py +0 -0
  74. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  75. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/offload_training/__init__.py +0 -0
  76. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/offload_training/manager.py +0 -0
  77. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/offload_training/memory_buffer.py +0 -0
  78. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/offload_training/offloader.py +0 -0
  79. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/vram/__init__.py +0 -0
  80. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/vram/disk_map.py +0 -0
  81. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/core/vram/initialization.py +0 -0
  82. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/__init__.py +0 -0
  83. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/base_pipeline.py +0 -0
  84. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/ddim_scheduler.py +0 -0
  85. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/loss.py +0 -0
  86. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/diffusion/template.py +0 -0
  87. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_residual_fsq.py +0 -0
  88. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_text_encoder.py +0 -0
  89. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_tokenizer.py +0 -0
  90. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ace_step_vae.py +0 -0
  91. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/anima_dit.py +0 -0
  92. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/dinov3_image_encoder.py +0 -0
  93. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ernie_image_dit.py +0 -0
  94. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ernie_image_text_encoder.py +0 -0
  95. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux2_dit.py +0 -0
  96. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux2_text_encoder.py +0 -0
  97. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux2_vae.py +0 -0
  98. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_controlnet.py +0 -0
  99. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_dit.py +0 -0
  100. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_infiniteyou.py +0 -0
  101. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_ipadapter.py +0 -0
  102. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_lora_encoder.py +0 -0
  103. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_lora_patcher.py +0 -0
  104. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  105. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  106. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_vae.py +0 -0
  107. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/flux_value_control.py +0 -0
  108. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/general_modules.py +0 -0
  109. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/hidream_common.py +0 -0
  110. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/hidream_o1_image_dit.py +0 -0
  111. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/joyai_image_dit.py +0 -0
  112. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/joyai_image_text_encoder.py +0 -0
  113. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/longcat_video_dit.py +0 -0
  114. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_audio_vae.py +0 -0
  115. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_common.py +0 -0
  116. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_dit.py +0 -0
  117. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_text_encoder.py +0 -0
  118. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_upsampler.py +0 -0
  119. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/ltx2_video_vae.py +0 -0
  120. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/mova_audio_dit.py +0 -0
  121. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/mova_audio_vae.py +0 -0
  122. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
  123. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/nexus_gen.py +0 -0
  124. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  125. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_projector.py +0 -0
  126. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/qwen_image_controlnet.py +0 -0
  127. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/qwen_image_dit.py +0 -0
  128. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/qwen_image_image2lora.py +0 -0
  129. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  130. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/qwen_image_vae.py +0 -0
  131. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/sd_text_encoder.py +0 -0
  132. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/siglip2_image_encoder.py +0 -0
  133. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_text_encoder.py +0 -0
  134. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_unet.py +0 -0
  135. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_vae.py +0 -0
  136. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_text_encoder.py +0 -0
  137. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_unet.py +0 -0
  138. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/step1x_connector.py +0 -0
  139. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/step1x_text_encoder.py +0 -0
  140. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  141. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_camera_controller.py +0 -0
  142. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit.py +0 -0
  143. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  144. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_image_encoder.py +0 -0
  145. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_mot.py +0 -0
  146. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_motion_controller.py +0 -0
  147. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_text_encoder.py +0 -0
  148. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_vace.py +0 -0
  149. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wan_video_vae.py +0 -0
  150. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wantodance.py +0 -0
  151. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/wav2vec.py +0 -0
  152. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/z_image_controlnet.py +0 -0
  153. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/z_image_dit.py +0 -0
  154. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/z_image_image2lora.py +0 -0
  155. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/models/z_image_text_encoder.py +0 -0
  156. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/anima_image.py +0 -0
  157. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/ernie_image.py +0 -0
  158. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/flux2_image.py +0 -0
  159. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/flux_image.py +0 -0
  160. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/joyai_image.py +0 -0
  161. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/mova_audio_video.py +0 -0
  162. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/qwen_image.py +0 -0
  163. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion.py +0 -0
  164. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion_xl.py +0 -0
  165. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/pipelines/wan_video.py +0 -0
  166. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/controlnet/__init__.py +0 -0
  167. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/controlnet/annotator.py +0 -0
  168. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  169. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/data/__init__.py +0 -0
  170. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/data/audio.py +0 -0
  171. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/data/audio_video.py +0 -0
  172. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/data/media_io_ltx2.py +0 -0
  173. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/lora/__init__.py +0 -0
  174. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/lora/flux.py +0 -0
  175. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/lora/general.py +0 -0
  176. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/lora/merge.py +0 -0
  177. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/lora/reset_rank.py +0 -0
  178. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/ses/__init__.py +0 -0
  179. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/ses/ses.py +0 -0
  180. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  181. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +0 -0
  182. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_dit.py +0 -0
  183. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +0 -0
  184. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +0 -0
  185. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  186. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/dino_v3.py +0 -0
  187. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
  188. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  189. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  190. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  191. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  192. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  193. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  194. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  195. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  196. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +0 -0
  197. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
  198. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  199. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  200. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
  201. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  202. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  203. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  204. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +0 -0
  205. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +0 -0
  206. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +0 -0
  207. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  208. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  209. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  210. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  211. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  212. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  213. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  214. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  215. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
  216. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  217. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/xfuser/__init__.py +0 -0
  218. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
  219. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth/version.py +0 -0
  220. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth.egg-info/dependency_links.txt +0 -0
  221. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth.egg-info/requires.txt +0 -0
  222. {diffsynth-2.0.12 → diffsynth-2.0.14}/diffsynth.egg-info/top_level.txt +0 -0
  223. {diffsynth-2.0.12 → diffsynth-2.0.14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.12
3
+ Version: 2.0.14
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -34,6 +34,17 @@ We believe that a well-developed open-source code framework can lower the thresh
34
34
 
35
35
  > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
36
36
 
37
+ - **June 16, 2026**: We have added a new Template model for ACE-Step: [vocals2music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-vocals2music). For more details, please refer to the [documentation](/docs/zh/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
38
+
39
+ - **June 15, 2026** We have open-sourced Image-to-LoRA V2, compressing the hours-long training process for image style LoRAs into a single model inference step, thereby exploring a new paradigm for LoRA model training. The [technical report](https://arxiv.org/abs/2606.13809) has been released. This release includes three models:
40
+ * [DiffSynth-Studio/ZImage-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2): Adapted for the Z-Image model
41
+ * [DiffSynth-Studio/KleinBase4B-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2): Adapted for the FLUX.2-klein-base-4B model
42
+ * [DiffSynth-Studio/HidreamO1-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/HidreamO1-i2L-v2): Adapted for the Hidream-O1-Image model
43
+
44
+ - **June 5, 2026** Ideogram 4 open-sourced. Support includes text-to-image inference. For details, please refer to the [documentation](/docs/en/Model_Details/Ideogram-4.md) and [example code](/examples/ideogram4/).
45
+
46
+ - **May 21, 2026**: Added support for image quality metrics models, including FID, CLIP, Aesthetic, PickScore, ImageReward, HPSv2, and HPSv3. For details, refer to the [documentation](/docs/en/Model_Details/Image-Quality-Metrics.md) and [example code](/examples/image_quality_metric/).
47
+
37
48
  - **May 18, 2026** Added **CPU Offload Training** support. By moving model weights layer-by-layer between CPU and GPU, it significantly reduces GPU VRAM usage during training, enabling LoRA training of large models even on consumer-grade GPUs, compatible with all models. Simply add `--enable_model_cpu_offload` to your training command to enable (currently supports single-GPU training only). For details, see the [documentation](/docs/en/Training/Offload_Training.md).
38
49
 
39
50
  - **May 14, 2026** HiDream-O1-Image open-sourced, welcome a new member to the image model family! Support includes text-to-image generation, image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/HiDream-O1-Image.md) and [example code](/examples/hidream_o1_image/).
@@ -53,6 +64,9 @@ We believe that a well-developed open-source code framework can lower the thresh
53
64
 
54
65
  - **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
55
66
 
67
+ <details>
68
+ <summary>More</summary>
69
+
56
70
  - **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
57
71
 
58
72
  - **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
@@ -61,9 +75,6 @@ We believe that a well-developed open-source code framework can lower the thresh
61
75
 
62
76
  - **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
63
77
 
64
- <details>
65
- <summary>More</summary>
66
-
67
78
  - **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details.
68
79
 
69
80
  - **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future.
@@ -313,6 +324,7 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
313
324
  |[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|
314
325
  |[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|
315
326
  |[PAI/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|
327
+ |[DiffSynth-Studio/ZImage-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2)|[code](/examples/z_image/model_inference/ZImage-i2L-v2.py)|[code](/examples/z_image/model_inference_low_vram/ZImage-i2L-v2.py)|[code](/examples/z_image/model_training/full/ZImage-i2L-v2.sh)|[code](/examples/z_image/model_training/validate_full/ZImage-i2L-v2.py)|-|-|
316
328
 
317
329
  </details>
318
330
 
@@ -503,6 +515,7 @@ Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
503
515
  |[DiffSynth-Studio/Template-KleinBase4B-SoftRGB](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-SoftRGB)|[code](/examples/flux2/model_inference/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-SoftRGB.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-SoftRGB.py)|-|-|
504
516
  |[DiffSynth-Studio/Template-KleinBase4B-Upscaler](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Upscaler)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Upscaler.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Upscaler.py)|-|-|
505
517
  |[DiffSynth-Studio/Template-KleinBase4B-ContentRef](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ContentRef)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ContentRef.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ContentRef.py)|-|-|
518
+ |[DiffSynth-Studio/KleinBase4B-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2)|[code](/examples/flux2/model_inference/KleinBase4B-i2L-v2.py)|[code](/examples/flux2/model_inference_low_vram/KleinBase4B-i2L-v2.py)|[code](/examples/flux2/model_training/full/KleinBase4B-i2L-v2.sh)|[code](/examples/flux2/model_training/validate_full/KleinBase4B-i2L-v2.py)|-|-|
506
519
 
507
520
  </details>
508
521
 
@@ -947,6 +960,85 @@ Example code for HiDream-O1-Image is available at: [/examples/hidream_o1_image/]
947
960
  |-|-|-|-|-|-|-|
948
961
  |[HiDream-ai/HiDream-O1-Image](https://modelscope.cn/HiDream-ai/HiDream-O1-Image)|[code](/examples/hidream_o1_image/model_inference/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_training/full/HiDream-O1-Image.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_training/lora/HiDream-O1-Image.sh)|[code](/examples/hidream_o1_image/model_training/validate_lora/HiDream-O1-Image.py)|
949
962
  |[HiDream-ai/HiDream-O1-Image-Dev](https://modelscope.cn/HiDream-ai/HiDream-O1-Image-Dev)|[code](/examples/hidream_o1_image/model_inference/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_training/full/HiDream-O1-Image-Dev.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_training/lora/HiDream-O1-Image-Dev.sh)|[code](/examples/hidream_o1_image/model_training/validate_lora/HiDream-O1-Image-Dev.py)|
963
+ |[DiffSynth-Studio/HidreamO1-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/HidreamO1-i2L-v2)|[code](/examples/hidream_o1_image/model_inference/HidreamO1-i2L-v2.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HidreamO1-i2L-v2.py)|[code](/examples/hidream_o1_image/model_training/full/HidreamO1-i2L-v2.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HidreamO1-i2L-v2.py)|-|-|
964
+
965
+ </details>
966
+
967
+ #### Ideogram 4: [/docs/en/Model_Details/Ideogram-4.md](/docs/en/Model_Details/Ideogram-4.md)
968
+
969
+ <details>
970
+
971
+ <summary>Quick Start</summary>
972
+
973
+ Running the following code will quickly load the [ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8) model and perform inference. The model can run with a minimum of 24GB VRAM.
974
+
975
+ ```python
976
+ from diffsynth.pipelines.ideogram4 import Ideogram4Pipeline
977
+ from diffsynth.core import ModelConfig
978
+ import torch
979
+
980
+
981
+ pipe = Ideogram4Pipeline.from_pretrained(
982
+ torch_dtype=torch.bfloat16,
983
+ device="cuda",
984
+ model_configs=[
985
+ ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors"),
986
+ # unconditional_transformer is optional. You can delete this line to reduce VRAM required.
987
+ ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="unconditional_transformer/diffusion_pytorch_model.safetensors"),
988
+ ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors"),
989
+ ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
990
+ ],
991
+ tokenizer_config=ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="tokenizer/"),
992
+ )
993
+ prompt = r"""
994
+ {
995
+ "high_level_description": "A medium-shot photograph of Formula 1 driver Max Verstappen wearing his Red Bull Racing racing suit and cap, smiling as he holds his racing helmet and talks to a man in a white shirt and black vest at a race track.",
996
+ "style_description": {
997
+ "aesthetics": "saturated primary colors, rule of thirds, joyful and triumphant",
998
+ "lighting": "overcast daylight, diffused, soft subtle shadows",
999
+ "photo": "shallow depth of field, sharp focus, eye-level, telephoto",
1000
+ "medium": "photograph"
1001
+ },
1002
+ "compositional_deconstruction": {
1003
+ "background": "The background is an out-of-focus racing paddock or track environment. Several blurred figures are visible, including one in an orange shirt. A purple and white structure with a red 'F1' logo stands on the left. The scene is outdoors with daylight, though the sky is not visible.",
1004
+ "elements": [
1005
+ {"type": "obj", "bbox": [55, 642, 1000, 937], "desc": "An older man standing in profile, facing left toward Max Verstappen. He has grey hair and fair skin. He is wearing a white long-sleeved button-down shirt with a navy blue quilted vest over it. He has a slight smile."},
1006
+ {"type": "obj", "bbox": [34, 137, 1000, 617], "desc": "Max Verstappen, a fair-skinned male Formula 1 driver, positioned in the center. He is facing forward with a joyful expression and a slight smile. He wears a navy blue Red Bull Racing team uniform with numerous sponsor logos and a matching baseball cap with the number '1'. He is holding a white and red racing helmet in his hands. He has a silver watch on his left wrist."},
1007
+ {"type": "obj", "bbox": [422, 212, 792, 452], "desc": "Max Verstappen's racing helmet, held in front of his chest. It features a white, red, and yellow design with the Red Bull logo and the 'Player 0.0' branding. The visor is clear and open."},
1008
+ {"type": "text", "bbox": [657, 0, 755, 142], "text": "F1", "desc": "Large, stylized red logo on a black and purple background in the lower left."},
1009
+ {"type": "text", "bbox": [768, 0, 818, 147], "text": "Formula 1\nWorld Championship™", "desc": "Small white sans-serif text below the F1 logo on the left side."},
1010
+ {"type": "text", "bbox": [78, 447, 117, 510], "text": "ORACLE\nRed Bull\nRacing", "desc": "Very small white and orange logo on the front of the navy blue cap."},
1011
+ {"type": "text", "bbox": [78, 417, 120, 440], "text": "1", "desc": "Bold red numeral '1' on the front left side of the navy blue cap."},
1012
+ {"type": "text", "bbox": [332, 442, 363, 483], "text": "Red Bull", "desc": "Small yellow and red text logo on the collar of the uniform."},
1013
+ {"type": "text", "bbox": [373, 490, 423, 532], "text": "RAUCH", "desc": "Small yellow and blue logo on the right chest of the uniform."},
1014
+ {"type": "text", "bbox": [422, 473, 500, 532], "text": "BYBIT\nHONDA", "desc": "Medium-sized white sans-serif text on the right chest of the uniform."},
1015
+ {"type": "text", "bbox": [410, 203, 442, 257], "text": "RAUCH", "desc": "Small yellow logo on the left upper arm of the uniform."},
1016
+ {"type": "text", "bbox": [530, 448, 627, 510], "text": "Red Bull", "desc": "Medium red text logo on the right side of the torso, part of the Red Bull graphic."},
1017
+ {"type": "text", "bbox": [680, 417, 768, 523], "text": "Red Bull", "desc": "Large red text logo across the lower torso of the uniform."},
1018
+ {"type": "text", "bbox": [797, 475, 815, 518], "text": "MAX", "desc": "Small white text next to a Dutch flag on the belt area of the uniform."},
1019
+ {"type": "text", "bbox": [558, 317, 715, 355], "text": "Player 0.0", "desc": "Black sans-serif text on a white band on the racing helmet."},
1020
+ {"type": "text", "bbox": [560, 800, 582, 835], "text": "IA.COM", "desc": "Small blue sans-serif text on the right sleeve of the white shirt."},
1021
+ {"type": "text", "bbox": [968, 8, 997, 332], "text": "© Anadolu Agency via Getty Images", "desc": "Small white watermark text in the bottom left corner."}
1022
+ ]
1023
+ }
1024
+ }
1025
+ """
1026
+ image = pipe(prompt=prompt, height=1024, width=1024, num_inference_steps=48, cfg_scale=7.0, seed=42)
1027
+ image.save("image_ideogram-4-fp8.jpg")
1028
+ ```
1029
+
1030
+ </details>
1031
+
1032
+ <details>
1033
+
1034
+ <summary>Examples</summary>
1035
+
1036
+ Example code for Ideogram 4 is available at: [/examples/ideogram4/](/examples/ideogram4/)
1037
+
1038
+ | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
1039
+ |-|-|-|-|-|-|-|
1040
+ |[ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8)|[code](/examples/ideogram4/model_inference/ideogram-4-fp8.py)|-|-|-|-|-|
1041
+ |[DiffSynth-Studio/ideogram-4-bf16-repackage](https://www.modelscope.cn/models/DiffSynth-Studio/ideogram-4-bf16-repackage)|[code](/examples/ideogram4/model_inference/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_inference_low_vram/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_training/full/Ideogram-4-bf16-repackage.sh)|-|[code](/examples/ideogram4/model_training/lora/Ideogram-4-bf16-repackage.sh)|[code](/examples/ideogram4/model_training/validate_lora/Ideogram-4-bf16-repackage.py)|
950
1042
 
951
1043
  </details>
952
1044
 
@@ -1062,6 +1154,7 @@ Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
1062
1154
 
1063
1155
  | Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
1064
1156
  |-|-|-|-|-|-|-|-|
1157
+ |[jd-opensource/JoyAI-Echo](https://modelscope.cn/models/jd-opensource/JoyAI-Echo)||[code](/examples/ltx2/model_inference/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_inference_low_vram/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_training/full/JoyAI-Echo-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_training/lora/JoyAI-Echo-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/JoyAI-Echo-T2AV.py)|
1065
1158
  |[Lightricks/LTX-2.3: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-I2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-I2AV.py)|
1066
1159
  |[Lightricks/LTX-2.3: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-TwoStage.py)|-|-|-|-|
1067
1160
  |[Lightricks/LTX-2.3: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-DistilledPipeline.py)|-|-|-|-|
@@ -1306,6 +1399,57 @@ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_s
1306
1399
  |[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
1307
1400
  |[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
1308
1401
  |[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
1402
+ |[DiffSynth-Studio/acestep15xlsft-lora-music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-lora-music)|[code](/examples/ace_step/model_inference/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_inference_low_vram/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_training/full/acestep15xlsft-vocals2music.sh)|[code](/examples/ace_step/model_training/validate_full/acestep15xlsft-vocals2music.py)|-|-|
1403
+
1404
+ </details>
1405
+
1406
+ ### Image Quality Metrics Models
1407
+
1408
+ [/docs/en/Model_Details/Image-Quality-Metrics.md](/docs/en/Model_Details/Image-Quality-Metrics.md)
1409
+
1410
+ <details>
1411
+
1412
+ <summary>Quick Start</summary>
1413
+
1414
+ Run the following code to quickly load PickScore and evaluate an image against a text prompt. The default model will be downloaded from ModelScope to `./models`.
1415
+
1416
+ ```python
1417
+ from diffsynth.metrics import PickScoreMetric, ModelConfig
1418
+ from modelscope import dataset_snapshot_download
1419
+ from PIL import Image
1420
+
1421
+ dataset_snapshot_download(
1422
+ "DiffSynth-Studio/diffsynth_example_dataset",
1423
+ allow_file_pattern="flux/FLUX.1-dev/*",
1424
+ local_dir="./data/diffsynth_example_dataset",
1425
+ )
1426
+ image = Image.open("data/diffsynth_example_dataset/flux/FLUX.1-dev/1.jpg").convert("RGB")
1427
+ prompt = "a dog"
1428
+ metric = PickScoreMetric.from_pretrained(
1429
+ model_config=ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="PickScore/model.safetensors"),
1430
+ device="cuda"
1431
+ )
1432
+ score = metric.compute(prompt, image)[0]
1433
+ print(f"PickScore score:: {score:.3f}")
1434
+ ```
1435
+
1436
+ </details>
1437
+
1438
+ <details>
1439
+
1440
+ <summary>Example Code</summary>
1441
+
1442
+ Example code for image quality metrics models can be found at: [/examples/image_quality_metric/](/examples/image_quality_metric/)
1443
+
1444
+ | Metric | GitHub Repository | Example Code |
1445
+ | - | - | - |
1446
+ | PickScore | [GitHub](https://github.com/yuvalkirstain/pickscore) | [code](../../../examples/image_quality_metric/pickscore.py) |
1447
+ | ImageReward | [GitHub](https://github.com/zai-org/ImageReward) | [code](../../../examples/image_quality_metric/image_reward.py) |
1448
+ | HPSv2 | [GitHub](https://github.com/tgxs002/HPSv2) | [code](../../../examples/image_quality_metric/hpsv2.py) |
1449
+ | HPSv3 | [GitHub](https://github.com/MizzenAI/HPSv3) | [code](../../../examples/image_quality_metric/hpsv3.py) |
1450
+ | CLIP Score | [GitHub](https://github.com/openai/CLIP) | [code](../../../examples/image_quality_metric/clipscore.py) |
1451
+ | Aesthetic | [GitHub](https://github.com/christophschuhmann/improved-aesthetic-predictor) | [code](../../../examples/image_quality_metric/aesthetic.py) |
1452
+ | FID | [GitHub](https://github.com/mseitzer/pytorch-fid) | [code](../../../examples/image_quality_metric/fid.py) |
1309
1453
 
1310
1454
  </details>
1311
1455
 
@@ -951,6 +951,7 @@ joyai_image_series = [
951
951
  },
952
952
  {
953
953
  # Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
954
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="UnifiedReward-Edit-qwen3vl-8b/model-*.safetensors")
954
955
  "model_hash": "2d11bf14bba8b4e87477c8199a895403",
955
956
  "model_name": "joyai_image_text_encoder",
956
957
  "model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
@@ -1024,6 +1025,103 @@ ace_step_series = [
1024
1025
  "model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
1025
1026
  "state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
1026
1027
  },
1028
+ {
1029
+ # Example: ???
1030
+ "model_hash": "ff74b1806e6a0b52e7bbd1d3df2d26d1",
1031
+ "model_name": "demucs",
1032
+ "model_class": "diffsynth.models.demucs.HTDemucs",
1033
+ },
1034
+ ]
1035
+
1036
+ image_metrics_series = [
1037
+ {
1038
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="PickScore/model.safetensors")
1039
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="CLIP-ViT-H-14-laion2B-s32B-b79K/model.safetensors")
1040
+ "model_hash": "b5e2c0bfcbf4085ccdb2feb8f0ba408a",
1041
+ "model_name": "image_metrics_clip_hf",
1042
+ "model_class": "diffsynth.models.clip.ImageMetricsCLIPModel",
1043
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsCLIPStateDictConverter",
1044
+ },
1045
+ {
1046
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="HPSv2/model.safetensors")
1047
+ "model_hash": "f79e72cec8ae5a540cff0304bfb21b00",
1048
+ "model_name": "image_metrics_hpsv2",
1049
+ "model_class": "diffsynth.models.clip.ImageMetricsCLIPModel",
1050
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsOpenCLIPStateDictConverter",
1051
+ },
1052
+ {
1053
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="HPSv3/model.safetensors")
1054
+ "model_hash": "5655d9cde15b759cfeefe7432d7a912c",
1055
+ "model_name": "image_metrics_hpsv3",
1056
+ "model_class": "diffsynth.models.hpsv3.HPSv3Qwen2VLRewardModel",
1057
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsHPSv3StateDictConverter",
1058
+ "extra_kwargs": {"vocab_size": 151658, "output_dim": 2, "reward_token": "special", "rm_head_type": "ranknet"},
1059
+ },
1060
+ {
1061
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="ImageReward/model.safetensors")
1062
+ "model_hash": "b3cc8e10b76ca98cde653daa5cf63139",
1063
+ "model_name": "image_metrics_image_reward",
1064
+ "model_class": "diffsynth.models.image_reward.ImageRewardModel",
1065
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsImageRewardStateDictConverter",
1066
+ },
1067
+ {
1068
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="Aesthetic/model.safetensors")
1069
+ "model_hash": "306981222ec94302794e07cf676c84cc",
1070
+ "model_name": "image_metrics_aesthetic",
1071
+ "model_class": "diffsynth.models.aesthetic.AestheticModel",
1072
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsAestheticStateDictConverter",
1073
+ },
1074
+ {
1075
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="FID/model.safetensors")
1076
+ "model_hash": "d4e9549be726259b444d1f62db4ce413",
1077
+ "model_name": "image_metrics_fid_inception",
1078
+ "model_class": "diffsynth.models.fid.FIDInceptionModel",
1079
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsFIDStateDictConverter",
1080
+ },
1081
+ {
1082
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="BioCLIPv2/open_clip_model.safetensors")
1083
+ "model_hash": "3a020a3e47afb7c5e21c52f2d0692c09",
1084
+ "model_name": "image_metrics_bioclip_v2",
1085
+ "model_class": "diffsynth.models.bioclip.BioCLIPv2Model",
1086
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsOpenCLIPStateDictConverter",
1087
+ },
1088
+ {
1089
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/alexnet.safetensors")
1090
+ "model_hash": "08a75c660c9b2e775c530a0955857f1f",
1091
+ "model_name": "image_metrics_lpips_alex",
1092
+ "model_class": "diffsynth.models.lpips.LPIPSModel",
1093
+ "extra_kwargs": {"net": "alex"},
1094
+ },
1095
+ {
1096
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/vgg.safetensors")
1097
+ "model_hash": "5740953aaa8aba2ecd9b9c23da813591",
1098
+ "model_name": "image_metrics_lpips_vgg",
1099
+ "model_class": "diffsynth.models.lpips.LPIPSModel",
1100
+ "extra_kwargs": {"net": "vgg"},
1101
+ },
1102
+ {
1103
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/squeezenet.safetensors")
1104
+ "model_hash": "ff994b70a30599287a332105396d5004",
1105
+ "model_name": "image_metrics_lpips_squeeze",
1106
+ "model_class": "diffsynth.models.lpips.LPIPSModel",
1107
+ "extra_kwargs": {"net": "squeeze"},
1108
+ },
1109
+ {
1110
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="UnifiedReward-2.0-qwen35-9b/model-*.safetensors")
1111
+ "model_hash": "f9786d06eca5c0f1ece89843b2c4cc66",
1112
+ "model_name": "image_metrics_unified_reward_2",
1113
+ "model_class": "diffsynth.models.unified_reward_2.UnifiedReward2Qwen35ForConditionalGeneration",
1114
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsUnifiedRewardStateDictConverter",
1115
+ "extra_kwargs": {"variant": "qwen35_9b"},
1116
+ },
1117
+ {
1118
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="Qwen-Image-Bench/model-*.safetensors")
1119
+ "model_hash": "ff4ad0463675e96738483611f6dd551b",
1120
+ "model_name": "image_metrics_qwen_image_bench",
1121
+ "model_class": "diffsynth.models.qwen_image_bench.QwenImageBenchQwen35ForConditionalGeneration",
1122
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsUnifiedRewardStateDictConverter",
1123
+ "extra_kwargs": {"variant": "qwen35"},
1124
+ },
1027
1125
  ]
1028
1126
 
1029
1127
  hidream_o1_image_series = [
@@ -1035,7 +1133,38 @@ hidream_o1_image_series = [
1035
1133
  },
1036
1134
  ]
1037
1135
 
1136
+ ideogram4_series = [
1137
+ {
1138
+ # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
1139
+ "model_hash": "6f56a1d28667f2ff98e1c79af88a7516",
1140
+ "model_name": "ideogram4_dit",
1141
+ "model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
1142
+ "extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}, "keep_original_dtype": True},
1143
+ },
1144
+ {
1145
+ # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
1146
+ "model_hash": "6d72a86d1027baff87e2cf8fc523aab1",
1147
+ "model_name": "ideogram4_text_encoder",
1148
+ "model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
1149
+ "extra_kwargs": {"keep_original_dtype": True},
1150
+ },
1151
+ {
1152
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ideogram-4-bf16-repackage", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
1153
+ "model_hash": "291b300b11c8c8e11978bd85a9c5f80c",
1154
+ "model_name": "ideogram4_dit",
1155
+ "model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
1156
+ "extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}},
1157
+ },
1158
+ {
1159
+ # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
1160
+ "model_hash": "6a269892c0757aacd46bd41b8d5a7aef",
1161
+ "model_name": "ideogram4_text_encoder",
1162
+ "model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
1163
+ },
1164
+ ]
1165
+
1038
1166
  MODEL_CONFIGS = (
1039
1167
  stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
1040
1168
  + z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series + hidream_o1_image_series
1169
+ + image_metrics_series + ideogram4_series
1041
1170
  )
@@ -380,6 +380,19 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
380
380
  "diffsynth.models.hidream_o1_image_dit.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
381
381
  "diffsynth.models.hidream_o1_image_dit.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
382
382
  },
383
+ "diffsynth.models.ideogram4_dit.Ideogram4DiT": {
384
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
385
+ "diffsynth.models.ideogram4_dit.Ideogram4RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
386
+ "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
387
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
388
+ },
389
+ "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder": {
390
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
391
+ "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
392
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
393
+ "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
394
+ "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
395
+ },
383
396
  }
384
397
 
385
398
  def QwenImageTextEncoder_Module_Map_Updater():
@@ -0,0 +1,191 @@
1
+ import torch, os, inspect
2
+ from einops import rearrange, repeat
3
+
4
+
5
+ try:
6
+ import flash_attn_interface
7
+ FLASH_ATTN_3_AVAILABLE = True
8
+ except ModuleNotFoundError:
9
+ FLASH_ATTN_3_AVAILABLE = False
10
+
11
+ try:
12
+ import flash_attn
13
+ FLASH_ATTN_2_AVAILABLE = True
14
+ except ModuleNotFoundError:
15
+ FLASH_ATTN_2_AVAILABLE = False
16
+
17
+ try:
18
+ from sageattention import sageattn
19
+ SAGE_ATTN_AVAILABLE = True
20
+ except ModuleNotFoundError:
21
+ SAGE_ATTN_AVAILABLE = False
22
+
23
+ try:
24
+ import xformers.ops as xops
25
+ XFORMERS_AVAILABLE = True
26
+ except ModuleNotFoundError:
27
+ XFORMERS_AVAILABLE = False
28
+
29
+ try:
30
+ if "enable_gqa" in inspect.signature(torch.nn.functional.scaled_dot_product_attention).parameters:
31
+ TORCH_SUPPORT_GQA = True
32
+ else:
33
+ TORCH_SUPPORT_GQA = False
34
+ except:
35
+ TORCH_SUPPORT_GQA = False
36
+
37
+
38
+ def initialize_attention_priority():
39
+ if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
40
+ return os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION').lower()
41
+ elif FLASH_ATTN_3_AVAILABLE:
42
+ return "flash_attention_3"
43
+ elif FLASH_ATTN_2_AVAILABLE:
44
+ return "flash_attention_2"
45
+ elif SAGE_ATTN_AVAILABLE:
46
+ return "sage_attention"
47
+ elif XFORMERS_AVAILABLE:
48
+ return "xformers"
49
+ else:
50
+ return "torch"
51
+
52
+
53
+ ATTENTION_IMPLEMENTATION = initialize_attention_priority()
54
+
55
+
56
+ def rearrange_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", required_in_pattern="b n s d", dims=None):
57
+ dims = {} if dims is None else dims
58
+ if q_pattern != required_in_pattern:
59
+ q = rearrange(q, f"{q_pattern} -> {required_in_pattern}", **dims)
60
+ if k_pattern != required_in_pattern:
61
+ k = rearrange(k, f"{k_pattern} -> {required_in_pattern}", **dims)
62
+ if v_pattern != required_in_pattern:
63
+ v = rearrange(v, f"{v_pattern} -> {required_in_pattern}", **dims)
64
+ return q, k, v
65
+
66
+
67
+ def rearrange_out(out: torch.Tensor, out_pattern="b n s d", required_out_pattern="b n s d", dims=None):
68
+ dims = {} if dims is None else dims
69
+ if out_pattern != required_out_pattern:
70
+ out = rearrange(out, f"{required_out_pattern} -> {out_pattern}", **dims)
71
+ return out
72
+
73
+
74
+ def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None, is_causal=False):
75
+ required_in_pattern, required_out_pattern= "b n s d", "b n s d"
76
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
77
+ if q.shape[1] != k.shape[1] or q.shape[1] != v.shape[1]:
78
+ # Grouped Query Attention
79
+ if TORCH_SUPPORT_GQA:
80
+ out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal, enable_gqa=True)
81
+ else:
82
+ # In low-version torch, `enable_gqa` is not supported.
83
+ k = repeat(k, "b n s d -> b (n m) s d", m=q.shape[1]//k.shape[1])
84
+ v = repeat(v, "b n s d -> b (n m) s d", m=q.shape[1]//v.shape[1])
85
+ out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal)
86
+ else:
87
+ out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal)
88
+ out = rearrange_out(out, out_pattern, required_out_pattern, dims)
89
+ return out
90
+
91
+
92
+ def torch_sdpa_sliding_window(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, sliding_window: int, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
93
+ required_in_pattern, required_out_pattern = "b n s d", "b n s d"
94
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
95
+
96
+ B, N, S, D = q.shape
97
+ W = sliding_window
98
+ chunk_size = W
99
+ num_chunks = (S + chunk_size - 1) // chunk_size
100
+
101
+ output = torch.empty_like(q)
102
+ dtype = q.dtype
103
+ device = q.device
104
+ min_val = torch.finfo(dtype).min
105
+
106
+ for i in range(num_chunks):
107
+ q_start = i * chunk_size
108
+ q_end = min(q_start + chunk_size, S)
109
+ actual_chunk_size = q_end - q_start
110
+
111
+ kv_start = max(0, q_start - W)
112
+ kv_end = min(S, q_end + W)
113
+
114
+ q_chunk = q[:, :, q_start:q_end, :]
115
+ k_chunk = k[:, :, kv_start:kv_end, :]
116
+ v_chunk = v[:, :, kv_start:kv_end, :]
117
+
118
+ q_indices = torch.arange(q_start, q_end, device=device)
119
+ k_indices = torch.arange(kv_start, kv_end, device=device)
120
+ diff = q_indices.unsqueeze(1) - k_indices.unsqueeze(0)
121
+ valid = diff.abs() <= W
122
+
123
+ local_mask = torch.zeros(actual_chunk_size, kv_end - kv_start, dtype=dtype, device=device)
124
+ local_mask.masked_fill_(~valid, min_val)
125
+ local_mask = local_mask.unsqueeze(0).unsqueeze(0)
126
+
127
+ out_chunk = torch_sdpa(
128
+ q_chunk, k_chunk, v_chunk, attn_mask=local_mask, scale=scale
129
+ )
130
+ output[:, :, q_start:q_end, :] = out_chunk
131
+
132
+ output = rearrange_out(output, out_pattern, required_out_pattern, dims)
133
+ return output
134
+
135
+
136
+ def flash_attention_3(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None, is_causal=False, window_size=None):
137
+ required_in_pattern, required_out_pattern= "b s n d", "b s n d"
138
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
139
+ window_size = (window_size, window_size) if window_size is not None else (-1, -1)
140
+ out = flash_attn_interface.flash_attn_func(q, k, v, softmax_scale=scale, window_size=window_size)
141
+ if isinstance(out, tuple):
142
+ out = out[0]
143
+ out = rearrange_out(out, out_pattern, required_out_pattern, dims)
144
+ return out
145
+
146
+
147
+ def flash_attention_2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None, is_causal=False, window_size=None):
148
+ required_in_pattern, required_out_pattern= "b s n d", "b s n d"
149
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
150
+ window_size = (window_size, window_size) if window_size is not None else (-1, -1)
151
+ out = flash_attn.flash_attn_func(q, k, v, softmax_scale=scale, causal=is_causal, window_size=window_size)
152
+ out = rearrange_out(out, out_pattern, required_out_pattern, dims)
153
+ return out
154
+
155
+
156
+ def sage_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
157
+ required_in_pattern, required_out_pattern= "b n s d", "b n s d"
158
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
159
+ out = sageattn(q, k, v, sm_scale=scale)
160
+ out = rearrange_out(out, out_pattern, required_out_pattern, dims)
161
+ return out
162
+
163
+
164
+ def xformers_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
165
+ required_in_pattern, required_out_pattern= "b s n d", "b s n d"
166
+ q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
167
+ out = xops.memory_efficient_attention(q, k, v, scale=scale)
168
+ out = rearrange_out(out, out_pattern, required_out_pattern, dims)
169
+ return out
170
+
171
+
172
+ def attention_forward(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None, is_causal=False, compatibility_mode=False, window_size=None):
173
+ if compatibility_mode or (attn_mask is not None) or ATTENTION_IMPLEMENTATION == "torch":
174
+ if window_size is None:
175
+ return torch_sdpa(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, attn_mask=attn_mask, scale=scale, is_causal=is_causal)
176
+ else:
177
+ # Sliding Window Attention is not compatible with `is_causal` and `attn_mask`.
178
+ assert is_causal == False and attn_mask is None
179
+ return torch_sdpa_sliding_window(q, k, v, window_size, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
180
+ elif ATTENTION_IMPLEMENTATION == "flash_attention_3":
181
+ return flash_attention_3(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale, is_causal=is_causal, window_size=window_size)
182
+ elif ATTENTION_IMPLEMENTATION == "flash_attention_2":
183
+ return flash_attention_2(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale, is_causal=is_causal, window_size=window_size)
184
+ elif ATTENTION_IMPLEMENTATION == "sage_attention":
185
+ if window_size is not None or is_causal: return attention_forward(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, attn_mask, scale, is_causal, compatibility_mode=True, window_size=window_size)
186
+ return sage_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
187
+ elif ATTENTION_IMPLEMENTATION == "xformers":
188
+ if window_size is not None or is_causal: return attention_forward(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, attn_mask, scale, is_causal, compatibility_mode=True, window_size=window_size)
189
+ return xformers_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
190
+ else:
191
+ raise NotImplementedError("No available attention implementation.")
@@ -2,6 +2,7 @@ import math, warnings
2
2
  import torch, torchvision, imageio, os
3
3
  import imageio.v3 as iio
4
4
  from PIL import Image
5
+ from einops import repeat
5
6
 
6
7
 
7
8
  class DataProcessingPipeline:
@@ -283,24 +284,27 @@ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
283
284
 
284
285
  class LoadPureAudioWithTorchaudio(DataProcessingOperator):
285
286
 
286
- def __init__(self, target_sample_rate=None, target_duration=None):
287
+ def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False, channels=2):
287
288
  self.target_sample_rate = target_sample_rate
288
- self.target_duration = target_duration
289
+ self.max_audio_duration = max_audio_duration
289
290
  self.resample = True if target_sample_rate is not None else False
291
+ self.padding = padding
290
292
  from diffsynth.utils.data.audio import read_audio
291
293
  self.audio_loader = read_audio
292
294
 
293
295
  def __call__(self, data: str):
294
296
  try:
295
297
  waveform, sample_rate = self.audio_loader(data, resample=self.resample, resample_rate=self.target_sample_rate)
296
- if self.target_duration is not None:
297
- target_samples = int(self.target_duration * sample_rate)
298
+ if self.max_audio_duration is not None:
299
+ target_samples = int(self.max_audio_duration * sample_rate)
298
300
  current_samples = waveform.shape[-1]
299
301
  if current_samples > target_samples:
300
302
  waveform = waveform[..., :target_samples]
301
- elif current_samples < target_samples:
303
+ elif current_samples < target_samples and self.padding:
302
304
  padding = target_samples - current_samples
303
305
  waveform = torch.nn.functional.pad(waveform, (0, padding))
306
+ if waveform.shape[0] == 1:
307
+ waveform = repeat(waveform, "C L -> (N C) L", N=2)
304
308
  return waveform, sample_rate
305
309
  except Exception as e:
306
310
  print(f"Cannot load audio in {data} due to {e}. The audio will be `None`.")