diffsynth 2.0.13__tar.gz → 2.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. {diffsynth-2.0.13 → diffsynth-2.0.14}/PKG-INFO +1 -1
  2. {diffsynth-2.0.13 → diffsynth-2.0.14}/README.md +4 -0
  3. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/model_configs.py +9 -12
  4. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/vram_management_module_maps.py +13 -0
  5. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/attention/attention.py +9 -1
  6. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/operators.py +4 -1
  7. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/flow_match.py +3 -3
  8. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ideogram4_dit.py +17 -4
  9. diffsynth-2.0.14/diffsynth/models/ideogram4_vae.py +74 -0
  10. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ideogram4.py +18 -17
  11. diffsynth-2.0.14/diffsynth/utils/demucs/__init__.py +21 -0
  12. diffsynth-2.0.14/diffsynth/utils/dequantizer/__init__.py +15 -0
  13. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/PKG-INFO +1 -1
  14. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/SOURCES.txt +2 -0
  15. {diffsynth-2.0.13 → diffsynth-2.0.14}/pyproject.toml +1 -1
  16. diffsynth-2.0.13/diffsynth/models/ideogram4_vae.py +0 -517
  17. {diffsynth-2.0.13 → diffsynth-2.0.14}/LICENSE +0 -0
  18. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/__init__.py +0 -0
  19. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/__init__.py +0 -0
  20. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/__init__.py +0 -0
  21. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/attention/__init__.py +0 -0
  22. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/__init__.py +0 -0
  23. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/unified_dataset.py +0 -0
  24. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/device/__init__.py +0 -0
  25. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/device/npu_compatible_device.py +0 -0
  26. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/gradient/__init__.py +0 -0
  27. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
  28. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/__init__.py +0 -0
  29. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/config.py +0 -0
  30. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/file.py +0 -0
  31. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/model.py +0 -0
  32. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  33. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/__init__.py +0 -0
  34. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/manager.py +0 -0
  35. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/memory_buffer.py +0 -0
  36. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/offloader.py +0 -0
  37. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/__init__.py +0 -0
  38. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/disk_map.py +0 -0
  39. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/initialization.py +0 -0
  40. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/layers.py +0 -0
  41. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/__init__.py +0 -0
  42. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/base_pipeline.py +0 -0
  43. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/ddim_scheduler.py +0 -0
  44. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/logger.py +0 -0
  45. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/loss.py +0 -0
  46. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/parsers.py +0 -0
  47. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/runner.py +0 -0
  48. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/template.py +0 -0
  49. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/training_module.py +0 -0
  50. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/__init__.py +0 -0
  51. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/aesthetic.py +0 -0
  52. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/base.py +0 -0
  53. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/bioclip.py +0 -0
  54. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/clip.py +0 -0
  55. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/fid.py +0 -0
  56. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/hpsv2.py +0 -0
  57. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/hpsv3.py +0 -0
  58. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/image_reward.py +0 -0
  59. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/lpips.py +0 -0
  60. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/pickscore.py +0 -0
  61. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/qwen_image_bench.py +0 -0
  62. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/unified_reward_2.py +0 -0
  63. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/unified_reward_edit.py +0 -0
  64. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_conditioner.py +0 -0
  65. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_dit.py +0 -0
  66. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_residual_fsq.py +0 -0
  67. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_text_encoder.py +0 -0
  68. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_tokenizer.py +0 -0
  69. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_vae.py +0 -0
  70. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/aesthetic.py +0 -0
  71. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/anima_dit.py +0 -0
  72. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/bioclip.py +0 -0
  73. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/clip.py +0 -0
  74. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/demucs.py +0 -0
  75. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/dinov3_image_encoder.py +0 -0
  76. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ernie_image_dit.py +0 -0
  77. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ernie_image_text_encoder.py +0 -0
  78. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/fid.py +0 -0
  79. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_dit.py +0 -0
  80. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_text_encoder.py +0 -0
  81. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_vae.py +0 -0
  82. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_controlnet.py +0 -0
  83. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_dit.py +0 -0
  84. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_infiniteyou.py +0 -0
  85. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_ipadapter.py +0 -0
  86. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_lora_encoder.py +0 -0
  87. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_lora_patcher.py +0 -0
  88. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  89. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  90. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_vae.py +0 -0
  91. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_value_control.py +0 -0
  92. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/general_modules.py +0 -0
  93. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hidream_common.py +0 -0
  94. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hidream_o1_image_dit.py +0 -0
  95. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hpsv2.py +0 -0
  96. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hpsv3.py +0 -0
  97. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ideogram4_text_encoder.py +0 -0
  98. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/image_reward.py +0 -0
  99. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/joyai_image_dit.py +0 -0
  100. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/joyai_image_text_encoder.py +0 -0
  101. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/longcat_video_dit.py +0 -0
  102. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/lpips.py +0 -0
  103. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_audio_vae.py +0 -0
  104. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_common.py +0 -0
  105. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_dit.py +0 -0
  106. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_text_encoder.py +0 -0
  107. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_upsampler.py +0 -0
  108. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_video_vae.py +0 -0
  109. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/model_loader.py +0 -0
  110. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_audio_dit.py +0 -0
  111. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_audio_vae.py +0 -0
  112. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
  113. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen.py +0 -0
  114. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  115. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_projector.py +0 -0
  116. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/pickscore.py +0 -0
  117. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_bench.py +0 -0
  118. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_controlnet.py +0 -0
  119. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_dit.py +0 -0
  120. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_image2lora.py +0 -0
  121. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  122. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_vae.py +0 -0
  123. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/sd_text_encoder.py +0 -0
  124. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/siglip2_image_encoder.py +0 -0
  125. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_text_encoder.py +0 -0
  126. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_unet.py +0 -0
  127. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_vae.py +0 -0
  128. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_text_encoder.py +0 -0
  129. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_unet.py +0 -0
  130. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/step1x_connector.py +0 -0
  131. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/step1x_text_encoder.py +0 -0
  132. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/unified_reward_2.py +0 -0
  133. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/unified_reward_edit.py +0 -0
  134. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  135. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_camera_controller.py +0 -0
  136. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit.py +0 -0
  137. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  138. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_image_encoder.py +0 -0
  139. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_mot.py +0 -0
  140. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_motion_controller.py +0 -0
  141. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_text_encoder.py +0 -0
  142. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_vace.py +0 -0
  143. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_vae.py +0 -0
  144. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wantodance.py +0 -0
  145. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wav2vec.py +0 -0
  146. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_controlnet.py +0 -0
  147. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_dit.py +0 -0
  148. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_image2lora.py +0 -0
  149. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_text_encoder.py +0 -0
  150. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ace_step.py +0 -0
  151. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/anima_image.py +0 -0
  152. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ernie_image.py +0 -0
  153. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/flux2_image.py +0 -0
  154. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/flux_image.py +0 -0
  155. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/hidream_o1_image.py +0 -0
  156. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/joyai_image.py +0 -0
  157. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ltx2_audio_video.py +0 -0
  158. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/mova_audio_video.py +0 -0
  159. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/qwen_image.py +0 -0
  160. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion.py +0 -0
  161. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion_xl.py +0 -0
  162. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/wan_video.py +0 -0
  163. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/z_image.py +0 -0
  164. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/__init__.py +0 -0
  165. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/annotator.py +0 -0
  166. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  167. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/__init__.py +0 -0
  168. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/audio.py +0 -0
  169. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/audio_video.py +0 -0
  170. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/media_io_ltx2.py +0 -0
  171. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/__init__.py +0 -0
  172. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/flux.py +0 -0
  173. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/general.py +0 -0
  174. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/merge.py +0 -0
  175. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/reset_rank.py +0 -0
  176. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/ses/__init__.py +0 -0
  177. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/ses/ses.py +0 -0
  178. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  179. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +0 -0
  180. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_dit.py +0 -0
  181. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +0 -0
  182. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +0 -0
  183. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  184. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/dino_v3.py +0 -0
  185. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
  186. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  187. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  188. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  189. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  190. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  191. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  192. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  193. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  194. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/image_metrics.py +0 -0
  195. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +0 -0
  196. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
  197. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  198. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  199. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
  200. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  201. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  202. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  203. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +0 -0
  204. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +0 -0
  205. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +0 -0
  206. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  207. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  208. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  209. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  210. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  211. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  212. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  213. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  214. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
  215. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  216. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/xfuser/__init__.py +0 -0
  217. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
  218. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/version.py +0 -0
  219. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/dependency_links.txt +0 -0
  220. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/requires.txt +0 -0
  221. {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/top_level.txt +0 -0
  222. {diffsynth-2.0.13 → diffsynth-2.0.14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.13
3
+ Version: 2.0.14
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -34,6 +34,8 @@ We believe that a well-developed open-source code framework can lower the thresh
34
34
 
35
35
  > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
36
36
 
37
+ - **June 16, 2026**: We have added a new Template model for ACE-Step: [vocals2music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-vocals2music). For more details, please refer to the [documentation](/docs/zh/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
38
+
37
39
  - **June 15, 2026** We have open-sourced Image-to-LoRA V2, compressing the hours-long training process for image style LoRAs into a single model inference step, thereby exploring a new paradigm for LoRA model training. The [technical report](https://arxiv.org/abs/2606.13809) has been released. This release includes three models:
38
40
  * [DiffSynth-Studio/ZImage-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2): Adapted for the Z-Image model
39
41
  * [DiffSynth-Studio/KleinBase4B-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2): Adapted for the FLUX.2-klein-base-4B model
@@ -1036,6 +1038,7 @@ Example code for Ideogram 4 is available at: [/examples/ideogram4/](/examples/id
1036
1038
  | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
1037
1039
  |-|-|-|-|-|-|-|
1038
1040
  |[ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8)|[code](/examples/ideogram4/model_inference/ideogram-4-fp8.py)|-|-|-|-|-|
1041
+ |[DiffSynth-Studio/ideogram-4-bf16-repackage](https://www.modelscope.cn/models/DiffSynth-Studio/ideogram-4-bf16-repackage)|[code](/examples/ideogram4/model_inference/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_inference_low_vram/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_training/full/Ideogram-4-bf16-repackage.sh)|-|[code](/examples/ideogram4/model_training/lora/Ideogram-4-bf16-repackage.sh)|[code](/examples/ideogram4/model_training/validate_lora/Ideogram-4-bf16-repackage.py)|
1039
1042
 
1040
1043
  </details>
1041
1044
 
@@ -1396,6 +1399,7 @@ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_s
1396
1399
  |[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
1397
1400
  |[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
1398
1401
  |[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
1402
+ |[DiffSynth-Studio/acestep15xlsft-lora-music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-lora-music)|[code](/examples/ace_step/model_inference/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_inference_low_vram/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_training/full/acestep15xlsft-vocals2music.sh)|[code](/examples/ace_step/model_training/validate_full/acestep15xlsft-vocals2music.py)|-|-|
1399
1403
 
1400
1404
  </details>
1401
1405
 
@@ -1149,20 +1149,17 @@ ideogram4_series = [
1149
1149
  "extra_kwargs": {"keep_original_dtype": True},
1150
1150
  },
1151
1151
  {
1152
- # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
1153
- "model_hash": "c54288e3ee12ca215898840682337b95",
1154
- "model_name": "ideogram4_vae_encoder",
1155
- "model_class": "diffsynth.models.ideogram4_vae.Ideogram4VAEEncoder",
1156
- "state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEEncoderStateDictConverter",
1157
- "extra_kwargs": {"keep_original_dtype": True},
1152
+ # Example: ModelConfig(model_id="DiffSynth-Studio/ideogram-4-bf16-repackage", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
1153
+ "model_hash": "291b300b11c8c8e11978bd85a9c5f80c",
1154
+ "model_name": "ideogram4_dit",
1155
+ "model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
1156
+ "extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}},
1158
1157
  },
1159
1158
  {
1160
- # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
1161
- "model_hash": "c54288e3ee12ca215898840682337b95",
1162
- "model_name": "ideogram4_vae_decoder",
1163
- "model_class": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoder",
1164
- "state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoderStateDictConverter",
1165
- "extra_kwargs": {"keep_original_dtype": True},
1159
+ # Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
1160
+ "model_hash": "6a269892c0757aacd46bd41b8d5a7aef",
1161
+ "model_name": "ideogram4_text_encoder",
1162
+ "model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
1166
1163
  },
1167
1164
  ]
1168
1165
 
@@ -380,6 +380,19 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
380
380
  "diffsynth.models.hidream_o1_image_dit.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
381
381
  "diffsynth.models.hidream_o1_image_dit.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
382
382
  },
383
+ "diffsynth.models.ideogram4_dit.Ideogram4DiT": {
384
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
385
+ "diffsynth.models.ideogram4_dit.Ideogram4RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
386
+ "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
387
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
388
+ },
389
+ "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder": {
390
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
391
+ "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
392
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
393
+ "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
394
+ "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
395
+ },
383
396
  }
384
397
 
385
398
  def QwenImageTextEncoder_Module_Map_Updater():
@@ -26,6 +26,14 @@ try:
26
26
  except ModuleNotFoundError:
27
27
  XFORMERS_AVAILABLE = False
28
28
 
29
+ try:
30
+ if "enable_gqa" in inspect.signature(torch.nn.functional.scaled_dot_product_attention).parameters:
31
+ TORCH_SUPPORT_GQA = True
32
+ else:
33
+ TORCH_SUPPORT_GQA = False
34
+ except:
35
+ TORCH_SUPPORT_GQA = False
36
+
29
37
 
30
38
  def initialize_attention_priority():
31
39
  if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
@@ -68,7 +76,7 @@ def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n
68
76
  q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
69
77
  if q.shape[1] != k.shape[1] or q.shape[1] != v.shape[1]:
70
78
  # Grouped Query Attention
71
- if "enable_gqa" in inspect.signature(torch.nn.functional.scaled_dot_product_attention).parameters:
79
+ if TORCH_SUPPORT_GQA:
72
80
  out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal, enable_gqa=True)
73
81
  else:
74
82
  # In low-version torch, `enable_gqa` is not supported.
@@ -2,6 +2,7 @@ import math, warnings
2
2
  import torch, torchvision, imageio, os
3
3
  import imageio.v3 as iio
4
4
  from PIL import Image
5
+ from einops import repeat
5
6
 
6
7
 
7
8
  class DataProcessingPipeline:
@@ -283,7 +284,7 @@ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
283
284
 
284
285
  class LoadPureAudioWithTorchaudio(DataProcessingOperator):
285
286
 
286
- def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False):
287
+ def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False, channels=2):
287
288
  self.target_sample_rate = target_sample_rate
288
289
  self.max_audio_duration = max_audio_duration
289
290
  self.resample = True if target_sample_rate is not None else False
@@ -302,6 +303,8 @@ class LoadPureAudioWithTorchaudio(DataProcessingOperator):
302
303
  elif current_samples < target_samples and self.padding:
303
304
  padding = target_samples - current_samples
304
305
  waveform = torch.nn.functional.pad(waveform, (0, padding))
306
+ if waveform.shape[0] == 1:
307
+ waveform = repeat(waveform, "C L -> (N C) L", N=2)
305
308
  return waveform, sample_rate
306
309
  except Exception as e:
307
310
  print(f"Cannot load audio in {data} due to {e}. The audio will be `None`.")
@@ -214,7 +214,7 @@ class FlowMatchScheduler():
214
214
  logsnr_max = 18.0
215
215
  t_min = 1.0 / (1 + math.exp(0.5 * logsnr_max))
216
216
  t_max = 1.0 / (1 + math.exp(0.5 * logsnr_min))
217
- step_intervals = torch.linspace(0.0, 1.0, num_inference_steps + 1, dtype=torch.float64)
217
+ step_intervals = torch.linspace(0.0, denoising_strength, num_inference_steps + 1, dtype=torch.float64)
218
218
  sigmas = []
219
219
  for i in range(num_inference_steps + 1):
220
220
  z = torch.special.ndtri(step_intervals[i])
@@ -230,7 +230,7 @@ class FlowMatchScheduler():
230
230
  one_minus_t = one_minus_t * (sigma_start / one_minus_t[0])
231
231
  sigmas = sigmas.flip(dims=(0,))
232
232
  timesteps = sigmas[:-1]
233
- sigmas = 1 - sigmas
233
+ sigmas = (1 - sigmas)[:-1]
234
234
  return sigmas, timesteps
235
235
 
236
236
  @staticmethod
@@ -263,7 +263,7 @@ class FlowMatchScheduler():
263
263
 
264
264
  def set_training_weight(self):
265
265
  steps = 1000
266
- x = self.timesteps
266
+ x = self.sigmas * self.num_train_timesteps
267
267
  y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
268
268
  y_shifted = y - y.min()
269
269
  bsmntw_weighing = y_shifted * (steps / y_shifted.sum())
@@ -5,6 +5,8 @@ import torch
5
5
  import torch.nn as nn
6
6
  import torch.nn.functional as F
7
7
 
8
+ from ..core.gradient import gradient_checkpoint_forward
9
+
8
10
  LLM_TOKEN_INDICATOR = 3
9
11
  OUTPUT_IMAGE_INDICATOR = 2
10
12
  IMAGE_POSITION_OFFSET = 65536
@@ -140,7 +142,7 @@ class Ideogram4MRoPE(nn.Module):
140
142
  pos = position_ids.permute(2, 0, 1).to(dtype=torch.float32)
141
143
  inv_freq = self.inv_freq.to(dtype=torch.float32)[None, None, :, None].expand(
142
144
  3, batch_size, -1, 1
143
- )
145
+ ).to(pos.device)
144
146
  freqs = inv_freq @ pos.unsqueeze(2)
145
147
  freqs = freqs.transpose(2, 3)
146
148
 
@@ -291,7 +293,7 @@ class Ideogram4EmbedScalar(nn.Module):
291
293
  scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
292
294
  emb = _sinusoidal_embedding(scaled, self.dim)
293
295
  emb = emb.to(
294
- getattr(self.mlp_in, "compute_dtype", None) or self.mlp_in.weight.dtype
296
+ getattr(self.mlp_in, "compute_dtype", None) or getattr(self.mlp_in, "computation_dtype", None) or self.mlp_in.weight.dtype
295
297
  )
296
298
  emb = F.silu(self.mlp_in(emb))
297
299
  return self.mlp_out(emb)
@@ -375,6 +377,8 @@ class Ideogram4DiT(nn.Module):
375
377
  position_ids: torch.Tensor,
376
378
  segment_ids: torch.Tensor,
377
379
  indicator: torch.Tensor,
380
+ use_gradient_checkpointing: bool = False,
381
+ use_gradient_checkpointing_offload: bool = False,
378
382
  ) -> torch.Tensor:
379
383
  """Velocity prediction.
380
384
 
@@ -393,7 +397,7 @@ class Ideogram4DiT(nn.Module):
393
397
  assert in_channels == self.config.in_channels
394
398
 
395
399
  param_dtype = (
396
- getattr(self.input_proj, "compute_dtype", None) or self.input_proj.weight.dtype
400
+ getattr(self.input_proj, "compute_dtype", None) or getattr(self.input_proj, "computation_dtype", None) or self.input_proj.weight.dtype
397
401
  )
398
402
  x = x.to(param_dtype)
399
403
  t = t.to(param_dtype)
@@ -428,7 +432,16 @@ class Ideogram4DiT(nn.Module):
428
432
  sin = sin.to(h.dtype)
429
433
 
430
434
  for layer in self.layers:
431
- h = layer(h, segment_ids=segment_ids, cos=cos, sin=sin, adaln_input=adaln_input)
435
+ h = gradient_checkpoint_forward(
436
+ layer,
437
+ use_gradient_checkpointing=use_gradient_checkpointing,
438
+ use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
439
+ x=h,
440
+ segment_ids=segment_ids,
441
+ cos=cos,
442
+ sin=sin,
443
+ adaln_input=adaln_input,
444
+ )
432
445
 
433
446
  out = self.final_layer(h, c=adaln_input)
434
447
  return out.to(torch.float32)
@@ -0,0 +1,74 @@
1
+ import torch
2
+ from einops import rearrange
3
+
4
+
5
+ LATENT_SHIFT = (
6
+ 0.01984364, 0.10149707, 0.29689495, 0.27188619, -0.21445648, -0.15979549,
7
+ 0.05021099, -0.15083604, -0.15360136, -0.20131799, 0.01922352, 0.0622626,
8
+ 0.10140969, -0.06739428, 0.3758261, -0.233712, 0.35164491, -0.02590912,
9
+ -0.0271935, -0.10833897, -0.1476848, -0.01130957, -0.2298372, 0.23526423,
10
+ -0.10893522, 0.11957631, 0.04047799, 0.3134589, -0.17225064, -0.18646109,
11
+ -0.34691978, -0.03571246, 0.02583857, 0.10190072, 0.28402294, 0.26952152,
12
+ -0.21634675, -0.17938656, 0.04358909, -0.15007621, -0.1548502, -0.18971131,
13
+ 0.02710861, 0.05609494, 0.10697846, -0.06854968, 0.38167698, -0.24269937,
14
+ 0.35705471, -0.03063305, -0.02946109, -0.11244286, -0.14336038, -0.01362137,
15
+ -0.21863696, 0.23228983, -0.11739769, 0.11693044, 0.02563311, 0.31356594,
16
+ -0.17420591, -0.19006285, -0.34905377, -0.04025005, 0.01924137, 0.07652984,
17
+ 0.2995608, 0.2628057, -0.22011674, -0.12715361, 0.04879879, -0.14075719,
18
+ -0.15935895, -0.2123584, 0.01974813, 0.05523547, 0.10011992, -0.06428964,
19
+ 0.37781868, -0.21491644, 0.34254215, -0.03153528, -0.0310082, -0.10761415,
20
+ -0.14730405, -0.02475182, -0.2285588, 0.2515081, -0.10445128, 0.12446,
21
+ 0.07062869, 0.30880162, -0.18016875, -0.18869164, -0.34533499, -0.0129177,
22
+ 0.02578168, 0.07993659, 0.28642181, 0.26038408, -0.22459419, -0.14820155,
23
+ 0.04059549, -0.14043529, -0.16111187, -0.2020305, 0.02602069, 0.04852717,
24
+ 0.10432153, -0.06309942, 0.38402443, -0.22397003, 0.34814481, -0.03774432,
25
+ -0.03381438, -0.11245691, -0.14128767, -0.02853208, -0.21752016, 0.24872463,
26
+ -0.11399775, 0.1222687, 0.05620835, 0.309178, -0.18065738, -0.19401479,
27
+ -0.34495114, -0.01760592,
28
+ )
29
+
30
+ LATENT_SCALE = (
31
+ 1.63933691, 1.70204478, 1.73642566, 1.90004803, 1.6675316, 1.69059584,
32
+ 1.56853198, 1.62314944, 1.89106626, 1.58086668, 1.60822129, 1.60962993,
33
+ 1.63322129, 1.56074359, 1.73419528, 1.7919265, 1.64040632, 1.66802808,
34
+ 1.60390303, 1.75480492, 1.63187587, 1.64334594, 1.61722884, 1.60146046,
35
+ 1.63459219, 1.55291476, 1.68771497, 1.68415657, 1.78966054, 1.66631641,
36
+ 1.65626686, 1.65976433, 1.63487607, 1.69513249, 1.72933756, 1.91310663,
37
+ 1.67035057, 1.72286863, 1.56719251, 1.61934825, 1.88628859, 1.56911539,
38
+ 1.59455129, 1.60829869, 1.62470611, 1.56052853, 1.73677003, 1.77563606,
39
+ 1.63732541, 1.66370527, 1.59508952, 1.75153949, 1.63029275, 1.64517667,
40
+ 1.61659342, 1.59722044, 1.64103121, 1.5408531, 1.68610394, 1.67772755,
41
+ 1.78998563, 1.66621713, 1.65458955, 1.66041308, 1.64710857, 1.68163503,
42
+ 1.74000294, 1.92784786, 1.67411194, 1.67395548, 1.57406532, 1.62199356,
43
+ 1.87618195, 1.5584375, 1.57438785, 1.61711053, 1.63094305, 1.55644029,
44
+ 1.73124302, 1.80666627, 1.6463621, 1.65932006, 1.60816188, 1.75682671,
45
+ 1.64695873, 1.63121722, 1.61380832, 1.60478651, 1.63396035, 1.53505068,
46
+ 1.65534289, 1.67132281, 1.80317197, 1.6767314, 1.65700938, 1.68426259,
47
+ 1.65339716, 1.67540638, 1.73298504, 1.94067348, 1.67893609, 1.70635117,
48
+ 1.5730906, 1.61928553, 1.87148809, 1.56244866, 1.56697152, 1.61584394,
49
+ 1.62759496, 1.55480378, 1.73484107, 1.79055143, 1.64688773, 1.66121492,
50
+ 1.60135887, 1.75254572, 1.64798332, 1.62989921, 1.61381592, 1.60792883,
51
+ 1.63939668, 1.53075757, 1.65371318, 1.66801185, 1.80029087, 1.67591476,
52
+ 1.65655173, 1.68533454,
53
+ )
54
+
55
+ def get_latent_norm(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
56
+ shift = torch.tensor(LATENT_SHIFT, dtype=torch.float32, device=device)
57
+ scale = torch.tensor(LATENT_SCALE, dtype=torch.float32, device=device)
58
+ return shift, scale
59
+
60
+ def decode(vae, latents, height, width, torch_dtype):
61
+ latent_shift, latent_scale = get_latent_norm(latents.device)
62
+ latents = latents.float() * latent_scale + latent_shift
63
+ latents = rearrange(latents, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
64
+ latents = latents.to(torch_dtype)
65
+ image = vae._decode(latents)
66
+ return image
67
+
68
+ def encode(vae, image, height, width, torch_dtype):
69
+ latents = vae._encode(image)[:, :32]
70
+ latent_shift, latent_scale = get_latent_norm(latents.device)
71
+ latents = rearrange(latents, "B C (H P) (W Q) -> B (H W) (P Q C)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
72
+ latents = (latents.float() - latent_shift) / latent_scale
73
+ latents = latents.to(torch_dtype)
74
+ return latents
@@ -10,7 +10,8 @@ from ..diffusion.base_pipeline import BasePipeline, PipelineUnit
10
10
  from ..core import ModelConfig
11
11
  from ..models.ideogram4_dit import Ideogram4DiT, LLM_TOKEN_INDICATOR, OUTPUT_IMAGE_INDICATOR, IMAGE_POSITION_OFFSET
12
12
  from ..models.ideogram4_text_encoder import Ideogram4TextEncoder
13
- from ..models.ideogram4_vae import Ideogram4VAEEncoder, Ideogram4VAEDecoder
13
+ from ..models.flux2_vae import Flux2VAE
14
+ from ..models.ideogram4_vae import encode, decode
14
15
  from transformers import AutoTokenizer
15
16
 
16
17
 
@@ -25,8 +26,7 @@ class Ideogram4Pipeline(BasePipeline):
25
26
  self.text_encoder: Ideogram4TextEncoder = None
26
27
  self.dit: Ideogram4DiT = None
27
28
  self.dit_uncond: Ideogram4DiT = None
28
- self.vae_encoder: Ideogram4VAEEncoder = None
29
- self.vae_decoder: Ideogram4VAEDecoder = None
29
+ self.vae: Flux2VAE = None
30
30
  self.tokenizer: AutoTokenizer = None
31
31
  self.in_iteration_models = ("dit", "dit_uncond")
32
32
  self.units = [
@@ -55,8 +55,7 @@ class Ideogram4Pipeline(BasePipeline):
55
55
  else:
56
56
  pipe.dit = transformers
57
57
  pipe.text_encoder = model_pool.fetch_model("ideogram4_text_encoder")
58
- pipe.vae_encoder = model_pool.fetch_model("ideogram4_vae_encoder")
59
- pipe.vae_decoder = model_pool.fetch_model("ideogram4_vae_decoder")
58
+ pipe.vae = model_pool.fetch_model("flux2_vae")
60
59
 
61
60
  if tokenizer_config is not None:
62
61
  tokenizer_config.download_if_necessary()
@@ -112,16 +111,15 @@ class Ideogram4Pipeline(BasePipeline):
112
111
  if cfg_scale != 1:
113
112
  models = {"dit": self.dit_uncond if self.dit_uncond is not None else self.dit}
114
113
  noise_pred_nega = self.model_fn(timestep=timestep, **models, **inputs_shared, **inputs_nega)
115
- # This is not a standard CFG implementation. We align it to the original version of Ideogram4.
116
- noise_pred = cfg_scale * noise_pred_posi + (1.0 - cfg_scale) * noise_pred_nega
114
+ noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
117
115
  else:
118
116
  noise_pred = noise_pred_posi
119
117
 
120
118
  inputs_shared["latents"] = self.step(self.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs_shared)
121
119
 
122
120
  # Decode
123
- self.load_models_to_device(["vae_decoder"])
124
- image = self.vae_decoder.decode(inputs_shared["latents"], inputs_shared["grid_h"], inputs_shared["grid_w"], self.dit.patch_size, self.torch_dtype)
121
+ self.load_models_to_device(["vae"])
122
+ image = decode(self.vae, inputs_shared["latents"], height, width, self.torch_dtype)
125
123
  image = self.vae_output_to_image(image)
126
124
  self.load_models_to_device([])
127
125
  return image
@@ -168,7 +166,7 @@ class Ideogram4Unit_PromptEmbedder(PipelineUnit):
168
166
  f"prompt has {num_text_tokens} tokens, exceeds max_text_tokens={max_text_tokens}"
169
167
  )
170
168
 
171
- patch = pipe.dit.patch_size * pipe.vae_encoder.ae_scale_factor
169
+ patch = pipe.dit.patch_size * 8
172
170
  grid_h = height // patch
173
171
  grid_w = width // patch
174
172
  num_image_tokens = grid_h * grid_w
@@ -239,7 +237,7 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
239
237
  )
240
238
 
241
239
  def process(self, pipe: "Ideogram4Pipeline", height, width, seed, rand_device):
242
- patch = pipe.dit.patch_size * pipe.vae_encoder.ae_scale_factor
240
+ patch = pipe.dit.patch_size * 8
243
241
  grid_h = height // patch
244
242
  grid_w = width // patch
245
243
  num_image_tokens = grid_h * grid_w
@@ -251,18 +249,17 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
251
249
  class Ideogram4Unit_InputImageEmbedder(PipelineUnit):
252
250
  def __init__(self):
253
251
  super().__init__(
254
- input_params=("input_image", "noise", "height", "width", "grid_h", "grid_w"),
252
+ input_params=("input_image", "noise", "height", "width"),
255
253
  output_params=("latents", "input_latents"),
256
- onload_model_names=("vae_encoder",)
254
+ onload_model_names=("vae",)
257
255
  )
258
256
 
259
- def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width, grid_h, grid_w):
257
+ def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width):
260
258
  if input_image is None:
261
259
  return {"latents": noise, "input_latents": None}
262
- pipe.load_models_to_device(["vae_encoder"])
260
+ pipe.load_models_to_device(["vae"])
263
261
  image = pipe.preprocess_image(input_image)
264
- input_latents = pipe.vae_encoder.encode(image, grid_h, grid_w, pipe.dit.patch_size)
265
-
262
+ input_latents = encode(pipe.vae, image, height, width, torch.bfloat16)
266
263
  if pipe.scheduler.training:
267
264
  return {"latents": noise, "input_latents": input_latents}
268
265
  else:
@@ -279,6 +276,8 @@ def model_fn_ideogram4(
279
276
  segment_ids=None,
280
277
  indicator=None,
281
278
  max_text_tokens=0,
279
+ use_gradient_checkpointing=False,
280
+ use_gradient_checkpointing_offload=False,
282
281
  **kwargs,
283
282
  ):
284
283
  t_ideogram4 = timestep.to(torch.float32)
@@ -292,5 +291,7 @@ def model_fn_ideogram4(
292
291
  out = dit(
293
292
  llm_features=llm_features, x=z, t=t_ideogram4,
294
293
  position_ids=position_ids, segment_ids=segment_ids, indicator=indicator,
294
+ use_gradient_checkpointing=use_gradient_checkpointing,
295
+ use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
295
296
  )
296
297
  return -out[:, max_text_tokens:]
@@ -0,0 +1,21 @@
1
+ import torch, torchaudio
2
+ from diffsynth import load_model, ModelConfig
3
+ from diffsynth.models.demucs import HTDemucs
4
+
5
+ class AudioTrackSeparator(torch.nn.Module):
6
+ def __init__(self, torch_dtype=torch.float32, device="cuda", model_config=ModelConfig(model_id="DiffSynth-Studio/Demucs-Repackage", origin_file_pattern="model.safetensors")):
7
+ super().__init__()
8
+ model_config.download_if_necessary()
9
+ self.model = load_model(HTDemucs, model_config.path, torch_dtype=torch_dtype, device=device)
10
+
11
+ @torch.no_grad()
12
+ def __call__(self, audio, target_sample_rate=48000, **kwargs):
13
+ if isinstance(audio, str):
14
+ audio, sample_rate = torchaudio.load(audio)
15
+ else:
16
+ audio, sample_rate = audio
17
+ audio = audio.to(dtype=next(iter(self.model.parameters())).dtype, device=next(iter(self.model.parameters())).device)
18
+ vocals = self.model.extract_track(audio, sample_rate)
19
+ if target_sample_rate != 44100:
20
+ vocals = torchaudio.functional.resample(vocals, 44100, target_sample_rate)
21
+ return vocals
@@ -0,0 +1,15 @@
1
+ from diffsynth import load_state_dict
2
+ import torch
3
+ from safetensors.torch import save_file
4
+ from tqdm import tqdm
5
+
6
+
7
+ def dequantize(source_path, target_path, device="cuda", torch_dtype=torch.bfloat16):
8
+ sd = load_state_dict(source_path, device=device)
9
+ for k in tqdm([k for k in sd if k.endswith(".weight_scale")]):
10
+ weight_key = k[:-13] + ".weight"
11
+ weight = sd.pop(weight_key).to(torch_dtype)
12
+ scale = sd.pop(k).to(torch_dtype).unsqueeze(1)
13
+ sd[weight_key] = weight * scale
14
+ if target_path is not None:
15
+ save_file(sd, target_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.13
3
+ Version: 2.0.14
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -168,6 +168,8 @@ diffsynth/utils/data/__init__.py
168
168
  diffsynth/utils/data/audio.py
169
169
  diffsynth/utils/data/audio_video.py
170
170
  diffsynth/utils/data/media_io_ltx2.py
171
+ diffsynth/utils/demucs/__init__.py
172
+ diffsynth/utils/dequantizer/__init__.py
171
173
  diffsynth/utils/lora/__init__.py
172
174
  diffsynth/utils/lora/flux.py
173
175
  diffsynth/utils/lora/general.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "diffsynth"
7
- version = "2.0.13"
7
+ version = "2.0.14"
8
8
  description = "Enjoy the magic of Diffusion models!"
9
9
  authors = [{name = "ModelScope Team"}]
10
10
  license = {text = "Apache-2.0"}