diffsynth 2.0.5__tar.gz → 2.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. {diffsynth-2.0.5 → diffsynth-2.0.6}/PKG-INFO +4 -1
  2. {diffsynth-2.0.5 → diffsynth-2.0.6}/README.md +21 -2
  3. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/__init__.py +1 -1
  4. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/model_configs.py +136 -1
  5. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/vram_management_module_maps.py +32 -0
  6. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/operators.py +52 -15
  7. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/unified_dataset.py +2 -0
  8. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/layers.py +1 -1
  9. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/base_pipeline.py +6 -0
  10. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/training_module.py +42 -3
  11. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_audio_vae.py +511 -47
  12. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_common.py +17 -0
  13. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_dit.py +347 -117
  14. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_text_encoder.py +205 -22
  15. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_video_vae.py +60 -55
  16. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/model_loader.py +3 -2
  17. diffsynth-2.0.6/diffsynth/models/mova_audio_dit.py +57 -0
  18. diffsynth-2.0.6/diffsynth/models/mova_audio_vae.py +796 -0
  19. diffsynth-2.0.6/diffsynth/models/mova_dual_tower_bridge.py +595 -0
  20. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_dit.py +13 -1
  21. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/flux2_image.py +8 -4
  22. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/ltx2_audio_video.py +334 -264
  23. diffsynth-2.0.6/diffsynth/pipelines/mova_audio_video.py +460 -0
  24. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/qwen_image.py +4 -2
  25. diffsynth-2.0.6/diffsynth/utils/data/audio.py +108 -0
  26. diffsynth-2.0.5/diffsynth/utils/data/media_io_ltx2.py → diffsynth-2.0.6/diffsynth/utils/data/audio_video.py +44 -59
  27. diffsynth-2.0.6/diffsynth/utils/data/media_io_ltx2.py +43 -0
  28. diffsynth-2.0.6/diffsynth/utils/ses/__init__.py +1 -0
  29. diffsynth-2.0.6/diffsynth/utils/ses/ses.py +117 -0
  30. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +1 -1
  31. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +5 -3
  32. diffsynth-2.0.6/diffsynth/utils/xfuser/__init__.py +1 -0
  33. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/xfuser/xdit_context_parallel.py +28 -1
  34. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/PKG-INFO +4 -1
  35. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/SOURCES.txt +8 -0
  36. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/requires.txt +4 -0
  37. {diffsynth-2.0.5 → diffsynth-2.0.6}/pyproject.toml +5 -1
  38. diffsynth-2.0.5/diffsynth/utils/xfuser/__init__.py +0 -1
  39. {diffsynth-2.0.5 → diffsynth-2.0.6}/LICENSE +0 -0
  40. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/__init__.py +0 -0
  41. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/__init__.py +0 -0
  42. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/attention/__init__.py +0 -0
  43. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/attention/attention.py +0 -0
  44. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/__init__.py +0 -0
  45. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/device/__init__.py +0 -0
  46. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/device/npu_compatible_device.py +0 -0
  47. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/gradient/__init__.py +0 -0
  48. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
  49. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/__init__.py +0 -0
  50. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/config.py +0 -0
  51. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/file.py +0 -0
  52. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/model.py +0 -0
  53. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
  54. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/__init__.py +0 -0
  55. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/disk_map.py +0 -0
  56. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/initialization.py +0 -0
  57. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/__init__.py +0 -0
  58. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/flow_match.py +0 -0
  59. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/logger.py +0 -0
  60. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/loss.py +0 -0
  61. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/parsers.py +0 -0
  62. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/runner.py +0 -0
  63. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/anima_dit.py +0 -0
  64. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/dinov3_image_encoder.py +0 -0
  65. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_dit.py +0 -0
  66. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_text_encoder.py +0 -0
  67. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_vae.py +0 -0
  68. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_controlnet.py +0 -0
  69. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_dit.py +0 -0
  70. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_infiniteyou.py +0 -0
  71. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_ipadapter.py +0 -0
  72. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_lora_encoder.py +0 -0
  73. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_lora_patcher.py +0 -0
  74. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_text_encoder_clip.py +0 -0
  75. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_text_encoder_t5.py +0 -0
  76. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_vae.py +0 -0
  77. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_value_control.py +0 -0
  78. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/general_modules.py +0 -0
  79. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/longcat_video_dit.py +0 -0
  80. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_upsampler.py +0 -0
  81. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen.py +0 -0
  82. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen_ar_model.py +0 -0
  83. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen_projector.py +0 -0
  84. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_controlnet.py +0 -0
  85. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_dit.py +0 -0
  86. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_image2lora.py +0 -0
  87. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_text_encoder.py +0 -0
  88. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_vae.py +0 -0
  89. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/sd_text_encoder.py +0 -0
  90. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/siglip2_image_encoder.py +0 -0
  91. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/step1x_connector.py +0 -0
  92. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/step1x_text_encoder.py +0 -0
  93. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_animate_adapter.py +0 -0
  94. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_camera_controller.py +0 -0
  95. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_dit_s2v.py +0 -0
  96. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_image_encoder.py +0 -0
  97. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_mot.py +0 -0
  98. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_motion_controller.py +0 -0
  99. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_text_encoder.py +0 -0
  100. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_vace.py +0 -0
  101. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_vae.py +0 -0
  102. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wav2vec.py +0 -0
  103. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_controlnet.py +0 -0
  104. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_dit.py +0 -0
  105. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_image2lora.py +0 -0
  106. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_text_encoder.py +0 -0
  107. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/anima_image.py +0 -0
  108. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/flux_image.py +0 -0
  109. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/wan_video.py +0 -0
  110. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/z_image.py +0 -0
  111. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/__init__.py +0 -0
  112. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/annotator.py +0 -0
  113. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
  114. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/data/__init__.py +0 -0
  115. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/__init__.py +0 -0
  116. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/flux.py +0 -0
  117. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/general.py +0 -0
  118. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/merge.py +0 -0
  119. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/reset_rank.py +0 -0
  120. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
  121. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
  122. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
  123. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
  124. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
  125. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
  126. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
  127. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
  128. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
  129. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
  130. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
  131. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
  132. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
  133. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
  134. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
  135. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
  136. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
  137. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
  138. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
  139. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
  140. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
  141. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
  142. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
  143. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
  144. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/version.py +0 -0
  145. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/dependency_links.txt +0 -0
  146. {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/top_level.txt +0 -0
  147. {diffsynth-2.0.5 → diffsynth-2.0.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth
3
- Version: 2.0.5
3
+ Version: 2.0.6
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: ModelScope Team
6
6
  License: Apache-2.0
@@ -32,4 +32,7 @@ Provides-Extra: npu
32
32
  Requires-Dist: torch==2.7.1+cpu; extra == "npu"
33
33
  Requires-Dist: torch-npu==2.7.1; extra == "npu"
34
34
  Requires-Dist: torchvision==0.22.1+cpu; extra == "npu"
35
+ Provides-Extra: audio
36
+ Requires-Dist: torchaudio; extra == "audio"
37
+ Requires-Dist: torchcodec; extra == "audio"
35
38
  Dynamic: license-file
@@ -32,6 +32,11 @@ We believe that a well-developed open-source code framework can lower the thresh
32
32
  > DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update.
33
33
 
34
34
  > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
35
+ - **January 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
36
+
37
+ - **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
38
+
39
+ - **March 3, 2026**: We released the [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) model, which is an updated version of Qwen-Image-Layered-Control. In addition to the originally supported text-guided functionality, it adds brush-controlled layer separation capabilities.
35
40
 
36
41
  - **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
37
42
 
@@ -396,7 +401,7 @@ Example code for Anima is located at: [/examples/anima/](/examples/anima/)
396
401
 
397
402
  | Model ID | Inference | Low VRAM Inference | Full Training | Validation after Full Training | LoRA Training | Validation after LoRA Training |
398
403
  |-|-|-|-|-|-|-|
399
- |[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_inference/anima-preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_inference_low_vram/anima-preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_training/full/anima-preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_training/validate_full/anima-preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_training/lora/anima-preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/anima/model_training/validate_lora/anima-preview.py)|
404
+ |[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](/examples/anima/model_inference/anima-preview.py)|[code](/examples/anima/model_inference_low_vram/anima-preview.py)|[code](/examples/anima/model_training/full/anima-preview.sh)|[code](/examples/anima/model_training/validate_full/anima-preview.py)|[code](/examples/anima/model_training/lora/anima-preview.sh)|[code](/examples/anima/model_training/validate_lora/anima-preview.py)|
400
405
 
401
406
  </details>
402
407
 
@@ -480,9 +485,11 @@ Example code for Qwen-Image is available at: [/examples/qwen_image/](/examples/q
480
485
  |[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)|
481
486
  |[Qwen/Qwen-Image-Edit-2511](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2511)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2511.py)|
482
487
  |[FireRedTeam/FireRed-Image-Edit-1.0](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.0)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.0.py)|
488
+ |[FireRedTeam/FireRed-Image-Edit-1.1](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.1)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.1.py)|
483
489
  |[lightx2v/Qwen-Image-Edit-2511-Lightning](https://modelscope.cn/models/lightx2v/Qwen-Image-Edit-2511-Lightning)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-Lightning.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511-Lightning.py)|-|-|-|-|
484
490
  |[Qwen/Qwen-Image-Layered](https://www.modelscope.cn/models/Qwen/Qwen-Image-Layered)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered.py)|
485
491
  |[DiffSynth-Studio/Qwen-Image-Layered-Control](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control.py)|
492
+ |[DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control-V2.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control-V2.py)|
486
493
  |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
487
494
  |[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
488
495
  |[DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py)|
@@ -701,6 +708,16 @@ Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
701
708
 
702
709
  | Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
703
710
  |-|-|-|-|-|-|-|-|
711
+ |[Lightricks/LTX-2.3: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-I2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-I2AV.py)|
712
+ |[Lightricks/LTX-2.3: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-TwoStage.py)|-|-|-|-|
713
+ |[Lightricks/LTX-2.3: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-DistilledPipeline.py)|-|-|-|-|
714
+ |[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)|
715
+ |[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-|
716
+ |[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-|
717
+ |[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-|
718
+ |[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-|
719
+ |[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
720
+ |[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
704
721
  |[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)|
705
722
  |[Lightricks/LTX-2-19b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
706
723
  |[Lightricks/LTX-2-19b-IC-LoRA-Detailer](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Detailer)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Detailer.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Detailer.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
@@ -851,6 +868,8 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
851
868
  |[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
852
869
  |[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
853
870
  |[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
871
+ | [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) | `input_image` | [code](/examples/mova/model_inference/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py) |
872
+ | [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) | `input_image` | [code](/examples/mova/model_inference/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py) |
854
873
 
855
874
  </details>
856
875
 
@@ -864,7 +883,7 @@ DiffSynth-Studio is not just an engineered model framework, but also an incubato
864
883
 
865
884
  - Paper: [Spectral Evolution Search: Efficient Inference-Time Scaling for Reward-Aligned Image Generation
866
885
  ](https://arxiv.org/abs/2602.03208)
867
- - Sample Code: coming soon
886
+ - Sample Code: [/docs/en/Research_Tutorial/inference_time_scaling.md](/docs/en/Research_Tutorial/inference_time_scaling.md)
868
887
 
869
888
  |FLUX.1-dev|FLUX.1-dev + SES|Qwen-Image|Qwen-Image + SES|
870
889
  |-|-|-|-|
@@ -1,2 +1,2 @@
1
1
  from .model_configs import MODEL_CONFIGS
2
- from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS
2
+ from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_CHECKER_MAPS
@@ -718,6 +718,119 @@ ltx2_series = [
718
718
  "model_name": "ltx2_latent_upsampler",
719
719
  "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
720
720
  },
721
+ {
722
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
723
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
724
+ "model_name": "ltx2_dit",
725
+ "model_class": "diffsynth.models.ltx2_dit.LTXModel",
726
+ "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
727
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
728
+ },
729
+ {
730
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
731
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
732
+ "model_name": "ltx2_video_vae_encoder",
733
+ "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
734
+ "extra_kwargs": {"encoder_version": "ltx-2.3"},
735
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
736
+ },
737
+ {
738
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
739
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
740
+ "model_name": "ltx2_video_vae_decoder",
741
+ "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
742
+ "extra_kwargs": {"decoder_version": "ltx-2.3"},
743
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
744
+ },
745
+ {
746
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
747
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
748
+ "model_name": "ltx2_audio_vae_decoder",
749
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
750
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
751
+ },
752
+ {
753
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
754
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
755
+ "model_name": "ltx2_audio_vocoder",
756
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
757
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
758
+ },
759
+ {
760
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
761
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
762
+ "model_name": "ltx2_audio_vae_encoder",
763
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
764
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
765
+ },
766
+ {
767
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
768
+ "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
769
+ "model_name": "ltx2_text_encoder_post_modules",
770
+ "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
771
+ "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
772
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
773
+ },
774
+ {
775
+ # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
776
+ "model_hash": "aed408774d694a2452f69936c32febb5",
777
+ "model_name": "ltx2_latent_upsampler",
778
+ "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
779
+ "extra_kwargs": {"rational_resampler": False},
780
+ },
781
+ {
782
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="transformer.safetensors")
783
+ "model_hash": "1c55afad76ed33c112a2978550b524d1",
784
+ "model_name": "ltx2_dit",
785
+ "model_class": "diffsynth.models.ltx2_dit.LTXModel",
786
+ "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
787
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
788
+ },
789
+ {
790
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
791
+ "model_hash": "eecdc07c2ec30863b8a2b8b2134036cf",
792
+ "model_name": "ltx2_video_vae_encoder",
793
+ "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
794
+ "extra_kwargs": {"encoder_version": "ltx-2.3"},
795
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
796
+ },
797
+ {
798
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
799
+ "model_hash": "deda2f542e17ee25bc8c38fd605316ea",
800
+ "model_name": "ltx2_video_vae_decoder",
801
+ "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
802
+ "extra_kwargs": {"decoder_version": "ltx-2.3"},
803
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
804
+ },
805
+ {
806
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
807
+ "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
808
+ "model_name": "ltx2_audio_vae_decoder",
809
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
810
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
811
+ },
812
+ {
813
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
814
+ "model_hash": "29338f3b95e7e312a3460a482e4f4554",
815
+ "model_name": "ltx2_audio_vae_encoder",
816
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
817
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
818
+ },
819
+ {
820
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
821
+ "model_hash": "cd436c99e69ec5c80f050f0944f02a15",
822
+ "model_name": "ltx2_audio_vocoder",
823
+ "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
824
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
825
+ },
826
+ {
827
+ # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
828
+ "model_hash": "05da2aab1c4b061f72c426311c165a43",
829
+ "model_name": "ltx2_text_encoder_post_modules",
830
+ "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
831
+ "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
832
+ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
833
+ },
721
834
  ]
722
835
  anima_series = [
723
836
  {
@@ -735,4 +848,26 @@ anima_series = [
735
848
  "state_dict_converter": "diffsynth.utils.state_dict_converters.anima_dit.AnimaDiTStateDictConverter",
736
849
  }
737
850
  ]
738
- MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series
851
+
852
+ mova_series = [
853
+ # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors")
854
+ {
855
+ "model_hash": "8c57e12790e2c45a64817e0ce28cde2f",
856
+ "model_name": "mova_audio_dit",
857
+ "model_class": "diffsynth.models.mova_audio_dit.MovaAudioDit",
858
+ "extra_kwargs": {'has_image_input': False, 'patch_size': [1], 'in_dim': 128, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 128, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
859
+ },
860
+ # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors")
861
+ {
862
+ "model_hash": "418517fb2b4e919d2cac8f314fcf82ac",
863
+ "model_name": "mova_audio_vae",
864
+ "model_class": "diffsynth.models.mova_audio_vae.DacVAE",
865
+ },
866
+ # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors")
867
+ {
868
+ "model_hash": "d1139dbbc8b4ab53cf4b4243d57bbceb",
869
+ "model_name": "mova_dual_tower_bridge",
870
+ "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
871
+ },
872
+ ]
873
+ MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series
@@ -249,4 +249,36 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
249
249
  "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
250
250
  "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
251
251
  },
252
+ "diffsynth.models.mova_audio_dit.MovaAudioDit": {
253
+ "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
254
+ "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
255
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
256
+ "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
257
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
258
+ "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
259
+ },
260
+ "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge": {
261
+ "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
262
+ "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
263
+ "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
264
+ },
265
+ "diffsynth.models.mova_audio_vae.DacVAE": {
266
+ "diffsynth.models.mova_audio_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule",
267
+ "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
268
+ "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
269
+ },
252
270
  }
271
+
272
+ def QwenImageTextEncoder_Module_Map_Updater():
273
+ current = VRAM_MANAGEMENT_MODULE_MAPS["diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder"]
274
+ from packaging import version
275
+ import transformers
276
+ if version.parse(transformers.__version__) >= version.parse("5.2.0"):
277
+ # The Qwen2RMSNorm in transformers 5.2.0+ has been renamed to Qwen2_5_VLRMSNorm, so we need to update the module map accordingly
278
+ current.pop("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm", None)
279
+ current["transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRMSNorm"] = "diffsynth.core.vram.layers.AutoWrappedModule"
280
+ return current
281
+
282
+ VERSION_CHECKER_MAPS = {
283
+ "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": QwenImageTextEncoder_Module_Map_Updater,
284
+ }
@@ -1,6 +1,8 @@
1
+ import math
1
2
  import torch, torchvision, imageio, os
2
3
  import imageio.v3 as iio
3
4
  from PIL import Image
5
+ import torchaudio
4
6
 
5
7
 
6
8
  class DataProcessingPipeline:
@@ -105,27 +107,59 @@ class ToList(DataProcessingOperator):
105
107
  return [data]
106
108
 
107
109
 
108
- class LoadVideo(DataProcessingOperator):
109
- def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x):
110
+ class FrameSamplerByRateMixin:
111
+ def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_rate=24, fix_frame_rate=False):
110
112
  self.num_frames = num_frames
111
113
  self.time_division_factor = time_division_factor
112
114
  self.time_division_remainder = time_division_remainder
113
- # frame_processor is build in the video loader for high efficiency.
114
- self.frame_processor = frame_processor
115
-
115
+ self.frame_rate = frame_rate
116
+ self.fix_frame_rate = fix_frame_rate
117
+
118
+ def get_reader(self, data: str):
119
+ return imageio.get_reader(data)
120
+
121
+ def get_available_num_frames(self, reader):
122
+ if not self.fix_frame_rate:
123
+ return reader.count_frames()
124
+ meta_data = reader.get_meta_data()
125
+ total_original_frames = int(reader.count_frames())
126
+ duration = meta_data["duration"] if "duration" in meta_data else total_original_frames / meta_data['fps']
127
+ total_available_frames = math.floor(duration * self.frame_rate)
128
+ return int(total_available_frames)
129
+
116
130
  def get_num_frames(self, reader):
117
131
  num_frames = self.num_frames
118
- if int(reader.count_frames()) < num_frames:
119
- num_frames = int(reader.count_frames())
132
+ total_frames = self.get_available_num_frames(reader)
133
+ if int(total_frames) < num_frames:
134
+ num_frames = total_frames
120
135
  while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
121
136
  num_frames -= 1
122
137
  return num_frames
123
-
138
+
139
+ def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: float, total_raw_frames: int) -> int:
140
+ if not self.fix_frame_rate:
141
+ return new_sequence_id
142
+ target_time_in_seconds = new_sequence_id / self.frame_rate
143
+ raw_frame_index_float = target_time_in_seconds * raw_frame_rate
144
+ frame_id = int(round(raw_frame_index_float))
145
+ frame_id = min(frame_id, total_raw_frames - 1)
146
+ return frame_id
147
+
148
+
149
+ class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
150
+ def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x, frame_rate=24, fix_frame_rate=False):
151
+ FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
152
+ # frame_processor is build in the video loader for high efficiency.
153
+ self.frame_processor = frame_processor
154
+
124
155
  def __call__(self, data: str):
125
- reader = imageio.get_reader(data)
156
+ reader = self.get_reader(data)
157
+ raw_frame_rate = reader.get_meta_data()['fps']
126
158
  num_frames = self.get_num_frames(reader)
159
+ total_raw_frames = reader.count_frames()
127
160
  frames = []
128
161
  for frame_id in range(num_frames):
162
+ frame_id = self.map_single_frame_id(frame_id, raw_frame_rate, total_raw_frames)
129
163
  frame = reader.get_data(frame_id)
130
164
  frame = Image.fromarray(frame)
131
165
  frame = self.frame_processor(frame)
@@ -149,7 +183,7 @@ class LoadGIF(DataProcessingOperator):
149
183
  self.time_division_remainder = time_division_remainder
150
184
  # frame_processor is build in the video loader for high efficiency.
151
185
  self.frame_processor = frame_processor
152
-
186
+
153
187
  def get_num_frames(self, path):
154
188
  num_frames = self.num_frames
155
189
  images = iio.imread(path, mode="RGB")
@@ -220,14 +254,17 @@ class LoadAudio(DataProcessingOperator):
220
254
  return input_audio
221
255
 
222
256
 
223
- class LoadAudioWithTorchaudio(DataProcessingOperator):
224
- def __init__(self, duration=5):
225
- self.duration = duration
257
+ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
258
+
259
+ def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
260
+ FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
226
261
 
227
262
  def __call__(self, data: str):
228
- import torchaudio
263
+ reader = self.get_reader(data)
264
+ num_frames = self.get_num_frames(reader)
265
+ duration = num_frames / self.frame_rate
229
266
  waveform, sample_rate = torchaudio.load(data)
230
- target_samples = int(self.duration * sample_rate)
267
+ target_samples = int(duration * sample_rate)
231
268
  current_samples = waveform.shape[-1]
232
269
  if current_samples > target_samples:
233
270
  waveform = waveform[..., :target_samples]
@@ -42,6 +42,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
42
42
  max_pixels=1920*1080, height=None, width=None,
43
43
  height_division_factor=16, width_division_factor=16,
44
44
  num_frames=81, time_division_factor=4, time_division_remainder=1,
45
+ frame_rate=24, fix_frame_rate=False,
45
46
  ):
46
47
  return RouteByType(operator_map=[
47
48
  (str, ToAbsolutePath(base_path) >> RouteByExtensionName(operator_map=[
@@ -53,6 +54,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
53
54
  (("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"), LoadVideo(
54
55
  num_frames, time_division_factor, time_division_remainder,
55
56
  frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
57
+ frame_rate=frame_rate, fix_frame_rate=fix_frame_rate,
56
58
  )),
57
59
  ])),
58
60
  ])
@@ -417,7 +417,7 @@ class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
417
417
  def lora_forward(self, x, out):
418
418
  if self.lora_merger is None:
419
419
  for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
420
- out = out + x @ lora_A.T @ lora_B.T
420
+ out = out + x @ lora_A.T.to(device=x.device, dtype=x.dtype) @ lora_B.T.to(device=x.device, dtype=x.dtype)
421
421
  else:
422
422
  lora_output = []
423
423
  for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
@@ -147,6 +147,12 @@ class BasePipeline(torch.nn.Module):
147
147
  video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
148
148
  return video
149
149
 
150
+ def output_audio_format_check(self, audio_output):
151
+ # output standard foramt: [C, T], output dtype: float()
152
+ # remove batch dim
153
+ if audio_output.ndim == 3:
154
+ audio_output = audio_output.squeeze(0)
155
+ return audio_output.float()
150
156
 
151
157
  def load_models_to_device(self, model_names):
152
158
  if self.vram_management_enabled:
@@ -1,9 +1,32 @@
1
- import torch, json, os
1
+ import torch, json, os, inspect
2
2
  from ..core import ModelConfig, load_state_dict
3
3
  from ..utils.controlnet import ControlNetInput
4
+ from .base_pipeline import PipelineUnit
4
5
  from peft import LoraConfig, inject_adapter_in_model
5
6
 
6
7
 
8
+ class GeneralUnit_RemoveCache(PipelineUnit):
9
+ def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
10
+ super().__init__(take_over=True)
11
+ self.required_params = required_params
12
+ self.force_remove_params_shared = force_remove_params_shared
13
+ self.force_remove_params_posi = force_remove_params_posi
14
+ self.force_remove_params_nega = force_remove_params_nega
15
+
16
+ def process_params(self, inputs, required_params, force_remove_params):
17
+ inputs_ = {}
18
+ for name, param in inputs.items():
19
+ if name in required_params and name not in force_remove_params:
20
+ inputs_[name] = param
21
+ return inputs_
22
+
23
+ def process(self, pipe, inputs_shared, inputs_posi, inputs_nega):
24
+ inputs_shared = self.process_params(inputs_shared, self.required_params, self.force_remove_params_shared)
25
+ inputs_posi = self.process_params(inputs_posi, self.required_params, self.force_remove_params_posi)
26
+ inputs_nega = self.process_params(inputs_nega, self.required_params, self.force_remove_params_nega)
27
+ return inputs_shared, inputs_posi, inputs_nega
28
+
29
+
7
30
  class DiffusionTrainingModule(torch.nn.Module):
8
31
  def __init__(self):
9
32
  super().__init__()
@@ -231,14 +254,30 @@ class DiffusionTrainingModule(torch.nn.Module):
231
254
  setattr(pipe, lora_base_model, model)
232
255
 
233
256
 
234
- def split_pipeline_units(self, task, pipe, trainable_models=None, lora_base_model=None):
257
+ def split_pipeline_units(
258
+ self, task, pipe,
259
+ trainable_models=None, lora_base_model=None,
260
+ # TODO: set `remove_unnecessary_params` to `True` by default
261
+ remove_unnecessary_params=False,
262
+ # TODO: move `loss_required_params` to `loss.py`
263
+ loss_required_params=("input_latents", "max_timestep_boundary", "min_timestep_boundary", "first_frame_latents", "video_latents", "audio_input_latents", "num_inference_steps"),
264
+ force_remove_params_shared=tuple(),
265
+ force_remove_params_posi=tuple(),
266
+ force_remove_params_nega=tuple(),
267
+ ):
235
268
  models_require_backward = []
236
269
  if trainable_models is not None:
237
270
  models_require_backward += trainable_models.split(",")
238
271
  if lora_base_model is not None:
239
272
  models_require_backward += [lora_base_model]
240
273
  if task.endswith(":data_process"):
241
- _, pipe.units = pipe.split_pipeline_units(models_require_backward)
274
+ other_units, pipe.units = pipe.split_pipeline_units(models_require_backward)
275
+ if remove_unnecessary_params:
276
+ required_params = list(loss_required_params) + [i for i in inspect.signature(self.pipe.model_fn).parameters]
277
+ for unit in other_units:
278
+ required_params.extend(unit.fetch_input_params())
279
+ required_params = sorted(list(set(required_params)))
280
+ pipe.units.append(GeneralUnit_RemoveCache(required_params, force_remove_params_shared, force_remove_params_posi, force_remove_params_nega))
242
281
  elif task.endswith(":train"):
243
282
  pipe.units, _ = pipe.split_pipeline_units(models_require_backward)
244
283
  return pipe