diffsynth 2.0.9__tar.gz → 2.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.9 → diffsynth-2.0.11}/PKG-INFO +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/README.md +327 -22
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/model_configs.py +144 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/vram_management_module_maps.py +93 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/operators.py +25 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/base_pipeline.py +38 -5
- diffsynth-2.0.11/diffsynth/diffusion/ddim_scheduler.py +107 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/flow_match.py +24 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/loss.py +5 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/parsers.py +6 -0
- diffsynth-2.0.11/diffsynth/diffusion/template.py +203 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/training_module.py +67 -8
- diffsynth-2.0.11/diffsynth/models/ace_step_conditioner.py +695 -0
- diffsynth-2.0.11/diffsynth/models/ace_step_dit.py +901 -0
- diffsynth-2.0.11/diffsynth/models/ace_step_text_encoder.py +53 -0
- diffsynth-2.0.11/diffsynth/models/ace_step_tokenizer.py +722 -0
- diffsynth-2.0.11/diffsynth/models/ace_step_vae.py +281 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/dinov3_image_encoder.py +11 -7
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_dit.py +82 -137
- diffsynth-2.0.11/diffsynth/models/joyai_image_dit.py +636 -0
- diffsynth-2.0.11/diffsynth/models/joyai_image_text_encoder.py +82 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/siglip2_image_encoder.py +13 -7
- diffsynth-2.0.11/diffsynth/models/stable_diffusion_text_encoder.py +216 -0
- diffsynth-2.0.11/diffsynth/models/stable_diffusion_unet.py +912 -0
- diffsynth-2.0.11/diffsynth/models/stable_diffusion_vae.py +642 -0
- diffsynth-2.0.11/diffsynth/models/stable_diffusion_xl_text_encoder.py +69 -0
- diffsynth-2.0.11/diffsynth/models/stable_diffusion_xl_unet.py +922 -0
- diffsynth-2.0.11/diffsynth/pipelines/ace_step.py +582 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/anima_image.py +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/flux2_image.py +51 -2
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/flux_image.py +1 -6
- diffsynth-2.0.11/diffsynth/pipelines/joyai_image.py +282 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/ltx2_audio_video.py +29 -29
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/mova_audio_video.py +18 -18
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/qwen_image.py +1 -1
- diffsynth-2.0.11/diffsynth/pipelines/stable_diffusion.py +230 -0
- diffsynth-2.0.11/diffsynth/pipelines/stable_diffusion_xl.py +331 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/wan_video.py +54 -54
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/z_image.py +2 -2
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/audio.py +1 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +13 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_dit.py +10 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +15 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +8 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/dino_v3.py +9 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +20 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +7 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +18 -0
- diffsynth-2.0.11/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +13 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/SOURCES.txt +27 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/pyproject.toml +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.11}/LICENSE +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/attention/attention.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/data/unified_dataset.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/diffusion/runner.py +2 -2
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ernie_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/model_loader.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wantodance.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/pipelines/ernie_image.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth/version.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.11}/setup.cfg +0 -0
|
@@ -34,6 +34,21 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
34
34
|
|
|
35
35
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
36
|
|
|
37
|
+
- **April 28, 2026** 🔥 We are excited to announce the release of **Diffusion Templates**, a plugin framework designed for Diffusion models that significantly lowers the barrier to training controllable generative models. Let's explore this cutting-edge technology together!
|
|
38
|
+
* Open-source code: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
|
|
39
|
+
* Technical report: [arXiv](https://arxiv.org/abs/2604.24351)
|
|
40
|
+
* Project homepage: [GitHub](https://modelscope.github.io/diffusion-templates-web/)
|
|
41
|
+
* Documentation: [English Version](https://diffsynth-studio-doc.readthedocs.io/en/latest/Diffusion_Templates/Introducing_Diffusion_Templates.html) | [Chinese Version](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/Diffusion_Templates/Introducing_Diffusion_Templates.html)
|
|
42
|
+
* Online demo: [ModelScope](https://modelscope.cn/studios/DiffSynth-Studio/Diffusion-Templates)
|
|
43
|
+
* Model collections: [ModelScope](https://modelscope.cn/collections/DiffSynth-Studio/KleinBase4B-Templates) | [ModelScope International](https://modelscope.ai/collections/DiffSynth-Studio/KleinBase4B-Templates) | [HuggingFace](https://huggingface.co/collections/DiffSynth-Studio/kleinbase4b-templates)
|
|
44
|
+
* Datasets: [ModelScope](https://modelscope.cn/collections/DiffSynth-Studio/ImagePulseV2) | [ModelScope International](https://modelscope.ai/collections/DiffSynth-Studio/ImagePulseV2) | [HuggingFace](https://huggingface.co/collections/DiffSynth-Studio/imagepulsev2)
|
|
45
|
+
|
|
46
|
+
- **April 27, 2026** We support ACE-Step-1.5! Support includes text-to-music generation, low VRAM inference, and LoRA training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
|
|
47
|
+
|
|
48
|
+
- **April 27, 2026**: We have reinstated support for the Stable Diffusion v1.5 and SDXL models, providing academic research support exclusively for these two model types.
|
|
49
|
+
|
|
50
|
+
- **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
|
|
51
|
+
|
|
37
52
|
- **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
|
|
38
53
|
|
|
39
54
|
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
|
|
@@ -90,7 +105,7 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
90
105
|
|
|
91
106
|
- **August 20, 2025** We open-sourced the [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) model, improving the editing effect of Qwen-Image-Edit on low-resolution image inputs. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)
|
|
92
107
|
|
|
93
|
-
- **August 19, 2025**
|
|
108
|
+
- **August 19, 2025** Qwen-Image-Edit open-sourced, welcome a new member to the image editing model family!
|
|
94
109
|
|
|
95
110
|
- **August 18, 2025** We trained and open-sourced the Qwen-Image inpainting ControlNet model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint). The model structure adopts a lightweight design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py).
|
|
96
111
|
|
|
@@ -106,7 +121,7 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
106
121
|
|
|
107
122
|
- **August 5, 2025** We open-sourced the distilled acceleration model [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) for Qwen-Image, achieving approximately 5x acceleration.
|
|
108
123
|
|
|
109
|
-
- **August 4, 2025**
|
|
124
|
+
- **August 4, 2025** Qwen-Image open-sourced, welcome a new member to the image generation model family!
|
|
110
125
|
|
|
111
126
|
- **August 1, 2025** [FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) open-sourced, a text-to-image model focused on aesthetic photography. We provided comprehensive support in a timely manner, including low VRAM layer-by-layer offload, LoRA training, and full training. For more details, please refer to [./examples/flux/](./examples/flux/).
|
|
112
127
|
|
|
@@ -297,6 +312,129 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
|
|
|
297
312
|
|
|
298
313
|
</details>
|
|
299
314
|
|
|
315
|
+
#### Stable Diffusion: [/docs/en/Model_Details/Stable-Diffusion.md](/docs/en/Model_Details/Stable-Diffusion.md)
|
|
316
|
+
|
|
317
|
+
<details>
|
|
318
|
+
|
|
319
|
+
<summary>Quick Start</summary>
|
|
320
|
+
|
|
321
|
+
Running the following code will quickly load the [AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 2GB VRAM.
|
|
322
|
+
|
|
323
|
+
```python
|
|
324
|
+
import torch
|
|
325
|
+
from diffsynth.core import ModelConfig
|
|
326
|
+
from diffsynth.pipelines.stable_diffusion import StableDiffusionPipeline
|
|
327
|
+
|
|
328
|
+
vram_config = {
|
|
329
|
+
"offload_dtype": torch.float32,
|
|
330
|
+
"offload_device": "cpu",
|
|
331
|
+
"onload_dtype": torch.float32,
|
|
332
|
+
"onload_device": "cpu",
|
|
333
|
+
"preparing_dtype": torch.float32,
|
|
334
|
+
"preparing_device": "cuda",
|
|
335
|
+
"computation_dtype": torch.float32,
|
|
336
|
+
"computation_device": "cuda",
|
|
337
|
+
}
|
|
338
|
+
pipe = StableDiffusionPipeline.from_pretrained(
|
|
339
|
+
torch_dtype=torch.float32,
|
|
340
|
+
model_configs=[
|
|
341
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
342
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
|
|
343
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
344
|
+
],
|
|
345
|
+
tokenizer_config=ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="tokenizer/"),
|
|
346
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
image = pipe(
|
|
350
|
+
prompt="a photo of an astronaut riding a horse on mars, high quality, detailed",
|
|
351
|
+
negative_prompt="blurry, low quality, deformed",
|
|
352
|
+
cfg_scale=7.5,
|
|
353
|
+
height=512,
|
|
354
|
+
width=512,
|
|
355
|
+
seed=42,
|
|
356
|
+
rand_device="cuda",
|
|
357
|
+
num_inference_steps=50,
|
|
358
|
+
)
|
|
359
|
+
image.save("image.jpg")
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
</details>
|
|
363
|
+
|
|
364
|
+
<details>
|
|
365
|
+
|
|
366
|
+
<summary>Examples</summary>
|
|
367
|
+
|
|
368
|
+
Example code for Stable Diffusion is available at: [/examples/stable_diffusion/](/examples/stable_diffusion/)
|
|
369
|
+
|
|
370
|
+
|Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
|
|
371
|
+
|-|-|-|-|-|-|-|
|
|
372
|
+
|[AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5)|[code](/examples/stable_diffusion/model_inference/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_inference_low_vram/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/full/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_full/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/lora/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_lora/stable-diffusion-v1-5.py)|
|
|
373
|
+
|
|
374
|
+
</details>
|
|
375
|
+
|
|
376
|
+
#### Stable Diffusion XL: [/docs/en/Model_Details/Stable-Diffusion-XL.md](/docs/en/Model_Details/Stable-Diffusion-XL.md)
|
|
377
|
+
|
|
378
|
+
<details>
|
|
379
|
+
|
|
380
|
+
<summary>Quick Start</summary>
|
|
381
|
+
|
|
382
|
+
Running the following code will quickly load the [stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 6GB VRAM.
|
|
383
|
+
|
|
384
|
+
```python
|
|
385
|
+
import torch
|
|
386
|
+
from diffsynth.core import ModelConfig
|
|
387
|
+
from diffsynth.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
|
|
388
|
+
|
|
389
|
+
vram_config = {
|
|
390
|
+
"offload_dtype": torch.float32,
|
|
391
|
+
"offload_device": "cpu",
|
|
392
|
+
"onload_dtype": torch.float32,
|
|
393
|
+
"onload_device": "cpu",
|
|
394
|
+
"preparing_dtype": torch.float32,
|
|
395
|
+
"preparing_device": "cuda",
|
|
396
|
+
"computation_dtype": torch.float32,
|
|
397
|
+
"computation_device": "cuda",
|
|
398
|
+
}
|
|
399
|
+
pipe = StableDiffusionXLPipeline.from_pretrained(
|
|
400
|
+
torch_dtype=torch.float32,
|
|
401
|
+
model_configs=[
|
|
402
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
403
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors", **vram_config),
|
|
404
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
|
|
405
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
406
|
+
],
|
|
407
|
+
tokenizer_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer/"),
|
|
408
|
+
tokenizer_2_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer_2/"),
|
|
409
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
image = pipe(
|
|
413
|
+
prompt="a photo of an astronaut riding a horse on mars",
|
|
414
|
+
negative_prompt="",
|
|
415
|
+
cfg_scale=5.0,
|
|
416
|
+
height=1024,
|
|
417
|
+
width=1024,
|
|
418
|
+
seed=42,
|
|
419
|
+
num_inference_steps=50,
|
|
420
|
+
)
|
|
421
|
+
image.save("image.jpg")
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
</details>
|
|
425
|
+
|
|
426
|
+
<details>
|
|
427
|
+
|
|
428
|
+
<summary>Examples</summary>
|
|
429
|
+
|
|
430
|
+
Example code for Stable Diffusion XL is available at: [/examples/stable_diffusion_xl/](/examples/stable_diffusion_xl/)
|
|
431
|
+
|
|
432
|
+
|Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
|
|
433
|
+
|-|-|-|-|-|-|-|
|
|
434
|
+
|[stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0)|[code](/examples/stable_diffusion_xl/model_inference/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_inference_low_vram/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/full/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_full/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/lora/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_lora/stable-diffusion-xl-base-1.0.py)|
|
|
435
|
+
|
|
436
|
+
</details>
|
|
437
|
+
|
|
300
438
|
#### FLUX.2: [/docs/en/Model_Details/FLUX2.md](/docs/en/Model_Details/FLUX2.md)
|
|
301
439
|
|
|
302
440
|
<details>
|
|
@@ -350,6 +488,17 @@ Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
|
|
|
350
488
|
|[black-forest-labs/FLUX.2-klein-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-9B.py)|
|
|
351
489
|
|[black-forest-labs/FLUX.2-klein-base-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-4B.py)|
|
|
352
490
|
|[black-forest-labs/FLUX.2-klein-base-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-9B.py)|
|
|
491
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Aesthetic](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Aesthetic)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Aesthetic.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Aesthetic.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Aesthetic.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Aesthetic.py)|-|-|
|
|
492
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Brightness](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Brightness)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Brightness.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Brightness.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Brightness.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Brightness.py)|-|-|
|
|
493
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Age](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Age)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Age.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Age.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Age.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Age.py)|-|-|
|
|
494
|
+
|[DiffSynth-Studio/Template-KleinBase4B-ControlNet](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ControlNet)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ControlNet.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ControlNet.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ControlNet.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ControlNet.py)|-|-|
|
|
495
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Edit](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Edit)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Edit.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Edit.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Edit.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Edit.py)|-|-|
|
|
496
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Inpaint)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Inpaint.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Inpaint.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Inpaint.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Inpaint.py)|-|-|
|
|
497
|
+
|[DiffSynth-Studio/Template-KleinBase4B-PandaMeme](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-PandaMeme)|[code](/examples/flux2/model_inference/Template-KleinBase4B-PandaMeme.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-PandaMeme.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-PandaMeme.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-PandaMeme.py)|-|-|
|
|
498
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Sharpness](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Sharpness)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Sharpness.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Sharpness.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Sharpness.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Sharpness.py)|-|-|
|
|
499
|
+
|[DiffSynth-Studio/Template-KleinBase4B-SoftRGB](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-SoftRGB)|[code](/examples/flux2/model_inference/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-SoftRGB.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-SoftRGB.py)|-|-|
|
|
500
|
+
|[DiffSynth-Studio/Template-KleinBase4B-Upscaler](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Upscaler)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Upscaler.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Upscaler.py)|-|-|
|
|
501
|
+
|[DiffSynth-Studio/Template-KleinBase4B-ContentRef](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ContentRef)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ContentRef.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ContentRef.py)|-|-|
|
|
353
502
|
|
|
354
503
|
</details>
|
|
355
504
|
|
|
@@ -598,6 +747,143 @@ Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
|
|
|
598
747
|
|
|
599
748
|
</details>
|
|
600
749
|
|
|
750
|
+
#### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
|
|
751
|
+
|
|
752
|
+
<details>
|
|
753
|
+
|
|
754
|
+
<summary>Quick Start</summary>
|
|
755
|
+
|
|
756
|
+
Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
757
|
+
|
|
758
|
+
```python
|
|
759
|
+
from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
|
|
760
|
+
import torch
|
|
761
|
+
|
|
762
|
+
vram_config = {
|
|
763
|
+
"offload_dtype": torch.bfloat16,
|
|
764
|
+
"offload_device": "cpu",
|
|
765
|
+
"onload_dtype": torch.bfloat16,
|
|
766
|
+
"onload_device": "cpu",
|
|
767
|
+
"preparing_dtype": torch.bfloat16,
|
|
768
|
+
"preparing_device": "cuda",
|
|
769
|
+
"computation_dtype": torch.bfloat16,
|
|
770
|
+
"computation_device": "cuda",
|
|
771
|
+
}
|
|
772
|
+
pipe = ErnieImagePipeline.from_pretrained(
|
|
773
|
+
torch_dtype=torch.bfloat16,
|
|
774
|
+
device='cuda',
|
|
775
|
+
model_configs=[
|
|
776
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
|
|
777
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
778
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
779
|
+
],
|
|
780
|
+
tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
|
|
781
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
image = pipe(
|
|
785
|
+
prompt="一只黑白相间的中华田园犬",
|
|
786
|
+
negative_prompt="",
|
|
787
|
+
height=1024,
|
|
788
|
+
width=1024,
|
|
789
|
+
seed=42,
|
|
790
|
+
num_inference_steps=50,
|
|
791
|
+
cfg_scale=4.0,
|
|
792
|
+
)
|
|
793
|
+
image.save("output.jpg")
|
|
794
|
+
```
|
|
795
|
+
|
|
796
|
+
</details>
|
|
797
|
+
|
|
798
|
+
<details>
|
|
799
|
+
|
|
800
|
+
<summary>Examples</summary>
|
|
801
|
+
|
|
802
|
+
Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
|
|
803
|
+
|
|
804
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
805
|
+
|-|-|-|-|-|-|-|
|
|
806
|
+
|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
|
|
807
|
+
|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
|
|
808
|
+
|
|
809
|
+
</details>
|
|
810
|
+
|
|
811
|
+
#### JoyAI-Image: [/docs/en/Model_Details/JoyAI-Image.md](/docs/en/Model_Details/JoyAI-Image.md)
|
|
812
|
+
|
|
813
|
+
<details>
|
|
814
|
+
|
|
815
|
+
<summary>Quick Start</summary>
|
|
816
|
+
|
|
817
|
+
Running the following code will quickly load the [jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 4GB VRAM.
|
|
818
|
+
|
|
819
|
+
```python
|
|
820
|
+
from diffsynth.pipelines.joyai_image import JoyAIImagePipeline, ModelConfig
|
|
821
|
+
import torch
|
|
822
|
+
from PIL import Image
|
|
823
|
+
from modelscope import dataset_snapshot_download
|
|
824
|
+
|
|
825
|
+
# Download dataset
|
|
826
|
+
dataset_snapshot_download(
|
|
827
|
+
dataset_id="DiffSynth-Studio/diffsynth_example_dataset",
|
|
828
|
+
local_dir="data/diffsynth_example_dataset",
|
|
829
|
+
allow_file_pattern="joyai_image/JoyAI-Image-Edit/*"
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
vram_config = {
|
|
833
|
+
"offload_dtype": torch.bfloat16,
|
|
834
|
+
"offload_device": "cpu",
|
|
835
|
+
"onload_dtype": torch.bfloat16,
|
|
836
|
+
"onload_device": "cpu",
|
|
837
|
+
"preparing_dtype": torch.bfloat16,
|
|
838
|
+
"preparing_device": "cuda",
|
|
839
|
+
"computation_dtype": torch.bfloat16,
|
|
840
|
+
"computation_device": "cuda",
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
pipe = JoyAIImagePipeline.from_pretrained(
|
|
844
|
+
torch_dtype=torch.bfloat16,
|
|
845
|
+
device="cuda",
|
|
846
|
+
model_configs=[
|
|
847
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth", **vram_config),
|
|
848
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model*.safetensors", **vram_config),
|
|
849
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="vae/Wan2.1_VAE.pth", **vram_config),
|
|
850
|
+
],
|
|
851
|
+
processor_config=ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/"),
|
|
852
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
# Use first sample from dataset
|
|
856
|
+
dataset_base_path = "data/diffsynth_example_dataset/joyai_image/JoyAI-Image-Edit"
|
|
857
|
+
prompt = "将裙子改为粉色"
|
|
858
|
+
edit_image = Image.open(f"{dataset_base_path}/edit/image1.jpg").convert("RGB")
|
|
859
|
+
|
|
860
|
+
output = pipe(
|
|
861
|
+
prompt=prompt,
|
|
862
|
+
edit_image=edit_image,
|
|
863
|
+
height=1024,
|
|
864
|
+
width=1024,
|
|
865
|
+
seed=0,
|
|
866
|
+
num_inference_steps=30,
|
|
867
|
+
cfg_scale=5.0,
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
output.save("output_joyai_edit_low_vram.png")
|
|
871
|
+
```
|
|
872
|
+
|
|
873
|
+
</details>
|
|
874
|
+
|
|
875
|
+
<details>
|
|
876
|
+
|
|
877
|
+
<summary>Examples</summary>
|
|
878
|
+
|
|
879
|
+
Example code for JoyAI-Image is available at: [/examples/joyai_image/](/examples/joyai_image/)
|
|
880
|
+
|
|
881
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
882
|
+
|-|-|-|-|-|-|-|
|
|
883
|
+
|[jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit)|[code](/examples/joyai_image/model_inference/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_inference_low_vram/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/full/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_full/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/lora/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_lora/JoyAI-Image-Edit.py)|
|
|
884
|
+
|
|
885
|
+
</details>
|
|
886
|
+
|
|
601
887
|
### Video Synthesis
|
|
602
888
|
|
|
603
889
|
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
|
|
@@ -877,18 +1163,22 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
|
|
|
877
1163
|
|
|
878
1164
|
</details>
|
|
879
1165
|
|
|
880
|
-
|
|
1166
|
+
### Audio Synthesis
|
|
1167
|
+
|
|
1168
|
+
#### ACE-Step: [/docs/en/Model_Details/ACE-Step.md](/docs/en/Model_Details/ACE-Step.md)
|
|
881
1169
|
|
|
882
1170
|
<details>
|
|
883
1171
|
|
|
884
1172
|
<summary>Quick Start</summary>
|
|
885
1173
|
|
|
886
|
-
Running the following code will quickly load the [
|
|
1174
|
+
Running the following code will quickly load the [ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
887
1175
|
|
|
888
1176
|
```python
|
|
889
|
-
from diffsynth.pipelines.
|
|
1177
|
+
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
|
1178
|
+
from diffsynth.utils.data.audio import save_audio
|
|
890
1179
|
import torch
|
|
891
1180
|
|
|
1181
|
+
|
|
892
1182
|
vram_config = {
|
|
893
1183
|
"offload_dtype": torch.bfloat16,
|
|
894
1184
|
"offload_device": "cpu",
|
|
@@ -899,28 +1189,34 @@ vram_config = {
|
|
|
899
1189
|
"computation_dtype": torch.bfloat16,
|
|
900
1190
|
"computation_device": "cuda",
|
|
901
1191
|
}
|
|
902
|
-
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
pipe = AceStepPipeline.from_pretrained(
|
|
903
1195
|
torch_dtype=torch.bfloat16,
|
|
904
|
-
device=
|
|
1196
|
+
device="cuda",
|
|
905
1197
|
model_configs=[
|
|
906
|
-
ModelConfig(model_id="
|
|
907
|
-
ModelConfig(model_id="
|
|
908
|
-
ModelConfig(model_id="
|
|
1198
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors", **vram_config),
|
|
1199
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
|
|
1200
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
909
1201
|
],
|
|
910
|
-
|
|
1202
|
+
text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
|
|
911
1203
|
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
912
1204
|
)
|
|
913
1205
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
1206
|
+
prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
|
|
1207
|
+
lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
|
|
1208
|
+
audio = pipe(
|
|
1209
|
+
prompt=prompt,
|
|
1210
|
+
lyrics=lyrics,
|
|
1211
|
+
duration=160,
|
|
1212
|
+
bpm=100,
|
|
1213
|
+
keyscale="B minor",
|
|
1214
|
+
timesignature="4",
|
|
1215
|
+
vocal_language="zh",
|
|
919
1216
|
seed=42,
|
|
920
|
-
num_inference_steps=50,
|
|
921
|
-
cfg_scale=4.0,
|
|
922
1217
|
)
|
|
923
|
-
|
|
1218
|
+
|
|
1219
|
+
save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
|
|
924
1220
|
```
|
|
925
1221
|
|
|
926
1222
|
</details>
|
|
@@ -929,12 +1225,21 @@ image.save("output.jpg")
|
|
|
929
1225
|
|
|
930
1226
|
<summary>Examples</summary>
|
|
931
1227
|
|
|
932
|
-
Example code for
|
|
1228
|
+
Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_step/)
|
|
933
1229
|
|
|
934
1230
|
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
935
1231
|
|-|-|-|-|-|-|-|
|
|
936
|
-
|[
|
|
937
|
-
|[
|
|
1232
|
+
|[ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5)|[code](/examples/ace_step/model_inference/Ace-Step1.5.py)|[code](/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/full/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_full/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/lora/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py)|
|
|
1233
|
+
|[ACE-Step/acestep-v15-turbo-shift1](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift1)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py)|
|
|
1234
|
+
|[ACE-Step/acestep-v15-turbo-shift3](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift3)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py)|
|
|
1235
|
+
|[ACE-Step/acestep-v15-turbo-continuous](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-continuous)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py)|
|
|
1236
|
+
|[ACE-Step/acestep-v15-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-base.py)|
|
|
1237
|
+
|[ACE-Step/acestep-v15-base: CoverTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py)|—|—|—|—|
|
|
1238
|
+
|[ACE-Step/acestep-v15-base: RepaintTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py)|—|—|—|—|
|
|
1239
|
+
|[ACE-Step/acestep-v15-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-sft)|[code](/examples/ace_step/model_inference/acestep-v15-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py)|
|
|
1240
|
+
|[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
|
|
1241
|
+
|[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
|
|
1242
|
+
|[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
|
|
938
1243
|
|
|
939
1244
|
</details>
|
|
940
1245
|
|
|
@@ -42,6 +42,7 @@ qwen_image_series = [
|
|
|
42
42
|
"model_hash": "5722b5c873720009de96422993b15682",
|
|
43
43
|
"model_name": "dinov3_image_encoder",
|
|
44
44
|
"model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
|
|
45
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
|
|
45
46
|
},
|
|
46
47
|
{
|
|
47
48
|
# Example:
|
|
@@ -900,4 +901,146 @@ mova_series = [
|
|
|
900
901
|
"model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
|
|
901
902
|
},
|
|
902
903
|
]
|
|
903
|
-
|
|
904
|
+
stable_diffusion_xl_series = [
|
|
905
|
+
{
|
|
906
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
|
|
907
|
+
"model_hash": "142b114f67f5ab3a6d83fb5788f12ded",
|
|
908
|
+
"model_name": "stable_diffusion_xl_unet",
|
|
909
|
+
"model_class": "diffsynth.models.stable_diffusion_xl_unet.SDXLUNet2DConditionModel",
|
|
910
|
+
"extra_kwargs": {"attention_head_dim": [5, 10, 20], "transformer_layers_per_block": [1, 2, 10], "use_linear_projection": True, "addition_embed_type": "text_time", "addition_time_embed_dim": 256, "projection_class_embeddings_input_dim": 2816},
|
|
911
|
+
},
|
|
912
|
+
{
|
|
913
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors")
|
|
914
|
+
"model_hash": "98cc34ccc5b54ae0e56bdea8688dcd5a",
|
|
915
|
+
"model_name": "stable_diffusion_xl_text_encoder",
|
|
916
|
+
"model_class": "diffsynth.models.stable_diffusion_xl_text_encoder.SDXLTextEncoder2",
|
|
917
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_xl_text_encoder.SDXLTextEncoder2StateDictConverter",
|
|
918
|
+
},
|
|
919
|
+
{
|
|
920
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors")
|
|
921
|
+
"model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
|
|
922
|
+
"model_name": "stable_diffusion_text_encoder",
|
|
923
|
+
"model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
|
|
924
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
|
|
925
|
+
},
|
|
926
|
+
{
|
|
927
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
928
|
+
"model_hash": "13115dd45a6e1c39860f91ab073b8a78",
|
|
929
|
+
"model_name": "stable_diffusion_xl_vae",
|
|
930
|
+
"model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
|
|
931
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
|
|
932
|
+
"extra_kwargs": {"scaling_factor": 0.13025, "sample_size": 1024, "force_upcast": True},
|
|
933
|
+
},
|
|
934
|
+
]
|
|
935
|
+
|
|
936
|
+
stable_diffusion_series = [
|
|
937
|
+
{
|
|
938
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors")
|
|
939
|
+
"model_hash": "ffd1737ae9df7fd43f5fbed653bdad67",
|
|
940
|
+
"model_name": "stable_diffusion_text_encoder",
|
|
941
|
+
"model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
|
|
942
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
|
|
943
|
+
},
|
|
944
|
+
{
|
|
945
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
946
|
+
"model_hash": "f86d5683ed32433be8ca69969c67ba69",
|
|
947
|
+
"model_name": "stable_diffusion_vae",
|
|
948
|
+
"model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
|
|
949
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
|
|
950
|
+
},
|
|
951
|
+
{
|
|
952
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
|
|
953
|
+
"model_hash": "025a4b86a84829399d89f613e580757b",
|
|
954
|
+
"model_name": "stable_diffusion_unet",
|
|
955
|
+
"model_class": "diffsynth.models.stable_diffusion_unet.UNet2DConditionModel",
|
|
956
|
+
},
|
|
957
|
+
]
|
|
958
|
+
|
|
959
|
+
joyai_image_series = [
|
|
960
|
+
{
|
|
961
|
+
# Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth")
|
|
962
|
+
"model_hash": "56592ddfd7d0249d3aa527d24161a863",
|
|
963
|
+
"model_name": "joyai_image_dit",
|
|
964
|
+
"model_class": "diffsynth.models.joyai_image_dit.JoyAIImageDiT",
|
|
965
|
+
},
|
|
966
|
+
{
|
|
967
|
+
# Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
|
|
968
|
+
"model_hash": "2d11bf14bba8b4e87477c8199a895403",
|
|
969
|
+
"model_name": "joyai_image_text_encoder",
|
|
970
|
+
"model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
|
|
971
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.joyai_image_text_encoder.JoyAIImageTextEncoderStateDictConverter",
|
|
972
|
+
},
|
|
973
|
+
]
|
|
974
|
+
|
|
975
|
+
ace_step_series = [
|
|
976
|
+
{
|
|
977
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
978
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
979
|
+
"model_name": "ace_step_dit",
|
|
980
|
+
"model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
|
|
981
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
|
|
982
|
+
},
|
|
983
|
+
{
|
|
984
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
985
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
986
|
+
"model_name": "ace_step_dit",
|
|
987
|
+
"model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
|
|
988
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
|
|
989
|
+
"extra_kwargs": {
|
|
990
|
+
"hidden_size": 2560,
|
|
991
|
+
"intermediate_size": 9728,
|
|
992
|
+
"num_hidden_layers": 32,
|
|
993
|
+
"num_attention_heads": 32,
|
|
994
|
+
"num_key_value_heads": 8,
|
|
995
|
+
"head_dim": 128,
|
|
996
|
+
"encoder_hidden_size": 2048,
|
|
997
|
+
"layer_types": ["sliding_attention", "full_attention"] * 16,
|
|
998
|
+
},
|
|
999
|
+
},
|
|
1000
|
+
{
|
|
1001
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
1002
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
1003
|
+
"model_name": "ace_step_conditioner",
|
|
1004
|
+
"model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
|
|
1005
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
|
|
1006
|
+
},
|
|
1007
|
+
{
|
|
1008
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
1009
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
1010
|
+
"model_name": "ace_step_conditioner",
|
|
1011
|
+
"model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
|
|
1012
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
|
|
1013
|
+
},
|
|
1014
|
+
{
|
|
1015
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors")
|
|
1016
|
+
"model_hash": "3509bea17b0e8cffc3dd4a15cc7899d0",
|
|
1017
|
+
"model_name": "ace_step_text_encoder",
|
|
1018
|
+
"model_class": "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder",
|
|
1019
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_text_encoder.AceStepTextEncoderStateDictConverter",
|
|
1020
|
+
},
|
|
1021
|
+
{
|
|
1022
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
1023
|
+
"model_hash": "51420834e54474986a7f4be0e4d6f687",
|
|
1024
|
+
"model_name": "ace_step_vae",
|
|
1025
|
+
"model_class": "diffsynth.models.ace_step_vae.AceStepVAE",
|
|
1026
|
+
},
|
|
1027
|
+
{
|
|
1028
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
1029
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
1030
|
+
"model_name": "ace_step_tokenizer",
|
|
1031
|
+
"model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
|
|
1032
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
|
|
1033
|
+
},
|
|
1034
|
+
{
|
|
1035
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
1036
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
1037
|
+
"model_name": "ace_step_tokenizer",
|
|
1038
|
+
"model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
|
|
1039
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
|
|
1040
|
+
},
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
MODEL_CONFIGS = (
|
|
1044
|
+
stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
|
|
1045
|
+
+ z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series
|
|
1046
|
+
)
|