diffsynth 2.0.9__tar.gz → 2.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.9 → diffsynth-2.0.10}/PKG-INFO +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/README.md +305 -20
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/model_configs.py +144 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/vram_management_module_maps.py +93 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/operators.py +25 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/base_pipeline.py +1 -1
- diffsynth-2.0.10/diffsynth/diffusion/ddim_scheduler.py +107 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/flow_match.py +24 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/training_module.py +15 -8
- diffsynth-2.0.10/diffsynth/models/ace_step_conditioner.py +695 -0
- diffsynth-2.0.10/diffsynth/models/ace_step_dit.py +901 -0
- diffsynth-2.0.10/diffsynth/models/ace_step_text_encoder.py +53 -0
- diffsynth-2.0.10/diffsynth/models/ace_step_tokenizer.py +722 -0
- diffsynth-2.0.10/diffsynth/models/ace_step_vae.py +281 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/dinov3_image_encoder.py +5 -5
- diffsynth-2.0.10/diffsynth/models/joyai_image_dit.py +636 -0
- diffsynth-2.0.10/diffsynth/models/joyai_image_text_encoder.py +82 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/siglip2_image_encoder.py +5 -5
- diffsynth-2.0.10/diffsynth/models/stable_diffusion_text_encoder.py +216 -0
- diffsynth-2.0.10/diffsynth/models/stable_diffusion_unet.py +912 -0
- diffsynth-2.0.10/diffsynth/models/stable_diffusion_vae.py +642 -0
- diffsynth-2.0.10/diffsynth/models/stable_diffusion_xl_text_encoder.py +69 -0
- diffsynth-2.0.10/diffsynth/models/stable_diffusion_xl_unet.py +922 -0
- diffsynth-2.0.10/diffsynth/pipelines/ace_step.py +582 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/anima_image.py +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/flux2_image.py +2 -2
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/flux_image.py +1 -6
- diffsynth-2.0.10/diffsynth/pipelines/joyai_image.py +282 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/ltx2_audio_video.py +29 -29
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/mova_audio_video.py +18 -18
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/qwen_image.py +1 -1
- diffsynth-2.0.10/diffsynth/pipelines/stable_diffusion.py +230 -0
- diffsynth-2.0.10/diffsynth/pipelines/stable_diffusion_xl.py +331 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/wan_video.py +53 -53
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/z_image.py +2 -2
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/audio.py +1 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +13 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_dit.py +10 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +15 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +8 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/dino_v3.py +9 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +20 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +7 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +18 -0
- diffsynth-2.0.10/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +13 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/SOURCES.txt +26 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/pyproject.toml +1 -1
- {diffsynth-2.0.9 → diffsynth-2.0.10}/LICENSE +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/attention/attention.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/data/unified_dataset.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/diffusion/runner.py +2 -2
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ernie_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/model_loader.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wantodance.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/pipelines/ernie_image.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth/version.py +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.9 → diffsynth-2.0.10}/setup.cfg +0 -0
|
@@ -34,6 +34,12 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
34
34
|
|
|
35
35
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
36
|
|
|
37
|
+
- **April 27, 2026** We support ACE-Step-1.5! Support includes text-to-music generation, low VRAM inference, and LoRA training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
|
|
38
|
+
|
|
39
|
+
- **April 27, 2026**: We have reinstated support for the Stable Diffusion v1.5 and SDXL models, providing academic research support exclusively for these two model types.
|
|
40
|
+
|
|
41
|
+
- **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
|
|
42
|
+
|
|
37
43
|
- **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
|
|
38
44
|
|
|
39
45
|
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
|
|
@@ -297,6 +303,129 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
|
|
|
297
303
|
|
|
298
304
|
</details>
|
|
299
305
|
|
|
306
|
+
#### Stable Diffusion: [/docs/en/Model_Details/Stable-Diffusion.md](/docs/en/Model_Details/Stable-Diffusion.md)
|
|
307
|
+
|
|
308
|
+
<details>
|
|
309
|
+
|
|
310
|
+
<summary>Quick Start</summary>
|
|
311
|
+
|
|
312
|
+
Running the following code will quickly load the [AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 2GB VRAM.
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
import torch
|
|
316
|
+
from diffsynth.core import ModelConfig
|
|
317
|
+
from diffsynth.pipelines.stable_diffusion import StableDiffusionPipeline
|
|
318
|
+
|
|
319
|
+
vram_config = {
|
|
320
|
+
"offload_dtype": torch.float32,
|
|
321
|
+
"offload_device": "cpu",
|
|
322
|
+
"onload_dtype": torch.float32,
|
|
323
|
+
"onload_device": "cpu",
|
|
324
|
+
"preparing_dtype": torch.float32,
|
|
325
|
+
"preparing_device": "cuda",
|
|
326
|
+
"computation_dtype": torch.float32,
|
|
327
|
+
"computation_device": "cuda",
|
|
328
|
+
}
|
|
329
|
+
pipe = StableDiffusionPipeline.from_pretrained(
|
|
330
|
+
torch_dtype=torch.float32,
|
|
331
|
+
model_configs=[
|
|
332
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
333
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
|
|
334
|
+
ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
335
|
+
],
|
|
336
|
+
tokenizer_config=ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="tokenizer/"),
|
|
337
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
image = pipe(
|
|
341
|
+
prompt="a photo of an astronaut riding a horse on mars, high quality, detailed",
|
|
342
|
+
negative_prompt="blurry, low quality, deformed",
|
|
343
|
+
cfg_scale=7.5,
|
|
344
|
+
height=512,
|
|
345
|
+
width=512,
|
|
346
|
+
seed=42,
|
|
347
|
+
rand_device="cuda",
|
|
348
|
+
num_inference_steps=50,
|
|
349
|
+
)
|
|
350
|
+
image.save("image.jpg")
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
</details>
|
|
354
|
+
|
|
355
|
+
<details>
|
|
356
|
+
|
|
357
|
+
<summary>Examples</summary>
|
|
358
|
+
|
|
359
|
+
Example code for Stable Diffusion is available at: [/examples/stable_diffusion/](/examples/stable_diffusion/)
|
|
360
|
+
|
|
361
|
+
|Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
|
|
362
|
+
|-|-|-|-|-|-|-|
|
|
363
|
+
|[AI-ModelScope/stable-diffusion-v1-5](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5)|[code](/examples/stable_diffusion/model_inference/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_inference_low_vram/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/full/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_full/stable-diffusion-v1-5.py)|[code](/examples/stable_diffusion/model_training/lora/stable-diffusion-v1-5.sh)|[code](/examples/stable_diffusion/model_training/validate_lora/stable-diffusion-v1-5.py)|
|
|
364
|
+
|
|
365
|
+
</details>
|
|
366
|
+
|
|
367
|
+
#### Stable Diffusion XL: [/docs/en/Model_Details/Stable-Diffusion-XL.md](/docs/en/Model_Details/Stable-Diffusion-XL.md)
|
|
368
|
+
|
|
369
|
+
<details>
|
|
370
|
+
|
|
371
|
+
<summary>Quick Start</summary>
|
|
372
|
+
|
|
373
|
+
Running the following code will quickly load the [stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 6GB VRAM.
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
import torch
|
|
377
|
+
from diffsynth.core import ModelConfig
|
|
378
|
+
from diffsynth.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
|
|
379
|
+
|
|
380
|
+
vram_config = {
|
|
381
|
+
"offload_dtype": torch.float32,
|
|
382
|
+
"offload_device": "cpu",
|
|
383
|
+
"onload_dtype": torch.float32,
|
|
384
|
+
"onload_device": "cpu",
|
|
385
|
+
"preparing_dtype": torch.float32,
|
|
386
|
+
"preparing_device": "cuda",
|
|
387
|
+
"computation_dtype": torch.float32,
|
|
388
|
+
"computation_device": "cuda",
|
|
389
|
+
}
|
|
390
|
+
pipe = StableDiffusionXLPipeline.from_pretrained(
|
|
391
|
+
torch_dtype=torch.float32,
|
|
392
|
+
model_configs=[
|
|
393
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
394
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors", **vram_config),
|
|
395
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors", **vram_config),
|
|
396
|
+
ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
397
|
+
],
|
|
398
|
+
tokenizer_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer/"),
|
|
399
|
+
tokenizer_2_config=ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="tokenizer_2/"),
|
|
400
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
image = pipe(
|
|
404
|
+
prompt="a photo of an astronaut riding a horse on mars",
|
|
405
|
+
negative_prompt="",
|
|
406
|
+
cfg_scale=5.0,
|
|
407
|
+
height=1024,
|
|
408
|
+
width=1024,
|
|
409
|
+
seed=42,
|
|
410
|
+
num_inference_steps=50,
|
|
411
|
+
)
|
|
412
|
+
image.save("image.jpg")
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
</details>
|
|
416
|
+
|
|
417
|
+
<details>
|
|
418
|
+
|
|
419
|
+
<summary>Examples</summary>
|
|
420
|
+
|
|
421
|
+
Example code for Stable Diffusion XL is available at: [/examples/stable_diffusion_xl/](/examples/stable_diffusion_xl/)
|
|
422
|
+
|
|
423
|
+
|Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation|
|
|
424
|
+
|-|-|-|-|-|-|-|
|
|
425
|
+
|[stabilityai/stable-diffusion-xl-base-1.0](https://www.modelscope.cn/models/stabilityai/stable-diffusion-xl-base-1.0)|[code](/examples/stable_diffusion_xl/model_inference/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_inference_low_vram/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/full/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_full/stable-diffusion-xl-base-1.0.py)|[code](/examples/stable_diffusion_xl/model_training/lora/stable-diffusion-xl-base-1.0.sh)|[code](/examples/stable_diffusion_xl/model_training/validate_lora/stable-diffusion-xl-base-1.0.py)|
|
|
426
|
+
|
|
427
|
+
</details>
|
|
428
|
+
|
|
300
429
|
#### FLUX.2: [/docs/en/Model_Details/FLUX2.md](/docs/en/Model_Details/FLUX2.md)
|
|
301
430
|
|
|
302
431
|
<details>
|
|
@@ -598,6 +727,143 @@ Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
|
|
|
598
727
|
|
|
599
728
|
</details>
|
|
600
729
|
|
|
730
|
+
#### ERNIE-Image: [/docs/en/Model_Details/ERNIE-Image.md](/docs/en/Model_Details/ERNIE-Image.md)
|
|
731
|
+
|
|
732
|
+
<details>
|
|
733
|
+
|
|
734
|
+
<summary>Quick Start</summary>
|
|
735
|
+
|
|
736
|
+
Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
737
|
+
|
|
738
|
+
```python
|
|
739
|
+
from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig
|
|
740
|
+
import torch
|
|
741
|
+
|
|
742
|
+
vram_config = {
|
|
743
|
+
"offload_dtype": torch.bfloat16,
|
|
744
|
+
"offload_device": "cpu",
|
|
745
|
+
"onload_dtype": torch.bfloat16,
|
|
746
|
+
"onload_device": "cpu",
|
|
747
|
+
"preparing_dtype": torch.bfloat16,
|
|
748
|
+
"preparing_device": "cuda",
|
|
749
|
+
"computation_dtype": torch.bfloat16,
|
|
750
|
+
"computation_device": "cuda",
|
|
751
|
+
}
|
|
752
|
+
pipe = ErnieImagePipeline.from_pretrained(
|
|
753
|
+
torch_dtype=torch.bfloat16,
|
|
754
|
+
device='cuda',
|
|
755
|
+
model_configs=[
|
|
756
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
|
|
757
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
|
|
758
|
+
ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
759
|
+
],
|
|
760
|
+
tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
|
|
761
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
image = pipe(
|
|
765
|
+
prompt="一只黑白相间的中华田园犬",
|
|
766
|
+
negative_prompt="",
|
|
767
|
+
height=1024,
|
|
768
|
+
width=1024,
|
|
769
|
+
seed=42,
|
|
770
|
+
num_inference_steps=50,
|
|
771
|
+
cfg_scale=4.0,
|
|
772
|
+
)
|
|
773
|
+
image.save("output.jpg")
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
</details>
|
|
777
|
+
|
|
778
|
+
<details>
|
|
779
|
+
|
|
780
|
+
<summary>Examples</summary>
|
|
781
|
+
|
|
782
|
+
Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples/ernie_image/)
|
|
783
|
+
|
|
784
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
785
|
+
|-|-|-|-|-|-|-|
|
|
786
|
+
|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)|
|
|
787
|
+
|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—|
|
|
788
|
+
|
|
789
|
+
</details>
|
|
790
|
+
|
|
791
|
+
#### JoyAI-Image: [/docs/en/Model_Details/JoyAI-Image.md](/docs/en/Model_Details/JoyAI-Image.md)
|
|
792
|
+
|
|
793
|
+
<details>
|
|
794
|
+
|
|
795
|
+
<summary>Quick Start</summary>
|
|
796
|
+
|
|
797
|
+
Running the following code will quickly load the [jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 4GB VRAM.
|
|
798
|
+
|
|
799
|
+
```python
|
|
800
|
+
from diffsynth.pipelines.joyai_image import JoyAIImagePipeline, ModelConfig
|
|
801
|
+
import torch
|
|
802
|
+
from PIL import Image
|
|
803
|
+
from modelscope import dataset_snapshot_download
|
|
804
|
+
|
|
805
|
+
# Download dataset
|
|
806
|
+
dataset_snapshot_download(
|
|
807
|
+
dataset_id="DiffSynth-Studio/diffsynth_example_dataset",
|
|
808
|
+
local_dir="data/diffsynth_example_dataset",
|
|
809
|
+
allow_file_pattern="joyai_image/JoyAI-Image-Edit/*"
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
vram_config = {
|
|
813
|
+
"offload_dtype": torch.bfloat16,
|
|
814
|
+
"offload_device": "cpu",
|
|
815
|
+
"onload_dtype": torch.bfloat16,
|
|
816
|
+
"onload_device": "cpu",
|
|
817
|
+
"preparing_dtype": torch.bfloat16,
|
|
818
|
+
"preparing_device": "cuda",
|
|
819
|
+
"computation_dtype": torch.bfloat16,
|
|
820
|
+
"computation_device": "cuda",
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
pipe = JoyAIImagePipeline.from_pretrained(
|
|
824
|
+
torch_dtype=torch.bfloat16,
|
|
825
|
+
device="cuda",
|
|
826
|
+
model_configs=[
|
|
827
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth", **vram_config),
|
|
828
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model*.safetensors", **vram_config),
|
|
829
|
+
ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="vae/Wan2.1_VAE.pth", **vram_config),
|
|
830
|
+
],
|
|
831
|
+
processor_config=ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/"),
|
|
832
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# Use first sample from dataset
|
|
836
|
+
dataset_base_path = "data/diffsynth_example_dataset/joyai_image/JoyAI-Image-Edit"
|
|
837
|
+
prompt = "将裙子改为粉色"
|
|
838
|
+
edit_image = Image.open(f"{dataset_base_path}/edit/image1.jpg").convert("RGB")
|
|
839
|
+
|
|
840
|
+
output = pipe(
|
|
841
|
+
prompt=prompt,
|
|
842
|
+
edit_image=edit_image,
|
|
843
|
+
height=1024,
|
|
844
|
+
width=1024,
|
|
845
|
+
seed=0,
|
|
846
|
+
num_inference_steps=30,
|
|
847
|
+
cfg_scale=5.0,
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
output.save("output_joyai_edit_low_vram.png")
|
|
851
|
+
```
|
|
852
|
+
|
|
853
|
+
</details>
|
|
854
|
+
|
|
855
|
+
<details>
|
|
856
|
+
|
|
857
|
+
<summary>Examples</summary>
|
|
858
|
+
|
|
859
|
+
Example code for JoyAI-Image is available at: [/examples/joyai_image/](/examples/joyai_image/)
|
|
860
|
+
|
|
861
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
862
|
+
|-|-|-|-|-|-|-|
|
|
863
|
+
|[jd-opensource/JoyAI-Image-Edit](https://modelscope.cn/models/jd-opensource/JoyAI-Image-Edit)|[code](/examples/joyai_image/model_inference/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_inference_low_vram/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/full/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_full/JoyAI-Image-Edit.py)|[code](/examples/joyai_image/model_training/lora/JoyAI-Image-Edit.sh)|[code](/examples/joyai_image/model_training/validate_lora/JoyAI-Image-Edit.py)|
|
|
864
|
+
|
|
865
|
+
</details>
|
|
866
|
+
|
|
601
867
|
### Video Synthesis
|
|
602
868
|
|
|
603
869
|
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
|
|
@@ -877,18 +1143,22 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
|
|
|
877
1143
|
|
|
878
1144
|
</details>
|
|
879
1145
|
|
|
880
|
-
|
|
1146
|
+
### Audio Synthesis
|
|
1147
|
+
|
|
1148
|
+
#### ACE-Step: [/docs/en/Model_Details/ACE-Step.md](/docs/en/Model_Details/ACE-Step.md)
|
|
881
1149
|
|
|
882
1150
|
<details>
|
|
883
1151
|
|
|
884
1152
|
<summary>Quick Start</summary>
|
|
885
1153
|
|
|
886
|
-
Running the following code will quickly load the [
|
|
1154
|
+
Running the following code will quickly load the [ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
887
1155
|
|
|
888
1156
|
```python
|
|
889
|
-
from diffsynth.pipelines.
|
|
1157
|
+
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
|
1158
|
+
from diffsynth.utils.data.audio import save_audio
|
|
890
1159
|
import torch
|
|
891
1160
|
|
|
1161
|
+
|
|
892
1162
|
vram_config = {
|
|
893
1163
|
"offload_dtype": torch.bfloat16,
|
|
894
1164
|
"offload_device": "cpu",
|
|
@@ -899,28 +1169,34 @@ vram_config = {
|
|
|
899
1169
|
"computation_dtype": torch.bfloat16,
|
|
900
1170
|
"computation_device": "cuda",
|
|
901
1171
|
}
|
|
902
|
-
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
pipe = AceStepPipeline.from_pretrained(
|
|
903
1175
|
torch_dtype=torch.bfloat16,
|
|
904
|
-
device=
|
|
1176
|
+
device="cuda",
|
|
905
1177
|
model_configs=[
|
|
906
|
-
ModelConfig(model_id="
|
|
907
|
-
ModelConfig(model_id="
|
|
908
|
-
ModelConfig(model_id="
|
|
1178
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors", **vram_config),
|
|
1179
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
|
|
1180
|
+
ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
|
909
1181
|
],
|
|
910
|
-
|
|
1182
|
+
text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
|
|
911
1183
|
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
912
1184
|
)
|
|
913
1185
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
1186
|
+
prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
|
|
1187
|
+
lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
|
|
1188
|
+
audio = pipe(
|
|
1189
|
+
prompt=prompt,
|
|
1190
|
+
lyrics=lyrics,
|
|
1191
|
+
duration=160,
|
|
1192
|
+
bpm=100,
|
|
1193
|
+
keyscale="B minor",
|
|
1194
|
+
timesignature="4",
|
|
1195
|
+
vocal_language="zh",
|
|
919
1196
|
seed=42,
|
|
920
|
-
num_inference_steps=50,
|
|
921
|
-
cfg_scale=4.0,
|
|
922
1197
|
)
|
|
923
|
-
|
|
1198
|
+
|
|
1199
|
+
save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
|
|
924
1200
|
```
|
|
925
1201
|
|
|
926
1202
|
</details>
|
|
@@ -929,12 +1205,21 @@ image.save("output.jpg")
|
|
|
929
1205
|
|
|
930
1206
|
<summary>Examples</summary>
|
|
931
1207
|
|
|
932
|
-
Example code for
|
|
1208
|
+
Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_step/)
|
|
933
1209
|
|
|
934
1210
|
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
935
1211
|
|-|-|-|-|-|-|-|
|
|
936
|
-
|[
|
|
937
|
-
|[
|
|
1212
|
+
|[ACE-Step/Ace-Step1.5](https://www.modelscope.cn/models/ACE-Step/Ace-Step1.5)|[code](/examples/ace_step/model_inference/Ace-Step1.5.py)|[code](/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/full/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_full/Ace-Step1.5.py)|[code](/examples/ace_step/model_training/lora/Ace-Step1.5.sh)|[code](/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py)|
|
|
1213
|
+
|[ACE-Step/acestep-v15-turbo-shift1](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift1)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py)|
|
|
1214
|
+
|[ACE-Step/acestep-v15-turbo-shift3](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-shift3)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py)|
|
|
1215
|
+
|[ACE-Step/acestep-v15-turbo-continuous](https://www.modelscope.cn/models/ACE-Step/acestep-v15-turbo-continuous)|[code](/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py)|
|
|
1216
|
+
|[ACE-Step/acestep-v15-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-base.py)|
|
|
1217
|
+
|[ACE-Step/acestep-v15-base: CoverTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py)|—|—|—|—|
|
|
1218
|
+
|[ACE-Step/acestep-v15-base: RepaintTask](https://www.modelscope.cn/models/ACE-Step/acestep-v15-base)|[code](/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py)|—|—|—|—|
|
|
1219
|
+
|[ACE-Step/acestep-v15-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-sft)|[code](/examples/ace_step/model_inference/acestep-v15-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py)|
|
|
1220
|
+
|[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
|
|
1221
|
+
|[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
|
|
1222
|
+
|[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
|
|
938
1223
|
|
|
939
1224
|
</details>
|
|
940
1225
|
|
|
@@ -42,6 +42,7 @@ qwen_image_series = [
|
|
|
42
42
|
"model_hash": "5722b5c873720009de96422993b15682",
|
|
43
43
|
"model_name": "dinov3_image_encoder",
|
|
44
44
|
"model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
|
|
45
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
|
|
45
46
|
},
|
|
46
47
|
{
|
|
47
48
|
# Example:
|
|
@@ -900,4 +901,146 @@ mova_series = [
|
|
|
900
901
|
"model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
|
|
901
902
|
},
|
|
902
903
|
]
|
|
903
|
-
|
|
904
|
+
stable_diffusion_xl_series = [
|
|
905
|
+
{
|
|
906
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
|
|
907
|
+
"model_hash": "142b114f67f5ab3a6d83fb5788f12ded",
|
|
908
|
+
"model_name": "stable_diffusion_xl_unet",
|
|
909
|
+
"model_class": "diffsynth.models.stable_diffusion_xl_unet.SDXLUNet2DConditionModel",
|
|
910
|
+
"extra_kwargs": {"attention_head_dim": [5, 10, 20], "transformer_layers_per_block": [1, 2, 10], "use_linear_projection": True, "addition_embed_type": "text_time", "addition_time_embed_dim": 256, "projection_class_embeddings_input_dim": 2816},
|
|
911
|
+
},
|
|
912
|
+
{
|
|
913
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder_2/model.safetensors")
|
|
914
|
+
"model_hash": "98cc34ccc5b54ae0e56bdea8688dcd5a",
|
|
915
|
+
"model_name": "stable_diffusion_xl_text_encoder",
|
|
916
|
+
"model_class": "diffsynth.models.stable_diffusion_xl_text_encoder.SDXLTextEncoder2",
|
|
917
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_xl_text_encoder.SDXLTextEncoder2StateDictConverter",
|
|
918
|
+
},
|
|
919
|
+
{
|
|
920
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="text_encoder/model.safetensors")
|
|
921
|
+
"model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
|
|
922
|
+
"model_name": "stable_diffusion_text_encoder",
|
|
923
|
+
"model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
|
|
924
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
|
|
925
|
+
},
|
|
926
|
+
{
|
|
927
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
928
|
+
"model_hash": "13115dd45a6e1c39860f91ab073b8a78",
|
|
929
|
+
"model_name": "stable_diffusion_xl_vae",
|
|
930
|
+
"model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
|
|
931
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
|
|
932
|
+
"extra_kwargs": {"scaling_factor": 0.13025, "sample_size": 1024, "force_upcast": True},
|
|
933
|
+
},
|
|
934
|
+
]
|
|
935
|
+
|
|
936
|
+
stable_diffusion_series = [
|
|
937
|
+
{
|
|
938
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="text_encoder/model.safetensors")
|
|
939
|
+
"model_hash": "ffd1737ae9df7fd43f5fbed653bdad67",
|
|
940
|
+
"model_name": "stable_diffusion_text_encoder",
|
|
941
|
+
"model_class": "diffsynth.models.stable_diffusion_text_encoder.SDTextEncoder",
|
|
942
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_text_encoder.SDTextEncoderStateDictConverter",
|
|
943
|
+
},
|
|
944
|
+
{
|
|
945
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
946
|
+
"model_hash": "f86d5683ed32433be8ca69969c67ba69",
|
|
947
|
+
"model_name": "stable_diffusion_vae",
|
|
948
|
+
"model_class": "diffsynth.models.stable_diffusion_vae.StableDiffusionVAE",
|
|
949
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
|
|
950
|
+
},
|
|
951
|
+
{
|
|
952
|
+
# Example: ModelConfig(model_id="AI-ModelScope/stable-diffusion-v1-5", origin_file_pattern="unet/diffusion_pytorch_model.safetensors")
|
|
953
|
+
"model_hash": "025a4b86a84829399d89f613e580757b",
|
|
954
|
+
"model_name": "stable_diffusion_unet",
|
|
955
|
+
"model_class": "diffsynth.models.stable_diffusion_unet.UNet2DConditionModel",
|
|
956
|
+
},
|
|
957
|
+
]
|
|
958
|
+
|
|
959
|
+
joyai_image_series = [
|
|
960
|
+
{
|
|
961
|
+
# Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="transformer/transformer.pth")
|
|
962
|
+
"model_hash": "56592ddfd7d0249d3aa527d24161a863",
|
|
963
|
+
"model_name": "joyai_image_dit",
|
|
964
|
+
"model_class": "diffsynth.models.joyai_image_dit.JoyAIImageDiT",
|
|
965
|
+
},
|
|
966
|
+
{
|
|
967
|
+
# Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
|
|
968
|
+
"model_hash": "2d11bf14bba8b4e87477c8199a895403",
|
|
969
|
+
"model_name": "joyai_image_text_encoder",
|
|
970
|
+
"model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
|
|
971
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.joyai_image_text_encoder.JoyAIImageTextEncoderStateDictConverter",
|
|
972
|
+
},
|
|
973
|
+
]
|
|
974
|
+
|
|
975
|
+
ace_step_series = [
|
|
976
|
+
{
|
|
977
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
978
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
979
|
+
"model_name": "ace_step_dit",
|
|
980
|
+
"model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
|
|
981
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
|
|
982
|
+
},
|
|
983
|
+
{
|
|
984
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
985
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
986
|
+
"model_name": "ace_step_dit",
|
|
987
|
+
"model_class": "diffsynth.models.ace_step_dit.AceStepDiTModel",
|
|
988
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_dit.AceStepDiTModelStateDictConverter",
|
|
989
|
+
"extra_kwargs": {
|
|
990
|
+
"hidden_size": 2560,
|
|
991
|
+
"intermediate_size": 9728,
|
|
992
|
+
"num_hidden_layers": 32,
|
|
993
|
+
"num_attention_heads": 32,
|
|
994
|
+
"num_key_value_heads": 8,
|
|
995
|
+
"head_dim": 128,
|
|
996
|
+
"encoder_hidden_size": 2048,
|
|
997
|
+
"layer_types": ["sliding_attention", "full_attention"] * 16,
|
|
998
|
+
},
|
|
999
|
+
},
|
|
1000
|
+
{
|
|
1001
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
1002
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
1003
|
+
"model_name": "ace_step_conditioner",
|
|
1004
|
+
"model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
|
|
1005
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
|
|
1006
|
+
},
|
|
1007
|
+
{
|
|
1008
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
1009
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
1010
|
+
"model_name": "ace_step_conditioner",
|
|
1011
|
+
"model_class": "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder",
|
|
1012
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_conditioner.AceStepConditionEncoderStateDictConverter",
|
|
1013
|
+
},
|
|
1014
|
+
{
|
|
1015
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors")
|
|
1016
|
+
"model_hash": "3509bea17b0e8cffc3dd4a15cc7899d0",
|
|
1017
|
+
"model_name": "ace_step_text_encoder",
|
|
1018
|
+
"model_class": "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder",
|
|
1019
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_text_encoder.AceStepTextEncoderStateDictConverter",
|
|
1020
|
+
},
|
|
1021
|
+
{
|
|
1022
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
1023
|
+
"model_hash": "51420834e54474986a7f4be0e4d6f687",
|
|
1024
|
+
"model_name": "ace_step_vae",
|
|
1025
|
+
"model_class": "diffsynth.models.ace_step_vae.AceStepVAE",
|
|
1026
|
+
},
|
|
1027
|
+
{
|
|
1028
|
+
# Example: ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors")
|
|
1029
|
+
"model_hash": "ba29d8bddbb6ace65675f6a757a13c00",
|
|
1030
|
+
"model_name": "ace_step_tokenizer",
|
|
1031
|
+
"model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
|
|
1032
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
|
|
1033
|
+
},
|
|
1034
|
+
{
|
|
1035
|
+
# Example: ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors")
|
|
1036
|
+
"model_hash": "3a28a410c2246f125153ef792d8bc828",
|
|
1037
|
+
"model_name": "ace_step_tokenizer",
|
|
1038
|
+
"model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
|
|
1039
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
|
|
1040
|
+
},
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
MODEL_CONFIGS = (
|
|
1044
|
+
stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
|
|
1045
|
+
+ z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series
|
|
1046
|
+
)
|