diffsynth 2.0.13__tar.gz → 2.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.13 → diffsynth-2.0.15}/PKG-INFO +1 -1
- {diffsynth-2.0.13 → diffsynth-2.0.15}/README.md +4 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/configs/model_configs.py +17 -12
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/configs/vram_management_module_maps.py +13 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/attention/attention.py +9 -1
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/data/operators.py +4 -1
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/ddim_scheduler.py +6 -4
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/flow_match.py +3 -3
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ideogram4_dit.py +17 -4
- diffsynth-2.0.15/diffsynth/models/ideogram4_vae.py +74 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/ideogram4.py +18 -17
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/stable_diffusion_xl.py +2 -0
- diffsynth-2.0.15/diffsynth/utils/demucs/__init__.py +21 -0
- diffsynth-2.0.15/diffsynth/utils/dequantizer/__init__.py +15 -0
- diffsynth-2.0.15/diffsynth/utils/lora/sdxl.py +780 -0
- diffsynth-2.0.15/diffsynth/utils/state_dict_converters/sdxl.py +1690 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth.egg-info/SOURCES.txt +4 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/pyproject.toml +1 -1
- diffsynth-2.0.13/diffsynth/models/ideogram4_vae.py +0 -517
- {diffsynth-2.0.13 → diffsynth-2.0.15}/LICENSE +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/data/unified_dataset.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/offload_training/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/offload_training/manager.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/offload_training/memory_buffer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/offload_training/offloader.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/base_pipeline.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/runner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/template.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/diffusion/training_module.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/aesthetic.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/base.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/bioclip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/fid.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/hpsv2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/hpsv3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/image_reward.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/lpips.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/pickscore.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/qwen_image_bench.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/unified_reward_2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/metrics/unified_reward_edit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_conditioner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_residual_fsq.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_tokenizer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ace_step_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/aesthetic.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/bioclip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/demucs.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/dinov3_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ernie_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/fid.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/hidream_common.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/hidream_o1_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/hpsv2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/hpsv3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ideogram4_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/image_reward.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/joyai_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/lpips.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/model_loader.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/pickscore.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_bench.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/siglip2_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/stable_diffusion_unet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/stable_diffusion_xl_unet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/unified_reward_2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/unified_reward_edit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wantodance.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/ace_step.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/anima_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/ernie_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/flux2_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/flux_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/hidream_o1_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/joyai_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/ltx2_audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/mova_audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/qwen_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/stable_diffusion.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/wan_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/pipelines/z_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/data/audio.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ace_step_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/dino_v3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/image_metrics.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth/version.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.15}/setup.cfg +0 -0
|
@@ -34,6 +34,8 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
34
34
|
|
|
35
35
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
36
|
|
|
37
|
+
- **June 16, 2026**: We have added a new Template model for ACE-Step: [vocals2music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-vocals2music). For more details, please refer to the [documentation](/docs/zh/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
|
|
38
|
+
|
|
37
39
|
- **June 15, 2026** We have open-sourced Image-to-LoRA V2, compressing the hours-long training process for image style LoRAs into a single model inference step, thereby exploring a new paradigm for LoRA model training. The [technical report](https://arxiv.org/abs/2606.13809) has been released. This release includes three models:
|
|
38
40
|
* [DiffSynth-Studio/ZImage-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2): Adapted for the Z-Image model
|
|
39
41
|
* [DiffSynth-Studio/KleinBase4B-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2): Adapted for the FLUX.2-klein-base-4B model
|
|
@@ -1036,6 +1038,7 @@ Example code for Ideogram 4 is available at: [/examples/ideogram4/](/examples/id
|
|
|
1036
1038
|
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
1037
1039
|
|-|-|-|-|-|-|-|
|
|
1038
1040
|
|[ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8)|[code](/examples/ideogram4/model_inference/ideogram-4-fp8.py)|-|-|-|-|-|
|
|
1041
|
+
|[DiffSynth-Studio/ideogram-4-bf16-repackage](https://www.modelscope.cn/models/DiffSynth-Studio/ideogram-4-bf16-repackage)|[code](/examples/ideogram4/model_inference/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_inference_low_vram/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_training/full/Ideogram-4-bf16-repackage.sh)|-|[code](/examples/ideogram4/model_training/lora/Ideogram-4-bf16-repackage.sh)|[code](/examples/ideogram4/model_training/validate_lora/Ideogram-4-bf16-repackage.py)|
|
|
1039
1042
|
|
|
1040
1043
|
</details>
|
|
1041
1044
|
|
|
@@ -1396,6 +1399,7 @@ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_s
|
|
|
1396
1399
|
|[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
|
|
1397
1400
|
|[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
|
|
1398
1401
|
|[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
|
|
1402
|
+
|[DiffSynth-Studio/acestep15xlsft-lora-music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-lora-music)|[code](/examples/ace_step/model_inference/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_inference_low_vram/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_training/full/acestep15xlsft-vocals2music.sh)|[code](/examples/ace_step/model_training/validate_full/acestep15xlsft-vocals2music.py)|-|-|
|
|
1399
1403
|
|
|
1400
1404
|
</details>
|
|
1401
1405
|
|
|
@@ -917,6 +917,14 @@ stable_diffusion_xl_series = [
|
|
|
917
917
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.stable_diffusion_vae.SDVAEStateDictConverter",
|
|
918
918
|
"extra_kwargs": {"scaling_factor": 0.13025, "sample_size": 1024, "force_upcast": True},
|
|
919
919
|
},
|
|
920
|
+
{
|
|
921
|
+
# Example: ModelConfig(model_id="stabilityai/stable-diffusion-xl-base-1.0", origin_file_pattern="sd_xl_base_1.0.safetensors")
|
|
922
|
+
"model_hash": "4cf64a799d04260df438c6f33c9a047e",
|
|
923
|
+
"model_name": "stable_diffusion_xl_unet",
|
|
924
|
+
"model_class": "diffsynth.models.stable_diffusion_xl_unet.SDXLUNet2DConditionModel",
|
|
925
|
+
"extra_kwargs": {"attention_head_dim": [5, 10, 20], "transformer_layers_per_block": [1, 2, 10], "use_linear_projection": True, "addition_embed_type": "text_time", "addition_time_embed_dim": 256, "projection_class_embeddings_input_dim": 2816},
|
|
926
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.sdxl.SDXLUNetStateDictConverter_Original2Diffusers",
|
|
927
|
+
}
|
|
920
928
|
]
|
|
921
929
|
|
|
922
930
|
stable_diffusion_series = [
|
|
@@ -1149,20 +1157,17 @@ ideogram4_series = [
|
|
|
1149
1157
|
"extra_kwargs": {"keep_original_dtype": True},
|
|
1150
1158
|
},
|
|
1151
1159
|
{
|
|
1152
|
-
# Example: ModelConfig(model_id="
|
|
1153
|
-
"model_hash": "
|
|
1154
|
-
"model_name": "
|
|
1155
|
-
"model_class": "diffsynth.models.
|
|
1156
|
-
"
|
|
1157
|
-
"extra_kwargs": {"keep_original_dtype": True},
|
|
1160
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ideogram-4-bf16-repackage", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
|
|
1161
|
+
"model_hash": "291b300b11c8c8e11978bd85a9c5f80c",
|
|
1162
|
+
"model_name": "ideogram4_dit",
|
|
1163
|
+
"model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
|
|
1164
|
+
"extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}},
|
|
1158
1165
|
},
|
|
1159
1166
|
{
|
|
1160
|
-
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="
|
|
1161
|
-
"model_hash": "
|
|
1162
|
-
"model_name": "
|
|
1163
|
-
"model_class": "diffsynth.models.
|
|
1164
|
-
"state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoderStateDictConverter",
|
|
1165
|
-
"extra_kwargs": {"keep_original_dtype": True},
|
|
1167
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
|
|
1168
|
+
"model_hash": "6a269892c0757aacd46bd41b8d5a7aef",
|
|
1169
|
+
"model_name": "ideogram4_text_encoder",
|
|
1170
|
+
"model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
|
|
1166
1171
|
},
|
|
1167
1172
|
]
|
|
1168
1173
|
|
|
@@ -380,6 +380,19 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
380
380
|
"diffsynth.models.hidream_o1_image_dit.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
381
381
|
"diffsynth.models.hidream_o1_image_dit.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
382
382
|
},
|
|
383
|
+
"diffsynth.models.ideogram4_dit.Ideogram4DiT": {
|
|
384
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
385
|
+
"diffsynth.models.ideogram4_dit.Ideogram4RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
386
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
387
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
388
|
+
},
|
|
389
|
+
"diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder": {
|
|
390
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
391
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
392
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
393
|
+
"transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
394
|
+
"transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
395
|
+
},
|
|
383
396
|
}
|
|
384
397
|
|
|
385
398
|
def QwenImageTextEncoder_Module_Map_Updater():
|
|
@@ -26,6 +26,14 @@ try:
|
|
|
26
26
|
except ModuleNotFoundError:
|
|
27
27
|
XFORMERS_AVAILABLE = False
|
|
28
28
|
|
|
29
|
+
try:
|
|
30
|
+
if "enable_gqa" in inspect.signature(torch.nn.functional.scaled_dot_product_attention).parameters:
|
|
31
|
+
TORCH_SUPPORT_GQA = True
|
|
32
|
+
else:
|
|
33
|
+
TORCH_SUPPORT_GQA = False
|
|
34
|
+
except:
|
|
35
|
+
TORCH_SUPPORT_GQA = False
|
|
36
|
+
|
|
29
37
|
|
|
30
38
|
def initialize_attention_priority():
|
|
31
39
|
if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
|
|
@@ -68,7 +76,7 @@ def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n
|
|
|
68
76
|
q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
|
|
69
77
|
if q.shape[1] != k.shape[1] or q.shape[1] != v.shape[1]:
|
|
70
78
|
# Grouped Query Attention
|
|
71
|
-
if
|
|
79
|
+
if TORCH_SUPPORT_GQA:
|
|
72
80
|
out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal, enable_gqa=True)
|
|
73
81
|
else:
|
|
74
82
|
# In low-version torch, `enable_gqa` is not supported.
|
|
@@ -2,6 +2,7 @@ import math, warnings
|
|
|
2
2
|
import torch, torchvision, imageio, os
|
|
3
3
|
import imageio.v3 as iio
|
|
4
4
|
from PIL import Image
|
|
5
|
+
from einops import repeat
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class DataProcessingPipeline:
|
|
@@ -283,7 +284,7 @@ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
|
|
|
283
284
|
|
|
284
285
|
class LoadPureAudioWithTorchaudio(DataProcessingOperator):
|
|
285
286
|
|
|
286
|
-
def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False):
|
|
287
|
+
def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False, channels=2):
|
|
287
288
|
self.target_sample_rate = target_sample_rate
|
|
288
289
|
self.max_audio_duration = max_audio_duration
|
|
289
290
|
self.resample = True if target_sample_rate is not None else False
|
|
@@ -302,6 +303,8 @@ class LoadPureAudioWithTorchaudio(DataProcessingOperator):
|
|
|
302
303
|
elif current_samples < target_samples and self.padding:
|
|
303
304
|
padding = target_samples - current_samples
|
|
304
305
|
waveform = torch.nn.functional.pad(waveform, (0, padding))
|
|
306
|
+
if waveform.shape[0] == 1:
|
|
307
|
+
waveform = repeat(waveform, "C L -> (N C) L", N=2)
|
|
305
308
|
return waveform, sample_rate
|
|
306
309
|
except Exception as e:
|
|
307
310
|
print(f"Cannot load audio in {data} due to {e}. The audio will be `None`.")
|
|
@@ -87,8 +87,9 @@ class DDIMScheduler():
|
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
def add_noise(self, original_samples, noise, timestep):
|
|
90
|
-
|
|
91
|
-
|
|
90
|
+
timestep_id = max(min(int(timestep.flatten().tolist()[0]), len(self.alphas_cumprod)-1), 0)
|
|
91
|
+
sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep_id])
|
|
92
|
+
sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep_id])
|
|
92
93
|
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
|
|
93
94
|
return noisy_samples
|
|
94
95
|
|
|
@@ -97,8 +98,9 @@ class DDIMScheduler():
|
|
|
97
98
|
if self.prediction_type == "epsilon":
|
|
98
99
|
return noise
|
|
99
100
|
else:
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
timestep_id = max(min(int(timestep.flatten().tolist()[0]), len(self.alphas_cumprod)-1), 0)
|
|
102
|
+
sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep_id])
|
|
103
|
+
sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep_id])
|
|
102
104
|
target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
|
103
105
|
return target
|
|
104
106
|
|
|
@@ -214,7 +214,7 @@ class FlowMatchScheduler():
|
|
|
214
214
|
logsnr_max = 18.0
|
|
215
215
|
t_min = 1.0 / (1 + math.exp(0.5 * logsnr_max))
|
|
216
216
|
t_max = 1.0 / (1 + math.exp(0.5 * logsnr_min))
|
|
217
|
-
step_intervals = torch.linspace(0.0,
|
|
217
|
+
step_intervals = torch.linspace(0.0, denoising_strength, num_inference_steps + 1, dtype=torch.float64)
|
|
218
218
|
sigmas = []
|
|
219
219
|
for i in range(num_inference_steps + 1):
|
|
220
220
|
z = torch.special.ndtri(step_intervals[i])
|
|
@@ -230,7 +230,7 @@ class FlowMatchScheduler():
|
|
|
230
230
|
one_minus_t = one_minus_t * (sigma_start / one_minus_t[0])
|
|
231
231
|
sigmas = sigmas.flip(dims=(0,))
|
|
232
232
|
timesteps = sigmas[:-1]
|
|
233
|
-
sigmas = 1 - sigmas
|
|
233
|
+
sigmas = (1 - sigmas)[:-1]
|
|
234
234
|
return sigmas, timesteps
|
|
235
235
|
|
|
236
236
|
@staticmethod
|
|
@@ -263,7 +263,7 @@ class FlowMatchScheduler():
|
|
|
263
263
|
|
|
264
264
|
def set_training_weight(self):
|
|
265
265
|
steps = 1000
|
|
266
|
-
x = self.
|
|
266
|
+
x = self.sigmas * self.num_train_timesteps
|
|
267
267
|
y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
|
|
268
268
|
y_shifted = y - y.min()
|
|
269
269
|
bsmntw_weighing = y_shifted * (steps / y_shifted.sum())
|
|
@@ -5,6 +5,8 @@ import torch
|
|
|
5
5
|
import torch.nn as nn
|
|
6
6
|
import torch.nn.functional as F
|
|
7
7
|
|
|
8
|
+
from ..core.gradient import gradient_checkpoint_forward
|
|
9
|
+
|
|
8
10
|
LLM_TOKEN_INDICATOR = 3
|
|
9
11
|
OUTPUT_IMAGE_INDICATOR = 2
|
|
10
12
|
IMAGE_POSITION_OFFSET = 65536
|
|
@@ -140,7 +142,7 @@ class Ideogram4MRoPE(nn.Module):
|
|
|
140
142
|
pos = position_ids.permute(2, 0, 1).to(dtype=torch.float32)
|
|
141
143
|
inv_freq = self.inv_freq.to(dtype=torch.float32)[None, None, :, None].expand(
|
|
142
144
|
3, batch_size, -1, 1
|
|
143
|
-
)
|
|
145
|
+
).to(pos.device)
|
|
144
146
|
freqs = inv_freq @ pos.unsqueeze(2)
|
|
145
147
|
freqs = freqs.transpose(2, 3)
|
|
146
148
|
|
|
@@ -291,7 +293,7 @@ class Ideogram4EmbedScalar(nn.Module):
|
|
|
291
293
|
scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
|
|
292
294
|
emb = _sinusoidal_embedding(scaled, self.dim)
|
|
293
295
|
emb = emb.to(
|
|
294
|
-
getattr(self.mlp_in, "compute_dtype", None) or self.mlp_in.weight.dtype
|
|
296
|
+
getattr(self.mlp_in, "compute_dtype", None) or getattr(self.mlp_in, "computation_dtype", None) or self.mlp_in.weight.dtype
|
|
295
297
|
)
|
|
296
298
|
emb = F.silu(self.mlp_in(emb))
|
|
297
299
|
return self.mlp_out(emb)
|
|
@@ -375,6 +377,8 @@ class Ideogram4DiT(nn.Module):
|
|
|
375
377
|
position_ids: torch.Tensor,
|
|
376
378
|
segment_ids: torch.Tensor,
|
|
377
379
|
indicator: torch.Tensor,
|
|
380
|
+
use_gradient_checkpointing: bool = False,
|
|
381
|
+
use_gradient_checkpointing_offload: bool = False,
|
|
378
382
|
) -> torch.Tensor:
|
|
379
383
|
"""Velocity prediction.
|
|
380
384
|
|
|
@@ -393,7 +397,7 @@ class Ideogram4DiT(nn.Module):
|
|
|
393
397
|
assert in_channels == self.config.in_channels
|
|
394
398
|
|
|
395
399
|
param_dtype = (
|
|
396
|
-
getattr(self.input_proj, "compute_dtype", None) or self.input_proj.weight.dtype
|
|
400
|
+
getattr(self.input_proj, "compute_dtype", None) or getattr(self.input_proj, "computation_dtype", None) or self.input_proj.weight.dtype
|
|
397
401
|
)
|
|
398
402
|
x = x.to(param_dtype)
|
|
399
403
|
t = t.to(param_dtype)
|
|
@@ -428,7 +432,16 @@ class Ideogram4DiT(nn.Module):
|
|
|
428
432
|
sin = sin.to(h.dtype)
|
|
429
433
|
|
|
430
434
|
for layer in self.layers:
|
|
431
|
-
h =
|
|
435
|
+
h = gradient_checkpoint_forward(
|
|
436
|
+
layer,
|
|
437
|
+
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
438
|
+
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
|
439
|
+
x=h,
|
|
440
|
+
segment_ids=segment_ids,
|
|
441
|
+
cos=cos,
|
|
442
|
+
sin=sin,
|
|
443
|
+
adaln_input=adaln_input,
|
|
444
|
+
)
|
|
432
445
|
|
|
433
446
|
out = self.final_layer(h, c=adaln_input)
|
|
434
447
|
return out.to(torch.float32)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from einops import rearrange
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
LATENT_SHIFT = (
|
|
6
|
+
0.01984364, 0.10149707, 0.29689495, 0.27188619, -0.21445648, -0.15979549,
|
|
7
|
+
0.05021099, -0.15083604, -0.15360136, -0.20131799, 0.01922352, 0.0622626,
|
|
8
|
+
0.10140969, -0.06739428, 0.3758261, -0.233712, 0.35164491, -0.02590912,
|
|
9
|
+
-0.0271935, -0.10833897, -0.1476848, -0.01130957, -0.2298372, 0.23526423,
|
|
10
|
+
-0.10893522, 0.11957631, 0.04047799, 0.3134589, -0.17225064, -0.18646109,
|
|
11
|
+
-0.34691978, -0.03571246, 0.02583857, 0.10190072, 0.28402294, 0.26952152,
|
|
12
|
+
-0.21634675, -0.17938656, 0.04358909, -0.15007621, -0.1548502, -0.18971131,
|
|
13
|
+
0.02710861, 0.05609494, 0.10697846, -0.06854968, 0.38167698, -0.24269937,
|
|
14
|
+
0.35705471, -0.03063305, -0.02946109, -0.11244286, -0.14336038, -0.01362137,
|
|
15
|
+
-0.21863696, 0.23228983, -0.11739769, 0.11693044, 0.02563311, 0.31356594,
|
|
16
|
+
-0.17420591, -0.19006285, -0.34905377, -0.04025005, 0.01924137, 0.07652984,
|
|
17
|
+
0.2995608, 0.2628057, -0.22011674, -0.12715361, 0.04879879, -0.14075719,
|
|
18
|
+
-0.15935895, -0.2123584, 0.01974813, 0.05523547, 0.10011992, -0.06428964,
|
|
19
|
+
0.37781868, -0.21491644, 0.34254215, -0.03153528, -0.0310082, -0.10761415,
|
|
20
|
+
-0.14730405, -0.02475182, -0.2285588, 0.2515081, -0.10445128, 0.12446,
|
|
21
|
+
0.07062869, 0.30880162, -0.18016875, -0.18869164, -0.34533499, -0.0129177,
|
|
22
|
+
0.02578168, 0.07993659, 0.28642181, 0.26038408, -0.22459419, -0.14820155,
|
|
23
|
+
0.04059549, -0.14043529, -0.16111187, -0.2020305, 0.02602069, 0.04852717,
|
|
24
|
+
0.10432153, -0.06309942, 0.38402443, -0.22397003, 0.34814481, -0.03774432,
|
|
25
|
+
-0.03381438, -0.11245691, -0.14128767, -0.02853208, -0.21752016, 0.24872463,
|
|
26
|
+
-0.11399775, 0.1222687, 0.05620835, 0.309178, -0.18065738, -0.19401479,
|
|
27
|
+
-0.34495114, -0.01760592,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
LATENT_SCALE = (
|
|
31
|
+
1.63933691, 1.70204478, 1.73642566, 1.90004803, 1.6675316, 1.69059584,
|
|
32
|
+
1.56853198, 1.62314944, 1.89106626, 1.58086668, 1.60822129, 1.60962993,
|
|
33
|
+
1.63322129, 1.56074359, 1.73419528, 1.7919265, 1.64040632, 1.66802808,
|
|
34
|
+
1.60390303, 1.75480492, 1.63187587, 1.64334594, 1.61722884, 1.60146046,
|
|
35
|
+
1.63459219, 1.55291476, 1.68771497, 1.68415657, 1.78966054, 1.66631641,
|
|
36
|
+
1.65626686, 1.65976433, 1.63487607, 1.69513249, 1.72933756, 1.91310663,
|
|
37
|
+
1.67035057, 1.72286863, 1.56719251, 1.61934825, 1.88628859, 1.56911539,
|
|
38
|
+
1.59455129, 1.60829869, 1.62470611, 1.56052853, 1.73677003, 1.77563606,
|
|
39
|
+
1.63732541, 1.66370527, 1.59508952, 1.75153949, 1.63029275, 1.64517667,
|
|
40
|
+
1.61659342, 1.59722044, 1.64103121, 1.5408531, 1.68610394, 1.67772755,
|
|
41
|
+
1.78998563, 1.66621713, 1.65458955, 1.66041308, 1.64710857, 1.68163503,
|
|
42
|
+
1.74000294, 1.92784786, 1.67411194, 1.67395548, 1.57406532, 1.62199356,
|
|
43
|
+
1.87618195, 1.5584375, 1.57438785, 1.61711053, 1.63094305, 1.55644029,
|
|
44
|
+
1.73124302, 1.80666627, 1.6463621, 1.65932006, 1.60816188, 1.75682671,
|
|
45
|
+
1.64695873, 1.63121722, 1.61380832, 1.60478651, 1.63396035, 1.53505068,
|
|
46
|
+
1.65534289, 1.67132281, 1.80317197, 1.6767314, 1.65700938, 1.68426259,
|
|
47
|
+
1.65339716, 1.67540638, 1.73298504, 1.94067348, 1.67893609, 1.70635117,
|
|
48
|
+
1.5730906, 1.61928553, 1.87148809, 1.56244866, 1.56697152, 1.61584394,
|
|
49
|
+
1.62759496, 1.55480378, 1.73484107, 1.79055143, 1.64688773, 1.66121492,
|
|
50
|
+
1.60135887, 1.75254572, 1.64798332, 1.62989921, 1.61381592, 1.60792883,
|
|
51
|
+
1.63939668, 1.53075757, 1.65371318, 1.66801185, 1.80029087, 1.67591476,
|
|
52
|
+
1.65655173, 1.68533454,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def get_latent_norm(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
|
|
56
|
+
shift = torch.tensor(LATENT_SHIFT, dtype=torch.float32, device=device)
|
|
57
|
+
scale = torch.tensor(LATENT_SCALE, dtype=torch.float32, device=device)
|
|
58
|
+
return shift, scale
|
|
59
|
+
|
|
60
|
+
def decode(vae, latents, height, width, torch_dtype):
|
|
61
|
+
latent_shift, latent_scale = get_latent_norm(latents.device)
|
|
62
|
+
latents = latents.float() * latent_scale + latent_shift
|
|
63
|
+
latents = rearrange(latents, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
|
|
64
|
+
latents = latents.to(torch_dtype)
|
|
65
|
+
image = vae._decode(latents)
|
|
66
|
+
return image
|
|
67
|
+
|
|
68
|
+
def encode(vae, image, height, width, torch_dtype):
|
|
69
|
+
latents = vae._encode(image)[:, :32]
|
|
70
|
+
latent_shift, latent_scale = get_latent_norm(latents.device)
|
|
71
|
+
latents = rearrange(latents, "B C (H P) (W Q) -> B (H W) (P Q C)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
|
|
72
|
+
latents = (latents.float() - latent_shift) / latent_scale
|
|
73
|
+
latents = latents.to(torch_dtype)
|
|
74
|
+
return latents
|
|
@@ -10,7 +10,8 @@ from ..diffusion.base_pipeline import BasePipeline, PipelineUnit
|
|
|
10
10
|
from ..core import ModelConfig
|
|
11
11
|
from ..models.ideogram4_dit import Ideogram4DiT, LLM_TOKEN_INDICATOR, OUTPUT_IMAGE_INDICATOR, IMAGE_POSITION_OFFSET
|
|
12
12
|
from ..models.ideogram4_text_encoder import Ideogram4TextEncoder
|
|
13
|
-
from ..models.
|
|
13
|
+
from ..models.flux2_vae import Flux2VAE
|
|
14
|
+
from ..models.ideogram4_vae import encode, decode
|
|
14
15
|
from transformers import AutoTokenizer
|
|
15
16
|
|
|
16
17
|
|
|
@@ -25,8 +26,7 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
25
26
|
self.text_encoder: Ideogram4TextEncoder = None
|
|
26
27
|
self.dit: Ideogram4DiT = None
|
|
27
28
|
self.dit_uncond: Ideogram4DiT = None
|
|
28
|
-
self.
|
|
29
|
-
self.vae_decoder: Ideogram4VAEDecoder = None
|
|
29
|
+
self.vae: Flux2VAE = None
|
|
30
30
|
self.tokenizer: AutoTokenizer = None
|
|
31
31
|
self.in_iteration_models = ("dit", "dit_uncond")
|
|
32
32
|
self.units = [
|
|
@@ -55,8 +55,7 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
55
55
|
else:
|
|
56
56
|
pipe.dit = transformers
|
|
57
57
|
pipe.text_encoder = model_pool.fetch_model("ideogram4_text_encoder")
|
|
58
|
-
pipe.
|
|
59
|
-
pipe.vae_decoder = model_pool.fetch_model("ideogram4_vae_decoder")
|
|
58
|
+
pipe.vae = model_pool.fetch_model("flux2_vae")
|
|
60
59
|
|
|
61
60
|
if tokenizer_config is not None:
|
|
62
61
|
tokenizer_config.download_if_necessary()
|
|
@@ -112,16 +111,15 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
112
111
|
if cfg_scale != 1:
|
|
113
112
|
models = {"dit": self.dit_uncond if self.dit_uncond is not None else self.dit}
|
|
114
113
|
noise_pred_nega = self.model_fn(timestep=timestep, **models, **inputs_shared, **inputs_nega)
|
|
115
|
-
|
|
116
|
-
noise_pred = cfg_scale * noise_pred_posi + (1.0 - cfg_scale) * noise_pred_nega
|
|
114
|
+
noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
|
|
117
115
|
else:
|
|
118
116
|
noise_pred = noise_pred_posi
|
|
119
117
|
|
|
120
118
|
inputs_shared["latents"] = self.step(self.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs_shared)
|
|
121
119
|
|
|
122
120
|
# Decode
|
|
123
|
-
self.load_models_to_device(["
|
|
124
|
-
image = self.
|
|
121
|
+
self.load_models_to_device(["vae"])
|
|
122
|
+
image = decode(self.vae, inputs_shared["latents"], height, width, self.torch_dtype)
|
|
125
123
|
image = self.vae_output_to_image(image)
|
|
126
124
|
self.load_models_to_device([])
|
|
127
125
|
return image
|
|
@@ -168,7 +166,7 @@ class Ideogram4Unit_PromptEmbedder(PipelineUnit):
|
|
|
168
166
|
f"prompt has {num_text_tokens} tokens, exceeds max_text_tokens={max_text_tokens}"
|
|
169
167
|
)
|
|
170
168
|
|
|
171
|
-
patch = pipe.dit.patch_size *
|
|
169
|
+
patch = pipe.dit.patch_size * 8
|
|
172
170
|
grid_h = height // patch
|
|
173
171
|
grid_w = width // patch
|
|
174
172
|
num_image_tokens = grid_h * grid_w
|
|
@@ -239,7 +237,7 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
|
|
|
239
237
|
)
|
|
240
238
|
|
|
241
239
|
def process(self, pipe: "Ideogram4Pipeline", height, width, seed, rand_device):
|
|
242
|
-
patch = pipe.dit.patch_size *
|
|
240
|
+
patch = pipe.dit.patch_size * 8
|
|
243
241
|
grid_h = height // patch
|
|
244
242
|
grid_w = width // patch
|
|
245
243
|
num_image_tokens = grid_h * grid_w
|
|
@@ -251,18 +249,17 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
|
|
|
251
249
|
class Ideogram4Unit_InputImageEmbedder(PipelineUnit):
|
|
252
250
|
def __init__(self):
|
|
253
251
|
super().__init__(
|
|
254
|
-
input_params=("input_image", "noise", "height", "width"
|
|
252
|
+
input_params=("input_image", "noise", "height", "width"),
|
|
255
253
|
output_params=("latents", "input_latents"),
|
|
256
|
-
onload_model_names=("
|
|
254
|
+
onload_model_names=("vae",)
|
|
257
255
|
)
|
|
258
256
|
|
|
259
|
-
def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width
|
|
257
|
+
def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width):
|
|
260
258
|
if input_image is None:
|
|
261
259
|
return {"latents": noise, "input_latents": None}
|
|
262
|
-
pipe.load_models_to_device(["
|
|
260
|
+
pipe.load_models_to_device(["vae"])
|
|
263
261
|
image = pipe.preprocess_image(input_image)
|
|
264
|
-
input_latents = pipe.
|
|
265
|
-
|
|
262
|
+
input_latents = encode(pipe.vae, image, height, width, torch.bfloat16)
|
|
266
263
|
if pipe.scheduler.training:
|
|
267
264
|
return {"latents": noise, "input_latents": input_latents}
|
|
268
265
|
else:
|
|
@@ -279,6 +276,8 @@ def model_fn_ideogram4(
|
|
|
279
276
|
segment_ids=None,
|
|
280
277
|
indicator=None,
|
|
281
278
|
max_text_tokens=0,
|
|
279
|
+
use_gradient_checkpointing=False,
|
|
280
|
+
use_gradient_checkpointing_offload=False,
|
|
282
281
|
**kwargs,
|
|
283
282
|
):
|
|
284
283
|
t_ideogram4 = timestep.to(torch.float32)
|
|
@@ -292,5 +291,7 @@ def model_fn_ideogram4(
|
|
|
292
291
|
out = dit(
|
|
293
292
|
llm_features=llm_features, x=z, t=t_ideogram4,
|
|
294
293
|
position_ids=position_ids, segment_ids=segment_ids, indicator=indicator,
|
|
294
|
+
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
295
|
+
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
|
295
296
|
)
|
|
296
297
|
return -out[:, max_text_tokens:]
|
|
@@ -13,6 +13,7 @@ from ..models.stable_diffusion_text_encoder import SDTextEncoder
|
|
|
13
13
|
from ..models.stable_diffusion_xl_unet import SDXLUNet2DConditionModel
|
|
14
14
|
from ..models.stable_diffusion_xl_text_encoder import SDXLTextEncoder2
|
|
15
15
|
from ..models.stable_diffusion_vae import StableDiffusionVAE
|
|
16
|
+
from ..utils.lora.sdxl import SdxlLoRALoader
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
|
@@ -53,6 +54,7 @@ class StableDiffusionXLPipeline(BasePipeline):
|
|
|
53
54
|
]
|
|
54
55
|
self.model_fn = model_fn_stable_diffusion_xl
|
|
55
56
|
self.compilable_models = ["unet"]
|
|
57
|
+
self.lora_loader = SdxlLoRALoader
|
|
56
58
|
|
|
57
59
|
@staticmethod
|
|
58
60
|
def from_pretrained(
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import torch, torchaudio
|
|
2
|
+
from diffsynth import load_model, ModelConfig
|
|
3
|
+
from diffsynth.models.demucs import HTDemucs
|
|
4
|
+
|
|
5
|
+
class AudioTrackSeparator(torch.nn.Module):
|
|
6
|
+
def __init__(self, torch_dtype=torch.float32, device="cuda", model_config=ModelConfig(model_id="DiffSynth-Studio/Demucs-Repackage", origin_file_pattern="model.safetensors")):
|
|
7
|
+
super().__init__()
|
|
8
|
+
model_config.download_if_necessary()
|
|
9
|
+
self.model = load_model(HTDemucs, model_config.path, torch_dtype=torch_dtype, device=device)
|
|
10
|
+
|
|
11
|
+
@torch.no_grad()
|
|
12
|
+
def __call__(self, audio, target_sample_rate=48000, **kwargs):
|
|
13
|
+
if isinstance(audio, str):
|
|
14
|
+
audio, sample_rate = torchaudio.load(audio)
|
|
15
|
+
else:
|
|
16
|
+
audio, sample_rate = audio
|
|
17
|
+
audio = audio.to(dtype=next(iter(self.model.parameters())).dtype, device=next(iter(self.model.parameters())).device)
|
|
18
|
+
vocals = self.model.extract_track(audio, sample_rate)
|
|
19
|
+
if target_sample_rate != 44100:
|
|
20
|
+
vocals = torchaudio.functional.resample(vocals, 44100, target_sample_rate)
|
|
21
|
+
return vocals
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from diffsynth import load_state_dict
|
|
2
|
+
import torch
|
|
3
|
+
from safetensors.torch import save_file
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def dequantize(source_path, target_path, device="cuda", torch_dtype=torch.bfloat16):
|
|
8
|
+
sd = load_state_dict(source_path, device=device)
|
|
9
|
+
for k in tqdm([k for k in sd if k.endswith(".weight_scale")]):
|
|
10
|
+
weight_key = k[:-13] + ".weight"
|
|
11
|
+
weight = sd.pop(weight_key).to(torch_dtype)
|
|
12
|
+
scale = sd.pop(k).to(torch_dtype).unsqueeze(1)
|
|
13
|
+
sd[weight_key] = weight * scale
|
|
14
|
+
if target_path is not None:
|
|
15
|
+
save_file(sd, target_path)
|