diffsynth 2.0.13__tar.gz → 2.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.13 → diffsynth-2.0.14}/PKG-INFO +1 -1
- {diffsynth-2.0.13 → diffsynth-2.0.14}/README.md +4 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/model_configs.py +9 -12
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/vram_management_module_maps.py +13 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/attention/attention.py +9 -1
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/operators.py +4 -1
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/flow_match.py +3 -3
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ideogram4_dit.py +17 -4
- diffsynth-2.0.14/diffsynth/models/ideogram4_vae.py +74 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ideogram4.py +18 -17
- diffsynth-2.0.14/diffsynth/utils/demucs/__init__.py +21 -0
- diffsynth-2.0.14/diffsynth/utils/dequantizer/__init__.py +15 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/SOURCES.txt +2 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/pyproject.toml +1 -1
- diffsynth-2.0.13/diffsynth/models/ideogram4_vae.py +0 -517
- {diffsynth-2.0.13 → diffsynth-2.0.14}/LICENSE +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/data/unified_dataset.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/manager.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/memory_buffer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/offload_training/offloader.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/base_pipeline.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/ddim_scheduler.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/runner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/template.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/diffusion/training_module.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/aesthetic.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/base.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/bioclip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/fid.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/hpsv2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/hpsv3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/image_reward.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/lpips.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/pickscore.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/qwen_image_bench.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/unified_reward_2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/metrics/unified_reward_edit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_conditioner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_residual_fsq.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_tokenizer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ace_step_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/aesthetic.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/bioclip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/demucs.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/dinov3_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ernie_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/fid.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hidream_common.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hidream_o1_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hpsv2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/hpsv3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ideogram4_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/image_reward.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/joyai_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/lpips.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/model_loader.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/pickscore.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_bench.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/siglip2_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_unet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/stable_diffusion_xl_unet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/unified_reward_2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/unified_reward_edit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wantodance.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ace_step.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/anima_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ernie_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/flux2_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/flux_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/hidream_o1_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/joyai_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/ltx2_audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/mova_audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/qwen_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/stable_diffusion_xl.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/wan_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/pipelines/z_image.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/audio.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/dino_v3.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/image_metrics.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth/version.py +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.13 → diffsynth-2.0.14}/setup.cfg +0 -0
|
@@ -34,6 +34,8 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
34
34
|
|
|
35
35
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
36
|
|
|
37
|
+
- **June 16, 2026**: We have added a new Template model for ACE-Step: [vocals2music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-vocals2music). For more details, please refer to the [documentation](/docs/zh/Model_Details/ACE-Step.md) and [example code](/examples/ace_step/).
|
|
38
|
+
|
|
37
39
|
- **June 15, 2026** We have open-sourced Image-to-LoRA V2, compressing the hours-long training process for image style LoRAs into a single model inference step, thereby exploring a new paradigm for LoRA model training. The [technical report](https://arxiv.org/abs/2606.13809) has been released. This release includes three models:
|
|
38
40
|
* [DiffSynth-Studio/ZImage-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2): Adapted for the Z-Image model
|
|
39
41
|
* [DiffSynth-Studio/KleinBase4B-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2): Adapted for the FLUX.2-klein-base-4B model
|
|
@@ -1036,6 +1038,7 @@ Example code for Ideogram 4 is available at: [/examples/ideogram4/](/examples/id
|
|
|
1036
1038
|
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
1037
1039
|
|-|-|-|-|-|-|-|
|
|
1038
1040
|
|[ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8)|[code](/examples/ideogram4/model_inference/ideogram-4-fp8.py)|-|-|-|-|-|
|
|
1041
|
+
|[DiffSynth-Studio/ideogram-4-bf16-repackage](https://www.modelscope.cn/models/DiffSynth-Studio/ideogram-4-bf16-repackage)|[code](/examples/ideogram4/model_inference/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_inference_low_vram/ideogram-4-bf16-repackage.py)|[code](/examples/ideogram4/model_training/full/Ideogram-4-bf16-repackage.sh)|-|[code](/examples/ideogram4/model_training/lora/Ideogram-4-bf16-repackage.sh)|[code](/examples/ideogram4/model_training/validate_lora/Ideogram-4-bf16-repackage.py)|
|
|
1039
1042
|
|
|
1040
1043
|
</details>
|
|
1041
1044
|
|
|
@@ -1396,6 +1399,7 @@ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_s
|
|
|
1396
1399
|
|[ACE-Step/acestep-v15-xl-base](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-base)|[code](/examples/ace_step/model_inference/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py)|
|
|
1397
1400
|
|[ACE-Step/acestep-v15-xl-sft](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-sft)|[code](/examples/ace_step/model_inference/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py)|
|
|
1398
1401
|
|[ACE-Step/acestep-v15-xl-turbo](https://www.modelscope.cn/models/ACE-Step/acestep-v15-xl-turbo)|[code](/examples/ace_step/model_inference/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_full/acestep-v15-xl-turbo.py)|[code](/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh)|[code](/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py)|
|
|
1402
|
+
|[DiffSynth-Studio/acestep15xlsft-lora-music](https://www.modelscope.cn/models/DiffSynth-Studio/acestep15xlsft-lora-music)|[code](/examples/ace_step/model_inference/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_inference_low_vram/acestep15xlsft-vocals2music.py)|[code](/examples/ace_step/model_training/full/acestep15xlsft-vocals2music.sh)|[code](/examples/ace_step/model_training/validate_full/acestep15xlsft-vocals2music.py)|-|-|
|
|
1399
1403
|
|
|
1400
1404
|
</details>
|
|
1401
1405
|
|
|
@@ -1149,20 +1149,17 @@ ideogram4_series = [
|
|
|
1149
1149
|
"extra_kwargs": {"keep_original_dtype": True},
|
|
1150
1150
|
},
|
|
1151
1151
|
{
|
|
1152
|
-
# Example: ModelConfig(model_id="
|
|
1153
|
-
"model_hash": "
|
|
1154
|
-
"model_name": "
|
|
1155
|
-
"model_class": "diffsynth.models.
|
|
1156
|
-
"
|
|
1157
|
-
"extra_kwargs": {"keep_original_dtype": True},
|
|
1152
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ideogram-4-bf16-repackage", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
|
|
1153
|
+
"model_hash": "291b300b11c8c8e11978bd85a9c5f80c",
|
|
1154
|
+
"model_name": "ideogram4_dit",
|
|
1155
|
+
"model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
|
|
1156
|
+
"extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}},
|
|
1158
1157
|
},
|
|
1159
1158
|
{
|
|
1160
|
-
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="
|
|
1161
|
-
"model_hash": "
|
|
1162
|
-
"model_name": "
|
|
1163
|
-
"model_class": "diffsynth.models.
|
|
1164
|
-
"state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoderStateDictConverter",
|
|
1165
|
-
"extra_kwargs": {"keep_original_dtype": True},
|
|
1159
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
|
|
1160
|
+
"model_hash": "6a269892c0757aacd46bd41b8d5a7aef",
|
|
1161
|
+
"model_name": "ideogram4_text_encoder",
|
|
1162
|
+
"model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
|
|
1166
1163
|
},
|
|
1167
1164
|
]
|
|
1168
1165
|
|
|
@@ -380,6 +380,19 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
380
380
|
"diffsynth.models.hidream_o1_image_dit.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
381
381
|
"diffsynth.models.hidream_o1_image_dit.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
382
382
|
},
|
|
383
|
+
"diffsynth.models.ideogram4_dit.Ideogram4DiT": {
|
|
384
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
385
|
+
"diffsynth.models.ideogram4_dit.Ideogram4RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
386
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
387
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
388
|
+
},
|
|
389
|
+
"diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder": {
|
|
390
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
391
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
392
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
393
|
+
"transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
394
|
+
"transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
395
|
+
},
|
|
383
396
|
}
|
|
384
397
|
|
|
385
398
|
def QwenImageTextEncoder_Module_Map_Updater():
|
|
@@ -26,6 +26,14 @@ try:
|
|
|
26
26
|
except ModuleNotFoundError:
|
|
27
27
|
XFORMERS_AVAILABLE = False
|
|
28
28
|
|
|
29
|
+
try:
|
|
30
|
+
if "enable_gqa" in inspect.signature(torch.nn.functional.scaled_dot_product_attention).parameters:
|
|
31
|
+
TORCH_SUPPORT_GQA = True
|
|
32
|
+
else:
|
|
33
|
+
TORCH_SUPPORT_GQA = False
|
|
34
|
+
except:
|
|
35
|
+
TORCH_SUPPORT_GQA = False
|
|
36
|
+
|
|
29
37
|
|
|
30
38
|
def initialize_attention_priority():
|
|
31
39
|
if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
|
|
@@ -68,7 +76,7 @@ def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n
|
|
|
68
76
|
q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
|
|
69
77
|
if q.shape[1] != k.shape[1] or q.shape[1] != v.shape[1]:
|
|
70
78
|
# Grouped Query Attention
|
|
71
|
-
if
|
|
79
|
+
if TORCH_SUPPORT_GQA:
|
|
72
80
|
out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale, is_causal=is_causal, enable_gqa=True)
|
|
73
81
|
else:
|
|
74
82
|
# In low-version torch, `enable_gqa` is not supported.
|
|
@@ -2,6 +2,7 @@ import math, warnings
|
|
|
2
2
|
import torch, torchvision, imageio, os
|
|
3
3
|
import imageio.v3 as iio
|
|
4
4
|
from PIL import Image
|
|
5
|
+
from einops import repeat
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class DataProcessingPipeline:
|
|
@@ -283,7 +284,7 @@ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
|
|
|
283
284
|
|
|
284
285
|
class LoadPureAudioWithTorchaudio(DataProcessingOperator):
|
|
285
286
|
|
|
286
|
-
def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False):
|
|
287
|
+
def __init__(self, target_sample_rate=None, max_audio_duration=None, padding=False, channels=2):
|
|
287
288
|
self.target_sample_rate = target_sample_rate
|
|
288
289
|
self.max_audio_duration = max_audio_duration
|
|
289
290
|
self.resample = True if target_sample_rate is not None else False
|
|
@@ -302,6 +303,8 @@ class LoadPureAudioWithTorchaudio(DataProcessingOperator):
|
|
|
302
303
|
elif current_samples < target_samples and self.padding:
|
|
303
304
|
padding = target_samples - current_samples
|
|
304
305
|
waveform = torch.nn.functional.pad(waveform, (0, padding))
|
|
306
|
+
if waveform.shape[0] == 1:
|
|
307
|
+
waveform = repeat(waveform, "C L -> (N C) L", N=2)
|
|
305
308
|
return waveform, sample_rate
|
|
306
309
|
except Exception as e:
|
|
307
310
|
print(f"Cannot load audio in {data} due to {e}. The audio will be `None`.")
|
|
@@ -214,7 +214,7 @@ class FlowMatchScheduler():
|
|
|
214
214
|
logsnr_max = 18.0
|
|
215
215
|
t_min = 1.0 / (1 + math.exp(0.5 * logsnr_max))
|
|
216
216
|
t_max = 1.0 / (1 + math.exp(0.5 * logsnr_min))
|
|
217
|
-
step_intervals = torch.linspace(0.0,
|
|
217
|
+
step_intervals = torch.linspace(0.0, denoising_strength, num_inference_steps + 1, dtype=torch.float64)
|
|
218
218
|
sigmas = []
|
|
219
219
|
for i in range(num_inference_steps + 1):
|
|
220
220
|
z = torch.special.ndtri(step_intervals[i])
|
|
@@ -230,7 +230,7 @@ class FlowMatchScheduler():
|
|
|
230
230
|
one_minus_t = one_minus_t * (sigma_start / one_minus_t[0])
|
|
231
231
|
sigmas = sigmas.flip(dims=(0,))
|
|
232
232
|
timesteps = sigmas[:-1]
|
|
233
|
-
sigmas = 1 - sigmas
|
|
233
|
+
sigmas = (1 - sigmas)[:-1]
|
|
234
234
|
return sigmas, timesteps
|
|
235
235
|
|
|
236
236
|
@staticmethod
|
|
@@ -263,7 +263,7 @@ class FlowMatchScheduler():
|
|
|
263
263
|
|
|
264
264
|
def set_training_weight(self):
|
|
265
265
|
steps = 1000
|
|
266
|
-
x = self.
|
|
266
|
+
x = self.sigmas * self.num_train_timesteps
|
|
267
267
|
y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
|
|
268
268
|
y_shifted = y - y.min()
|
|
269
269
|
bsmntw_weighing = y_shifted * (steps / y_shifted.sum())
|
|
@@ -5,6 +5,8 @@ import torch
|
|
|
5
5
|
import torch.nn as nn
|
|
6
6
|
import torch.nn.functional as F
|
|
7
7
|
|
|
8
|
+
from ..core.gradient import gradient_checkpoint_forward
|
|
9
|
+
|
|
8
10
|
LLM_TOKEN_INDICATOR = 3
|
|
9
11
|
OUTPUT_IMAGE_INDICATOR = 2
|
|
10
12
|
IMAGE_POSITION_OFFSET = 65536
|
|
@@ -140,7 +142,7 @@ class Ideogram4MRoPE(nn.Module):
|
|
|
140
142
|
pos = position_ids.permute(2, 0, 1).to(dtype=torch.float32)
|
|
141
143
|
inv_freq = self.inv_freq.to(dtype=torch.float32)[None, None, :, None].expand(
|
|
142
144
|
3, batch_size, -1, 1
|
|
143
|
-
)
|
|
145
|
+
).to(pos.device)
|
|
144
146
|
freqs = inv_freq @ pos.unsqueeze(2)
|
|
145
147
|
freqs = freqs.transpose(2, 3)
|
|
146
148
|
|
|
@@ -291,7 +293,7 @@ class Ideogram4EmbedScalar(nn.Module):
|
|
|
291
293
|
scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
|
|
292
294
|
emb = _sinusoidal_embedding(scaled, self.dim)
|
|
293
295
|
emb = emb.to(
|
|
294
|
-
getattr(self.mlp_in, "compute_dtype", None) or self.mlp_in.weight.dtype
|
|
296
|
+
getattr(self.mlp_in, "compute_dtype", None) or getattr(self.mlp_in, "computation_dtype", None) or self.mlp_in.weight.dtype
|
|
295
297
|
)
|
|
296
298
|
emb = F.silu(self.mlp_in(emb))
|
|
297
299
|
return self.mlp_out(emb)
|
|
@@ -375,6 +377,8 @@ class Ideogram4DiT(nn.Module):
|
|
|
375
377
|
position_ids: torch.Tensor,
|
|
376
378
|
segment_ids: torch.Tensor,
|
|
377
379
|
indicator: torch.Tensor,
|
|
380
|
+
use_gradient_checkpointing: bool = False,
|
|
381
|
+
use_gradient_checkpointing_offload: bool = False,
|
|
378
382
|
) -> torch.Tensor:
|
|
379
383
|
"""Velocity prediction.
|
|
380
384
|
|
|
@@ -393,7 +397,7 @@ class Ideogram4DiT(nn.Module):
|
|
|
393
397
|
assert in_channels == self.config.in_channels
|
|
394
398
|
|
|
395
399
|
param_dtype = (
|
|
396
|
-
getattr(self.input_proj, "compute_dtype", None) or self.input_proj.weight.dtype
|
|
400
|
+
getattr(self.input_proj, "compute_dtype", None) or getattr(self.input_proj, "computation_dtype", None) or self.input_proj.weight.dtype
|
|
397
401
|
)
|
|
398
402
|
x = x.to(param_dtype)
|
|
399
403
|
t = t.to(param_dtype)
|
|
@@ -428,7 +432,16 @@ class Ideogram4DiT(nn.Module):
|
|
|
428
432
|
sin = sin.to(h.dtype)
|
|
429
433
|
|
|
430
434
|
for layer in self.layers:
|
|
431
|
-
h =
|
|
435
|
+
h = gradient_checkpoint_forward(
|
|
436
|
+
layer,
|
|
437
|
+
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
438
|
+
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
|
439
|
+
x=h,
|
|
440
|
+
segment_ids=segment_ids,
|
|
441
|
+
cos=cos,
|
|
442
|
+
sin=sin,
|
|
443
|
+
adaln_input=adaln_input,
|
|
444
|
+
)
|
|
432
445
|
|
|
433
446
|
out = self.final_layer(h, c=adaln_input)
|
|
434
447
|
return out.to(torch.float32)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from einops import rearrange
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
LATENT_SHIFT = (
|
|
6
|
+
0.01984364, 0.10149707, 0.29689495, 0.27188619, -0.21445648, -0.15979549,
|
|
7
|
+
0.05021099, -0.15083604, -0.15360136, -0.20131799, 0.01922352, 0.0622626,
|
|
8
|
+
0.10140969, -0.06739428, 0.3758261, -0.233712, 0.35164491, -0.02590912,
|
|
9
|
+
-0.0271935, -0.10833897, -0.1476848, -0.01130957, -0.2298372, 0.23526423,
|
|
10
|
+
-0.10893522, 0.11957631, 0.04047799, 0.3134589, -0.17225064, -0.18646109,
|
|
11
|
+
-0.34691978, -0.03571246, 0.02583857, 0.10190072, 0.28402294, 0.26952152,
|
|
12
|
+
-0.21634675, -0.17938656, 0.04358909, -0.15007621, -0.1548502, -0.18971131,
|
|
13
|
+
0.02710861, 0.05609494, 0.10697846, -0.06854968, 0.38167698, -0.24269937,
|
|
14
|
+
0.35705471, -0.03063305, -0.02946109, -0.11244286, -0.14336038, -0.01362137,
|
|
15
|
+
-0.21863696, 0.23228983, -0.11739769, 0.11693044, 0.02563311, 0.31356594,
|
|
16
|
+
-0.17420591, -0.19006285, -0.34905377, -0.04025005, 0.01924137, 0.07652984,
|
|
17
|
+
0.2995608, 0.2628057, -0.22011674, -0.12715361, 0.04879879, -0.14075719,
|
|
18
|
+
-0.15935895, -0.2123584, 0.01974813, 0.05523547, 0.10011992, -0.06428964,
|
|
19
|
+
0.37781868, -0.21491644, 0.34254215, -0.03153528, -0.0310082, -0.10761415,
|
|
20
|
+
-0.14730405, -0.02475182, -0.2285588, 0.2515081, -0.10445128, 0.12446,
|
|
21
|
+
0.07062869, 0.30880162, -0.18016875, -0.18869164, -0.34533499, -0.0129177,
|
|
22
|
+
0.02578168, 0.07993659, 0.28642181, 0.26038408, -0.22459419, -0.14820155,
|
|
23
|
+
0.04059549, -0.14043529, -0.16111187, -0.2020305, 0.02602069, 0.04852717,
|
|
24
|
+
0.10432153, -0.06309942, 0.38402443, -0.22397003, 0.34814481, -0.03774432,
|
|
25
|
+
-0.03381438, -0.11245691, -0.14128767, -0.02853208, -0.21752016, 0.24872463,
|
|
26
|
+
-0.11399775, 0.1222687, 0.05620835, 0.309178, -0.18065738, -0.19401479,
|
|
27
|
+
-0.34495114, -0.01760592,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
LATENT_SCALE = (
|
|
31
|
+
1.63933691, 1.70204478, 1.73642566, 1.90004803, 1.6675316, 1.69059584,
|
|
32
|
+
1.56853198, 1.62314944, 1.89106626, 1.58086668, 1.60822129, 1.60962993,
|
|
33
|
+
1.63322129, 1.56074359, 1.73419528, 1.7919265, 1.64040632, 1.66802808,
|
|
34
|
+
1.60390303, 1.75480492, 1.63187587, 1.64334594, 1.61722884, 1.60146046,
|
|
35
|
+
1.63459219, 1.55291476, 1.68771497, 1.68415657, 1.78966054, 1.66631641,
|
|
36
|
+
1.65626686, 1.65976433, 1.63487607, 1.69513249, 1.72933756, 1.91310663,
|
|
37
|
+
1.67035057, 1.72286863, 1.56719251, 1.61934825, 1.88628859, 1.56911539,
|
|
38
|
+
1.59455129, 1.60829869, 1.62470611, 1.56052853, 1.73677003, 1.77563606,
|
|
39
|
+
1.63732541, 1.66370527, 1.59508952, 1.75153949, 1.63029275, 1.64517667,
|
|
40
|
+
1.61659342, 1.59722044, 1.64103121, 1.5408531, 1.68610394, 1.67772755,
|
|
41
|
+
1.78998563, 1.66621713, 1.65458955, 1.66041308, 1.64710857, 1.68163503,
|
|
42
|
+
1.74000294, 1.92784786, 1.67411194, 1.67395548, 1.57406532, 1.62199356,
|
|
43
|
+
1.87618195, 1.5584375, 1.57438785, 1.61711053, 1.63094305, 1.55644029,
|
|
44
|
+
1.73124302, 1.80666627, 1.6463621, 1.65932006, 1.60816188, 1.75682671,
|
|
45
|
+
1.64695873, 1.63121722, 1.61380832, 1.60478651, 1.63396035, 1.53505068,
|
|
46
|
+
1.65534289, 1.67132281, 1.80317197, 1.6767314, 1.65700938, 1.68426259,
|
|
47
|
+
1.65339716, 1.67540638, 1.73298504, 1.94067348, 1.67893609, 1.70635117,
|
|
48
|
+
1.5730906, 1.61928553, 1.87148809, 1.56244866, 1.56697152, 1.61584394,
|
|
49
|
+
1.62759496, 1.55480378, 1.73484107, 1.79055143, 1.64688773, 1.66121492,
|
|
50
|
+
1.60135887, 1.75254572, 1.64798332, 1.62989921, 1.61381592, 1.60792883,
|
|
51
|
+
1.63939668, 1.53075757, 1.65371318, 1.66801185, 1.80029087, 1.67591476,
|
|
52
|
+
1.65655173, 1.68533454,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def get_latent_norm(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
|
|
56
|
+
shift = torch.tensor(LATENT_SHIFT, dtype=torch.float32, device=device)
|
|
57
|
+
scale = torch.tensor(LATENT_SCALE, dtype=torch.float32, device=device)
|
|
58
|
+
return shift, scale
|
|
59
|
+
|
|
60
|
+
def decode(vae, latents, height, width, torch_dtype):
|
|
61
|
+
latent_shift, latent_scale = get_latent_norm(latents.device)
|
|
62
|
+
latents = latents.float() * latent_scale + latent_shift
|
|
63
|
+
latents = rearrange(latents, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
|
|
64
|
+
latents = latents.to(torch_dtype)
|
|
65
|
+
image = vae._decode(latents)
|
|
66
|
+
return image
|
|
67
|
+
|
|
68
|
+
def encode(vae, image, height, width, torch_dtype):
|
|
69
|
+
latents = vae._encode(image)[:, :32]
|
|
70
|
+
latent_shift, latent_scale = get_latent_norm(latents.device)
|
|
71
|
+
latents = rearrange(latents, "B C (H P) (W Q) -> B (H W) (P Q C)", P=2, Q=2, H=height//16, W=width//16).to(torch.bfloat16)
|
|
72
|
+
latents = (latents.float() - latent_shift) / latent_scale
|
|
73
|
+
latents = latents.to(torch_dtype)
|
|
74
|
+
return latents
|
|
@@ -10,7 +10,8 @@ from ..diffusion.base_pipeline import BasePipeline, PipelineUnit
|
|
|
10
10
|
from ..core import ModelConfig
|
|
11
11
|
from ..models.ideogram4_dit import Ideogram4DiT, LLM_TOKEN_INDICATOR, OUTPUT_IMAGE_INDICATOR, IMAGE_POSITION_OFFSET
|
|
12
12
|
from ..models.ideogram4_text_encoder import Ideogram4TextEncoder
|
|
13
|
-
from ..models.
|
|
13
|
+
from ..models.flux2_vae import Flux2VAE
|
|
14
|
+
from ..models.ideogram4_vae import encode, decode
|
|
14
15
|
from transformers import AutoTokenizer
|
|
15
16
|
|
|
16
17
|
|
|
@@ -25,8 +26,7 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
25
26
|
self.text_encoder: Ideogram4TextEncoder = None
|
|
26
27
|
self.dit: Ideogram4DiT = None
|
|
27
28
|
self.dit_uncond: Ideogram4DiT = None
|
|
28
|
-
self.
|
|
29
|
-
self.vae_decoder: Ideogram4VAEDecoder = None
|
|
29
|
+
self.vae: Flux2VAE = None
|
|
30
30
|
self.tokenizer: AutoTokenizer = None
|
|
31
31
|
self.in_iteration_models = ("dit", "dit_uncond")
|
|
32
32
|
self.units = [
|
|
@@ -55,8 +55,7 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
55
55
|
else:
|
|
56
56
|
pipe.dit = transformers
|
|
57
57
|
pipe.text_encoder = model_pool.fetch_model("ideogram4_text_encoder")
|
|
58
|
-
pipe.
|
|
59
|
-
pipe.vae_decoder = model_pool.fetch_model("ideogram4_vae_decoder")
|
|
58
|
+
pipe.vae = model_pool.fetch_model("flux2_vae")
|
|
60
59
|
|
|
61
60
|
if tokenizer_config is not None:
|
|
62
61
|
tokenizer_config.download_if_necessary()
|
|
@@ -112,16 +111,15 @@ class Ideogram4Pipeline(BasePipeline):
|
|
|
112
111
|
if cfg_scale != 1:
|
|
113
112
|
models = {"dit": self.dit_uncond if self.dit_uncond is not None else self.dit}
|
|
114
113
|
noise_pred_nega = self.model_fn(timestep=timestep, **models, **inputs_shared, **inputs_nega)
|
|
115
|
-
|
|
116
|
-
noise_pred = cfg_scale * noise_pred_posi + (1.0 - cfg_scale) * noise_pred_nega
|
|
114
|
+
noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
|
|
117
115
|
else:
|
|
118
116
|
noise_pred = noise_pred_posi
|
|
119
117
|
|
|
120
118
|
inputs_shared["latents"] = self.step(self.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs_shared)
|
|
121
119
|
|
|
122
120
|
# Decode
|
|
123
|
-
self.load_models_to_device(["
|
|
124
|
-
image = self.
|
|
121
|
+
self.load_models_to_device(["vae"])
|
|
122
|
+
image = decode(self.vae, inputs_shared["latents"], height, width, self.torch_dtype)
|
|
125
123
|
image = self.vae_output_to_image(image)
|
|
126
124
|
self.load_models_to_device([])
|
|
127
125
|
return image
|
|
@@ -168,7 +166,7 @@ class Ideogram4Unit_PromptEmbedder(PipelineUnit):
|
|
|
168
166
|
f"prompt has {num_text_tokens} tokens, exceeds max_text_tokens={max_text_tokens}"
|
|
169
167
|
)
|
|
170
168
|
|
|
171
|
-
patch = pipe.dit.patch_size *
|
|
169
|
+
patch = pipe.dit.patch_size * 8
|
|
172
170
|
grid_h = height // patch
|
|
173
171
|
grid_w = width // patch
|
|
174
172
|
num_image_tokens = grid_h * grid_w
|
|
@@ -239,7 +237,7 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
|
|
|
239
237
|
)
|
|
240
238
|
|
|
241
239
|
def process(self, pipe: "Ideogram4Pipeline", height, width, seed, rand_device):
|
|
242
|
-
patch = pipe.dit.patch_size *
|
|
240
|
+
patch = pipe.dit.patch_size * 8
|
|
243
241
|
grid_h = height // patch
|
|
244
242
|
grid_w = width // patch
|
|
245
243
|
num_image_tokens = grid_h * grid_w
|
|
@@ -251,18 +249,17 @@ class Ideogram4Unit_NoiseInitializer(PipelineUnit):
|
|
|
251
249
|
class Ideogram4Unit_InputImageEmbedder(PipelineUnit):
|
|
252
250
|
def __init__(self):
|
|
253
251
|
super().__init__(
|
|
254
|
-
input_params=("input_image", "noise", "height", "width"
|
|
252
|
+
input_params=("input_image", "noise", "height", "width"),
|
|
255
253
|
output_params=("latents", "input_latents"),
|
|
256
|
-
onload_model_names=("
|
|
254
|
+
onload_model_names=("vae",)
|
|
257
255
|
)
|
|
258
256
|
|
|
259
|
-
def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width
|
|
257
|
+
def process(self, pipe: "Ideogram4Pipeline", input_image, noise, height, width):
|
|
260
258
|
if input_image is None:
|
|
261
259
|
return {"latents": noise, "input_latents": None}
|
|
262
|
-
pipe.load_models_to_device(["
|
|
260
|
+
pipe.load_models_to_device(["vae"])
|
|
263
261
|
image = pipe.preprocess_image(input_image)
|
|
264
|
-
input_latents = pipe.
|
|
265
|
-
|
|
262
|
+
input_latents = encode(pipe.vae, image, height, width, torch.bfloat16)
|
|
266
263
|
if pipe.scheduler.training:
|
|
267
264
|
return {"latents": noise, "input_latents": input_latents}
|
|
268
265
|
else:
|
|
@@ -279,6 +276,8 @@ def model_fn_ideogram4(
|
|
|
279
276
|
segment_ids=None,
|
|
280
277
|
indicator=None,
|
|
281
278
|
max_text_tokens=0,
|
|
279
|
+
use_gradient_checkpointing=False,
|
|
280
|
+
use_gradient_checkpointing_offload=False,
|
|
282
281
|
**kwargs,
|
|
283
282
|
):
|
|
284
283
|
t_ideogram4 = timestep.to(torch.float32)
|
|
@@ -292,5 +291,7 @@ def model_fn_ideogram4(
|
|
|
292
291
|
out = dit(
|
|
293
292
|
llm_features=llm_features, x=z, t=t_ideogram4,
|
|
294
293
|
position_ids=position_ids, segment_ids=segment_ids, indicator=indicator,
|
|
294
|
+
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
295
|
+
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
|
295
296
|
)
|
|
296
297
|
return -out[:, max_text_tokens:]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import torch, torchaudio
|
|
2
|
+
from diffsynth import load_model, ModelConfig
|
|
3
|
+
from diffsynth.models.demucs import HTDemucs
|
|
4
|
+
|
|
5
|
+
class AudioTrackSeparator(torch.nn.Module):
|
|
6
|
+
def __init__(self, torch_dtype=torch.float32, device="cuda", model_config=ModelConfig(model_id="DiffSynth-Studio/Demucs-Repackage", origin_file_pattern="model.safetensors")):
|
|
7
|
+
super().__init__()
|
|
8
|
+
model_config.download_if_necessary()
|
|
9
|
+
self.model = load_model(HTDemucs, model_config.path, torch_dtype=torch_dtype, device=device)
|
|
10
|
+
|
|
11
|
+
@torch.no_grad()
|
|
12
|
+
def __call__(self, audio, target_sample_rate=48000, **kwargs):
|
|
13
|
+
if isinstance(audio, str):
|
|
14
|
+
audio, sample_rate = torchaudio.load(audio)
|
|
15
|
+
else:
|
|
16
|
+
audio, sample_rate = audio
|
|
17
|
+
audio = audio.to(dtype=next(iter(self.model.parameters())).dtype, device=next(iter(self.model.parameters())).device)
|
|
18
|
+
vocals = self.model.extract_track(audio, sample_rate)
|
|
19
|
+
if target_sample_rate != 44100:
|
|
20
|
+
vocals = torchaudio.functional.resample(vocals, 44100, target_sample_rate)
|
|
21
|
+
return vocals
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from diffsynth import load_state_dict
|
|
2
|
+
import torch
|
|
3
|
+
from safetensors.torch import save_file
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def dequantize(source_path, target_path, device="cuda", torch_dtype=torch.bfloat16):
|
|
8
|
+
sd = load_state_dict(source_path, device=device)
|
|
9
|
+
for k in tqdm([k for k in sd if k.endswith(".weight_scale")]):
|
|
10
|
+
weight_key = k[:-13] + ".weight"
|
|
11
|
+
weight = sd.pop(weight_key).to(torch_dtype)
|
|
12
|
+
scale = sd.pop(k).to(torch_dtype).unsqueeze(1)
|
|
13
|
+
sd[weight_key] = weight * scale
|
|
14
|
+
if target_path is not None:
|
|
15
|
+
save_file(sd, target_path)
|
|
@@ -168,6 +168,8 @@ diffsynth/utils/data/__init__.py
|
|
|
168
168
|
diffsynth/utils/data/audio.py
|
|
169
169
|
diffsynth/utils/data/audio_video.py
|
|
170
170
|
diffsynth/utils/data/media_io_ltx2.py
|
|
171
|
+
diffsynth/utils/demucs/__init__.py
|
|
172
|
+
diffsynth/utils/dequantizer/__init__.py
|
|
171
173
|
diffsynth/utils/lora/__init__.py
|
|
172
174
|
diffsynth/utils/lora/flux.py
|
|
173
175
|
diffsynth/utils/lora/general.py
|