diffsynth 2.0.11__tar.gz → 2.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.11 → diffsynth-2.0.13}/PKG-INFO +9 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/README.md +211 -5
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/configs/model_configs.py +143 -16
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/configs/vram_management_module_maps.py +9 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/__init__.py +1 -0
- diffsynth-2.0.13/diffsynth/core/attention/attention.py +183 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/data/operators.py +16 -11
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/data/unified_dataset.py +1 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/loader/model.py +5 -1
- diffsynth-2.0.13/diffsynth/core/offload_training/__init__.py +1 -0
- diffsynth-2.0.13/diffsynth/core/offload_training/manager.py +177 -0
- diffsynth-2.0.13/diffsynth/core/offload_training/memory_buffer.py +136 -0
- diffsynth-2.0.13/diffsynth/core/offload_training/offloader.py +71 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/vram/layers.py +3 -3
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/__init__.py +1 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/flow_match.py +92 -1
- diffsynth-2.0.13/diffsynth/diffusion/logger.py +107 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/loss.py +1 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/parsers.py +21 -3
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/runner.py +49 -8
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/training_module.py +11 -0
- diffsynth-2.0.13/diffsynth/metrics/__init__.py +32 -0
- diffsynth-2.0.13/diffsynth/metrics/aesthetic.py +42 -0
- diffsynth-2.0.13/diffsynth/metrics/base.py +28 -0
- diffsynth-2.0.13/diffsynth/metrics/bioclip.py +45 -0
- diffsynth-2.0.13/diffsynth/metrics/clip.py +55 -0
- diffsynth-2.0.13/diffsynth/metrics/fid.py +37 -0
- diffsynth-2.0.13/diffsynth/metrics/hpsv2.py +41 -0
- diffsynth-2.0.13/diffsynth/metrics/hpsv3.py +63 -0
- diffsynth-2.0.13/diffsynth/metrics/image_reward.py +48 -0
- diffsynth-2.0.13/diffsynth/metrics/lpips.py +63 -0
- diffsynth-2.0.13/diffsynth/metrics/pickscore.py +59 -0
- diffsynth-2.0.13/diffsynth/metrics/qwen_image_bench.py +70 -0
- diffsynth-2.0.13/diffsynth/metrics/unified_reward_2.py +69 -0
- diffsynth-2.0.13/diffsynth/metrics/unified_reward_edit.py +97 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ace_step_conditioner.py +5 -26
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ace_step_dit.py +33 -454
- diffsynth-2.0.13/diffsynth/models/ace_step_residual_fsq.py +569 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ace_step_tokenizer.py +2 -4
- diffsynth-2.0.13/diffsynth/models/aesthetic.py +90 -0
- diffsynth-2.0.13/diffsynth/models/bioclip.py +118 -0
- diffsynth-2.0.13/diffsynth/models/clip.py +153 -0
- diffsynth-2.0.13/diffsynth/models/demucs.py +483 -0
- diffsynth-2.0.13/diffsynth/models/fid.py +238 -0
- diffsynth-2.0.13/diffsynth/models/hidream_common.py +373 -0
- diffsynth-2.0.13/diffsynth/models/hidream_o1_image_dit.py +1910 -0
- diffsynth-2.0.13/diffsynth/models/hpsv2.py +92 -0
- diffsynth-2.0.13/diffsynth/models/hpsv3.py +353 -0
- diffsynth-2.0.13/diffsynth/models/ideogram4_dit.py +434 -0
- diffsynth-2.0.13/diffsynth/models/ideogram4_text_encoder.py +353 -0
- diffsynth-2.0.13/diffsynth/models/ideogram4_vae.py +517 -0
- diffsynth-2.0.13/diffsynth/models/image_reward.py +206 -0
- diffsynth-2.0.13/diffsynth/models/lpips.py +351 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_audio_vae.py +4 -2
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/model_loader.py +2 -1
- diffsynth-2.0.13/diffsynth/models/pickscore.py +84 -0
- diffsynth-2.0.13/diffsynth/models/qwen_image_bench.py +593 -0
- diffsynth-2.0.13/diffsynth/models/unified_reward_2.py +230 -0
- diffsynth-2.0.13/diffsynth/models/unified_reward_edit.py +377 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/ace_step.py +44 -29
- diffsynth-2.0.13/diffsynth/pipelines/hidream_o1_image.py +425 -0
- diffsynth-2.0.13/diffsynth/pipelines/ideogram4.py +296 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/ltx2_audio_video.py +1 -2
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/qwen_image.py +1 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/z_image.py +6 -1
- diffsynth-2.0.13/diffsynth/utils/state_dict_converters/image_metrics.py +135 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +5 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth.egg-info/PKG-INFO +9 -1
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth.egg-info/SOURCES.txt +40 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth.egg-info/requires.txt +9 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/pyproject.toml +11 -2
- diffsynth-2.0.11/diffsynth/core/attention/attention.py +0 -121
- diffsynth-2.0.11/diffsynth/diffusion/logger.py +0 -43
- {diffsynth-2.0.11 → diffsynth-2.0.13}/LICENSE +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/base_pipeline.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/ddim_scheduler.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/diffusion/template.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ace_step_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/dinov3_image_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ernie_image_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux2_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/joyai_image_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_common.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/mova_audio_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/mova_audio_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/mova_dual_tower_bridge.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/siglip2_image_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/stable_diffusion_unet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/stable_diffusion_xl_unet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wantodance.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/anima_image.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/ernie_image.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/flux2_image.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/flux_image.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/joyai_image.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/mova_audio_video.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/stable_diffusion.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/stable_diffusion_xl.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/pipelines/wan_video.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/data/audio.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/data/audio_video.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/data/media_io_ltx2.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/ses/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/ses/ses.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ace_step_conditioner.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ace_step_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ace_step_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ace_step_tokenizer.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/dino_v3.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ernie_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/joyai_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/stable_diffusion_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/stable_diffusion_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/stable_diffusion_xl_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/z_image_dit.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/utils/xfuser/xdit_context_parallel.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth/version.py +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.11 → diffsynth-2.0.13}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffsynth
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13
|
|
4
4
|
Summary: Enjoy the magic of Diffusion models!
|
|
5
5
|
Author: ModelScope Team
|
|
6
6
|
License: Apache-2.0
|
|
@@ -33,6 +33,14 @@ Requires-Dist: torch==2.7.1+cpu; extra == "npu"
|
|
|
33
33
|
Requires-Dist: torch-npu==2.7.1; extra == "npu"
|
|
34
34
|
Requires-Dist: torchvision==0.22.1+cpu; extra == "npu"
|
|
35
35
|
Provides-Extra: audio
|
|
36
|
+
Requires-Dist: av; extra == "audio"
|
|
36
37
|
Requires-Dist: torchaudio; extra == "audio"
|
|
37
38
|
Requires-Dist: torchcodec; extra == "audio"
|
|
39
|
+
Requires-Dist: librosa; extra == "audio"
|
|
40
|
+
Provides-Extra: all
|
|
41
|
+
Requires-Dist: av; extra == "all"
|
|
42
|
+
Requires-Dist: torchaudio; extra == "all"
|
|
43
|
+
Requires-Dist: torchcodec; extra == "all"
|
|
44
|
+
Requires-Dist: librosa; extra == "all"
|
|
45
|
+
Requires-Dist: streamlit; extra == "all"
|
|
38
46
|
Dynamic: license-file
|
|
@@ -34,6 +34,19 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
34
34
|
|
|
35
35
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
36
36
|
|
|
37
|
+
- **June 15, 2026** We have open-sourced Image-to-LoRA V2, compressing the hours-long training process for image style LoRAs into a single model inference step, thereby exploring a new paradigm for LoRA model training. The [technical report](https://arxiv.org/abs/2606.13809) has been released. This release includes three models:
|
|
38
|
+
* [DiffSynth-Studio/ZImage-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2): Adapted for the Z-Image model
|
|
39
|
+
* [DiffSynth-Studio/KleinBase4B-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2): Adapted for the FLUX.2-klein-base-4B model
|
|
40
|
+
* [DiffSynth-Studio/HidreamO1-i2L-v2](https://modelscope.cn/models/DiffSynth-Studio/HidreamO1-i2L-v2): Adapted for the Hidream-O1-Image model
|
|
41
|
+
|
|
42
|
+
- **June 5, 2026** Ideogram 4 open-sourced. Support includes text-to-image inference. For details, please refer to the [documentation](/docs/en/Model_Details/Ideogram-4.md) and [example code](/examples/ideogram4/).
|
|
43
|
+
|
|
44
|
+
- **May 21, 2026**: Added support for image quality metrics models, including FID, CLIP, Aesthetic, PickScore, ImageReward, HPSv2, and HPSv3. For details, refer to the [documentation](/docs/en/Model_Details/Image-Quality-Metrics.md) and [example code](/examples/image_quality_metric/).
|
|
45
|
+
|
|
46
|
+
- **May 18, 2026** Added **CPU Offload Training** support. By moving model weights layer-by-layer between CPU and GPU, it significantly reduces GPU VRAM usage during training, enabling LoRA training of large models even on consumer-grade GPUs, compatible with all models. Simply add `--enable_model_cpu_offload` to your training command to enable (currently supports single-GPU training only). For details, see the [documentation](/docs/en/Training/Offload_Training.md).
|
|
47
|
+
|
|
48
|
+
- **May 14, 2026** HiDream-O1-Image open-sourced, welcome a new member to the image model family! Support includes text-to-image generation, image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/HiDream-O1-Image.md) and [example code](/examples/hidream_o1_image/).
|
|
49
|
+
|
|
37
50
|
- **April 28, 2026** 🔥 We are excited to announce the release of **Diffusion Templates**, a plugin framework designed for Diffusion models that significantly lowers the barrier to training controllable generative models. Let's explore this cutting-edge technology together!
|
|
38
51
|
* Open-source code: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
|
|
39
52
|
* Technical report: [arXiv](https://arxiv.org/abs/2604.24351)
|
|
@@ -49,6 +62,9 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
49
62
|
|
|
50
63
|
- **April 14, 2026** JoyAI-Image open-sourced, welcome a new member to the image editing model family! Support includes instruction-guided image editing, low VRAM inference, and training capabilities. For details, please refer to the [documentation](/docs/en/Model_Details/JoyAI-Image.md) and [example code](/examples/joyai_image/).
|
|
51
64
|
|
|
65
|
+
<details>
|
|
66
|
+
<summary>More</summary>
|
|
67
|
+
|
|
52
68
|
- **March 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
|
|
53
69
|
|
|
54
70
|
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
|
|
@@ -57,9 +73,6 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
57
73
|
|
|
58
74
|
- **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
|
|
59
75
|
|
|
60
|
-
<details>
|
|
61
|
-
<summary>More</summary>
|
|
62
|
-
|
|
63
76
|
- **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details.
|
|
64
77
|
|
|
65
78
|
- **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future.
|
|
@@ -309,6 +322,7 @@ Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image
|
|
|
309
322
|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|
|
|
310
323
|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|
|
|
311
324
|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|
|
|
325
|
+
|[DiffSynth-Studio/ZImage-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/ZImage-i2L-v2)|[code](/examples/z_image/model_inference/ZImage-i2L-v2.py)|[code](/examples/z_image/model_inference_low_vram/ZImage-i2L-v2.py)|[code](/examples/z_image/model_training/full/ZImage-i2L-v2.sh)|[code](/examples/z_image/model_training/validate_full/ZImage-i2L-v2.py)|-|-|
|
|
312
326
|
|
|
313
327
|
</details>
|
|
314
328
|
|
|
@@ -499,6 +513,7 @@ Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
|
|
|
499
513
|
|[DiffSynth-Studio/Template-KleinBase4B-SoftRGB](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-SoftRGB)|[code](/examples/flux2/model_inference/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-SoftRGB.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-SoftRGB.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-SoftRGB.py)|-|-|
|
|
500
514
|
|[DiffSynth-Studio/Template-KleinBase4B-Upscaler](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-Upscaler)|[code](/examples/flux2/model_inference/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-Upscaler.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-Upscaler.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-Upscaler.py)|-|-|
|
|
501
515
|
|[DiffSynth-Studio/Template-KleinBase4B-ContentRef](https://www.modelscope.cn/models/DiffSynth-Studio/Template-KleinBase4B-ContentRef)|[code](/examples/flux2/model_inference/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_inference_low_vram/Template-KleinBase4B-ContentRef.py)|[code](/examples/flux2/model_training/full/Template-KleinBase4B-ContentRef.sh)|[code](/examples/flux2/model_training/validate_full/Template-KleinBase4B-ContentRef.py)|-|-|
|
|
516
|
+
|[DiffSynth-Studio/KleinBase4B-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/KleinBase4B-i2L-v2)|[code](/examples/flux2/model_inference/KleinBase4B-i2L-v2.py)|[code](/examples/flux2/model_inference_low_vram/KleinBase4B-i2L-v2.py)|[code](/examples/flux2/model_training/full/KleinBase4B-i2L-v2.sh)|[code](/examples/flux2/model_training/validate_full/KleinBase4B-i2L-v2.py)|-|-|
|
|
502
517
|
|
|
503
518
|
</details>
|
|
504
519
|
|
|
@@ -884,6 +899,146 @@ Example code for JoyAI-Image is available at: [/examples/joyai_image/](/examples
|
|
|
884
899
|
|
|
885
900
|
</details>
|
|
886
901
|
|
|
902
|
+
#### HiDream-O1-Image: [/docs/en/Model_Details/HiDream-O1-Image.md](/docs/en/Model_Details/HiDream-O1-Image.md)
|
|
903
|
+
|
|
904
|
+
<details>
|
|
905
|
+
|
|
906
|
+
<summary>Quick Start</summary>
|
|
907
|
+
|
|
908
|
+
Running the following code will quickly load the [HiDream-ai/HiDream-O1-Image](https://modelscope.cn/HiDream-ai/HiDream-O1-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM.
|
|
909
|
+
|
|
910
|
+
```python
|
|
911
|
+
from diffsynth.pipelines.hidream_o1_image import HiDreamO1ImagePipeline
|
|
912
|
+
from diffsynth.core.loader.config import ModelConfig
|
|
913
|
+
import torch
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
vram_config = {
|
|
917
|
+
"offload_dtype": torch.bfloat16,
|
|
918
|
+
"offload_device": "cpu",
|
|
919
|
+
"onload_dtype": torch.bfloat16,
|
|
920
|
+
"onload_device": "cpu",
|
|
921
|
+
"preparing_dtype": torch.bfloat16,
|
|
922
|
+
"preparing_device": "cuda",
|
|
923
|
+
"computation_dtype": torch.bfloat16,
|
|
924
|
+
"computation_device": "cuda",
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
pipe = HiDreamO1ImagePipeline.from_pretrained(
|
|
929
|
+
torch_dtype=torch.bfloat16,
|
|
930
|
+
device="cuda",
|
|
931
|
+
model_configs=[
|
|
932
|
+
ModelConfig(model_id="HiDream-ai/HiDream-O1-Image", origin_file_pattern="model-*.safetensors", **vram_config),
|
|
933
|
+
],
|
|
934
|
+
processor_config=ModelConfig(model_id="HiDream-ai/HiDream-O1-Image", origin_file_pattern="./"),
|
|
935
|
+
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
|
|
936
|
+
)
|
|
937
|
+
image = pipe(
|
|
938
|
+
prompt="medium shot, eye-level, front view. A woman is seated in an ornate bedroom, illuminated by candlelight, with a calm and composed expression. The subject is a young woman with fair skin, light brown hair styled in an updo with loose tendrils framing her face, and blue eyes. She wears a cream-colored satin robe with delicate floral embroidery and lace trim along the neckline. Her ears are adorned with pearl drop earrings. She is seated on a bed with a dark, intricately carved wooden headboard. To her left, a wooden nightstand holds three lit white candles and a candelabra with multiple lit candles in the background. The bed is covered with patterned pillows and a dark, textured blanket. The walls are paneled with dark wood and feature a large, ornate tapestry with muted earth tones. The lighting creates soft highlights on her face and robe, with warm shadows cast across the room.",
|
|
939
|
+
negative_prompt=" ",
|
|
940
|
+
cfg_scale=4.0,
|
|
941
|
+
height=2048,
|
|
942
|
+
width=2048,
|
|
943
|
+
seed=42,
|
|
944
|
+
num_inference_steps=50,
|
|
945
|
+
)
|
|
946
|
+
image.save("image.jpg")
|
|
947
|
+
```
|
|
948
|
+
|
|
949
|
+
</details>
|
|
950
|
+
|
|
951
|
+
<details>
|
|
952
|
+
|
|
953
|
+
<summary>Examples</summary>
|
|
954
|
+
|
|
955
|
+
Example code for HiDream-O1-Image is available at: [/examples/hidream_o1_image/](/examples/hidream_o1_image/)
|
|
956
|
+
|
|
957
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
958
|
+
|-|-|-|-|-|-|-|
|
|
959
|
+
|[HiDream-ai/HiDream-O1-Image](https://modelscope.cn/HiDream-ai/HiDream-O1-Image)|[code](/examples/hidream_o1_image/model_inference/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_training/full/HiDream-O1-Image.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HiDream-O1-Image.py)|[code](/examples/hidream_o1_image/model_training/lora/HiDream-O1-Image.sh)|[code](/examples/hidream_o1_image/model_training/validate_lora/HiDream-O1-Image.py)|
|
|
960
|
+
|[HiDream-ai/HiDream-O1-Image-Dev](https://modelscope.cn/HiDream-ai/HiDream-O1-Image-Dev)|[code](/examples/hidream_o1_image/model_inference/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_training/full/HiDream-O1-Image-Dev.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HiDream-O1-Image-Dev.py)|[code](/examples/hidream_o1_image/model_training/lora/HiDream-O1-Image-Dev.sh)|[code](/examples/hidream_o1_image/model_training/validate_lora/HiDream-O1-Image-Dev.py)|
|
|
961
|
+
|[DiffSynth-Studio/HidreamO1-i2L-v2](https://www.modelscope.cn/models/DiffSynth-Studio/HidreamO1-i2L-v2)|[code](/examples/hidream_o1_image/model_inference/HidreamO1-i2L-v2.py)|[code](/examples/hidream_o1_image/model_inference_low_vram/HidreamO1-i2L-v2.py)|[code](/examples/hidream_o1_image/model_training/full/HidreamO1-i2L-v2.sh)|[code](/examples/hidream_o1_image/model_training/validate_full/HidreamO1-i2L-v2.py)|-|-|
|
|
962
|
+
|
|
963
|
+
</details>
|
|
964
|
+
|
|
965
|
+
#### Ideogram 4: [/docs/en/Model_Details/Ideogram-4.md](/docs/en/Model_Details/Ideogram-4.md)
|
|
966
|
+
|
|
967
|
+
<details>
|
|
968
|
+
|
|
969
|
+
<summary>Quick Start</summary>
|
|
970
|
+
|
|
971
|
+
Running the following code will quickly load the [ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8) model and perform inference. The model can run with a minimum of 24GB VRAM.
|
|
972
|
+
|
|
973
|
+
```python
|
|
974
|
+
from diffsynth.pipelines.ideogram4 import Ideogram4Pipeline
|
|
975
|
+
from diffsynth.core import ModelConfig
|
|
976
|
+
import torch
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
pipe = Ideogram4Pipeline.from_pretrained(
|
|
980
|
+
torch_dtype=torch.bfloat16,
|
|
981
|
+
device="cuda",
|
|
982
|
+
model_configs=[
|
|
983
|
+
ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors"),
|
|
984
|
+
# unconditional_transformer is optional. You can delete this line to reduce VRAM required.
|
|
985
|
+
ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="unconditional_transformer/diffusion_pytorch_model.safetensors"),
|
|
986
|
+
ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors"),
|
|
987
|
+
ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
|
988
|
+
],
|
|
989
|
+
tokenizer_config=ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="tokenizer/"),
|
|
990
|
+
)
|
|
991
|
+
prompt = r"""
|
|
992
|
+
{
|
|
993
|
+
"high_level_description": "A medium-shot photograph of Formula 1 driver Max Verstappen wearing his Red Bull Racing racing suit and cap, smiling as he holds his racing helmet and talks to a man in a white shirt and black vest at a race track.",
|
|
994
|
+
"style_description": {
|
|
995
|
+
"aesthetics": "saturated primary colors, rule of thirds, joyful and triumphant",
|
|
996
|
+
"lighting": "overcast daylight, diffused, soft subtle shadows",
|
|
997
|
+
"photo": "shallow depth of field, sharp focus, eye-level, telephoto",
|
|
998
|
+
"medium": "photograph"
|
|
999
|
+
},
|
|
1000
|
+
"compositional_deconstruction": {
|
|
1001
|
+
"background": "The background is an out-of-focus racing paddock or track environment. Several blurred figures are visible, including one in an orange shirt. A purple and white structure with a red 'F1' logo stands on the left. The scene is outdoors with daylight, though the sky is not visible.",
|
|
1002
|
+
"elements": [
|
|
1003
|
+
{"type": "obj", "bbox": [55, 642, 1000, 937], "desc": "An older man standing in profile, facing left toward Max Verstappen. He has grey hair and fair skin. He is wearing a white long-sleeved button-down shirt with a navy blue quilted vest over it. He has a slight smile."},
|
|
1004
|
+
{"type": "obj", "bbox": [34, 137, 1000, 617], "desc": "Max Verstappen, a fair-skinned male Formula 1 driver, positioned in the center. He is facing forward with a joyful expression and a slight smile. He wears a navy blue Red Bull Racing team uniform with numerous sponsor logos and a matching baseball cap with the number '1'. He is holding a white and red racing helmet in his hands. He has a silver watch on his left wrist."},
|
|
1005
|
+
{"type": "obj", "bbox": [422, 212, 792, 452], "desc": "Max Verstappen's racing helmet, held in front of his chest. It features a white, red, and yellow design with the Red Bull logo and the 'Player 0.0' branding. The visor is clear and open."},
|
|
1006
|
+
{"type": "text", "bbox": [657, 0, 755, 142], "text": "F1", "desc": "Large, stylized red logo on a black and purple background in the lower left."},
|
|
1007
|
+
{"type": "text", "bbox": [768, 0, 818, 147], "text": "Formula 1\nWorld Championship™", "desc": "Small white sans-serif text below the F1 logo on the left side."},
|
|
1008
|
+
{"type": "text", "bbox": [78, 447, 117, 510], "text": "ORACLE\nRed Bull\nRacing", "desc": "Very small white and orange logo on the front of the navy blue cap."},
|
|
1009
|
+
{"type": "text", "bbox": [78, 417, 120, 440], "text": "1", "desc": "Bold red numeral '1' on the front left side of the navy blue cap."},
|
|
1010
|
+
{"type": "text", "bbox": [332, 442, 363, 483], "text": "Red Bull", "desc": "Small yellow and red text logo on the collar of the uniform."},
|
|
1011
|
+
{"type": "text", "bbox": [373, 490, 423, 532], "text": "RAUCH", "desc": "Small yellow and blue logo on the right chest of the uniform."},
|
|
1012
|
+
{"type": "text", "bbox": [422, 473, 500, 532], "text": "BYBIT\nHONDA", "desc": "Medium-sized white sans-serif text on the right chest of the uniform."},
|
|
1013
|
+
{"type": "text", "bbox": [410, 203, 442, 257], "text": "RAUCH", "desc": "Small yellow logo on the left upper arm of the uniform."},
|
|
1014
|
+
{"type": "text", "bbox": [530, 448, 627, 510], "text": "Red Bull", "desc": "Medium red text logo on the right side of the torso, part of the Red Bull graphic."},
|
|
1015
|
+
{"type": "text", "bbox": [680, 417, 768, 523], "text": "Red Bull", "desc": "Large red text logo across the lower torso of the uniform."},
|
|
1016
|
+
{"type": "text", "bbox": [797, 475, 815, 518], "text": "MAX", "desc": "Small white text next to a Dutch flag on the belt area of the uniform."},
|
|
1017
|
+
{"type": "text", "bbox": [558, 317, 715, 355], "text": "Player 0.0", "desc": "Black sans-serif text on a white band on the racing helmet."},
|
|
1018
|
+
{"type": "text", "bbox": [560, 800, 582, 835], "text": "IA.COM", "desc": "Small blue sans-serif text on the right sleeve of the white shirt."},
|
|
1019
|
+
{"type": "text", "bbox": [968, 8, 997, 332], "text": "© Anadolu Agency via Getty Images", "desc": "Small white watermark text in the bottom left corner."}
|
|
1020
|
+
]
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
"""
|
|
1024
|
+
image = pipe(prompt=prompt, height=1024, width=1024, num_inference_steps=48, cfg_scale=7.0, seed=42)
|
|
1025
|
+
image.save("image_ideogram-4-fp8.jpg")
|
|
1026
|
+
```
|
|
1027
|
+
|
|
1028
|
+
</details>
|
|
1029
|
+
|
|
1030
|
+
<details>
|
|
1031
|
+
|
|
1032
|
+
<summary>Examples</summary>
|
|
1033
|
+
|
|
1034
|
+
Example code for Ideogram 4 is available at: [/examples/ideogram4/](/examples/ideogram4/)
|
|
1035
|
+
|
|
1036
|
+
| Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
1037
|
+
|-|-|-|-|-|-|-|
|
|
1038
|
+
|[ideogram-ai/ideogram-4-fp8](https://www.modelscope.cn/models/ideogram-ai/ideogram-4-fp8)|[code](/examples/ideogram4/model_inference/ideogram-4-fp8.py)|-|-|-|-|-|
|
|
1039
|
+
|
|
1040
|
+
</details>
|
|
1041
|
+
|
|
887
1042
|
### Video Synthesis
|
|
888
1043
|
|
|
889
1044
|
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
|
|
@@ -996,6 +1151,7 @@ Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
|
|
|
996
1151
|
|
|
997
1152
|
| Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
998
1153
|
|-|-|-|-|-|-|-|-|
|
|
1154
|
+
|[jd-opensource/JoyAI-Echo](https://modelscope.cn/models/jd-opensource/JoyAI-Echo)||[code](/examples/ltx2/model_inference/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_inference_low_vram/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_training/full/JoyAI-Echo-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/JoyAI-Echo-T2AV.py)|[code](/examples/ltx2/model_training/lora/JoyAI-Echo-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/JoyAI-Echo-T2AV.py)|
|
|
999
1155
|
|[Lightricks/LTX-2.3: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-I2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-I2AV.py)|
|
|
1000
1156
|
|[Lightricks/LTX-2.3: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-TwoStage.py)|-|-|-|-|
|
|
1001
1157
|
|[Lightricks/LTX-2.3: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-DistilledPipeline.py)|-|-|-|-|
|
|
@@ -1158,8 +1314,8 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
|
|
|
1158
1314
|
|[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
|
|
1159
1315
|
|[openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py)|
|
|
1160
1316
|
|[openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py)|
|
|
1161
|
-
|[Wan-AI/
|
|
1162
|
-
|[Wan-AI/
|
|
1317
|
+
|[Wan-AI/Wan2.2-Dancer-14B (global model)](https://modelscope.cn/models/Wan-AI/Wan2.2-Dancer-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Dancer-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Dancer-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Dancer-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Dancer-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Dancer-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Dancer-14B-global.py)|
|
|
1318
|
+
|[Wan-AI/Wan2.2-Dancer-14B (local model)](https://modelscope.cn/models/Wan-AI/Wan2.2-Dancer-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Dancer-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Dancer-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Dancer-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Dancer-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Dancer-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Dancer-14B-local.py)|
|
|
1163
1319
|
|
|
1164
1320
|
</details>
|
|
1165
1321
|
|
|
@@ -1243,6 +1399,56 @@ Example code for ACE-Step is available at: [/examples/ace_step/](/examples/ace_s
|
|
|
1243
1399
|
|
|
1244
1400
|
</details>
|
|
1245
1401
|
|
|
1402
|
+
### Image Quality Metrics Models
|
|
1403
|
+
|
|
1404
|
+
[/docs/en/Model_Details/Image-Quality-Metrics.md](/docs/en/Model_Details/Image-Quality-Metrics.md)
|
|
1405
|
+
|
|
1406
|
+
<details>
|
|
1407
|
+
|
|
1408
|
+
<summary>Quick Start</summary>
|
|
1409
|
+
|
|
1410
|
+
Run the following code to quickly load PickScore and evaluate an image against a text prompt. The default model will be downloaded from ModelScope to `./models`.
|
|
1411
|
+
|
|
1412
|
+
```python
|
|
1413
|
+
from diffsynth.metrics import PickScoreMetric, ModelConfig
|
|
1414
|
+
from modelscope import dataset_snapshot_download
|
|
1415
|
+
from PIL import Image
|
|
1416
|
+
|
|
1417
|
+
dataset_snapshot_download(
|
|
1418
|
+
"DiffSynth-Studio/diffsynth_example_dataset",
|
|
1419
|
+
allow_file_pattern="flux/FLUX.1-dev/*",
|
|
1420
|
+
local_dir="./data/diffsynth_example_dataset",
|
|
1421
|
+
)
|
|
1422
|
+
image = Image.open("data/diffsynth_example_dataset/flux/FLUX.1-dev/1.jpg").convert("RGB")
|
|
1423
|
+
prompt = "a dog"
|
|
1424
|
+
metric = PickScoreMetric.from_pretrained(
|
|
1425
|
+
model_config=ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="PickScore/model.safetensors"),
|
|
1426
|
+
device="cuda"
|
|
1427
|
+
)
|
|
1428
|
+
score = metric.compute(prompt, image)[0]
|
|
1429
|
+
print(f"PickScore score:: {score:.3f}")
|
|
1430
|
+
```
|
|
1431
|
+
|
|
1432
|
+
</details>
|
|
1433
|
+
|
|
1434
|
+
<details>
|
|
1435
|
+
|
|
1436
|
+
<summary>Example Code</summary>
|
|
1437
|
+
|
|
1438
|
+
Example code for image quality metrics models can be found at: [/examples/image_quality_metric/](/examples/image_quality_metric/)
|
|
1439
|
+
|
|
1440
|
+
| Metric | GitHub Repository | Example Code |
|
|
1441
|
+
| - | - | - |
|
|
1442
|
+
| PickScore | [GitHub](https://github.com/yuvalkirstain/pickscore) | [code](../../../examples/image_quality_metric/pickscore.py) |
|
|
1443
|
+
| ImageReward | [GitHub](https://github.com/zai-org/ImageReward) | [code](../../../examples/image_quality_metric/image_reward.py) |
|
|
1444
|
+
| HPSv2 | [GitHub](https://github.com/tgxs002/HPSv2) | [code](../../../examples/image_quality_metric/hpsv2.py) |
|
|
1445
|
+
| HPSv3 | [GitHub](https://github.com/MizzenAI/HPSv3) | [code](../../../examples/image_quality_metric/hpsv3.py) |
|
|
1446
|
+
| CLIP Score | [GitHub](https://github.com/openai/CLIP) | [code](../../../examples/image_quality_metric/clipscore.py) |
|
|
1447
|
+
| Aesthetic | [GitHub](https://github.com/christophschuhmann/improved-aesthetic-predictor) | [code](../../../examples/image_quality_metric/aesthetic.py) |
|
|
1448
|
+
| FID | [GitHub](https://github.com/mseitzer/pytorch-fid) | [code](../../../examples/image_quality_metric/fid.py) |
|
|
1449
|
+
|
|
1450
|
+
</details>
|
|
1451
|
+
|
|
1246
1452
|
## Innovative Achievements
|
|
1247
1453
|
|
|
1248
1454
|
DiffSynth-Studio is not just an engineered model framework, but also an incubator for innovative achievements.
|
|
@@ -309,7 +309,7 @@ wan_series = [
|
|
|
309
309
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.wans2v_audio_encoder.WanS2VAudioEncoderStateDictConverter",
|
|
310
310
|
},
|
|
311
311
|
{
|
|
312
|
-
# Example: ModelConfig(model_id="Wan-AI/
|
|
312
|
+
# Example: ModelConfig(model_id="Wan-AI/Wan2.2-Dancer-14B", origin_file_pattern="global_model.safetensors")
|
|
313
313
|
"model_hash": "eb18873fc0ba77b541eb7b62dbcd2059",
|
|
314
314
|
"model_name": "wan_video_dit",
|
|
315
315
|
"model_class": "diffsynth.models.wan_video_dit.WanModel",
|
|
@@ -833,20 +833,6 @@ ltx2_series = [
|
|
|
833
833
|
"extra_kwargs": {"decoder_version": "ltx-2.3"},
|
|
834
834
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
|
|
835
835
|
},
|
|
836
|
-
{
|
|
837
|
-
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
|
|
838
|
-
"model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
|
|
839
|
-
"model_name": "ltx2_audio_vae_decoder",
|
|
840
|
-
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
|
|
841
|
-
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
|
|
842
|
-
},
|
|
843
|
-
{
|
|
844
|
-
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
|
|
845
|
-
"model_hash": "29338f3b95e7e312a3460a482e4f4554",
|
|
846
|
-
"model_name": "ltx2_audio_vae_encoder",
|
|
847
|
-
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
|
|
848
|
-
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
|
|
849
|
-
},
|
|
850
836
|
{
|
|
851
837
|
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
|
|
852
838
|
"model_hash": "cd436c99e69ec5c80f050f0944f02a15",
|
|
@@ -965,6 +951,7 @@ joyai_image_series = [
|
|
|
965
951
|
},
|
|
966
952
|
{
|
|
967
953
|
# Example: ModelConfig(model_id="jd-opensource/JoyAI-Image-Edit", origin_file_pattern="JoyAI-Image-Und/model-*.safetensors")
|
|
954
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="UnifiedReward-Edit-qwen3vl-8b/model-*.safetensors")
|
|
968
955
|
"model_hash": "2d11bf14bba8b4e87477c8199a895403",
|
|
969
956
|
"model_name": "joyai_image_text_encoder",
|
|
970
957
|
"model_class": "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder",
|
|
@@ -1038,9 +1025,149 @@ ace_step_series = [
|
|
|
1038
1025
|
"model_class": "diffsynth.models.ace_step_tokenizer.AceStepTokenizer",
|
|
1039
1026
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.ace_step_tokenizer.AceStepTokenizerStateDictConverter",
|
|
1040
1027
|
},
|
|
1028
|
+
{
|
|
1029
|
+
# Example: ???
|
|
1030
|
+
"model_hash": "ff74b1806e6a0b52e7bbd1d3df2d26d1",
|
|
1031
|
+
"model_name": "demucs",
|
|
1032
|
+
"model_class": "diffsynth.models.demucs.HTDemucs",
|
|
1033
|
+
},
|
|
1034
|
+
]
|
|
1035
|
+
|
|
1036
|
+
image_metrics_series = [
|
|
1037
|
+
{
|
|
1038
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="PickScore/model.safetensors")
|
|
1039
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="CLIP-ViT-H-14-laion2B-s32B-b79K/model.safetensors")
|
|
1040
|
+
"model_hash": "b5e2c0bfcbf4085ccdb2feb8f0ba408a",
|
|
1041
|
+
"model_name": "image_metrics_clip_hf",
|
|
1042
|
+
"model_class": "diffsynth.models.clip.ImageMetricsCLIPModel",
|
|
1043
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsCLIPStateDictConverter",
|
|
1044
|
+
},
|
|
1045
|
+
{
|
|
1046
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="HPSv2/model.safetensors")
|
|
1047
|
+
"model_hash": "f79e72cec8ae5a540cff0304bfb21b00",
|
|
1048
|
+
"model_name": "image_metrics_hpsv2",
|
|
1049
|
+
"model_class": "diffsynth.models.clip.ImageMetricsCLIPModel",
|
|
1050
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsOpenCLIPStateDictConverter",
|
|
1051
|
+
},
|
|
1052
|
+
{
|
|
1053
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="HPSv3/model.safetensors")
|
|
1054
|
+
"model_hash": "5655d9cde15b759cfeefe7432d7a912c",
|
|
1055
|
+
"model_name": "image_metrics_hpsv3",
|
|
1056
|
+
"model_class": "diffsynth.models.hpsv3.HPSv3Qwen2VLRewardModel",
|
|
1057
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsHPSv3StateDictConverter",
|
|
1058
|
+
"extra_kwargs": {"vocab_size": 151658, "output_dim": 2, "reward_token": "special", "rm_head_type": "ranknet"},
|
|
1059
|
+
},
|
|
1060
|
+
{
|
|
1061
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="ImageReward/model.safetensors")
|
|
1062
|
+
"model_hash": "b3cc8e10b76ca98cde653daa5cf63139",
|
|
1063
|
+
"model_name": "image_metrics_image_reward",
|
|
1064
|
+
"model_class": "diffsynth.models.image_reward.ImageRewardModel",
|
|
1065
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsImageRewardStateDictConverter",
|
|
1066
|
+
},
|
|
1067
|
+
{
|
|
1068
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="Aesthetic/model.safetensors")
|
|
1069
|
+
"model_hash": "306981222ec94302794e07cf676c84cc",
|
|
1070
|
+
"model_name": "image_metrics_aesthetic",
|
|
1071
|
+
"model_class": "diffsynth.models.aesthetic.AestheticModel",
|
|
1072
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsAestheticStateDictConverter",
|
|
1073
|
+
},
|
|
1074
|
+
{
|
|
1075
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="FID/model.safetensors")
|
|
1076
|
+
"model_hash": "d4e9549be726259b444d1f62db4ce413",
|
|
1077
|
+
"model_name": "image_metrics_fid_inception",
|
|
1078
|
+
"model_class": "diffsynth.models.fid.FIDInceptionModel",
|
|
1079
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsFIDStateDictConverter",
|
|
1080
|
+
},
|
|
1081
|
+
{
|
|
1082
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="BioCLIPv2/open_clip_model.safetensors")
|
|
1083
|
+
"model_hash": "3a020a3e47afb7c5e21c52f2d0692c09",
|
|
1084
|
+
"model_name": "image_metrics_bioclip_v2",
|
|
1085
|
+
"model_class": "diffsynth.models.bioclip.BioCLIPv2Model",
|
|
1086
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsOpenCLIPStateDictConverter",
|
|
1087
|
+
},
|
|
1088
|
+
{
|
|
1089
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/alexnet.safetensors")
|
|
1090
|
+
"model_hash": "08a75c660c9b2e775c530a0955857f1f",
|
|
1091
|
+
"model_name": "image_metrics_lpips_alex",
|
|
1092
|
+
"model_class": "diffsynth.models.lpips.LPIPSModel",
|
|
1093
|
+
"extra_kwargs": {"net": "alex"},
|
|
1094
|
+
},
|
|
1095
|
+
{
|
|
1096
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/vgg.safetensors")
|
|
1097
|
+
"model_hash": "5740953aaa8aba2ecd9b9c23da813591",
|
|
1098
|
+
"model_name": "image_metrics_lpips_vgg",
|
|
1099
|
+
"model_class": "diffsynth.models.lpips.LPIPSModel",
|
|
1100
|
+
"extra_kwargs": {"net": "vgg"},
|
|
1101
|
+
},
|
|
1102
|
+
{
|
|
1103
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="LPIPS/squeezenet.safetensors")
|
|
1104
|
+
"model_hash": "ff994b70a30599287a332105396d5004",
|
|
1105
|
+
"model_name": "image_metrics_lpips_squeeze",
|
|
1106
|
+
"model_class": "diffsynth.models.lpips.LPIPSModel",
|
|
1107
|
+
"extra_kwargs": {"net": "squeeze"},
|
|
1108
|
+
},
|
|
1109
|
+
{
|
|
1110
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="UnifiedReward-2.0-qwen35-9b/model-*.safetensors")
|
|
1111
|
+
"model_hash": "f9786d06eca5c0f1ece89843b2c4cc66",
|
|
1112
|
+
"model_name": "image_metrics_unified_reward_2",
|
|
1113
|
+
"model_class": "diffsynth.models.unified_reward_2.UnifiedReward2Qwen35ForConditionalGeneration",
|
|
1114
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsUnifiedRewardStateDictConverter",
|
|
1115
|
+
"extra_kwargs": {"variant": "qwen35_9b"},
|
|
1116
|
+
},
|
|
1117
|
+
{
|
|
1118
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/ImageMetrics", origin_file_pattern="Qwen-Image-Bench/model-*.safetensors")
|
|
1119
|
+
"model_hash": "ff4ad0463675e96738483611f6dd551b",
|
|
1120
|
+
"model_name": "image_metrics_qwen_image_bench",
|
|
1121
|
+
"model_class": "diffsynth.models.qwen_image_bench.QwenImageBenchQwen35ForConditionalGeneration",
|
|
1122
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.image_metrics.ImageMetricsUnifiedRewardStateDictConverter",
|
|
1123
|
+
"extra_kwargs": {"variant": "qwen35"},
|
|
1124
|
+
},
|
|
1125
|
+
]
|
|
1126
|
+
|
|
1127
|
+
hidream_o1_image_series = [
|
|
1128
|
+
{
|
|
1129
|
+
# Example: ModelConfig(model_id="HiDream-ai/HiDream-O1-Image", origin_file_pattern="model-*.safetensors")
|
|
1130
|
+
"model_hash": "58a7c1073d79556bfc61e05e6061b771",
|
|
1131
|
+
"model_name": "hidream_o1_image_dit",
|
|
1132
|
+
"model_class": "diffsynth.models.hidream_o1_image_dit.HiDreamO1ImageModel",
|
|
1133
|
+
},
|
|
1134
|
+
]
|
|
1135
|
+
|
|
1136
|
+
ideogram4_series = [
|
|
1137
|
+
{
|
|
1138
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="transformer/diffusion_pytorch_model.safetensors")
|
|
1139
|
+
"model_hash": "6f56a1d28667f2ff98e1c79af88a7516",
|
|
1140
|
+
"model_name": "ideogram4_dit",
|
|
1141
|
+
"model_class": "diffsynth.models.ideogram4_dit.Ideogram4DiT",
|
|
1142
|
+
"extra_kwargs": {"config": {"emb_dim": 4608, "num_layers": 34, "num_heads": 18, "intermediate_size": 12288, "adanln_dim": 512, "in_channels": 128, "llm_features_dim": 53248, "rope_theta": 5000000, "mrope_section": [24, 20, 20], "norm_eps": 1e-05}, "keep_original_dtype": True},
|
|
1143
|
+
},
|
|
1144
|
+
{
|
|
1145
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="text_encoder/model.safetensors")
|
|
1146
|
+
"model_hash": "6d72a86d1027baff87e2cf8fc523aab1",
|
|
1147
|
+
"model_name": "ideogram4_text_encoder",
|
|
1148
|
+
"model_class": "diffsynth.models.ideogram4_text_encoder.Ideogram4TextEncoder",
|
|
1149
|
+
"extra_kwargs": {"keep_original_dtype": True},
|
|
1150
|
+
},
|
|
1151
|
+
{
|
|
1152
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
1153
|
+
"model_hash": "c54288e3ee12ca215898840682337b95",
|
|
1154
|
+
"model_name": "ideogram4_vae_encoder",
|
|
1155
|
+
"model_class": "diffsynth.models.ideogram4_vae.Ideogram4VAEEncoder",
|
|
1156
|
+
"state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEEncoderStateDictConverter",
|
|
1157
|
+
"extra_kwargs": {"keep_original_dtype": True},
|
|
1158
|
+
},
|
|
1159
|
+
{
|
|
1160
|
+
# Example: ModelConfig(model_id="ideogram-ai/ideogram-4-fp8", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
|
|
1161
|
+
"model_hash": "c54288e3ee12ca215898840682337b95",
|
|
1162
|
+
"model_name": "ideogram4_vae_decoder",
|
|
1163
|
+
"model_class": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoder",
|
|
1164
|
+
"state_dict_converter": "diffsynth.models.ideogram4_vae.Ideogram4VAEDecoderStateDictConverter",
|
|
1165
|
+
"extra_kwargs": {"keep_original_dtype": True},
|
|
1166
|
+
},
|
|
1041
1167
|
]
|
|
1042
1168
|
|
|
1043
1169
|
MODEL_CONFIGS = (
|
|
1044
1170
|
stable_diffusion_xl_series + stable_diffusion_series + qwen_image_series + wan_series + flux_series + flux2_series + ernie_image_series
|
|
1045
|
-
+ z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series
|
|
1171
|
+
+ z_image_series + ltx2_series + anima_series + mova_series + joyai_image_series + ace_step_series + hidream_o1_image_series
|
|
1172
|
+
+ image_metrics_series + ideogram4_series
|
|
1046
1173
|
)
|
|
@@ -327,7 +327,7 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
327
327
|
"diffsynth.models.ace_step_tokenizer.AceStepTokenizer": {
|
|
328
328
|
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
329
329
|
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
330
|
-
"
|
|
330
|
+
"diffsynth.models.ace_step_residual_fsq.ResidualFSQ": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
331
331
|
"transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
332
332
|
"transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
333
333
|
"transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
@@ -372,6 +372,14 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
372
372
|
"diffsynth.models.stable_diffusion_text_encoder.CLIPAttention": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
373
373
|
"diffsynth.models.stable_diffusion_xl_text_encoder.CLIPTextModelWithProjection": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
374
374
|
},
|
|
375
|
+
"diffsynth.models.hidream_o1_image_dit.HiDreamO1ImageModel": {
|
|
376
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
377
|
+
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
378
|
+
"torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
379
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
380
|
+
"diffsynth.models.hidream_o1_image_dit.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
381
|
+
"diffsynth.models.hidream_o1_image_dit.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
382
|
+
},
|
|
375
383
|
}
|
|
376
384
|
|
|
377
385
|
def QwenImageTextEncoder_Module_Map_Updater():
|