diffsynth 2.0.2__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.2/diffsynth.egg-info → diffsynth-2.0.4}/PKG-INFO +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/README.md +18 -6
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/configs/model_configs.py +22 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/data/unified_dataset.py +5 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/device/__init__.py +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/loader/config.py +2 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/loader/file.py +15 -6
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/loader/model.py +5 -3
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/base_pipeline.py +3 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/flow_match.py +7 -2
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/logger.py +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/runner.py +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/training_module.py +55 -4
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/dinov3_image_encoder.py +3 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux2_dit.py +29 -38
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/longcat_video_dit.py +6 -5
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/model_loader.py +4 -3
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/nexus_gen_ar_model.py +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/siglip2_image_encoder.py +3 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/step1x_text_encoder.py +10 -9
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_dit.py +2 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/z_image_dit.py +3 -3
- diffsynth-2.0.4/diffsynth/models/z_image_text_encoder.py +74 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/pipelines/flux2_image.py +226 -5
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/pipelines/flux_image.py +6 -5
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/pipelines/qwen_image.py +3 -2
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/pipelines/wan_video.py +4 -3
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/pipelines/z_image.py +3 -2
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/controlnet/annotator.py +2 -1
- diffsynth-2.0.4/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +6 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/xfuser/xdit_context_parallel.py +1 -1
- {diffsynth-2.0.2 → diffsynth-2.0.4/diffsynth.egg-info}/PKG-INFO +1 -1
- diffsynth-2.0.4/diffsynth.egg-info/SOURCES.txt +119 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/pyproject.toml +1 -1
- diffsynth-2.0.2/data/style/move.py +0 -13
- diffsynth-2.0.2/data/style/test.py +0 -57
- diffsynth-2.0.2/diffsynth/models/z_image_text_encoder.py +0 -41
- diffsynth-2.0.2/diffsynth.egg-info/SOURCES.txt +0 -430
- diffsynth-2.0.2/examples/dev_tools/fix_path.py +0 -43
- diffsynth-2.0.2/examples/dev_tools/unit_test.py +0 -121
- diffsynth-2.0.2/examples/flux/model_inference/FLEX.2-preview.py +0 -50
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-Kontext-dev.py +0 -54
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-Krea-dev.py +0 -27
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-AttriCtrl.py +0 -19
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py +0 -37
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py +0 -40
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py +0 -33
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-EliGen.py +0 -133
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py +0 -24
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py +0 -61
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py +0 -38
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py +0 -38
- diffsynth-2.0.2/examples/flux/model_inference/FLUX.1-dev.py +0 -26
- diffsynth-2.0.2/examples/flux/model_inference/Nexus-Gen-Editing.py +0 -37
- diffsynth-2.0.2/examples/flux/model_inference/Nexus-Gen-Generation.py +0 -32
- diffsynth-2.0.2/examples/flux/model_inference/Step1X-Edit.py +0 -32
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLEX.2-preview.py +0 -61
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py +0 -65
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-Krea-dev.py +0 -38
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-AttriCtrl.py +0 -30
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py +0 -48
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py +0 -50
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py +0 -44
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-EliGen.py +0 -144
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py +0 -35
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py +0 -73
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Encoder.py +0 -49
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Fusion.py +0 -38
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/FLUX.1-dev.py +0 -37
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py +0 -48
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/Nexus-Gen-Generation.py +0 -43
- diffsynth-2.0.2/examples/flux/model_inference_low_vram/Step1X-Edit.py +0 -43
- diffsynth-2.0.2/examples/flux/model_training/train.py +0 -193
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLEX.2-preview.py +0 -20
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-Kontext-dev.py +0 -26
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-Krea-dev.py +0 -20
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-AttriCtrl.py +0 -21
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Inpainting-Beta.py +0 -31
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Union-alpha.py +0 -31
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Upscaler.py +0 -30
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-IP-Adapter.py +0 -28
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-InfiniteYou.py +0 -33
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev-LoRA-Encoder.py +0 -24
- diffsynth-2.0.2/examples/flux/model_training/validate_full/FLUX.1-dev.py +0 -20
- diffsynth-2.0.2/examples/flux/model_training/validate_full/Nexus-Gen.py +0 -28
- diffsynth-2.0.2/examples/flux/model_training/validate_full/Step1X-Edit.py +0 -25
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLEX.2-preview.py +0 -18
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-Kontext-dev.py +0 -24
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-Krea-dev.py +0 -18
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-AttriCtrl.py +0 -19
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Inpainting-Beta.py +0 -29
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Union-alpha.py +0 -29
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Upscaler.py +0 -28
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py +0 -33
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-IP-Adapter.py +0 -26
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev-InfiniteYou.py +0 -28
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/FLUX.1-dev.py +0 -18
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/Nexus-Gen.py +0 -26
- diffsynth-2.0.2/examples/flux/model_training/validate_lora/Step1X-Edit.py +0 -23
- diffsynth-2.0.2/examples/flux2/model_inference/FLUX.2-dev.py +0 -27
- diffsynth-2.0.2/examples/flux2/model_inference_low_vram/FLUX.2-dev.py +0 -27
- diffsynth-2.0.2/examples/flux2/model_training/train.py +0 -143
- diffsynth-2.0.2/examples/flux2/model_training/validate_lora/FLUX.2-dev.py +0 -28
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-2512.py +0 -17
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py +0 -31
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py +0 -32
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py +0 -33
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Distill-DMD2.py +0 -25
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py +0 -17
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py +0 -20
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py +0 -31
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Edit-2511.py +0 -44
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py +0 -25
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Edit.py +0 -25
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py +0 -114
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py +0 -106
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-EliGen.py +0 -107
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py +0 -35
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-Layered.py +0 -36
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image-i2L.py +0 -110
- diffsynth-2.0.2/examples/qwen_image/model_inference/Qwen-Image.py +0 -17
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-2512.py +0 -28
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py +0 -42
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py +0 -43
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Inpaint.py +0 -44
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-DMD2.py +0 -36
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py +0 -28
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py +0 -31
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py +0 -43
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511.py +0 -54
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py +0 -37
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py +0 -37
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py +0 -125
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py +0 -117
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py +0 -118
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-In-Context-Control-Union.py +0 -46
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered.py +0 -46
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image-i2L.py +0 -134
- diffsynth-2.0.2/examples/qwen_image/model_inference_low_vram/Qwen-Image.py +0 -28
- diffsynth-2.0.2/examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py +0 -13
- diffsynth-2.0.2/examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Inpaint-Initialize.py +0 -12
- diffsynth-2.0.2/examples/qwen_image/model_training/special/fp8_training/validate.py +0 -18
- diffsynth-2.0.2/examples/qwen_image/model_training/special/simple/train.py +0 -76
- diffsynth-2.0.2/examples/qwen_image/model_training/special/split_training/validate.py +0 -18
- diffsynth-2.0.2/examples/qwen_image/model_training/train.py +0 -169
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-2512.py +0 -20
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py +0 -31
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py +0 -31
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Inpaint.py +0 -32
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py +0 -20
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py +0 -26
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2511.py +0 -26
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py +0 -23
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered.py +0 -28
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_full/Qwen-Image.py +0 -20
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-2512.py +0 -18
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py +0 -32
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py +0 -33
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Inpaint.py +0 -34
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py +0 -18
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py +0 -23
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py +0 -24
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2511.py +0 -24
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py +0 -21
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py +0 -29
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py +0 -29
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-In-Context-Control-Union.py +0 -19
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered.py +0 -27
- diffsynth-2.0.2/examples/qwen_image/model_training/validate_lora/Qwen-Image.py +0 -18
- diffsynth-2.0.2/examples/wanvideo/acceleration/unified_sequence_parallel.py +0 -26
- diffsynth-2.0.2/examples/wanvideo/model_inference/LongCat-Video.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py +0 -49
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +0 -44
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py +0 -44
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py +0 -24
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py +0 -52
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py +0 -53
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py +0 -54
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +0 -62
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py +0 -43
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-S2V-14B.py +0 -73
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py +0 -124
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py +0 -24
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py +0 -43
- diffsynth-2.0.2/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py +0 -68
- diffsynth-2.0.2/examples/wanvideo/model_inference/krea-realtime-video.py +0 -25
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/LongCat-Video.py +0 -46
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Video-As-Prompt-Wan2.1-14B.py +0 -62
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-1.3b-speedcontrol-v1.py +0 -45
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-FLF2V-14B-720P.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-Control.py +0 -45
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-InP.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-Control.py +0 -45
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-InP.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +0 -55
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-InP.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control-Camera.py +0 -55
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-InP.py +0 -47
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-480P.py +0 -45
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-720P.py +0 -46
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-1.3B.py +0 -45
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-14B.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B-Preview.py +0 -63
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B.py +0 -64
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-14B.py +0 -65
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +0 -74
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control-Camera.py +0 -55
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control.py +0 -46
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-InP.py +0 -46
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-I2V-A14B.py +0 -44
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-S2V-14B.py +0 -84
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-S2V-14B_multi_clips.py +0 -133
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-T2V-A14B.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-TI2V-5B.py +0 -54
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/Wan2.2-VACE-Fun-A14B.py +0 -65
- diffsynth-2.0.2/examples/wanvideo/model_inference_low_vram/krea-realtime-video.py +0 -36
- diffsynth-2.0.2/examples/wanvideo/model_training/special/direct_distill/validate.py +0 -23
- diffsynth-2.0.2/examples/wanvideo/model_training/special/fp8_training/validate.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/special/low_vram_training/validate.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/special/split_training/validate.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/train.py +0 -185
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/LongCat-Video.py +0 -25
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py +0 -43
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py +0 -25
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py +0 -25
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py +0 -34
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py +0 -35
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py +0 -33
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py +0 -53
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py +0 -43
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_full/krea-realtime-video.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/LongCat-Video.py +0 -23
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py +0 -42
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py +0 -27
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py +0 -31
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py +0 -23
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py +0 -23
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py +0 -28
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py +0 -32
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py +0 -50
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py +0 -27
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py +0 -29
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py +0 -30
- diffsynth-2.0.2/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py +0 -28
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Omni-Base-i2L.py +0 -62
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Omni-Base.py +0 -24
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py +0 -27
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py +0 -40
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py +0 -46
- diffsynth-2.0.2/examples/z_image/model_inference/Z-Image-Turbo.py +0 -17
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Omni-Base-i2L.py +0 -62
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Omni-Base.py +0 -33
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py +0 -37
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py +0 -50
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py +0 -56
- diffsynth-2.0.2/examples/z_image/model_inference_low_vram/Z-Image-Turbo.py +0 -27
- diffsynth-2.0.2/examples/z_image/model_training/special/differential_training/validate.py +0 -18
- diffsynth-2.0.2/examples/z_image/model_training/special/trajectory_imitation/validate.py +0 -18
- diffsynth-2.0.2/examples/z_image/model_training/train.py +0 -153
- diffsynth-2.0.2/examples/z_image/model_training/validate_full/Z-Image-Omni-Base.py +0 -33
- diffsynth-2.0.2/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py +0 -24
- diffsynth-2.0.2/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py +0 -24
- diffsynth-2.0.2/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py +0 -24
- diffsynth-2.0.2/examples/z_image/model_training/validate_full/Z-Image-Turbo.py +0 -20
- diffsynth-2.0.2/examples/z_image/model_training/validate_lora/Z-Image-Omni-Base.py +0 -31
- diffsynth-2.0.2/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py +0 -23
- diffsynth-2.0.2/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py +0 -23
- diffsynth-2.0.2/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py +0 -23
- diffsynth-2.0.2/examples/z_image/model_training/validate_lora/Z-Image-Turbo.py +0 -18
- {diffsynth-2.0.2 → diffsynth-2.0.4}/LICENSE +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/configs/vram_management_module_maps.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/attention/attention.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/data/operators.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/core/vram/layers.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth/utils/xfuser/__init__.py +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.2 → diffsynth-2.0.4}/setup.cfg +0 -0
|
@@ -33,7 +33,11 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
33
33
|
|
|
34
34
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
35
35
|
|
|
36
|
-
- **January
|
|
36
|
+
- **January 27, 2026**: [Z-Image](https://modelscope.cn/models/Tongyi-MAI/Z-Image) is released, and our [Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L) model is released concurrently. You can use it in [ModelScope Studios](https://modelscope.cn/studios/DiffSynth-Studio/Z-Image-i2L). For details, see the [documentation](/docs/zh/Model_Details/Z-Image.md).
|
|
37
|
+
|
|
38
|
+
- **January 19, 2026**: Added support for [FLUX.2-klein-4B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B) and [FLUX.2-klein-9B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/FLUX2.md) and [example code](/examples/flux2/) are now available.
|
|
39
|
+
|
|
40
|
+
- **January 12, 2026**: We trained and open-sourced a text-guided image layer separation model ([Model Link](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)). Given an input image and a textual description, the model isolates the image layer corresponding to the described content. For more details, please refer to our blog post ([Chinese version](https://modelscope.cn/learn/4938), [English version](https://huggingface.co/blog/kelseye/qwen-image-layered-control)).
|
|
37
41
|
|
|
38
42
|
- **December 24, 2025**: Based on Qwen-Image-Edit-2511, we trained an In-Context Editing LoRA model ([Model Link](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-2511-ICEdit-LoRA)). This model takes three images as input (Image A, Image B, and Image C), and automatically analyzes the transformation from Image A to Image B, then applies the same transformation to Image C to generate Image D. For more details, please refer to our blog post ([Chinese version](https://mp.weixin.qq.com/s/41aEiN3lXKGCJs1-we4Q2g), [English version](https://huggingface.co/blog/kelseye/qwen-image-edit-2511-icedit-lora)).
|
|
39
43
|
|
|
@@ -267,9 +271,14 @@ image.save("image.jpg")
|
|
|
267
271
|
|
|
268
272
|
Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image/)
|
|
269
273
|
|
|
270
|
-
|
|
|
274
|
+
|Model ID|Inference|Low VRAM Inference|Full Training|Validation After Full Training|LoRA Training|Validation After LoRA Training|
|
|
271
275
|
|-|-|-|-|-|-|-|
|
|
276
|
+
|[Tongyi-MAI/Z-Image](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image)|[code](/examples/z_image/model_inference/Z-Image.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image.py)|[code](/examples/z_image/model_training/full/Z-Image.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image.py)|[code](/examples/z_image/model_training/lora/Z-Image.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image.py)|
|
|
277
|
+
|[DiffSynth-Studio/Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L)|[code](/examples/z_image/model_inference/Z-Image-i2L.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-i2L.py)|-|-|-|-|
|
|
272
278
|
|[Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo)|[code](/examples/z_image/model_inference/Z-Image-Turbo.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo.py)|
|
|
279
|
+
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|
|
|
280
|
+
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|
|
|
281
|
+
|[PAI/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|
|
|
273
282
|
|
|
274
283
|
</details>
|
|
275
284
|
|
|
@@ -319,9 +328,13 @@ image.save("image.jpg")
|
|
|
319
328
|
|
|
320
329
|
Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
|
|
321
330
|
|
|
322
|
-
| Model ID | Inference | Low-VRAM Inference | LoRA Training | LoRA Training Validation |
|
|
323
|
-
|
|
324
|
-
|[black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev)|[code](/examples/flux2/model_inference/FLUX.2-dev.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-dev.py)
|
|
331
|
+
| Model ID | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
332
|
+
|-|-|-|-|-|-|-|
|
|
333
|
+
|[black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev)|[code](/examples/flux2/model_inference/FLUX.2-dev.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-dev.py)|-|-|[code](/examples/flux2/model_training/lora/FLUX.2-dev.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-dev.py)|
|
|
334
|
+
|[black-forest-labs/FLUX.2-klein-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-4B.py)|
|
|
335
|
+
|[black-forest-labs/FLUX.2-klein-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-9B.py)|
|
|
336
|
+
|[black-forest-labs/FLUX.2-klein-base-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-4B.py)|
|
|
337
|
+
|[black-forest-labs/FLUX.2-klein-base-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-9B.py)|
|
|
325
338
|
|
|
326
339
|
</details>
|
|
327
340
|
|
|
@@ -774,4 +787,3 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-47
|
|
|
774
787
|
https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
|
|
775
788
|
|
|
776
789
|
</details>
|
|
777
|
-
|
|
@@ -510,6 +510,28 @@ flux2_series = [
|
|
|
510
510
|
"model_name": "flux2_vae",
|
|
511
511
|
"model_class": "diffsynth.models.flux2_vae.Flux2VAE",
|
|
512
512
|
},
|
|
513
|
+
{
|
|
514
|
+
# Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="transformer/*.safetensors")
|
|
515
|
+
"model_hash": "3bde7b817fec8143028b6825a63180df",
|
|
516
|
+
"model_name": "flux2_dit",
|
|
517
|
+
"model_class": "diffsynth.models.flux2_dit.Flux2DiT",
|
|
518
|
+
"extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 7680, "num_attention_heads": 24, "num_layers": 5, "num_single_layers": 20}
|
|
519
|
+
},
|
|
520
|
+
{
|
|
521
|
+
# Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="text_encoder/*.safetensors")
|
|
522
|
+
"model_hash": "9195f3ea256fcd0ae6d929c203470754",
|
|
523
|
+
"model_name": "z_image_text_encoder",
|
|
524
|
+
"model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
|
|
525
|
+
"extra_kwargs": {"model_size": "8B"},
|
|
526
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
|
|
527
|
+
},
|
|
528
|
+
{
|
|
529
|
+
# Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="transformer/*.safetensors")
|
|
530
|
+
"model_hash": "39c6fc48f07bebecedbbaa971ff466c8",
|
|
531
|
+
"model_name": "flux2_dit",
|
|
532
|
+
"model_class": "diffsynth.models.flux2_dit.Flux2DiT",
|
|
533
|
+
"extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 12288, "num_attention_heads": 32, "num_layers": 8, "num_single_layers": 24}
|
|
534
|
+
},
|
|
513
535
|
]
|
|
514
536
|
|
|
515
537
|
z_image_series = [
|
|
@@ -10,6 +10,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
|
|
|
10
10
|
data_file_keys=tuple(),
|
|
11
11
|
main_data_operator=lambda x: x,
|
|
12
12
|
special_operator_map=None,
|
|
13
|
+
max_data_items=None,
|
|
13
14
|
):
|
|
14
15
|
self.base_path = base_path
|
|
15
16
|
self.metadata_path = metadata_path
|
|
@@ -18,6 +19,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
|
|
|
18
19
|
self.main_data_operator = main_data_operator
|
|
19
20
|
self.cached_data_operator = LoadTorchPickle()
|
|
20
21
|
self.special_operator_map = {} if special_operator_map is None else special_operator_map
|
|
22
|
+
self.max_data_items = max_data_items
|
|
21
23
|
self.data = []
|
|
22
24
|
self.cached_data = []
|
|
23
25
|
self.load_from_cache = metadata_path is None
|
|
@@ -97,7 +99,9 @@ class UnifiedDataset(torch.utils.data.Dataset):
|
|
|
97
99
|
return data
|
|
98
100
|
|
|
99
101
|
def __len__(self):
|
|
100
|
-
if self.
|
|
102
|
+
if self.max_data_items is not None:
|
|
103
|
+
return self.max_data_items
|
|
104
|
+
elif self.load_from_cache:
|
|
101
105
|
return len(self.cached_data) * self.repeat
|
|
102
106
|
else:
|
|
103
107
|
return len(self.data) * self.repeat
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name
|
|
2
|
-
from .npu_compatible_device import IS_NPU_AVAILABLE
|
|
2
|
+
from .npu_compatible_device import IS_NPU_AVAILABLE, IS_CUDA_AVAILABLE
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import torch, glob, os
|
|
2
|
-
from typing import Optional, Union
|
|
2
|
+
from typing import Optional, Union, Dict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from modelscope import snapshot_download
|
|
5
5
|
from huggingface_hub import snapshot_download as hf_snapshot_download
|
|
@@ -23,6 +23,7 @@ class ModelConfig:
|
|
|
23
23
|
computation_device: Optional[Union[str, torch.device]] = None
|
|
24
24
|
computation_dtype: Optional[torch.dtype] = None
|
|
25
25
|
clear_parameters: bool = False
|
|
26
|
+
state_dict: Dict[str, torch.Tensor] = None
|
|
26
27
|
|
|
27
28
|
def check_input(self):
|
|
28
29
|
if self.path is None and self.model_id is None:
|
|
@@ -2,16 +2,25 @@ from safetensors import safe_open
|
|
|
2
2
|
import torch, hashlib
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def load_state_dict(file_path, torch_dtype=None, device="cpu"):
|
|
5
|
+
def load_state_dict(file_path, torch_dtype=None, device="cpu", pin_memory=False, verbose=0):
|
|
6
6
|
if isinstance(file_path, list):
|
|
7
7
|
state_dict = {}
|
|
8
8
|
for file_path_ in file_path:
|
|
9
|
-
state_dict.update(load_state_dict(file_path_, torch_dtype, device))
|
|
10
|
-
return state_dict
|
|
11
|
-
if file_path.endswith(".safetensors"):
|
|
12
|
-
return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
|
|
9
|
+
state_dict.update(load_state_dict(file_path_, torch_dtype, device, pin_memory=pin_memory, verbose=verbose))
|
|
13
10
|
else:
|
|
14
|
-
|
|
11
|
+
if verbose >= 1:
|
|
12
|
+
print(f"Loading file [started]: {file_path}")
|
|
13
|
+
if file_path.endswith(".safetensors"):
|
|
14
|
+
state_dict = load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
|
|
15
|
+
else:
|
|
16
|
+
state_dict = load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
|
|
17
|
+
# If load state dict in CPU memory, `pin_memory=True` will make `model.to("cuda")` faster.
|
|
18
|
+
if pin_memory:
|
|
19
|
+
for i in state_dict:
|
|
20
|
+
state_dict[i] = state_dict[i].pin_memory()
|
|
21
|
+
if verbose >= 1:
|
|
22
|
+
print(f"Loading file [done]: {file_path}")
|
|
23
|
+
return state_dict
|
|
15
24
|
|
|
16
25
|
|
|
17
26
|
def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
|
|
@@ -5,7 +5,7 @@ from .file import load_state_dict
|
|
|
5
5
|
import torch
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None):
|
|
8
|
+
def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None, state_dict=None):
|
|
9
9
|
config = {} if config is None else config
|
|
10
10
|
# Why do we use `skip_model_initialization`?
|
|
11
11
|
# It skips the random initialization of model parameters,
|
|
@@ -20,7 +20,7 @@ def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, devic
|
|
|
20
20
|
dtypes = [vram_config["offload_dtype"], vram_config["onload_dtype"], vram_config["preparing_dtype"], vram_config["computation_dtype"]]
|
|
21
21
|
dtype = [d for d in dtypes if d != "disk"][0]
|
|
22
22
|
if vram_config["offload_device"] != "disk":
|
|
23
|
-
state_dict = DiskMap(path, device, torch_dtype=dtype)
|
|
23
|
+
if state_dict is None: state_dict = DiskMap(path, device, torch_dtype=dtype)
|
|
24
24
|
if state_dict_converter is not None:
|
|
25
25
|
state_dict = state_dict_converter(state_dict)
|
|
26
26
|
else:
|
|
@@ -35,7 +35,9 @@ def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, devic
|
|
|
35
35
|
# Sometimes a model file contains multiple models,
|
|
36
36
|
# and DiskMap can load only the parameters of a single model,
|
|
37
37
|
# avoiding the need to load all parameters in the file.
|
|
38
|
-
if
|
|
38
|
+
if state_dict is not None:
|
|
39
|
+
pass
|
|
40
|
+
elif use_disk_map:
|
|
39
41
|
state_dict = DiskMap(path, device, torch_dtype=torch_dtype)
|
|
40
42
|
else:
|
|
41
43
|
state_dict = load_state_dict(path, torch_dtype, device)
|
|
@@ -4,6 +4,7 @@ import numpy as np
|
|
|
4
4
|
from einops import repeat, reduce
|
|
5
5
|
from typing import Union
|
|
6
6
|
from ..core import AutoTorchModule, AutoWrappedLinear, load_state_dict, ModelConfig, parse_device_type
|
|
7
|
+
from ..core.device.npu_compatible_device import get_device_type
|
|
7
8
|
from ..utils.lora import GeneralLoRALoader
|
|
8
9
|
from ..models.model_loader import ModelPool
|
|
9
10
|
from ..utils.controlnet import ControlNetInput
|
|
@@ -61,7 +62,7 @@ class BasePipeline(torch.nn.Module):
|
|
|
61
62
|
|
|
62
63
|
def __init__(
|
|
63
64
|
self,
|
|
64
|
-
device=
|
|
65
|
+
device=get_device_type(), torch_dtype=torch.float16,
|
|
65
66
|
height_division_factor=64, width_division_factor=64,
|
|
66
67
|
time_division_factor=None, time_division_remainder=None,
|
|
67
68
|
):
|
|
@@ -295,6 +296,7 @@ class BasePipeline(torch.nn.Module):
|
|
|
295
296
|
vram_config=vram_config,
|
|
296
297
|
vram_limit=vram_limit,
|
|
297
298
|
clear_parameters=model_config.clear_parameters,
|
|
299
|
+
state_dict=model_config.state_dict,
|
|
298
300
|
)
|
|
299
301
|
return model_pool
|
|
300
302
|
|
|
@@ -89,13 +89,18 @@ class FlowMatchScheduler():
|
|
|
89
89
|
return float(mu)
|
|
90
90
|
|
|
91
91
|
@staticmethod
|
|
92
|
-
def set_timesteps_flux2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=
|
|
92
|
+
def set_timesteps_flux2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=None):
|
|
93
93
|
sigma_min = 1 / num_inference_steps
|
|
94
94
|
sigma_max = 1.0
|
|
95
95
|
num_train_timesteps = 1000
|
|
96
96
|
sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
|
|
97
97
|
sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps)
|
|
98
|
-
|
|
98
|
+
if dynamic_shift_len is None:
|
|
99
|
+
# If you ask me why I set mu=0.8,
|
|
100
|
+
# I can only say that it yields better training results.
|
|
101
|
+
mu = 0.8
|
|
102
|
+
else:
|
|
103
|
+
mu = FlowMatchScheduler.compute_empirical_mu(dynamic_shift_len, num_inference_steps)
|
|
99
104
|
sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
|
|
100
105
|
timesteps = sigmas * num_train_timesteps
|
|
101
106
|
return sigmas, timesteps
|
|
@@ -10,7 +10,7 @@ class ModelLogger:
|
|
|
10
10
|
self.num_steps = 0
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def on_step_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None):
|
|
13
|
+
def on_step_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None, **kwargs):
|
|
14
14
|
self.num_steps += 1
|
|
15
15
|
if save_steps is not None and self.num_steps % save_steps == 0:
|
|
16
16
|
self.save_model(accelerator, model, f"step-{self.num_steps}.safetensors")
|
|
@@ -40,7 +40,7 @@ def launch_training_task(
|
|
|
40
40
|
loss = model(data)
|
|
41
41
|
accelerator.backward(loss)
|
|
42
42
|
optimizer.step()
|
|
43
|
-
model_logger.on_step_end(accelerator, model, save_steps)
|
|
43
|
+
model_logger.on_step_end(accelerator, model, save_steps, loss=loss)
|
|
44
44
|
scheduler.step()
|
|
45
45
|
if save_steps is None:
|
|
46
46
|
model_logger.on_epoch_end(accelerator, model, epoch_id)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import torch, json
|
|
1
|
+
import torch, json, os
|
|
2
2
|
from ..core import ModelConfig, load_state_dict
|
|
3
3
|
from ..utils.controlnet import ControlNetInput
|
|
4
4
|
from peft import LoraConfig, inject_adapter_in_model
|
|
@@ -127,16 +127,67 @@ class DiffusionTrainingModule(torch.nn.Module):
|
|
|
127
127
|
if model_id_with_origin_paths is not None:
|
|
128
128
|
model_id_with_origin_paths = model_id_with_origin_paths.split(",")
|
|
129
129
|
for model_id_with_origin_path in model_id_with_origin_paths:
|
|
130
|
-
model_id, origin_file_pattern = model_id_with_origin_path.split(":")
|
|
131
130
|
vram_config = self.parse_vram_config(
|
|
132
131
|
fp8=model_id_with_origin_path in fp8_models,
|
|
133
132
|
offload=model_id_with_origin_path in offload_models,
|
|
134
133
|
device=device
|
|
135
134
|
)
|
|
136
|
-
|
|
135
|
+
config = self.parse_path_or_model_id(model_id_with_origin_path)
|
|
136
|
+
model_configs.append(ModelConfig(model_id=config.model_id, origin_file_pattern=config.origin_file_pattern, **vram_config))
|
|
137
137
|
return model_configs
|
|
138
138
|
|
|
139
|
+
|
|
140
|
+
def parse_path_or_model_id(self, model_id_with_origin_path, default_value=None):
|
|
141
|
+
if model_id_with_origin_path is None:
|
|
142
|
+
return default_value
|
|
143
|
+
elif os.path.exists(model_id_with_origin_path):
|
|
144
|
+
return ModelConfig(path=model_id_with_origin_path)
|
|
145
|
+
else:
|
|
146
|
+
if ":" not in model_id_with_origin_path:
|
|
147
|
+
raise ValueError(f"Failed to parse model config: {model_id_with_origin_path}. This is neither a valid path nor in the format of `model_id/origin_file_pattern`.")
|
|
148
|
+
split_id = model_id_with_origin_path.rfind(":")
|
|
149
|
+
model_id = model_id_with_origin_path[:split_id]
|
|
150
|
+
origin_file_pattern = model_id_with_origin_path[split_id + 1:]
|
|
151
|
+
return ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def auto_detect_lora_target_modules(
|
|
155
|
+
self,
|
|
156
|
+
model: torch.nn.Module,
|
|
157
|
+
search_for_linear=False,
|
|
158
|
+
linear_detector=lambda x: min(x.weight.shape) >= 512,
|
|
159
|
+
block_list_detector=lambda x: isinstance(x, torch.nn.ModuleList) and len(x) > 1,
|
|
160
|
+
name_prefix="",
|
|
161
|
+
):
|
|
162
|
+
lora_target_modules = []
|
|
163
|
+
if search_for_linear:
|
|
164
|
+
for name, module in model.named_modules():
|
|
165
|
+
module_name = name_prefix + ["", "."][name_prefix != ""] + name
|
|
166
|
+
if isinstance(module, torch.nn.Linear) and linear_detector(module):
|
|
167
|
+
lora_target_modules.append(module_name)
|
|
168
|
+
else:
|
|
169
|
+
for name, module in model.named_children():
|
|
170
|
+
module_name = name_prefix + ["", "."][name_prefix != ""] + name
|
|
171
|
+
lora_target_modules += self.auto_detect_lora_target_modules(
|
|
172
|
+
module,
|
|
173
|
+
search_for_linear=block_list_detector(module),
|
|
174
|
+
linear_detector=linear_detector,
|
|
175
|
+
block_list_detector=block_list_detector,
|
|
176
|
+
name_prefix=module_name,
|
|
177
|
+
)
|
|
178
|
+
return lora_target_modules
|
|
139
179
|
|
|
180
|
+
|
|
181
|
+
def parse_lora_target_modules(self, model, lora_target_modules):
|
|
182
|
+
if lora_target_modules == "":
|
|
183
|
+
print("No LoRA target modules specified. The framework will automatically search for them.")
|
|
184
|
+
lora_target_modules = self.auto_detect_lora_target_modules(model)
|
|
185
|
+
print(f"LoRA will be patched at {lora_target_modules}.")
|
|
186
|
+
else:
|
|
187
|
+
lora_target_modules = lora_target_modules.split(",")
|
|
188
|
+
return lora_target_modules
|
|
189
|
+
|
|
190
|
+
|
|
140
191
|
def switch_pipe_to_training_mode(
|
|
141
192
|
self,
|
|
142
193
|
pipe,
|
|
@@ -166,7 +217,7 @@ class DiffusionTrainingModule(torch.nn.Module):
|
|
|
166
217
|
return
|
|
167
218
|
model = self.add_lora_to_model(
|
|
168
219
|
getattr(pipe, lora_base_model),
|
|
169
|
-
target_modules=
|
|
220
|
+
target_modules=self.parse_lora_target_modules(getattr(pipe, lora_base_model), lora_target_modules),
|
|
170
221
|
lora_rank=lora_rank,
|
|
171
222
|
upcast_dtype=pipe.torch_dtype,
|
|
172
223
|
)
|
|
@@ -2,6 +2,8 @@ from transformers import DINOv3ViTModel, DINOv3ViTImageProcessorFast
|
|
|
2
2
|
from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig
|
|
3
3
|
import torch
|
|
4
4
|
|
|
5
|
+
from ..core.device.npu_compatible_device import get_device_type
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class DINOv3ImageEncoder(DINOv3ViTModel):
|
|
7
9
|
def __init__(self):
|
|
@@ -70,7 +72,7 @@ class DINOv3ImageEncoder(DINOv3ViTModel):
|
|
|
70
72
|
}
|
|
71
73
|
)
|
|
72
74
|
|
|
73
|
-
def forward(self, image, torch_dtype=torch.bfloat16, device=
|
|
75
|
+
def forward(self, image, torch_dtype=torch.bfloat16, device=get_device_type()):
|
|
74
76
|
inputs = self.processor(images=image, return_tensors="pt")
|
|
75
77
|
pixel_values = inputs["pixel_values"].to(dtype=torch_dtype, device=device)
|
|
76
78
|
bool_masked_pos = None
|
|
@@ -823,7 +823,13 @@ class Flux2PosEmbed(nn.Module):
|
|
|
823
823
|
|
|
824
824
|
|
|
825
825
|
class Flux2TimestepGuidanceEmbeddings(nn.Module):
|
|
826
|
-
def __init__(
|
|
826
|
+
def __init__(
|
|
827
|
+
self,
|
|
828
|
+
in_channels: int = 256,
|
|
829
|
+
embedding_dim: int = 6144,
|
|
830
|
+
bias: bool = False,
|
|
831
|
+
guidance_embeds: bool = True,
|
|
832
|
+
):
|
|
827
833
|
super().__init__()
|
|
828
834
|
|
|
829
835
|
self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
|
|
@@ -831,20 +837,24 @@ class Flux2TimestepGuidanceEmbeddings(nn.Module):
|
|
|
831
837
|
in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
|
|
832
838
|
)
|
|
833
839
|
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
840
|
+
if guidance_embeds:
|
|
841
|
+
self.guidance_embedder = TimestepEmbedding(
|
|
842
|
+
in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
|
|
843
|
+
)
|
|
844
|
+
else:
|
|
845
|
+
self.guidance_embedder = None
|
|
837
846
|
|
|
838
847
|
def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> torch.Tensor:
|
|
839
848
|
timesteps_proj = self.time_proj(timestep)
|
|
840
849
|
timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype)) # (N, D)
|
|
841
850
|
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
851
|
+
if guidance is not None and self.guidance_embedder is not None:
|
|
852
|
+
guidance_proj = self.time_proj(guidance)
|
|
853
|
+
guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype)) # (N, D)
|
|
854
|
+
time_guidance_emb = timesteps_emb + guidance_emb
|
|
855
|
+
return time_guidance_emb
|
|
856
|
+
else:
|
|
857
|
+
return timesteps_emb
|
|
848
858
|
|
|
849
859
|
|
|
850
860
|
class Flux2Modulation(nn.Module):
|
|
@@ -882,6 +892,7 @@ class Flux2DiT(torch.nn.Module):
|
|
|
882
892
|
axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
|
|
883
893
|
rope_theta: int = 2000,
|
|
884
894
|
eps: float = 1e-6,
|
|
895
|
+
guidance_embeds: bool = True,
|
|
885
896
|
):
|
|
886
897
|
super().__init__()
|
|
887
898
|
self.out_channels = out_channels or in_channels
|
|
@@ -892,7 +903,10 @@ class Flux2DiT(torch.nn.Module):
|
|
|
892
903
|
|
|
893
904
|
# 2. Combined timestep + guidance embedding
|
|
894
905
|
self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
|
|
895
|
-
in_channels=timestep_guidance_channels,
|
|
906
|
+
in_channels=timestep_guidance_channels,
|
|
907
|
+
embedding_dim=self.inner_dim,
|
|
908
|
+
bias=False,
|
|
909
|
+
guidance_embeds=guidance_embeds,
|
|
896
910
|
)
|
|
897
911
|
|
|
898
912
|
# 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
|
|
@@ -953,34 +967,9 @@ class Flux2DiT(torch.nn.Module):
|
|
|
953
967
|
txt_ids: torch.Tensor = None,
|
|
954
968
|
guidance: torch.Tensor = None,
|
|
955
969
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
|
956
|
-
return_dict: bool = True,
|
|
957
970
|
use_gradient_checkpointing=False,
|
|
958
971
|
use_gradient_checkpointing_offload=False,
|
|
959
|
-
)
|
|
960
|
-
"""
|
|
961
|
-
The [`FluxTransformer2DModel`] forward method.
|
|
962
|
-
|
|
963
|
-
Args:
|
|
964
|
-
hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
|
|
965
|
-
Input `hidden_states`.
|
|
966
|
-
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
|
|
967
|
-
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
|
968
|
-
timestep ( `torch.LongTensor`):
|
|
969
|
-
Used to indicate denoising step.
|
|
970
|
-
block_controlnet_hidden_states: (`list` of `torch.Tensor`):
|
|
971
|
-
A list of tensors that if specified are added to the residuals of transformer blocks.
|
|
972
|
-
joint_attention_kwargs (`dict`, *optional*):
|
|
973
|
-
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
|
974
|
-
`self.processor` in
|
|
975
|
-
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
|
976
|
-
return_dict (`bool`, *optional*, defaults to `True`):
|
|
977
|
-
Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
|
|
978
|
-
tuple.
|
|
979
|
-
|
|
980
|
-
Returns:
|
|
981
|
-
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
|
|
982
|
-
`tuple` where the first element is the sample tensor.
|
|
983
|
-
"""
|
|
972
|
+
):
|
|
984
973
|
# 0. Handle input arguments
|
|
985
974
|
if joint_attention_kwargs is not None:
|
|
986
975
|
joint_attention_kwargs = joint_attention_kwargs.copy()
|
|
@@ -992,7 +981,9 @@ class Flux2DiT(torch.nn.Module):
|
|
|
992
981
|
|
|
993
982
|
# 1. Calculate timestep embedding and modulation parameters
|
|
994
983
|
timestep = timestep.to(hidden_states.dtype) * 1000
|
|
995
|
-
|
|
984
|
+
|
|
985
|
+
if guidance is not None:
|
|
986
|
+
guidance = guidance.to(hidden_states.dtype) * 1000
|
|
996
987
|
|
|
997
988
|
temb = self.time_guidance_embed(timestep, guidance)
|
|
998
989
|
|
|
@@ -9,6 +9,7 @@ import numpy as np
|
|
|
9
9
|
import torch.nn.functional as F
|
|
10
10
|
from einops import rearrange, repeat
|
|
11
11
|
from .wan_video_dit import flash_attention
|
|
12
|
+
from ..core.device.npu_compatible_device import get_device_type
|
|
12
13
|
from ..core.gradient import gradient_checkpoint_forward
|
|
13
14
|
|
|
14
15
|
|
|
@@ -373,7 +374,7 @@ class FinalLayer_FP32(nn.Module):
|
|
|
373
374
|
B, N, C = x.shape
|
|
374
375
|
T, _, _ = latent_shape
|
|
375
376
|
|
|
376
|
-
with amp.autocast(
|
|
377
|
+
with amp.autocast(get_device_type(), dtype=torch.float32):
|
|
377
378
|
shift, scale = self.adaLN_modulation(t).unsqueeze(2).chunk(2, dim=-1) # [B, T, 1, C]
|
|
378
379
|
x = modulate_fp32(self.norm_final, x.view(B, T, -1, C), shift, scale).view(B, N, C)
|
|
379
380
|
x = self.linear(x)
|
|
@@ -583,7 +584,7 @@ class LongCatSingleStreamBlock(nn.Module):
|
|
|
583
584
|
T, _, _ = latent_shape # S != T*H*W in case of CP split on H*W.
|
|
584
585
|
|
|
585
586
|
# compute modulation params in fp32
|
|
586
|
-
with amp.autocast(device_type=
|
|
587
|
+
with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
|
|
587
588
|
shift_msa, scale_msa, gate_msa, \
|
|
588
589
|
shift_mlp, scale_mlp, gate_mlp = \
|
|
589
590
|
self.adaLN_modulation(t).unsqueeze(2).chunk(6, dim=-1) # [B, T, 1, C]
|
|
@@ -602,7 +603,7 @@ class LongCatSingleStreamBlock(nn.Module):
|
|
|
602
603
|
else:
|
|
603
604
|
x_s = attn_outputs
|
|
604
605
|
|
|
605
|
-
with amp.autocast(device_type=
|
|
606
|
+
with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
|
|
606
607
|
x = x + (gate_msa * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
|
|
607
608
|
x = x.to(x_dtype)
|
|
608
609
|
|
|
@@ -615,7 +616,7 @@ class LongCatSingleStreamBlock(nn.Module):
|
|
|
615
616
|
# ffn with modulation
|
|
616
617
|
x_m = modulate_fp32(self.mod_norm_ffn, x.view(B, -1, N//T, C), shift_mlp, scale_mlp).view(B, -1, C)
|
|
617
618
|
x_s = self.ffn(x_m)
|
|
618
|
-
with amp.autocast(device_type=
|
|
619
|
+
with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
|
|
619
620
|
x = x + (gate_mlp * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
|
|
620
621
|
x = x.to(x_dtype)
|
|
621
622
|
|
|
@@ -797,7 +798,7 @@ class LongCatVideoTransformer3DModel(torch.nn.Module):
|
|
|
797
798
|
|
|
798
799
|
hidden_states = self.x_embedder(hidden_states) # [B, N, C]
|
|
799
800
|
|
|
800
|
-
with amp.autocast(device_type=
|
|
801
|
+
with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
|
|
801
802
|
t = self.t_embedder(timestep.float().flatten(), dtype=torch.float32).reshape(B, N_t, -1) # [B, T, C_t]
|
|
802
803
|
|
|
803
804
|
encoder_hidden_states = self.y_embedder(encoder_hidden_states) # [B, 1, N_token, C]
|
|
@@ -29,7 +29,7 @@ class ModelPool:
|
|
|
29
29
|
module_map = None
|
|
30
30
|
return module_map
|
|
31
31
|
|
|
32
|
-
def load_model_file(self, config, path, vram_config, vram_limit=None):
|
|
32
|
+
def load_model_file(self, config, path, vram_config, vram_limit=None, state_dict=None):
|
|
33
33
|
model_class = self.import_model_class(config["model_class"])
|
|
34
34
|
model_config = config.get("extra_kwargs", {})
|
|
35
35
|
if "state_dict_converter" in config:
|
|
@@ -43,6 +43,7 @@ class ModelPool:
|
|
|
43
43
|
state_dict_converter,
|
|
44
44
|
use_disk_map=True,
|
|
45
45
|
vram_config=vram_config, module_map=module_map, vram_limit=vram_limit,
|
|
46
|
+
state_dict=state_dict,
|
|
46
47
|
)
|
|
47
48
|
return model
|
|
48
49
|
|
|
@@ -59,7 +60,7 @@ class ModelPool:
|
|
|
59
60
|
}
|
|
60
61
|
return vram_config
|
|
61
62
|
|
|
62
|
-
def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False):
|
|
63
|
+
def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False, state_dict=None):
|
|
63
64
|
print(f"Loading models from: {json.dumps(path, indent=4)}")
|
|
64
65
|
if vram_config is None:
|
|
65
66
|
vram_config = self.default_vram_config()
|
|
@@ -67,7 +68,7 @@ class ModelPool:
|
|
|
67
68
|
loaded = False
|
|
68
69
|
for config in MODEL_CONFIGS:
|
|
69
70
|
if config["model_hash"] == model_hash:
|
|
70
|
-
model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit)
|
|
71
|
+
model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit, state_dict=state_dict)
|
|
71
72
|
if clear_parameters: self.clear_parameters(model)
|
|
72
73
|
self.model.append(model)
|
|
73
74
|
model_name = config["model_name"]
|
|
@@ -583,7 +583,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
583
583
|
is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
|
|
584
584
|
is_compileable = is_compileable and not self.generation_config.disable_compile
|
|
585
585
|
if is_compileable and (
|
|
586
|
-
self.device.type
|
|
586
|
+
self.device.type in ["cuda", "npu"] or generation_config.compile_config._compile_all_devices
|
|
587
587
|
):
|
|
588
588
|
os.environ["TOKENIZERS_PARALLELISM"] = "0"
|
|
589
589
|
model_forward = self.get_compiled_call(generation_config.compile_config)
|
|
@@ -2,6 +2,8 @@ from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer,
|
|
|
2
2
|
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessorFast
|
|
3
3
|
import torch
|
|
4
4
|
|
|
5
|
+
from diffsynth.core.device.npu_compatible_device import get_device_type
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class Siglip2ImageEncoder(SiglipVisionTransformer):
|
|
7
9
|
def __init__(self):
|
|
@@ -47,7 +49,7 @@ class Siglip2ImageEncoder(SiglipVisionTransformer):
|
|
|
47
49
|
}
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
def forward(self, image, torch_dtype=torch.bfloat16, device=
|
|
52
|
+
def forward(self, image, torch_dtype=torch.bfloat16, device=get_device_type()):
|
|
51
53
|
pixel_values = self.processor(images=[image], return_tensors="pt")["pixel_values"]
|
|
52
54
|
pixel_values = pixel_values.to(device=device, dtype=torch_dtype)
|
|
53
55
|
output_attentions = False
|