diffsynth 2.0.5__tar.gz → 2.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-2.0.5 → diffsynth-2.0.6}/PKG-INFO +4 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/README.md +21 -2
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/__init__.py +1 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/model_configs.py +136 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/configs/vram_management_module_maps.py +32 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/operators.py +52 -15
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/unified_dataset.py +2 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/layers.py +1 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/base_pipeline.py +6 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/training_module.py +42 -3
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_audio_vae.py +511 -47
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_common.py +17 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_dit.py +347 -117
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_text_encoder.py +205 -22
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_video_vae.py +60 -55
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/model_loader.py +3 -2
- diffsynth-2.0.6/diffsynth/models/mova_audio_dit.py +57 -0
- diffsynth-2.0.6/diffsynth/models/mova_audio_vae.py +796 -0
- diffsynth-2.0.6/diffsynth/models/mova_dual_tower_bridge.py +595 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_dit.py +13 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/flux2_image.py +8 -4
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/ltx2_audio_video.py +334 -264
- diffsynth-2.0.6/diffsynth/pipelines/mova_audio_video.py +460 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/qwen_image.py +4 -2
- diffsynth-2.0.6/diffsynth/utils/data/audio.py +108 -0
- diffsynth-2.0.5/diffsynth/utils/data/media_io_ltx2.py → diffsynth-2.0.6/diffsynth/utils/data/audio_video.py +44 -59
- diffsynth-2.0.6/diffsynth/utils/data/media_io_ltx2.py +43 -0
- diffsynth-2.0.6/diffsynth/utils/ses/__init__.py +1 -0
- diffsynth-2.0.6/diffsynth/utils/ses/ses.py +117 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py +1 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_video_vae.py +5 -3
- diffsynth-2.0.6/diffsynth/utils/xfuser/__init__.py +1 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/xfuser/xdit_context_parallel.py +28 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/PKG-INFO +4 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/SOURCES.txt +8 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/requires.txt +4 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/pyproject.toml +5 -1
- diffsynth-2.0.5/diffsynth/utils/xfuser/__init__.py +0 -1
- {diffsynth-2.0.5 → diffsynth-2.0.6}/LICENSE +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/attention/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/attention/attention.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/data/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/device/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/device/npu_compatible_device.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/gradient/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/gradient/gradient_checkpoint.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/config.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/file.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/loader/model.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/npu_patch/npu_fused_operator.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/disk_map.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/core/vram/initialization.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/flow_match.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/logger.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/loss.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/parsers.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/diffusion/runner.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/anima_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/dinov3_image_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux2_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_lora_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_lora_patcher.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/flux_value_control.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/general_modules.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/longcat_video_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/ltx2_upsampler.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen_ar_model.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_controlnet.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_image2lora.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/qwen_image_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/siglip2_image_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/step1x_connector.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/step1x_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_camera_controller.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_dit_s2v.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_mot.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_motion_controller.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_vace.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wan_video_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/wav2vec.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_controlnet.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_image2lora.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/models/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/anima_image.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/flux_image.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/wan_video.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/pipelines/z_image.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/annotator.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/controlnet/controlnet_input.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/data/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/flux.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/general.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/merge.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/lora/reset_rank.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/__init__.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/anima_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux2_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_controlnet.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_infiniteyou.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_ipadapter.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/flux_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/nexus_gen.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/nexus_gen_projector.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/step1x_connector.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_dit.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_mot.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_vace.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wan_video_vae.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/utils/state_dict_converters/z_image_text_encoder.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth/version.py +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-2.0.5 → diffsynth-2.0.6}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffsynth
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.6
|
|
4
4
|
Summary: Enjoy the magic of Diffusion models!
|
|
5
5
|
Author: ModelScope Team
|
|
6
6
|
License: Apache-2.0
|
|
@@ -32,4 +32,7 @@ Provides-Extra: npu
|
|
|
32
32
|
Requires-Dist: torch==2.7.1+cpu; extra == "npu"
|
|
33
33
|
Requires-Dist: torch-npu==2.7.1; extra == "npu"
|
|
34
34
|
Requires-Dist: torchvision==0.22.1+cpu; extra == "npu"
|
|
35
|
+
Provides-Extra: audio
|
|
36
|
+
Requires-Dist: torchaudio; extra == "audio"
|
|
37
|
+
Requires-Dist: torchcodec; extra == "audio"
|
|
35
38
|
Dynamic: license-file
|
|
@@ -32,6 +32,11 @@ We believe that a well-developed open-source code framework can lower the thresh
|
|
|
32
32
|
> DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update.
|
|
33
33
|
|
|
34
34
|
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
|
|
35
|
+
- **January 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
|
|
36
|
+
|
|
37
|
+
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
|
|
38
|
+
|
|
39
|
+
- **March 3, 2026**: We released the [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) model, which is an updated version of Qwen-Image-Layered-Control. In addition to the originally supported text-guided functionality, it adds brush-controlled layer separation capabilities.
|
|
35
40
|
|
|
36
41
|
- **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
|
|
37
42
|
|
|
@@ -396,7 +401,7 @@ Example code for Anima is located at: [/examples/anima/](/examples/anima/)
|
|
|
396
401
|
|
|
397
402
|
| Model ID | Inference | Low VRAM Inference | Full Training | Validation after Full Training | LoRA Training | Validation after LoRA Training |
|
|
398
403
|
|-|-|-|-|-|-|-|
|
|
399
|
-
|[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](
|
|
404
|
+
|[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](/examples/anima/model_inference/anima-preview.py)|[code](/examples/anima/model_inference_low_vram/anima-preview.py)|[code](/examples/anima/model_training/full/anima-preview.sh)|[code](/examples/anima/model_training/validate_full/anima-preview.py)|[code](/examples/anima/model_training/lora/anima-preview.sh)|[code](/examples/anima/model_training/validate_lora/anima-preview.py)|
|
|
400
405
|
|
|
401
406
|
</details>
|
|
402
407
|
|
|
@@ -480,9 +485,11 @@ Example code for Qwen-Image is available at: [/examples/qwen_image/](/examples/q
|
|
|
480
485
|
|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)|
|
|
481
486
|
|[Qwen/Qwen-Image-Edit-2511](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2511)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2511.py)|
|
|
482
487
|
|[FireRedTeam/FireRed-Image-Edit-1.0](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.0)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.0.py)|
|
|
488
|
+
|[FireRedTeam/FireRed-Image-Edit-1.1](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.1)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.1.py)|
|
|
483
489
|
|[lightx2v/Qwen-Image-Edit-2511-Lightning](https://modelscope.cn/models/lightx2v/Qwen-Image-Edit-2511-Lightning)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-Lightning.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511-Lightning.py)|-|-|-|-|
|
|
484
490
|
|[Qwen/Qwen-Image-Layered](https://www.modelscope.cn/models/Qwen/Qwen-Image-Layered)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered.py)|
|
|
485
491
|
|[DiffSynth-Studio/Qwen-Image-Layered-Control](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control.py)|
|
|
492
|
+
|[DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control-V2.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control-V2.py)|
|
|
486
493
|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|
487
494
|
|[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|
488
495
|
|[DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py)|
|
|
@@ -701,6 +708,16 @@ Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
|
|
|
701
708
|
|
|
702
709
|
| Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|
|
703
710
|
|-|-|-|-|-|-|-|-|
|
|
711
|
+
|[Lightricks/LTX-2.3: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-I2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-I2AV.py)|
|
|
712
|
+
|[Lightricks/LTX-2.3: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-TwoStage.py)|-|-|-|-|
|
|
713
|
+
|[Lightricks/LTX-2.3: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-DistilledPipeline.py)|-|-|-|-|
|
|
714
|
+
|[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)|
|
|
715
|
+
|[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-|
|
|
716
|
+
|[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-|
|
|
717
|
+
|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-|
|
|
718
|
+
|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-|
|
|
719
|
+
|[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
|
|
720
|
+
|[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
|
|
704
721
|
|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)|
|
|
705
722
|
|[Lightricks/LTX-2-19b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
|
|
706
723
|
|[Lightricks/LTX-2-19b-IC-LoRA-Detailer](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Detailer)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Detailer.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Detailer.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
|
|
@@ -851,6 +868,8 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
|
|
|
851
868
|
|[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
|
|
852
869
|
|[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
|
|
853
870
|
|[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
|
|
871
|
+
| [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) | `input_image` | [code](/examples/mova/model_inference/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-360P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py) |
|
|
872
|
+
| [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) | `input_image` | [code](/examples/mova/model_inference/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/full/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py) | [code](/examples/mova/model_training/lora/MOVA-720P-I2AV.sh) | [code](/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py) |
|
|
854
873
|
|
|
855
874
|
</details>
|
|
856
875
|
|
|
@@ -864,7 +883,7 @@ DiffSynth-Studio is not just an engineered model framework, but also an incubato
|
|
|
864
883
|
|
|
865
884
|
- Paper: [Spectral Evolution Search: Efficient Inference-Time Scaling for Reward-Aligned Image Generation
|
|
866
885
|
](https://arxiv.org/abs/2602.03208)
|
|
867
|
-
- Sample Code:
|
|
886
|
+
- Sample Code: [/docs/en/Research_Tutorial/inference_time_scaling.md](/docs/en/Research_Tutorial/inference_time_scaling.md)
|
|
868
887
|
|
|
869
888
|
|FLUX.1-dev|FLUX.1-dev + SES|Qwen-Image|Qwen-Image + SES|
|
|
870
889
|
|-|-|-|-|
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
from .model_configs import MODEL_CONFIGS
|
|
2
|
-
from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS
|
|
2
|
+
from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_CHECKER_MAPS
|
|
@@ -718,6 +718,119 @@ ltx2_series = [
|
|
|
718
718
|
"model_name": "ltx2_latent_upsampler",
|
|
719
719
|
"model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
|
|
720
720
|
},
|
|
721
|
+
{
|
|
722
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
723
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
724
|
+
"model_name": "ltx2_dit",
|
|
725
|
+
"model_class": "diffsynth.models.ltx2_dit.LTXModel",
|
|
726
|
+
"extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
|
|
727
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
|
|
728
|
+
},
|
|
729
|
+
{
|
|
730
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
731
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
732
|
+
"model_name": "ltx2_video_vae_encoder",
|
|
733
|
+
"model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
|
|
734
|
+
"extra_kwargs": {"encoder_version": "ltx-2.3"},
|
|
735
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
|
|
736
|
+
},
|
|
737
|
+
{
|
|
738
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
739
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
740
|
+
"model_name": "ltx2_video_vae_decoder",
|
|
741
|
+
"model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
|
|
742
|
+
"extra_kwargs": {"decoder_version": "ltx-2.3"},
|
|
743
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
747
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
748
|
+
"model_name": "ltx2_audio_vae_decoder",
|
|
749
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
|
|
750
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
|
|
751
|
+
},
|
|
752
|
+
{
|
|
753
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
754
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
755
|
+
"model_name": "ltx2_audio_vocoder",
|
|
756
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
|
|
757
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
|
|
758
|
+
},
|
|
759
|
+
{
|
|
760
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
761
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
762
|
+
"model_name": "ltx2_audio_vae_encoder",
|
|
763
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
|
|
764
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
|
|
765
|
+
},
|
|
766
|
+
{
|
|
767
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
|
|
768
|
+
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
|
769
|
+
"model_name": "ltx2_text_encoder_post_modules",
|
|
770
|
+
"model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
|
|
771
|
+
"extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
|
|
772
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
|
|
773
|
+
},
|
|
774
|
+
{
|
|
775
|
+
# Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
|
|
776
|
+
"model_hash": "aed408774d694a2452f69936c32febb5",
|
|
777
|
+
"model_name": "ltx2_latent_upsampler",
|
|
778
|
+
"model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
|
|
779
|
+
"extra_kwargs": {"rational_resampler": False},
|
|
780
|
+
},
|
|
781
|
+
{
|
|
782
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="transformer.safetensors")
|
|
783
|
+
"model_hash": "1c55afad76ed33c112a2978550b524d1",
|
|
784
|
+
"model_name": "ltx2_dit",
|
|
785
|
+
"model_class": "diffsynth.models.ltx2_dit.LTXModel",
|
|
786
|
+
"extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
|
|
787
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
|
|
788
|
+
},
|
|
789
|
+
{
|
|
790
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
|
|
791
|
+
"model_hash": "eecdc07c2ec30863b8a2b8b2134036cf",
|
|
792
|
+
"model_name": "ltx2_video_vae_encoder",
|
|
793
|
+
"model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
|
|
794
|
+
"extra_kwargs": {"encoder_version": "ltx-2.3"},
|
|
795
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
|
|
796
|
+
},
|
|
797
|
+
{
|
|
798
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
|
|
799
|
+
"model_hash": "deda2f542e17ee25bc8c38fd605316ea",
|
|
800
|
+
"model_name": "ltx2_video_vae_decoder",
|
|
801
|
+
"model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
|
|
802
|
+
"extra_kwargs": {"decoder_version": "ltx-2.3"},
|
|
803
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
|
|
804
|
+
},
|
|
805
|
+
{
|
|
806
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
|
|
807
|
+
"model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
|
|
808
|
+
"model_name": "ltx2_audio_vae_decoder",
|
|
809
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
|
|
810
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
|
|
811
|
+
},
|
|
812
|
+
{
|
|
813
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
|
|
814
|
+
"model_hash": "29338f3b95e7e312a3460a482e4f4554",
|
|
815
|
+
"model_name": "ltx2_audio_vae_encoder",
|
|
816
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
|
|
817
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
|
|
818
|
+
},
|
|
819
|
+
{
|
|
820
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
|
|
821
|
+
"model_hash": "cd436c99e69ec5c80f050f0944f02a15",
|
|
822
|
+
"model_name": "ltx2_audio_vocoder",
|
|
823
|
+
"model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
|
|
824
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
# Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
|
|
828
|
+
"model_hash": "05da2aab1c4b061f72c426311c165a43",
|
|
829
|
+
"model_name": "ltx2_text_encoder_post_modules",
|
|
830
|
+
"model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
|
|
831
|
+
"extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
|
|
832
|
+
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
|
|
833
|
+
},
|
|
721
834
|
]
|
|
722
835
|
anima_series = [
|
|
723
836
|
{
|
|
@@ -735,4 +848,26 @@ anima_series = [
|
|
|
735
848
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.anima_dit.AnimaDiTStateDictConverter",
|
|
736
849
|
}
|
|
737
850
|
]
|
|
738
|
-
|
|
851
|
+
|
|
852
|
+
mova_series = [
|
|
853
|
+
# Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors")
|
|
854
|
+
{
|
|
855
|
+
"model_hash": "8c57e12790e2c45a64817e0ce28cde2f",
|
|
856
|
+
"model_name": "mova_audio_dit",
|
|
857
|
+
"model_class": "diffsynth.models.mova_audio_dit.MovaAudioDit",
|
|
858
|
+
"extra_kwargs": {'has_image_input': False, 'patch_size': [1], 'in_dim': 128, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 128, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
|
|
859
|
+
},
|
|
860
|
+
# Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors")
|
|
861
|
+
{
|
|
862
|
+
"model_hash": "418517fb2b4e919d2cac8f314fcf82ac",
|
|
863
|
+
"model_name": "mova_audio_vae",
|
|
864
|
+
"model_class": "diffsynth.models.mova_audio_vae.DacVAE",
|
|
865
|
+
},
|
|
866
|
+
# Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors")
|
|
867
|
+
{
|
|
868
|
+
"model_hash": "d1139dbbc8b4ab53cf4b4243d57bbceb",
|
|
869
|
+
"model_name": "mova_dual_tower_bridge",
|
|
870
|
+
"model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
|
|
871
|
+
},
|
|
872
|
+
]
|
|
873
|
+
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series
|
|
@@ -249,4 +249,36 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
|
|
|
249
249
|
"torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
250
250
|
"torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
251
251
|
},
|
|
252
|
+
"diffsynth.models.mova_audio_dit.MovaAudioDit": {
|
|
253
|
+
"diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
|
|
254
|
+
"diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
255
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
256
|
+
"torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
257
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
258
|
+
"diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
259
|
+
},
|
|
260
|
+
"diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge": {
|
|
261
|
+
"torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
|
|
262
|
+
"torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
263
|
+
"diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
264
|
+
},
|
|
265
|
+
"diffsynth.models.mova_audio_vae.DacVAE": {
|
|
266
|
+
"diffsynth.models.mova_audio_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
267
|
+
"torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
268
|
+
"torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
|
|
269
|
+
},
|
|
252
270
|
}
|
|
271
|
+
|
|
272
|
+
def QwenImageTextEncoder_Module_Map_Updater():
|
|
273
|
+
current = VRAM_MANAGEMENT_MODULE_MAPS["diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder"]
|
|
274
|
+
from packaging import version
|
|
275
|
+
import transformers
|
|
276
|
+
if version.parse(transformers.__version__) >= version.parse("5.2.0"):
|
|
277
|
+
# The Qwen2RMSNorm in transformers 5.2.0+ has been renamed to Qwen2_5_VLRMSNorm, so we need to update the module map accordingly
|
|
278
|
+
current.pop("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm", None)
|
|
279
|
+
current["transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRMSNorm"] = "diffsynth.core.vram.layers.AutoWrappedModule"
|
|
280
|
+
return current
|
|
281
|
+
|
|
282
|
+
VERSION_CHECKER_MAPS = {
|
|
283
|
+
"diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": QwenImageTextEncoder_Module_Map_Updater,
|
|
284
|
+
}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
import math
|
|
1
2
|
import torch, torchvision, imageio, os
|
|
2
3
|
import imageio.v3 as iio
|
|
3
4
|
from PIL import Image
|
|
5
|
+
import torchaudio
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class DataProcessingPipeline:
|
|
@@ -105,27 +107,59 @@ class ToList(DataProcessingOperator):
|
|
|
105
107
|
return [data]
|
|
106
108
|
|
|
107
109
|
|
|
108
|
-
class
|
|
109
|
-
def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1,
|
|
110
|
+
class FrameSamplerByRateMixin:
|
|
111
|
+
def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_rate=24, fix_frame_rate=False):
|
|
110
112
|
self.num_frames = num_frames
|
|
111
113
|
self.time_division_factor = time_division_factor
|
|
112
114
|
self.time_division_remainder = time_division_remainder
|
|
113
|
-
|
|
114
|
-
self.
|
|
115
|
-
|
|
115
|
+
self.frame_rate = frame_rate
|
|
116
|
+
self.fix_frame_rate = fix_frame_rate
|
|
117
|
+
|
|
118
|
+
def get_reader(self, data: str):
|
|
119
|
+
return imageio.get_reader(data)
|
|
120
|
+
|
|
121
|
+
def get_available_num_frames(self, reader):
|
|
122
|
+
if not self.fix_frame_rate:
|
|
123
|
+
return reader.count_frames()
|
|
124
|
+
meta_data = reader.get_meta_data()
|
|
125
|
+
total_original_frames = int(reader.count_frames())
|
|
126
|
+
duration = meta_data["duration"] if "duration" in meta_data else total_original_frames / meta_data['fps']
|
|
127
|
+
total_available_frames = math.floor(duration * self.frame_rate)
|
|
128
|
+
return int(total_available_frames)
|
|
129
|
+
|
|
116
130
|
def get_num_frames(self, reader):
|
|
117
131
|
num_frames = self.num_frames
|
|
118
|
-
|
|
119
|
-
|
|
132
|
+
total_frames = self.get_available_num_frames(reader)
|
|
133
|
+
if int(total_frames) < num_frames:
|
|
134
|
+
num_frames = total_frames
|
|
120
135
|
while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
|
|
121
136
|
num_frames -= 1
|
|
122
137
|
return num_frames
|
|
123
|
-
|
|
138
|
+
|
|
139
|
+
def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: float, total_raw_frames: int) -> int:
|
|
140
|
+
if not self.fix_frame_rate:
|
|
141
|
+
return new_sequence_id
|
|
142
|
+
target_time_in_seconds = new_sequence_id / self.frame_rate
|
|
143
|
+
raw_frame_index_float = target_time_in_seconds * raw_frame_rate
|
|
144
|
+
frame_id = int(round(raw_frame_index_float))
|
|
145
|
+
frame_id = min(frame_id, total_raw_frames - 1)
|
|
146
|
+
return frame_id
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
|
|
150
|
+
def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x, frame_rate=24, fix_frame_rate=False):
|
|
151
|
+
FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
|
|
152
|
+
# frame_processor is build in the video loader for high efficiency.
|
|
153
|
+
self.frame_processor = frame_processor
|
|
154
|
+
|
|
124
155
|
def __call__(self, data: str):
|
|
125
|
-
reader =
|
|
156
|
+
reader = self.get_reader(data)
|
|
157
|
+
raw_frame_rate = reader.get_meta_data()['fps']
|
|
126
158
|
num_frames = self.get_num_frames(reader)
|
|
159
|
+
total_raw_frames = reader.count_frames()
|
|
127
160
|
frames = []
|
|
128
161
|
for frame_id in range(num_frames):
|
|
162
|
+
frame_id = self.map_single_frame_id(frame_id, raw_frame_rate, total_raw_frames)
|
|
129
163
|
frame = reader.get_data(frame_id)
|
|
130
164
|
frame = Image.fromarray(frame)
|
|
131
165
|
frame = self.frame_processor(frame)
|
|
@@ -149,7 +183,7 @@ class LoadGIF(DataProcessingOperator):
|
|
|
149
183
|
self.time_division_remainder = time_division_remainder
|
|
150
184
|
# frame_processor is build in the video loader for high efficiency.
|
|
151
185
|
self.frame_processor = frame_processor
|
|
152
|
-
|
|
186
|
+
|
|
153
187
|
def get_num_frames(self, path):
|
|
154
188
|
num_frames = self.num_frames
|
|
155
189
|
images = iio.imread(path, mode="RGB")
|
|
@@ -220,14 +254,17 @@ class LoadAudio(DataProcessingOperator):
|
|
|
220
254
|
return input_audio
|
|
221
255
|
|
|
222
256
|
|
|
223
|
-
class LoadAudioWithTorchaudio(DataProcessingOperator):
|
|
224
|
-
|
|
225
|
-
|
|
257
|
+
class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
|
|
258
|
+
|
|
259
|
+
def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
|
|
260
|
+
FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
|
|
226
261
|
|
|
227
262
|
def __call__(self, data: str):
|
|
228
|
-
|
|
263
|
+
reader = self.get_reader(data)
|
|
264
|
+
num_frames = self.get_num_frames(reader)
|
|
265
|
+
duration = num_frames / self.frame_rate
|
|
229
266
|
waveform, sample_rate = torchaudio.load(data)
|
|
230
|
-
target_samples = int(
|
|
267
|
+
target_samples = int(duration * sample_rate)
|
|
231
268
|
current_samples = waveform.shape[-1]
|
|
232
269
|
if current_samples > target_samples:
|
|
233
270
|
waveform = waveform[..., :target_samples]
|
|
@@ -42,6 +42,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
|
|
|
42
42
|
max_pixels=1920*1080, height=None, width=None,
|
|
43
43
|
height_division_factor=16, width_division_factor=16,
|
|
44
44
|
num_frames=81, time_division_factor=4, time_division_remainder=1,
|
|
45
|
+
frame_rate=24, fix_frame_rate=False,
|
|
45
46
|
):
|
|
46
47
|
return RouteByType(operator_map=[
|
|
47
48
|
(str, ToAbsolutePath(base_path) >> RouteByExtensionName(operator_map=[
|
|
@@ -53,6 +54,7 @@ class UnifiedDataset(torch.utils.data.Dataset):
|
|
|
53
54
|
(("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"), LoadVideo(
|
|
54
55
|
num_frames, time_division_factor, time_division_remainder,
|
|
55
56
|
frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
|
|
57
|
+
frame_rate=frame_rate, fix_frame_rate=fix_frame_rate,
|
|
56
58
|
)),
|
|
57
59
|
])),
|
|
58
60
|
])
|
|
@@ -417,7 +417,7 @@ class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
|
|
|
417
417
|
def lora_forward(self, x, out):
|
|
418
418
|
if self.lora_merger is None:
|
|
419
419
|
for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
|
|
420
|
-
out = out + x @ lora_A.T @ lora_B.T
|
|
420
|
+
out = out + x @ lora_A.T.to(device=x.device, dtype=x.dtype) @ lora_B.T.to(device=x.device, dtype=x.dtype)
|
|
421
421
|
else:
|
|
422
422
|
lora_output = []
|
|
423
423
|
for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
|
|
@@ -147,6 +147,12 @@ class BasePipeline(torch.nn.Module):
|
|
|
147
147
|
video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
|
|
148
148
|
return video
|
|
149
149
|
|
|
150
|
+
def output_audio_format_check(self, audio_output):
|
|
151
|
+
# output standard foramt: [C, T], output dtype: float()
|
|
152
|
+
# remove batch dim
|
|
153
|
+
if audio_output.ndim == 3:
|
|
154
|
+
audio_output = audio_output.squeeze(0)
|
|
155
|
+
return audio_output.float()
|
|
150
156
|
|
|
151
157
|
def load_models_to_device(self, model_names):
|
|
152
158
|
if self.vram_management_enabled:
|
|
@@ -1,9 +1,32 @@
|
|
|
1
|
-
import torch, json, os
|
|
1
|
+
import torch, json, os, inspect
|
|
2
2
|
from ..core import ModelConfig, load_state_dict
|
|
3
3
|
from ..utils.controlnet import ControlNetInput
|
|
4
|
+
from .base_pipeline import PipelineUnit
|
|
4
5
|
from peft import LoraConfig, inject_adapter_in_model
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
class GeneralUnit_RemoveCache(PipelineUnit):
|
|
9
|
+
def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
|
|
10
|
+
super().__init__(take_over=True)
|
|
11
|
+
self.required_params = required_params
|
|
12
|
+
self.force_remove_params_shared = force_remove_params_shared
|
|
13
|
+
self.force_remove_params_posi = force_remove_params_posi
|
|
14
|
+
self.force_remove_params_nega = force_remove_params_nega
|
|
15
|
+
|
|
16
|
+
def process_params(self, inputs, required_params, force_remove_params):
|
|
17
|
+
inputs_ = {}
|
|
18
|
+
for name, param in inputs.items():
|
|
19
|
+
if name in required_params and name not in force_remove_params:
|
|
20
|
+
inputs_[name] = param
|
|
21
|
+
return inputs_
|
|
22
|
+
|
|
23
|
+
def process(self, pipe, inputs_shared, inputs_posi, inputs_nega):
|
|
24
|
+
inputs_shared = self.process_params(inputs_shared, self.required_params, self.force_remove_params_shared)
|
|
25
|
+
inputs_posi = self.process_params(inputs_posi, self.required_params, self.force_remove_params_posi)
|
|
26
|
+
inputs_nega = self.process_params(inputs_nega, self.required_params, self.force_remove_params_nega)
|
|
27
|
+
return inputs_shared, inputs_posi, inputs_nega
|
|
28
|
+
|
|
29
|
+
|
|
7
30
|
class DiffusionTrainingModule(torch.nn.Module):
|
|
8
31
|
def __init__(self):
|
|
9
32
|
super().__init__()
|
|
@@ -231,14 +254,30 @@ class DiffusionTrainingModule(torch.nn.Module):
|
|
|
231
254
|
setattr(pipe, lora_base_model, model)
|
|
232
255
|
|
|
233
256
|
|
|
234
|
-
def split_pipeline_units(
|
|
257
|
+
def split_pipeline_units(
|
|
258
|
+
self, task, pipe,
|
|
259
|
+
trainable_models=None, lora_base_model=None,
|
|
260
|
+
# TODO: set `remove_unnecessary_params` to `True` by default
|
|
261
|
+
remove_unnecessary_params=False,
|
|
262
|
+
# TODO: move `loss_required_params` to `loss.py`
|
|
263
|
+
loss_required_params=("input_latents", "max_timestep_boundary", "min_timestep_boundary", "first_frame_latents", "video_latents", "audio_input_latents", "num_inference_steps"),
|
|
264
|
+
force_remove_params_shared=tuple(),
|
|
265
|
+
force_remove_params_posi=tuple(),
|
|
266
|
+
force_remove_params_nega=tuple(),
|
|
267
|
+
):
|
|
235
268
|
models_require_backward = []
|
|
236
269
|
if trainable_models is not None:
|
|
237
270
|
models_require_backward += trainable_models.split(",")
|
|
238
271
|
if lora_base_model is not None:
|
|
239
272
|
models_require_backward += [lora_base_model]
|
|
240
273
|
if task.endswith(":data_process"):
|
|
241
|
-
|
|
274
|
+
other_units, pipe.units = pipe.split_pipeline_units(models_require_backward)
|
|
275
|
+
if remove_unnecessary_params:
|
|
276
|
+
required_params = list(loss_required_params) + [i for i in inspect.signature(self.pipe.model_fn).parameters]
|
|
277
|
+
for unit in other_units:
|
|
278
|
+
required_params.extend(unit.fetch_input_params())
|
|
279
|
+
required_params = sorted(list(set(required_params)))
|
|
280
|
+
pipe.units.append(GeneralUnit_RemoveCache(required_params, force_remove_params_shared, force_remove_params_posi, force_remove_params_nega))
|
|
242
281
|
elif task.endswith(":train"):
|
|
243
282
|
pipe.units, _ = pipe.split_pipeline_units(models_require_backward)
|
|
244
283
|
return pipe
|