diffsynth 1.1.2__tar.gz → 1.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-1.1.2 → diffsynth-1.1.7}/PKG-INFO +1 -1
- {diffsynth-1.1.2 → diffsynth-1.1.7}/README.md +16 -5
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/configs/model_config.py +52 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/processors.py +7 -6
- diffsynth-1.1.7/diffsynth/distributed/xdit_context_parallel.py +129 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/factory.py +1 -1
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_controlnet.py +2 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_dit.py +13 -10
- diffsynth-1.1.7/diffsynth/models/flux_infiniteyou.py +128 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_dit.py +81 -46
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_text_encoder.py +23 -10
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/lora.py +67 -49
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/model_manager.py +4 -0
- diffsynth-1.1.7/diffsynth/models/wan_video_dit.py +554 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_image_encoder.py +8 -10
- diffsynth-1.1.7/diffsynth/models/wan_video_motion_controller.py +44 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_vae.py +3 -4
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/flux_image.py +78 -2
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/hunyuan_video.py +147 -17
- diffsynth-1.1.7/diffsynth/pipelines/wan_video.py +493 -0
- diffsynth-1.1.7/diffsynth/prompters/hunyuan_video_prompter.py +275 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/wan_prompter.py +2 -1
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/flow_match.py +1 -1
- diffsynth-1.1.7/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/preprocessor_config.json +45 -0
- diffsynth-1.1.7/diffsynth/trainers/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/trainers/text_to_image.py +1 -1
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/SOURCES.txt +5 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/setup.py +1 -1
- diffsynth-1.1.2/diffsynth/models/wan_video_dit.py +0 -799
- diffsynth-1.1.2/diffsynth/pipelines/wan_video.py +0 -276
- diffsynth-1.1.2/diffsynth/prompters/hunyuan_video_prompter.py +0 -143
- {diffsynth-1.1.2 → diffsynth-1.1.7}/LICENSE +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/controlnet_unit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/simple_text_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/video.py +0 -0
- {diffsynth-1.1.2/diffsynth/extensions → diffsynth-1.1.7/diffsynth/distributed}/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ESRGAN/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/api.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/cupy_kernels.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/data.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/patch_match.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/accurate.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/balanced.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/fast.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/interpolation.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/blip.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/blip_pretrain.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/med.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/vit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/aesthetic.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/clip.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/config.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/hps.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/imagereward.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/mps.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/constants.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/generation_utils.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/hf_configs.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/loss.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/modified_resnet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/openai.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/push_to_hf_hub.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/transform.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/utils.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/version.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/pickscore.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/base_model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/clip_model.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/cross_modeling.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/RIFE/__init__.py +0 -0
- {diffsynth-1.1.2/diffsynth/processors → diffsynth-1.1.7/diffsynth/extensions}/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/attention.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/cog_dit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/cog_vae.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/downloader.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_dit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_dit_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_vae_decoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_vae_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/kolors_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/omnigen.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_dit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_vae_decoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_vae_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_controlnet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_ipadapter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_motion.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_unet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_vae_decoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_vae_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_controlnet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_ipadapter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_motion.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_unet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_vae_decoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_vae_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_dit.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_vae.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_image_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_unet.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_vae_decoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_vae_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/tiler.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/utils.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_text_encoder.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/base.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/cog_video.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/dancer.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/hunyuan_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/omnigen_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/pipeline_runner.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd3_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd_video.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sdxl_image.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sdxl_video.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/step_video.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/svd_video.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/FastBlend.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/PILEditor.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/RIFE.py +0 -0
- {diffsynth-1.1.2/diffsynth/tokenizer_configs → diffsynth-1.1.7/diffsynth/processors}/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/base.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/sequencial_processor.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/base_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/cog_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/flux_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/hunyuan_dit_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/kolors_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/omnigen_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/omost.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/prompt_refiners.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sd3_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sd_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sdxl_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/stepvideo_prompter.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/continuous_ode.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/ddim.py +0 -0
- {diffsynth-1.1.2/diffsynth/trainers → diffsynth-1.1.7/diffsynth/tokenizer_configs}/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/added_tokens.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/spiece.model +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/spiece.model +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab_org.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer.model +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/spiece.model +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/merges.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/vocab.json +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/vram_management/__init__.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/vram_management/layers.py +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/requires.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-1.1.2 → diffsynth-1.1.7}/setup.cfg +0 -0
|
@@ -13,13 +13,19 @@ Document: https://diffsynth-studio.readthedocs.io/zh-cn/latest/index.html
|
|
|
13
13
|
|
|
14
14
|
## Introduction
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
Welcome to the magic world of Diffusion models!
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
DiffSynth consists of two open-source projects:
|
|
19
|
+
* [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): Focused on aggressive technological exploration. Targeted at academia. Provides more cutting-edge technical support and novel inference capabilities.
|
|
20
|
+
* [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine): Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
|
|
21
|
+
|
|
22
|
+
DiffSynth-Studio is an open-source project aimed at exploring innovations in AIGC technology. We have integrated numerous open-source Diffusion models, including FLUX and Wan, among others. Through this open-source project, we hope to connect models within the open-source community and explore new technologies based on diffusion models.
|
|
23
|
+
|
|
24
|
+
Until now, DiffSynth-Studio has supported the following models:
|
|
19
25
|
|
|
20
26
|
* [Wan-Video](https://github.com/Wan-Video/Wan2.1)
|
|
21
27
|
* [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
|
|
22
|
-
* [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
|
|
28
|
+
* [HunyuanVideo](https://github.com/Tencent/HunyuanVideo), [HunyuanVideo-I2V]()
|
|
23
29
|
* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
|
|
24
30
|
* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
|
|
25
31
|
* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
|
|
@@ -36,6 +42,11 @@ Until now, DiffSynth Studio has supported the following models:
|
|
|
36
42
|
* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
|
|
37
43
|
|
|
38
44
|
## News
|
|
45
|
+
- **March 31, 2025** We support InfiniteYou, an identity preserving method for FLUX. Please refer to [./examples/InfiniteYou/](./examples/InfiniteYou/) for more details.
|
|
46
|
+
|
|
47
|
+
- **March 25, 2025** 🔥🔥🔥 Our new open-source project, [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine), is now open-sourced! Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
|
|
48
|
+
|
|
49
|
+
- **March 13, 2025** We support HunyuanVideo-I2V, the image-to-video generation version of HunyuanVideo open-sourced by Tencent. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details.
|
|
39
50
|
|
|
40
51
|
- **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
|
|
41
52
|
|
|
@@ -43,7 +54,7 @@ Until now, DiffSynth Studio has supported the following models:
|
|
|
43
54
|
|
|
44
55
|
- **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
|
|
45
56
|
- Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
|
|
46
|
-
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
|
|
57
|
+
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
|
|
47
58
|
- Online Demo: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
|
|
48
59
|
- Training Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
|
|
49
60
|
|
|
@@ -72,7 +83,7 @@ Until now, DiffSynth Studio has supported the following models:
|
|
|
72
83
|
- Enable CFG and highres-fix to improve visual quality. See [here](/examples/image_synthesis/README.md)
|
|
73
84
|
- LoRA, ControlNet, and additional models will be available soon.
|
|
74
85
|
|
|
75
|
-
- **June 21, 2024.**
|
|
86
|
+
- **June 21, 2024.** We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
|
|
76
87
|
- [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
|
|
77
88
|
- Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/).
|
|
78
89
|
- Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
|
|
@@ -37,6 +37,7 @@ from ..models.flux_text_encoder import FluxTextEncoder2
|
|
|
37
37
|
from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
|
|
38
38
|
from ..models.flux_controlnet import FluxControlNet
|
|
39
39
|
from ..models.flux_ipadapter import FluxIpAdapter
|
|
40
|
+
from ..models.flux_infiniteyou import InfiniteYouImageProjector
|
|
40
41
|
|
|
41
42
|
from ..models.cog_vae import CogVAEEncoder, CogVAEDecoder
|
|
42
43
|
from ..models.cog_dit import CogDiT
|
|
@@ -58,6 +59,7 @@ from ..models.wan_video_dit import WanModel
|
|
|
58
59
|
from ..models.wan_video_text_encoder import WanTextEncoder
|
|
59
60
|
from ..models.wan_video_image_encoder import WanImageEncoder
|
|
60
61
|
from ..models.wan_video_vae import WanVideoVAE
|
|
62
|
+
from ..models.wan_video_motion_controller import WanMotionControllerModel
|
|
61
63
|
|
|
62
64
|
|
|
63
65
|
model_loader_configs = [
|
|
@@ -95,6 +97,7 @@ model_loader_configs = [
|
|
|
95
97
|
(None, "57b02550baab820169365b3ee3afa2c9", ["flux_dit"], [FluxDiT], "civitai"),
|
|
96
98
|
(None, "3394f306c4cbf04334b712bf5aaed95f", ["flux_dit"], [FluxDiT], "civitai"),
|
|
97
99
|
(None, "023f054d918a84ccf503481fd1e3379e", ["flux_dit"], [FluxDiT], "civitai"),
|
|
100
|
+
(None, "605c56eab23e9e2af863ad8f0813a25d", ["flux_dit"], [FluxDiT], "diffusers"),
|
|
98
101
|
(None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
|
|
99
102
|
(None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
|
|
100
103
|
(None, "6b7116078c4170bfbeaedc8fe71f6649", ["esrgan"], [RRDBNet], "civitai"),
|
|
@@ -103,6 +106,8 @@ model_loader_configs = [
|
|
|
103
106
|
(None, "b001c89139b5f053c715fe772362dd2a", ["flux_controlnet"], [FluxControlNet], "diffusers"),
|
|
104
107
|
(None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"),
|
|
105
108
|
(None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"),
|
|
109
|
+
(None, "7f9583eb8ba86642abb9a21a4b2c9e16", ["flux_controlnet"], [FluxControlNet], "diffusers"),
|
|
110
|
+
(None, "c07c0f04f5ff55e86b4e937c7a40d481", ["infiniteyou_image_projector"], [InfiniteYouImageProjector], "diffusers"),
|
|
106
111
|
(None, "4daaa66cc656a8fe369908693dad0a35", ["flux_ipadapter"], [FluxIpAdapter], "diffusers"),
|
|
107
112
|
(None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
|
|
108
113
|
(None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
|
|
@@ -116,10 +121,16 @@ model_loader_configs = [
|
|
|
116
121
|
(None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
|
|
117
122
|
(None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
|
|
118
123
|
(None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
|
|
124
|
+
(None, "6d6ccde6845b95ad9114ab993d917893", ["wan_video_dit"], [WanModel], "civitai"),
|
|
125
|
+
(None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
|
|
126
|
+
(None, "349723183fc063b2bfc10bb2835cf677", ["wan_video_dit"], [WanModel], "civitai"),
|
|
127
|
+
(None, "efa44cddf936c70abd0ea28b6cbe946c", ["wan_video_dit"], [WanModel], "civitai"),
|
|
128
|
+
(None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
|
|
119
129
|
(None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
|
|
120
130
|
(None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
|
|
121
131
|
(None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
|
|
122
132
|
(None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
|
|
133
|
+
(None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
|
|
123
134
|
]
|
|
124
135
|
huggingface_model_loader_configs = [
|
|
125
136
|
# These configs are provided for detecting model type automatically.
|
|
@@ -133,6 +144,7 @@ huggingface_model_loader_configs = [
|
|
|
133
144
|
("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
|
|
134
145
|
("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
|
|
135
146
|
("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
|
|
147
|
+
("LlavaForConditionalGeneration", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoMLLMEncoder"),
|
|
136
148
|
("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
|
|
137
149
|
]
|
|
138
150
|
patch_model_loader_configs = [
|
|
@@ -595,6 +607,25 @@ preset_models_on_modelscope = {
|
|
|
595
607
|
"models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
|
|
596
608
|
],
|
|
597
609
|
},
|
|
610
|
+
"InfiniteYou":{
|
|
611
|
+
"file_list":[
|
|
612
|
+
("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
|
|
613
|
+
("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
|
|
614
|
+
("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/image_proj_model.bin", "models/InfiniteYou"),
|
|
615
|
+
("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/1k3d68.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
|
|
616
|
+
("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/2d106det.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
|
|
617
|
+
("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/genderage.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
|
|
618
|
+
("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/glintr100.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
|
|
619
|
+
("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
|
|
620
|
+
],
|
|
621
|
+
"load_path":[
|
|
622
|
+
[
|
|
623
|
+
"models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
|
|
624
|
+
"models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors"
|
|
625
|
+
],
|
|
626
|
+
"models/InfiniteYou/image_proj_model.bin",
|
|
627
|
+
],
|
|
628
|
+
},
|
|
598
629
|
# ESRGAN
|
|
599
630
|
"ESRGAN_x4": [
|
|
600
631
|
("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
|
|
@@ -675,6 +706,25 @@ preset_models_on_modelscope = {
|
|
|
675
706
|
"models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
|
|
676
707
|
],
|
|
677
708
|
},
|
|
709
|
+
"HunyuanVideoI2V":{
|
|
710
|
+
"file_list": [
|
|
711
|
+
("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideoI2V/text_encoder"),
|
|
712
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00001-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
713
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00002-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
714
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00003-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
715
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00004-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
716
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "config.json", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
717
|
+
("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model.safetensors.index.json", "models/HunyuanVideoI2V/text_encoder_2"),
|
|
718
|
+
("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/vae/pytorch_model.pt", "models/HunyuanVideoI2V/vae"),
|
|
719
|
+
("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideoI2V/transformers")
|
|
720
|
+
],
|
|
721
|
+
"load_path": [
|
|
722
|
+
"models/HunyuanVideoI2V/text_encoder/model.safetensors",
|
|
723
|
+
"models/HunyuanVideoI2V/text_encoder_2",
|
|
724
|
+
"models/HunyuanVideoI2V/vae/pytorch_model.pt",
|
|
725
|
+
"models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
|
|
726
|
+
],
|
|
727
|
+
},
|
|
678
728
|
"HunyuanVideo-fp8":{
|
|
679
729
|
"file_list": [
|
|
680
730
|
("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
|
|
@@ -735,6 +785,7 @@ Preset_model_id: TypeAlias = Literal[
|
|
|
735
785
|
"Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
|
|
736
786
|
"Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
|
|
737
787
|
"InstantX/FLUX.1-dev-IP-Adapter",
|
|
788
|
+
"InfiniteYou",
|
|
738
789
|
"SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
|
|
739
790
|
"QwenPrompt",
|
|
740
791
|
"OmostPrompt",
|
|
@@ -751,4 +802,5 @@ Preset_model_id: TypeAlias = Literal[
|
|
|
751
802
|
"StableDiffusion3.5-medium",
|
|
752
803
|
"HunyuanVideo",
|
|
753
804
|
"HunyuanVideo-fp8",
|
|
805
|
+
"HunyuanVideoI2V",
|
|
754
806
|
]
|
|
@@ -1,10 +1,4 @@
|
|
|
1
1
|
from typing_extensions import Literal, TypeAlias
|
|
2
|
-
import warnings
|
|
3
|
-
with warnings.catch_warnings():
|
|
4
|
-
warnings.simplefilter("ignore")
|
|
5
|
-
from controlnet_aux.processor import (
|
|
6
|
-
CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector, NormalBaeDetector
|
|
7
|
-
)
|
|
8
2
|
|
|
9
3
|
|
|
10
4
|
Processor_id: TypeAlias = Literal[
|
|
@@ -15,18 +9,25 @@ class Annotator:
|
|
|
15
9
|
def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda', skip_processor=False):
|
|
16
10
|
if not skip_processor:
|
|
17
11
|
if processor_id == "canny":
|
|
12
|
+
from controlnet_aux.processor import CannyDetector
|
|
18
13
|
self.processor = CannyDetector()
|
|
19
14
|
elif processor_id == "depth":
|
|
15
|
+
from controlnet_aux.processor import MidasDetector
|
|
20
16
|
self.processor = MidasDetector.from_pretrained(model_path).to(device)
|
|
21
17
|
elif processor_id == "softedge":
|
|
18
|
+
from controlnet_aux.processor import HEDdetector
|
|
22
19
|
self.processor = HEDdetector.from_pretrained(model_path).to(device)
|
|
23
20
|
elif processor_id == "lineart":
|
|
21
|
+
from controlnet_aux.processor import LineartDetector
|
|
24
22
|
self.processor = LineartDetector.from_pretrained(model_path).to(device)
|
|
25
23
|
elif processor_id == "lineart_anime":
|
|
24
|
+
from controlnet_aux.processor import LineartAnimeDetector
|
|
26
25
|
self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
|
|
27
26
|
elif processor_id == "openpose":
|
|
27
|
+
from controlnet_aux.processor import OpenposeDetector
|
|
28
28
|
self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
|
|
29
29
|
elif processor_id == "normal":
|
|
30
|
+
from controlnet_aux.processor import NormalBaeDetector
|
|
30
31
|
self.processor = NormalBaeDetector.from_pretrained(model_path).to(device)
|
|
31
32
|
elif processor_id == "tile" or processor_id == "none" or processor_id == "inpaint":
|
|
32
33
|
self.processor = None
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from einops import rearrange
|
|
4
|
+
from xfuser.core.distributed import (get_sequence_parallel_rank,
|
|
5
|
+
get_sequence_parallel_world_size,
|
|
6
|
+
get_sp_group)
|
|
7
|
+
from xfuser.core.long_ctx_attention import xFuserLongContextAttention
|
|
8
|
+
|
|
9
|
+
def sinusoidal_embedding_1d(dim, position):
|
|
10
|
+
sinusoid = torch.outer(position.type(torch.float64), torch.pow(
|
|
11
|
+
10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
|
|
12
|
+
x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
|
|
13
|
+
return x.to(position.dtype)
|
|
14
|
+
|
|
15
|
+
def pad_freqs(original_tensor, target_len):
|
|
16
|
+
seq_len, s1, s2 = original_tensor.shape
|
|
17
|
+
pad_size = target_len - seq_len
|
|
18
|
+
padding_tensor = torch.ones(
|
|
19
|
+
pad_size,
|
|
20
|
+
s1,
|
|
21
|
+
s2,
|
|
22
|
+
dtype=original_tensor.dtype,
|
|
23
|
+
device=original_tensor.device)
|
|
24
|
+
padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
|
|
25
|
+
return padded_tensor
|
|
26
|
+
|
|
27
|
+
def rope_apply(x, freqs, num_heads):
|
|
28
|
+
x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
|
|
29
|
+
s_per_rank = x.shape[1]
|
|
30
|
+
|
|
31
|
+
x_out = torch.view_as_complex(x.to(torch.float64).reshape(
|
|
32
|
+
x.shape[0], x.shape[1], x.shape[2], -1, 2))
|
|
33
|
+
|
|
34
|
+
sp_size = get_sequence_parallel_world_size()
|
|
35
|
+
sp_rank = get_sequence_parallel_rank()
|
|
36
|
+
freqs = pad_freqs(freqs, s_per_rank * sp_size)
|
|
37
|
+
freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
|
|
38
|
+
|
|
39
|
+
x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
|
|
40
|
+
return x_out.to(x.dtype)
|
|
41
|
+
|
|
42
|
+
def usp_dit_forward(self,
|
|
43
|
+
x: torch.Tensor,
|
|
44
|
+
timestep: torch.Tensor,
|
|
45
|
+
context: torch.Tensor,
|
|
46
|
+
clip_feature: Optional[torch.Tensor] = None,
|
|
47
|
+
y: Optional[torch.Tensor] = None,
|
|
48
|
+
use_gradient_checkpointing: bool = False,
|
|
49
|
+
use_gradient_checkpointing_offload: bool = False,
|
|
50
|
+
**kwargs,
|
|
51
|
+
):
|
|
52
|
+
t = self.time_embedding(
|
|
53
|
+
sinusoidal_embedding_1d(self.freq_dim, timestep))
|
|
54
|
+
t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
|
|
55
|
+
context = self.text_embedding(context)
|
|
56
|
+
|
|
57
|
+
if self.has_image_input:
|
|
58
|
+
x = torch.cat([x, y], dim=1) # (b, c_x + c_y, f, h, w)
|
|
59
|
+
clip_embdding = self.img_emb(clip_feature)
|
|
60
|
+
context = torch.cat([clip_embdding, context], dim=1)
|
|
61
|
+
|
|
62
|
+
x, (f, h, w) = self.patchify(x)
|
|
63
|
+
|
|
64
|
+
freqs = torch.cat([
|
|
65
|
+
self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
|
|
66
|
+
self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
|
|
67
|
+
self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
|
|
68
|
+
], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
|
|
69
|
+
|
|
70
|
+
def create_custom_forward(module):
|
|
71
|
+
def custom_forward(*inputs):
|
|
72
|
+
return module(*inputs)
|
|
73
|
+
return custom_forward
|
|
74
|
+
|
|
75
|
+
# Context Parallel
|
|
76
|
+
x = torch.chunk(
|
|
77
|
+
x, get_sequence_parallel_world_size(),
|
|
78
|
+
dim=1)[get_sequence_parallel_rank()]
|
|
79
|
+
|
|
80
|
+
for block in self.blocks:
|
|
81
|
+
if self.training and use_gradient_checkpointing:
|
|
82
|
+
if use_gradient_checkpointing_offload:
|
|
83
|
+
with torch.autograd.graph.save_on_cpu():
|
|
84
|
+
x = torch.utils.checkpoint.checkpoint(
|
|
85
|
+
create_custom_forward(block),
|
|
86
|
+
x, context, t_mod, freqs,
|
|
87
|
+
use_reentrant=False,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
x = torch.utils.checkpoint.checkpoint(
|
|
91
|
+
create_custom_forward(block),
|
|
92
|
+
x, context, t_mod, freqs,
|
|
93
|
+
use_reentrant=False,
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
x = block(x, context, t_mod, freqs)
|
|
97
|
+
|
|
98
|
+
x = self.head(x, t)
|
|
99
|
+
|
|
100
|
+
# Context Parallel
|
|
101
|
+
x = get_sp_group().all_gather(x, dim=1)
|
|
102
|
+
|
|
103
|
+
# unpatchify
|
|
104
|
+
x = self.unpatchify(x, (f, h, w))
|
|
105
|
+
return x
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def usp_attn_forward(self, x, freqs):
|
|
109
|
+
q = self.norm_q(self.q(x))
|
|
110
|
+
k = self.norm_k(self.k(x))
|
|
111
|
+
v = self.v(x)
|
|
112
|
+
|
|
113
|
+
q = rope_apply(q, freqs, self.num_heads)
|
|
114
|
+
k = rope_apply(k, freqs, self.num_heads)
|
|
115
|
+
q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
|
|
116
|
+
k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
|
|
117
|
+
v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
|
|
118
|
+
|
|
119
|
+
x = xFuserLongContextAttention()(
|
|
120
|
+
None,
|
|
121
|
+
query=q,
|
|
122
|
+
key=k,
|
|
123
|
+
value=v,
|
|
124
|
+
)
|
|
125
|
+
x = x.flatten(2)
|
|
126
|
+
|
|
127
|
+
del q, k, v
|
|
128
|
+
torch.cuda.empty_cache()
|
|
129
|
+
return self.o(x)
|
|
@@ -318,6 +318,8 @@ class FluxControlNetStateDictConverter:
|
|
|
318
318
|
extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4}
|
|
319
319
|
elif hash_value == "0cfd1740758423a2a854d67c136d1e8c":
|
|
320
320
|
extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
|
|
321
|
+
elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16":
|
|
322
|
+
extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10}
|
|
321
323
|
else:
|
|
322
324
|
extra_kwargs = {}
|
|
323
325
|
return state_dict_, extra_kwargs
|
|
@@ -628,19 +628,22 @@ class FluxDiTStateDictConverter:
|
|
|
628
628
|
else:
|
|
629
629
|
pass
|
|
630
630
|
for name in list(state_dict_.keys()):
|
|
631
|
-
if ".
|
|
632
|
-
|
|
631
|
+
if "single_blocks." in name and ".a_to_q." in name:
|
|
632
|
+
mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
|
|
633
|
+
if mlp is None:
|
|
634
|
+
mlp = torch.zeros(4 * state_dict_[name].shape[0],
|
|
635
|
+
*state_dict_[name].shape[1:],
|
|
636
|
+
dtype=state_dict_[name].dtype)
|
|
637
|
+
else:
|
|
638
|
+
state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
|
|
633
639
|
param = torch.concat([
|
|
634
|
-
state_dict_
|
|
635
|
-
state_dict_
|
|
636
|
-
state_dict_
|
|
637
|
-
|
|
640
|
+
state_dict_.pop(name),
|
|
641
|
+
state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
|
|
642
|
+
state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
|
|
643
|
+
mlp,
|
|
638
644
|
], dim=0)
|
|
645
|
+
name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
|
|
639
646
|
state_dict_[name_] = param
|
|
640
|
-
state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
|
|
641
|
-
state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
|
|
642
|
-
state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
|
|
643
|
-
state_dict_.pop(name)
|
|
644
647
|
for name in list(state_dict_.keys()):
|
|
645
648
|
for component in ["a", "b"]:
|
|
646
649
|
if f".{component}_to_q." in name:
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import torch
|
|
3
|
+
import torch.nn as nn
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# FFN
|
|
7
|
+
def FeedForward(dim, mult=4):
|
|
8
|
+
inner_dim = int(dim * mult)
|
|
9
|
+
return nn.Sequential(
|
|
10
|
+
nn.LayerNorm(dim),
|
|
11
|
+
nn.Linear(dim, inner_dim, bias=False),
|
|
12
|
+
nn.GELU(),
|
|
13
|
+
nn.Linear(inner_dim, dim, bias=False),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def reshape_tensor(x, heads):
|
|
18
|
+
bs, length, width = x.shape
|
|
19
|
+
#(bs, length, width) --> (bs, length, n_heads, dim_per_head)
|
|
20
|
+
x = x.view(bs, length, heads, -1)
|
|
21
|
+
# (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
|
|
22
|
+
x = x.transpose(1, 2)
|
|
23
|
+
# (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
|
|
24
|
+
x = x.reshape(bs, heads, length, -1)
|
|
25
|
+
return x
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PerceiverAttention(nn.Module):
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, dim, dim_head=64, heads=8):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.scale = dim_head**-0.5
|
|
33
|
+
self.dim_head = dim_head
|
|
34
|
+
self.heads = heads
|
|
35
|
+
inner_dim = dim_head * heads
|
|
36
|
+
|
|
37
|
+
self.norm1 = nn.LayerNorm(dim)
|
|
38
|
+
self.norm2 = nn.LayerNorm(dim)
|
|
39
|
+
|
|
40
|
+
self.to_q = nn.Linear(dim, inner_dim, bias=False)
|
|
41
|
+
self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
|
|
42
|
+
self.to_out = nn.Linear(inner_dim, dim, bias=False)
|
|
43
|
+
|
|
44
|
+
def forward(self, x, latents):
|
|
45
|
+
"""
|
|
46
|
+
Args:
|
|
47
|
+
x (torch.Tensor): image features
|
|
48
|
+
shape (b, n1, D)
|
|
49
|
+
latent (torch.Tensor): latent features
|
|
50
|
+
shape (b, n2, D)
|
|
51
|
+
"""
|
|
52
|
+
x = self.norm1(x)
|
|
53
|
+
latents = self.norm2(latents)
|
|
54
|
+
|
|
55
|
+
b, l, _ = latents.shape
|
|
56
|
+
|
|
57
|
+
q = self.to_q(latents)
|
|
58
|
+
kv_input = torch.cat((x, latents), dim=-2)
|
|
59
|
+
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
|
|
60
|
+
|
|
61
|
+
q = reshape_tensor(q, self.heads)
|
|
62
|
+
k = reshape_tensor(k, self.heads)
|
|
63
|
+
v = reshape_tensor(v, self.heads)
|
|
64
|
+
|
|
65
|
+
# attention
|
|
66
|
+
scale = 1 / math.sqrt(math.sqrt(self.dim_head))
|
|
67
|
+
weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
|
|
68
|
+
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
|
|
69
|
+
out = weight @ v
|
|
70
|
+
|
|
71
|
+
out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
|
|
72
|
+
|
|
73
|
+
return self.to_out(out)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class InfiniteYouImageProjector(nn.Module):
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
dim=1280,
|
|
81
|
+
depth=4,
|
|
82
|
+
dim_head=64,
|
|
83
|
+
heads=20,
|
|
84
|
+
num_queries=8,
|
|
85
|
+
embedding_dim=512,
|
|
86
|
+
output_dim=4096,
|
|
87
|
+
ff_mult=4,
|
|
88
|
+
):
|
|
89
|
+
super().__init__()
|
|
90
|
+
self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
|
|
91
|
+
self.proj_in = nn.Linear(embedding_dim, dim)
|
|
92
|
+
|
|
93
|
+
self.proj_out = nn.Linear(dim, output_dim)
|
|
94
|
+
self.norm_out = nn.LayerNorm(output_dim)
|
|
95
|
+
|
|
96
|
+
self.layers = nn.ModuleList([])
|
|
97
|
+
for _ in range(depth):
|
|
98
|
+
self.layers.append(
|
|
99
|
+
nn.ModuleList([
|
|
100
|
+
PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
|
|
101
|
+
FeedForward(dim=dim, mult=ff_mult),
|
|
102
|
+
]))
|
|
103
|
+
|
|
104
|
+
def forward(self, x):
|
|
105
|
+
|
|
106
|
+
latents = self.latents.repeat(x.size(0), 1, 1)
|
|
107
|
+
|
|
108
|
+
x = self.proj_in(x)
|
|
109
|
+
|
|
110
|
+
for attn, ff in self.layers:
|
|
111
|
+
latents = attn(x, latents) + latents
|
|
112
|
+
latents = ff(latents) + latents
|
|
113
|
+
|
|
114
|
+
latents = self.proj_out(latents)
|
|
115
|
+
return self.norm_out(latents)
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def state_dict_converter():
|
|
119
|
+
return FluxInfiniteYouImageProjectorStateDictConverter()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class FluxInfiniteYouImageProjectorStateDictConverter:
|
|
123
|
+
|
|
124
|
+
def __init__(self):
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
def from_diffusers(self, state_dict):
|
|
128
|
+
return state_dict['image_proj']
|