diffsynth 1.1.1__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffsynth-1.1.1 → diffsynth-1.1.2}/PKG-INFO +1 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/README.md +19 -5
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/configs/model_config.py +19 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/data/video.py +2 -2
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/BLIP/__init__.py +1 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/BLIP/blip.py +77 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/BLIP/blip_pretrain.py +44 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/BLIP/med.py +947 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/BLIP/vit.py +301 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/__init__.py +148 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/aesthetic.py +148 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/clip.py +97 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/config.py +23 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/hps.py +118 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/imagereward.py +212 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/mps.py +129 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/__init__.py +14 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py +458 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/constants.py +2 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/factory.py +433 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/hf_configs.py +45 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py +176 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/loss.py +270 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/model.py +461 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/modified_resnet.py +181 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/openai.py +144 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py +376 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/push_to_hf_hub.py +243 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py +127 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py +211 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/transform.py +216 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py +727 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/utils.py +60 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/version.py +1 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/pickscore.py +112 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/trainer/__init__.py +1 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/trainer/models/__init__.py +3 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/trainer/models/base_model.py +7 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/trainer/models/clip_model.py +146 -0
- diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/trainer/models/cross_modeling.py +292 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/kolors_text_encoder.py +1 -2
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/lora.py +2 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/model_manager.py +16 -7
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd3_text_encoder.py +2 -1
- diffsynth-1.1.2/diffsynth/models/stepvideo_dit.py +940 -0
- diffsynth-1.1.2/diffsynth/models/stepvideo_text_encoder.py +553 -0
- diffsynth-1.1.2/diffsynth/models/stepvideo_vae.py +1132 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/tiler.py +1 -1
- diffsynth-1.1.2/diffsynth/models/wan_video_dit.py +799 -0
- diffsynth-1.1.2/diffsynth/models/wan_video_image_encoder.py +904 -0
- diffsynth-1.1.2/diffsynth/models/wan_video_text_encoder.py +269 -0
- diffsynth-1.1.2/diffsynth/models/wan_video_vae.py +808 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/__init__.py +2 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/base.py +12 -2
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/flux_image.py +104 -2
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/omnigen_image.py +1 -1
- diffsynth-1.1.2/diffsynth/pipelines/step_video.py +209 -0
- diffsynth-1.1.2/diffsynth/pipelines/wan_video.py +276 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/__init__.py +2 -0
- diffsynth-1.1.2/diffsynth/prompters/stepvideo_prompter.py +56 -0
- diffsynth-1.1.2/diffsynth/prompters/wan_prompter.py +108 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/schedulers/flow_match.py +8 -3
- diffsynth-1.1.2/diffsynth/trainers/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/trainers/text_to_image.py +27 -2
- diffsynth-1.1.2/diffsynth/vram_management/__init__.py +1 -0
- diffsynth-1.1.2/diffsynth/vram_management/layers.py +95 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth.egg-info/PKG-INFO +1 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth.egg-info/SOURCES.txt +51 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth.egg-info/requires.txt +1 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/setup.py +1 -1
- {diffsynth-1.1.1 → diffsynth-1.1.2}/LICENSE +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/configs/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/controlnets/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/controlnets/controlnet_unit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/controlnets/processors.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/data/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/data/simple_text_image.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/ESRGAN/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/api.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/cupy_kernels.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/data.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/patch_match.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/runners/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/runners/accurate.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/runners/balanced.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/runners/fast.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/FastBlend/runners/interpolation.py +0 -0
- /diffsynth-1.1.1/diffsynth/extensions/__init__.py → /diffsynth-1.1.2/diffsynth/extensions/ImageQualityMetric/open_clip/generation_utils.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/extensions/RIFE/__init__.py +0 -0
- {diffsynth-1.1.1/diffsynth/processors → diffsynth-1.1.2/diffsynth/extensions}/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/attention.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/cog_dit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/cog_vae.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/downloader.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/flux_controlnet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/flux_dit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/flux_ipadapter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/flux_text_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/flux_vae.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_dit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_dit_text_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_video_dit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_video_text_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_video_vae_decoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/hunyuan_video_vae_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/omnigen.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd3_dit.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd3_vae_decoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd3_vae_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_controlnet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_ipadapter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_motion.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_text_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_unet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_vae_decoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sd_vae_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_controlnet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_ipadapter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_motion.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_text_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_unet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_vae_decoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/sdxl_vae_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/svd_image_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/svd_unet.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/svd_vae_decoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/svd_vae_encoder.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/models/utils.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/cog_video.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/dancer.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/hunyuan_image.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/hunyuan_video.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/pipeline_runner.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/sd3_image.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/sd_image.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/sd_video.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/sdxl_image.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/sdxl_video.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/pipelines/svd_video.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/processors/FastBlend.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/processors/PILEditor.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/processors/RIFE.py +0 -0
- {diffsynth-1.1.1/diffsynth/tokenizer_configs → diffsynth-1.1.2/diffsynth/processors}/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/processors/base.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/processors/sequencial_processor.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/base_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/cog_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/flux_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/hunyuan_dit_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/hunyuan_video_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/kolors_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/omnigen_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/omost.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/prompt_refiners.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/sd3_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/sd_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/prompters/sdxl_prompter.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/schedulers/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/schedulers/continuous_ode.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/schedulers/ddim.py +0 -0
- {diffsynth-1.1.1/diffsynth/trainers → diffsynth-1.1.2/diffsynth/tokenizer_configs}/__init__.py +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/cog/tokenizer/added_tokens.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/cog/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/cog/tokenizer/spiece.model +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/cog/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_2/spiece.model +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab_org.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer.model +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/spiece.model +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/merges.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/vocab.json +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth.egg-info/dependency_links.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/diffsynth.egg-info/top_level.txt +0 -0
- {diffsynth-1.1.1 → diffsynth-1.1.2}/setup.cfg +0 -0
|
@@ -17,6 +17,8 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu
|
|
|
17
17
|
|
|
18
18
|
Until now, DiffSynth Studio has supported the following models:
|
|
19
19
|
|
|
20
|
+
* [Wan-Video](https://github.com/Wan-Video/Wan2.1)
|
|
21
|
+
* [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
|
|
20
22
|
* [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
|
|
21
23
|
* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
|
|
22
24
|
* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
|
|
@@ -34,11 +36,16 @@ Until now, DiffSynth Studio has supported the following models:
|
|
|
34
36
|
* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
|
|
35
37
|
|
|
36
38
|
## News
|
|
39
|
+
|
|
40
|
+
- **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
|
|
41
|
+
|
|
42
|
+
- **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
|
|
43
|
+
|
|
37
44
|
- **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
45
|
+
- Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
|
|
46
|
+
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
|
|
47
|
+
- Online Demo: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
|
|
48
|
+
- Training Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
|
|
42
49
|
|
|
43
50
|
- **December 19, 2024** We implement advanced VRAM management for HunyuanVideo, making it possible to generate videos at a resolution of 129x720x1280 using 24GB of VRAM, or at 129x512x384 resolution with just 6GB of VRAM. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details.
|
|
44
51
|
|
|
@@ -114,12 +121,19 @@ cd DiffSynth-Studio
|
|
|
114
121
|
pip install -e .
|
|
115
122
|
```
|
|
116
123
|
|
|
117
|
-
Or install from pypi:
|
|
124
|
+
Or install from pypi (There is a delay in the update. If you want to experience the latest features, please do not use this installation method.):
|
|
118
125
|
|
|
119
126
|
```
|
|
120
127
|
pip install diffsynth
|
|
121
128
|
```
|
|
122
129
|
|
|
130
|
+
If you encounter issues during installation, it may be caused by the packages we depend on. Please refer to the documentation of the package that caused the problem.
|
|
131
|
+
|
|
132
|
+
* [torch](https://pytorch.org/get-started/locally/)
|
|
133
|
+
* [sentencepiece](https://github.com/google/sentencepiece)
|
|
134
|
+
* [cmake](https://cmake.org)
|
|
135
|
+
* [cupy](https://docs.cupy.dev/en/stable/install.html)
|
|
136
|
+
|
|
123
137
|
## Usage (in Python code)
|
|
124
138
|
|
|
125
139
|
The Python examples are in [`examples`](./examples/). We provide an overview here.
|
|
@@ -51,6 +51,14 @@ from ..extensions.ESRGAN import RRDBNet
|
|
|
51
51
|
|
|
52
52
|
from ..models.hunyuan_video_dit import HunyuanVideoDiT
|
|
53
53
|
|
|
54
|
+
from ..models.stepvideo_vae import StepVideoVAE
|
|
55
|
+
from ..models.stepvideo_dit import StepVideoModel
|
|
56
|
+
|
|
57
|
+
from ..models.wan_video_dit import WanModel
|
|
58
|
+
from ..models.wan_video_text_encoder import WanTextEncoder
|
|
59
|
+
from ..models.wan_video_image_encoder import WanImageEncoder
|
|
60
|
+
from ..models.wan_video_vae import WanVideoVAE
|
|
61
|
+
|
|
54
62
|
|
|
55
63
|
model_loader_configs = [
|
|
56
64
|
# These configs are provided for detecting model type automatically.
|
|
@@ -103,6 +111,15 @@ model_loader_configs = [
|
|
|
103
111
|
(None, "aeb82dce778a03dcb4d726cb03f3c43f", ["hunyuan_video_vae_decoder", "hunyuan_video_vae_encoder"], [HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder], "diffusers"),
|
|
104
112
|
(None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
|
|
105
113
|
(None, "84ef4bd4757f60e906b54aa6a7815dc6", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
|
|
114
|
+
(None, "68beaf8429b7c11aa8ca05b1bd0058bd", ["stepvideo_vae"], [StepVideoVAE], "civitai"),
|
|
115
|
+
(None, "5c0216a2132b082c10cb7a0e0377e681", ["stepvideo_dit"], [StepVideoModel], "civitai"),
|
|
116
|
+
(None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
|
|
117
|
+
(None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
|
|
118
|
+
(None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
|
|
119
|
+
(None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
|
|
120
|
+
(None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
|
|
121
|
+
(None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
|
|
122
|
+
(None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
|
|
106
123
|
]
|
|
107
124
|
huggingface_model_loader_configs = [
|
|
108
125
|
# These configs are provided for detecting model type automatically.
|
|
@@ -115,7 +132,8 @@ huggingface_model_loader_configs = [
|
|
|
115
132
|
("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
|
|
116
133
|
("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
|
|
117
134
|
("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
|
|
118
|
-
("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder")
|
|
135
|
+
("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
|
|
136
|
+
("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
|
|
119
137
|
]
|
|
120
138
|
patch_model_loader_configs = [
|
|
121
139
|
# These configs are provided for detecting model type automatically.
|
|
@@ -135,8 +135,8 @@ class VideoData:
|
|
|
135
135
|
frame.save(os.path.join(folder, f"{i}.png"))
|
|
136
136
|
|
|
137
137
|
|
|
138
|
-
def save_video(frames, save_path, fps, quality=9):
|
|
139
|
-
writer = imageio.get_writer(save_path, fps=fps, quality=quality)
|
|
138
|
+
def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
|
|
139
|
+
writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params)
|
|
140
140
|
for frame in tqdm(frames, desc="Saving video"):
|
|
141
141
|
frame = np.array(frame)
|
|
142
142
|
writer.append_data(frame)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .blip_pretrain import *
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
'''
|
|
2
|
+
* Adapted from BLIP (https://github.com/salesforce/BLIP)
|
|
3
|
+
'''
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
warnings.filterwarnings("ignore")
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import os
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
from timm.models.hub import download_cached_file
|
|
12
|
+
from transformers import BertTokenizer
|
|
13
|
+
from .vit import VisionTransformer, interpolate_pos_embed
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def default_bert():
|
|
17
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
+
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
|
|
19
|
+
model_path = os.path.join(project_root, 'models', 'QualityMetric')
|
|
20
|
+
return os.path.join(model_path, "bert-base-uncased")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def init_tokenizer(bert_model_path):
|
|
24
|
+
tokenizer = BertTokenizer.from_pretrained(bert_model_path)
|
|
25
|
+
tokenizer.add_special_tokens({'bos_token':'[DEC]'})
|
|
26
|
+
tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
|
|
27
|
+
tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
|
|
28
|
+
return tokenizer
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
|
|
32
|
+
|
|
33
|
+
assert vit in ['base', 'large'], "vit parameter must be base or large"
|
|
34
|
+
if vit=='base':
|
|
35
|
+
vision_width = 768
|
|
36
|
+
visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
|
|
37
|
+
num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
|
|
38
|
+
drop_path_rate=0 or drop_path_rate
|
|
39
|
+
)
|
|
40
|
+
elif vit=='large':
|
|
41
|
+
vision_width = 1024
|
|
42
|
+
visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
|
|
43
|
+
num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
|
|
44
|
+
drop_path_rate=0.1 or drop_path_rate
|
|
45
|
+
)
|
|
46
|
+
return visual_encoder, vision_width
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_url(url_or_filename):
|
|
50
|
+
parsed = urlparse(url_or_filename)
|
|
51
|
+
return parsed.scheme in ("http", "https")
|
|
52
|
+
|
|
53
|
+
def load_checkpoint(model,url_or_filename):
|
|
54
|
+
if is_url(url_or_filename):
|
|
55
|
+
cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
|
|
56
|
+
checkpoint = torch.load(cached_file, map_location='cpu')
|
|
57
|
+
elif os.path.isfile(url_or_filename):
|
|
58
|
+
checkpoint = torch.load(url_or_filename, map_location='cpu')
|
|
59
|
+
else:
|
|
60
|
+
raise RuntimeError('checkpoint url or path is invalid')
|
|
61
|
+
|
|
62
|
+
state_dict = checkpoint['model']
|
|
63
|
+
|
|
64
|
+
state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
|
|
65
|
+
if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
|
|
66
|
+
state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
|
|
67
|
+
model.visual_encoder_m)
|
|
68
|
+
for key in model.state_dict().keys():
|
|
69
|
+
if key in state_dict.keys():
|
|
70
|
+
if state_dict[key].shape!=model.state_dict()[key].shape:
|
|
71
|
+
print(key, ": ", state_dict[key].shape, ', ', model.state_dict()[key].shape)
|
|
72
|
+
del state_dict[key]
|
|
73
|
+
|
|
74
|
+
msg = model.load_state_dict(state_dict,strict=False)
|
|
75
|
+
print('load checkpoint from %s'%url_or_filename)
|
|
76
|
+
return model,msg
|
|
77
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
'''
|
|
2
|
+
* Adapted from BLIP (https://github.com/salesforce/BLIP)
|
|
3
|
+
'''
|
|
4
|
+
|
|
5
|
+
import transformers
|
|
6
|
+
transformers.logging.set_verbosity_error()
|
|
7
|
+
|
|
8
|
+
from torch import nn
|
|
9
|
+
import os
|
|
10
|
+
from .med import BertConfig, BertModel
|
|
11
|
+
from .blip import create_vit, init_tokenizer
|
|
12
|
+
|
|
13
|
+
class BLIP_Pretrain(nn.Module):
|
|
14
|
+
def __init__(self,
|
|
15
|
+
med_config = "med_config.json",
|
|
16
|
+
image_size = 224,
|
|
17
|
+
vit = 'base',
|
|
18
|
+
vit_grad_ckpt = False,
|
|
19
|
+
vit_ckpt_layer = 0,
|
|
20
|
+
embed_dim = 256,
|
|
21
|
+
queue_size = 57600,
|
|
22
|
+
momentum = 0.995,
|
|
23
|
+
bert_model_path = ""
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Args:
|
|
27
|
+
med_config (str): path for the mixture of encoder-decoder model's configuration file
|
|
28
|
+
image_size (int): input image size
|
|
29
|
+
vit (str): model size of vision transformer
|
|
30
|
+
"""
|
|
31
|
+
super().__init__()
|
|
32
|
+
|
|
33
|
+
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, 0)
|
|
34
|
+
|
|
35
|
+
self.tokenizer = init_tokenizer(bert_model_path)
|
|
36
|
+
encoder_config = BertConfig.from_json_file(med_config)
|
|
37
|
+
encoder_config.encoder_width = vision_width
|
|
38
|
+
self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
|
|
39
|
+
|
|
40
|
+
text_width = self.text_encoder.config.hidden_size
|
|
41
|
+
|
|
42
|
+
self.vision_proj = nn.Linear(vision_width, embed_dim)
|
|
43
|
+
self.text_proj = nn.Linear(text_width, embed_dim)
|
|
44
|
+
|