diffusers 0.30.2__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +38 -2
- diffusers/configuration_utils.py +12 -0
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +257 -54
- diffusers/loaders/__init__.py +2 -0
- diffusers/loaders/ip_adapter.py +5 -1
- diffusers/loaders/lora_base.py +14 -7
- diffusers/loaders/lora_conversion_utils.py +332 -0
- diffusers/loaders/lora_pipeline.py +707 -41
- diffusers/loaders/peft.py +1 -0
- diffusers/loaders/single_file_utils.py +81 -4
- diffusers/loaders/textual_inversion.py +2 -0
- diffusers/loaders/unet.py +39 -8
- diffusers/models/__init__.py +4 -0
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +86 -10
- diffusers/models/attention_processor.py +169 -133
- diffusers/models/autoencoders/autoencoder_kl.py +71 -11
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +287 -85
- diffusers/models/controlnet_flux.py +536 -0
- diffusers/models/controlnet_sd3.py +7 -3
- diffusers/models/controlnet_sparsectrl.py +0 -1
- diffusers/models/embeddings.py +238 -61
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +182 -14
- diffusers/models/modeling_utils.py +283 -46
- diffusers/models/normalization.py +79 -0
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +58 -36
- diffusers/models/transformers/pixart_transformer_2d.py +9 -1
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +161 -44
- diffusers/models/transformers/transformer_sd3.py +7 -1
- diffusers/models/unets/unet_2d_condition.py +8 -8
- diffusers/models/unets/unet_motion_model.py +41 -63
- diffusers/models/upsampling.py +6 -6
- diffusers/pipelines/__init__.py +40 -7
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
- diffusers/pipelines/auto_pipeline.py +39 -8
- diffusers/pipelines/cogvideo/__init__.py +6 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +32 -34
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +837 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +825 -0
- diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +10 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -20
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
- diffusers/pipelines/pag/__init__.py +6 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_loading_utils.py +225 -27
- diffusers/pipelines/pipeline_utils.py +123 -180
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +126 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/quantization_config.py +391 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +4 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
- diffusers/schedulers/scheduling_deis_multistep.py +78 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_sasolver.py +78 -1
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
- diffusers/training_utils.py +48 -18
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +195 -0
- diffusers/utils/hub_utils.py +16 -4
- diffusers/utils/import_utils.py +31 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +3 -3
- diffusers/utils/testing_utils.py +59 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/RECORD +173 -147
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/WHEEL +1 -1
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ def retrieve_timesteps(
|
|
53
53
|
sigmas: Optional[List[float]] = None,
|
54
54
|
**kwargs,
|
55
55
|
):
|
56
|
-
"""
|
56
|
+
r"""
|
57
57
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
58
58
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
59
59
|
|
@@ -20,6 +20,7 @@ from huggingface_hub.utils import validate_hf_hub_args
|
|
20
20
|
from ..configuration_utils import ConfigMixin
|
21
21
|
from ..utils import is_sentencepiece_available
|
22
22
|
from .aura_flow import AuraFlowPipeline
|
23
|
+
from .cogview3 import CogView3PlusPipeline
|
23
24
|
from .controlnet import (
|
24
25
|
StableDiffusionControlNetImg2ImgPipeline,
|
25
26
|
StableDiffusionControlNetInpaintPipeline,
|
@@ -29,7 +30,14 @@ from .controlnet import (
|
|
29
30
|
StableDiffusionXLControlNetPipeline,
|
30
31
|
)
|
31
32
|
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
|
32
|
-
from .flux import
|
33
|
+
from .flux import (
|
34
|
+
FluxControlNetImg2ImgPipeline,
|
35
|
+
FluxControlNetInpaintPipeline,
|
36
|
+
FluxControlNetPipeline,
|
37
|
+
FluxImg2ImgPipeline,
|
38
|
+
FluxInpaintPipeline,
|
39
|
+
FluxPipeline,
|
40
|
+
)
|
33
41
|
from .hunyuandit import HunyuanDiTPipeline
|
34
42
|
from .kandinsky import (
|
35
43
|
KandinskyCombinedPipeline,
|
@@ -49,12 +57,16 @@ from .kandinsky2_2 import (
|
|
49
57
|
)
|
50
58
|
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
|
51
59
|
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
|
60
|
+
from .lumina import LuminaText2ImgPipeline
|
52
61
|
from .pag import (
|
53
62
|
HunyuanDiTPAGPipeline,
|
54
63
|
PixArtSigmaPAGPipeline,
|
55
64
|
StableDiffusion3PAGPipeline,
|
65
|
+
StableDiffusionControlNetPAGInpaintPipeline,
|
56
66
|
StableDiffusionControlNetPAGPipeline,
|
67
|
+
StableDiffusionPAGImg2ImgPipeline,
|
57
68
|
StableDiffusionPAGPipeline,
|
69
|
+
StableDiffusionXLControlNetPAGImg2ImgPipeline,
|
58
70
|
StableDiffusionXLControlNetPAGPipeline,
|
59
71
|
StableDiffusionXLPAGImg2ImgPipeline,
|
60
72
|
StableDiffusionXLPAGInpaintPipeline,
|
@@ -106,6 +118,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
106
118
|
("pixart-sigma-pag", PixArtSigmaPAGPipeline),
|
107
119
|
("auraflow", AuraFlowPipeline),
|
108
120
|
("flux", FluxPipeline),
|
121
|
+
("flux-controlnet", FluxControlNetPipeline),
|
122
|
+
("lumina", LuminaText2ImgPipeline),
|
123
|
+
("cogview3", CogView3PlusPipeline),
|
109
124
|
]
|
110
125
|
)
|
111
126
|
|
@@ -119,9 +134,13 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
119
134
|
("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
|
120
135
|
("kandinsky3", Kandinsky3Img2ImgPipeline),
|
121
136
|
("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
|
137
|
+
("stable-diffusion-pag", StableDiffusionPAGImg2ImgPipeline),
|
122
138
|
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
|
123
139
|
("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
|
140
|
+
("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
|
124
141
|
("lcm", LatentConsistencyModelImg2ImgPipeline),
|
142
|
+
("flux", FluxImg2ImgPipeline),
|
143
|
+
("flux-controlnet", FluxControlNetImg2ImgPipeline),
|
125
144
|
]
|
126
145
|
)
|
127
146
|
|
@@ -134,8 +153,11 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
|
|
134
153
|
("kandinsky", KandinskyInpaintCombinedPipeline),
|
135
154
|
("kandinsky22", KandinskyV22InpaintCombinedPipeline),
|
136
155
|
("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
|
156
|
+
("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
|
137
157
|
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
|
138
158
|
("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
|
159
|
+
("flux", FluxInpaintPipeline),
|
160
|
+
("flux-controlnet", FluxControlNetInpaintPipeline),
|
139
161
|
]
|
140
162
|
)
|
141
163
|
|
@@ -161,12 +183,12 @@ _AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
|
|
161
183
|
)
|
162
184
|
|
163
185
|
if is_sentencepiece_available():
|
164
|
-
from .kolors import KolorsPipeline
|
186
|
+
from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
|
165
187
|
from .pag import KolorsPAGPipeline
|
166
188
|
|
167
189
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
|
168
190
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
|
169
|
-
AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] =
|
191
|
+
AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
|
170
192
|
|
171
193
|
SUPPORTED_TASKS_MAPPINGS = [
|
172
194
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
|
@@ -656,12 +678,17 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
|
656
678
|
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
657
679
|
orig_class_name = config["_class_name"]
|
658
680
|
|
681
|
+
# the `orig_class_name` can be:
|
682
|
+
# `- *Pipeline` (for regular text-to-image checkpoint)
|
683
|
+
# `- *Img2ImgPipeline` (for refiner checkpoint)
|
684
|
+
to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
|
685
|
+
|
659
686
|
if "controlnet" in kwargs:
|
660
|
-
orig_class_name =
|
687
|
+
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
661
688
|
if "enable_pag" in kwargs:
|
662
689
|
enable_pag = kwargs.pop("enable_pag")
|
663
690
|
if enable_pag:
|
664
|
-
orig_class_name = orig_class_name.replace(
|
691
|
+
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
665
692
|
|
666
693
|
image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
|
667
694
|
|
@@ -948,13 +975,17 @@ class AutoPipelineForInpainting(ConfigMixin):
|
|
948
975
|
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
949
976
|
orig_class_name = config["_class_name"]
|
950
977
|
|
978
|
+
# The `orig_class_name`` can be:
|
979
|
+
# `- *InpaintPipeline` (for inpaint-specific checkpoint)
|
980
|
+
# - or *Pipeline (for regular text-to-image checkpoint)
|
981
|
+
to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
|
982
|
+
|
951
983
|
if "controlnet" in kwargs:
|
952
|
-
orig_class_name =
|
984
|
+
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
953
985
|
if "enable_pag" in kwargs:
|
954
986
|
enable_pag = kwargs.pop("enable_pag")
|
955
987
|
if enable_pag:
|
956
|
-
orig_class_name =
|
957
|
-
|
988
|
+
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
958
989
|
inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
|
959
990
|
|
960
991
|
kwargs = {**load_config_kwargs, **kwargs}
|
@@ -23,6 +23,9 @@ except OptionalDependencyNotAvailable:
|
|
23
23
|
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
24
|
else:
|
25
25
|
_import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
|
26
|
+
_import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
|
27
|
+
_import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
|
28
|
+
_import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
|
26
29
|
|
27
30
|
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
28
31
|
try:
|
@@ -33,6 +36,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
33
36
|
from ...utils.dummy_torch_and_transformers_objects import *
|
34
37
|
else:
|
35
38
|
from .pipeline_cogvideox import CogVideoXPipeline
|
39
|
+
from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
|
40
|
+
from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
|
41
|
+
from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
|
36
42
|
|
37
43
|
else:
|
38
44
|
import sys
|
@@ -15,20 +15,21 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from
|
19
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
20
19
|
|
21
20
|
import torch
|
22
21
|
from transformers import T5EncoderModel, T5Tokenizer
|
23
22
|
|
24
23
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
24
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
25
25
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
26
26
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
27
27
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
28
28
|
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
29
|
-
from ...utils import
|
29
|
+
from ...utils import logging, replace_example_docstring
|
30
30
|
from ...utils.torch_utils import randn_tensor
|
31
31
|
from ...video_processor import VideoProcessor
|
32
|
+
from .pipeline_output import CogVideoXPipelineOutput
|
32
33
|
|
33
34
|
|
34
35
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
@@ -85,7 +86,7 @@ def retrieve_timesteps(
|
|
85
86
|
sigmas: Optional[List[float]] = None,
|
86
87
|
**kwargs,
|
87
88
|
):
|
88
|
-
"""
|
89
|
+
r"""
|
89
90
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
90
91
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
91
92
|
|
@@ -136,22 +137,7 @@ def retrieve_timesteps(
|
|
136
137
|
return timesteps, num_inference_steps
|
137
138
|
|
138
139
|
|
139
|
-
|
140
|
-
class CogVideoXPipelineOutput(BaseOutput):
|
141
|
-
r"""
|
142
|
-
Output class for CogVideo pipelines.
|
143
|
-
|
144
|
-
Args:
|
145
|
-
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
146
|
-
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
147
|
-
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
148
|
-
`(batch_size, num_frames, channels, height, width)`.
|
149
|
-
"""
|
150
|
-
|
151
|
-
frames: torch.Tensor
|
152
|
-
|
153
|
-
|
154
|
-
class CogVideoXPipeline(DiffusionPipeline):
|
140
|
+
class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
155
141
|
r"""
|
156
142
|
Pipeline for text-to-video generation using CogVideoX.
|
157
143
|
|
@@ -202,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
202
188
|
self.vae_scale_factor_temporal = (
|
203
189
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
204
190
|
)
|
191
|
+
self.vae_scaling_factor_image = (
|
192
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
193
|
+
)
|
205
194
|
|
206
195
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
207
196
|
|
@@ -331,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
331
320
|
def prepare_latents(
|
332
321
|
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
|
333
322
|
):
|
323
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
324
|
+
raise ValueError(
|
325
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
326
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
327
|
+
)
|
328
|
+
|
334
329
|
shape = (
|
335
330
|
batch_size,
|
336
331
|
(num_frames - 1) // self.vae_scale_factor_temporal + 1,
|
@@ -338,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
338
333
|
height // self.vae_scale_factor_spatial,
|
339
334
|
width // self.vae_scale_factor_spatial,
|
340
335
|
)
|
341
|
-
if isinstance(generator, list) and len(generator) != batch_size:
|
342
|
-
raise ValueError(
|
343
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
344
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
345
|
-
)
|
346
336
|
|
347
337
|
if latents is None:
|
348
338
|
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
@@ -355,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
355
345
|
|
356
346
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
357
347
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
358
|
-
latents = 1 / self.
|
348
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
359
349
|
|
360
350
|
frames = self.vae.decode(latents).sample
|
361
351
|
return frames
|
@@ -463,7 +453,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
463
453
|
crops_coords=grid_crops_coords,
|
464
454
|
grid_size=(grid_height, grid_width),
|
465
455
|
temporal_size=num_frames,
|
466
|
-
use_real=True,
|
467
456
|
)
|
468
457
|
|
469
458
|
freqs_cos = freqs_cos.to(device=device)
|
@@ -478,6 +467,10 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
478
467
|
def num_timesteps(self):
|
479
468
|
return self._num_timesteps
|
480
469
|
|
470
|
+
@property
|
471
|
+
def attention_kwargs(self):
|
472
|
+
return self._attention_kwargs
|
473
|
+
|
481
474
|
@property
|
482
475
|
def interrupt(self):
|
483
476
|
return self._interrupt
|
@@ -503,6 +496,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
503
496
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
504
497
|
output_type: str = "pil",
|
505
498
|
return_dict: bool = True,
|
499
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
506
500
|
callback_on_step_end: Optional[
|
507
501
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
508
502
|
] = None,
|
@@ -520,14 +514,14 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
520
514
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
521
515
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
522
516
|
less than `1`).
|
523
|
-
height (`int`, *optional*, defaults to self.
|
524
|
-
The height in pixels of the generated image. This is set to
|
525
|
-
width (`int`, *optional*, defaults to self.
|
526
|
-
The width in pixels of the generated image. This is set to
|
517
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
518
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
519
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
520
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
527
521
|
num_frames (`int`, defaults to `48`):
|
528
522
|
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
|
529
523
|
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
|
530
|
-
num_seconds is 6 and fps is
|
524
|
+
num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
|
531
525
|
needs to be satisfied is that of divisibility mentioned above.
|
532
526
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
533
527
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -564,6 +558,10 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
564
558
|
return_dict (`bool`, *optional*, defaults to `True`):
|
565
559
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
566
560
|
of a plain tuple.
|
561
|
+
attention_kwargs (`dict`, *optional*):
|
562
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
563
|
+
`self.processor` in
|
564
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
567
565
|
callback_on_step_end (`Callable`, *optional*):
|
568
566
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
569
567
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -593,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
593
591
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
594
592
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
595
593
|
|
596
|
-
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
597
|
-
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
598
594
|
num_videos_per_prompt = 1
|
599
595
|
|
600
596
|
# 1. Check inputs. Raise error if not correct
|
@@ -608,6 +604,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
608
604
|
negative_prompt_embeds,
|
609
605
|
)
|
610
606
|
self._guidance_scale = guidance_scale
|
607
|
+
self._attention_kwargs = attention_kwargs
|
611
608
|
self._interrupt = False
|
612
609
|
|
613
610
|
# 2. Default call parameters
|
@@ -689,6 +686,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
689
686
|
encoder_hidden_states=prompt_embeds,
|
690
687
|
timestep=timestep,
|
691
688
|
image_rotary_emb=image_rotary_emb,
|
689
|
+
attention_kwargs=attention_kwargs,
|
692
690
|
return_dict=False,
|
693
691
|
)[0]
|
694
692
|
noise_pred = noise_pred.float()
|