diffusers 0.30.3__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +34 -2
- diffusers/configuration_utils.py +12 -0
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +257 -54
- diffusers/loaders/__init__.py +2 -0
- diffusers/loaders/ip_adapter.py +5 -1
- diffusers/loaders/lora_base.py +14 -7
- diffusers/loaders/lora_conversion_utils.py +332 -0
- diffusers/loaders/lora_pipeline.py +707 -41
- diffusers/loaders/peft.py +1 -0
- diffusers/loaders/single_file_utils.py +81 -4
- diffusers/loaders/textual_inversion.py +2 -0
- diffusers/loaders/unet.py +39 -8
- diffusers/models/__init__.py +4 -0
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +86 -10
- diffusers/models/attention_processor.py +169 -133
- diffusers/models/autoencoders/autoencoder_kl.py +71 -11
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +187 -88
- diffusers/models/controlnet_flux.py +536 -0
- diffusers/models/controlnet_sd3.py +7 -3
- diffusers/models/controlnet_sparsectrl.py +0 -1
- diffusers/models/embeddings.py +170 -61
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +182 -14
- diffusers/models/modeling_utils.py +283 -46
- diffusers/models/normalization.py +79 -0
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +23 -2
- diffusers/models/transformers/pixart_transformer_2d.py +9 -1
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +161 -44
- diffusers/models/transformers/transformer_sd3.py +7 -1
- diffusers/models/unets/unet_2d_condition.py +8 -8
- diffusers/models/unets/unet_motion_model.py +41 -63
- diffusers/models/upsampling.py +6 -6
- diffusers/pipelines/__init__.py +35 -6
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
- diffusers/pipelines/auto_pipeline.py +39 -8
- diffusers/pipelines/cogvideo/__init__.py +2 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +30 -17
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +41 -31
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +42 -29
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +10 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -20
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
- diffusers/pipelines/pag/__init__.py +6 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_loading_utils.py +225 -27
- diffusers/pipelines/pipeline_utils.py +123 -180
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +126 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/quantization_config.py +391 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +4 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
- diffusers/schedulers/scheduling_deis_multistep.py +78 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_sasolver.py +78 -1
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
- diffusers/training_utils.py +48 -18
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +165 -0
- diffusers/utils/hub_utils.py +16 -4
- diffusers/utils/import_utils.py +31 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +3 -3
- diffusers/utils/testing_utils.py +59 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/RECORD +172 -149
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/WHEEL +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ def retrieve_timesteps(
|
|
53
53
|
sigmas: Optional[List[float]] = None,
|
54
54
|
**kwargs,
|
55
55
|
):
|
56
|
-
"""
|
56
|
+
r"""
|
57
57
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
58
58
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
59
59
|
|
@@ -20,6 +20,7 @@ from huggingface_hub.utils import validate_hf_hub_args
|
|
20
20
|
from ..configuration_utils import ConfigMixin
|
21
21
|
from ..utils import is_sentencepiece_available
|
22
22
|
from .aura_flow import AuraFlowPipeline
|
23
|
+
from .cogview3 import CogView3PlusPipeline
|
23
24
|
from .controlnet import (
|
24
25
|
StableDiffusionControlNetImg2ImgPipeline,
|
25
26
|
StableDiffusionControlNetInpaintPipeline,
|
@@ -29,7 +30,14 @@ from .controlnet import (
|
|
29
30
|
StableDiffusionXLControlNetPipeline,
|
30
31
|
)
|
31
32
|
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
|
32
|
-
from .flux import
|
33
|
+
from .flux import (
|
34
|
+
FluxControlNetImg2ImgPipeline,
|
35
|
+
FluxControlNetInpaintPipeline,
|
36
|
+
FluxControlNetPipeline,
|
37
|
+
FluxImg2ImgPipeline,
|
38
|
+
FluxInpaintPipeline,
|
39
|
+
FluxPipeline,
|
40
|
+
)
|
33
41
|
from .hunyuandit import HunyuanDiTPipeline
|
34
42
|
from .kandinsky import (
|
35
43
|
KandinskyCombinedPipeline,
|
@@ -49,12 +57,16 @@ from .kandinsky2_2 import (
|
|
49
57
|
)
|
50
58
|
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
|
51
59
|
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
|
60
|
+
from .lumina import LuminaText2ImgPipeline
|
52
61
|
from .pag import (
|
53
62
|
HunyuanDiTPAGPipeline,
|
54
63
|
PixArtSigmaPAGPipeline,
|
55
64
|
StableDiffusion3PAGPipeline,
|
65
|
+
StableDiffusionControlNetPAGInpaintPipeline,
|
56
66
|
StableDiffusionControlNetPAGPipeline,
|
67
|
+
StableDiffusionPAGImg2ImgPipeline,
|
57
68
|
StableDiffusionPAGPipeline,
|
69
|
+
StableDiffusionXLControlNetPAGImg2ImgPipeline,
|
58
70
|
StableDiffusionXLControlNetPAGPipeline,
|
59
71
|
StableDiffusionXLPAGImg2ImgPipeline,
|
60
72
|
StableDiffusionXLPAGInpaintPipeline,
|
@@ -106,6 +118,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
106
118
|
("pixart-sigma-pag", PixArtSigmaPAGPipeline),
|
107
119
|
("auraflow", AuraFlowPipeline),
|
108
120
|
("flux", FluxPipeline),
|
121
|
+
("flux-controlnet", FluxControlNetPipeline),
|
122
|
+
("lumina", LuminaText2ImgPipeline),
|
123
|
+
("cogview3", CogView3PlusPipeline),
|
109
124
|
]
|
110
125
|
)
|
111
126
|
|
@@ -119,9 +134,13 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
119
134
|
("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
|
120
135
|
("kandinsky3", Kandinsky3Img2ImgPipeline),
|
121
136
|
("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
|
137
|
+
("stable-diffusion-pag", StableDiffusionPAGImg2ImgPipeline),
|
122
138
|
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
|
123
139
|
("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
|
140
|
+
("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
|
124
141
|
("lcm", LatentConsistencyModelImg2ImgPipeline),
|
142
|
+
("flux", FluxImg2ImgPipeline),
|
143
|
+
("flux-controlnet", FluxControlNetImg2ImgPipeline),
|
125
144
|
]
|
126
145
|
)
|
127
146
|
|
@@ -134,8 +153,11 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
|
|
134
153
|
("kandinsky", KandinskyInpaintCombinedPipeline),
|
135
154
|
("kandinsky22", KandinskyV22InpaintCombinedPipeline),
|
136
155
|
("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
|
156
|
+
("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
|
137
157
|
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
|
138
158
|
("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
|
159
|
+
("flux", FluxInpaintPipeline),
|
160
|
+
("flux-controlnet", FluxControlNetInpaintPipeline),
|
139
161
|
]
|
140
162
|
)
|
141
163
|
|
@@ -161,12 +183,12 @@ _AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
|
|
161
183
|
)
|
162
184
|
|
163
185
|
if is_sentencepiece_available():
|
164
|
-
from .kolors import KolorsPipeline
|
186
|
+
from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
|
165
187
|
from .pag import KolorsPAGPipeline
|
166
188
|
|
167
189
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
|
168
190
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
|
169
|
-
AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] =
|
191
|
+
AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
|
170
192
|
|
171
193
|
SUPPORTED_TASKS_MAPPINGS = [
|
172
194
|
AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
|
@@ -656,12 +678,17 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
|
656
678
|
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
657
679
|
orig_class_name = config["_class_name"]
|
658
680
|
|
681
|
+
# the `orig_class_name` can be:
|
682
|
+
# `- *Pipeline` (for regular text-to-image checkpoint)
|
683
|
+
# `- *Img2ImgPipeline` (for refiner checkpoint)
|
684
|
+
to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
|
685
|
+
|
659
686
|
if "controlnet" in kwargs:
|
660
|
-
orig_class_name =
|
687
|
+
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
661
688
|
if "enable_pag" in kwargs:
|
662
689
|
enable_pag = kwargs.pop("enable_pag")
|
663
690
|
if enable_pag:
|
664
|
-
orig_class_name = orig_class_name.replace(
|
691
|
+
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
665
692
|
|
666
693
|
image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
|
667
694
|
|
@@ -948,13 +975,17 @@ class AutoPipelineForInpainting(ConfigMixin):
|
|
948
975
|
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
949
976
|
orig_class_name = config["_class_name"]
|
950
977
|
|
978
|
+
# The `orig_class_name`` can be:
|
979
|
+
# `- *InpaintPipeline` (for inpaint-specific checkpoint)
|
980
|
+
# - or *Pipeline (for regular text-to-image checkpoint)
|
981
|
+
to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
|
982
|
+
|
951
983
|
if "controlnet" in kwargs:
|
952
|
-
orig_class_name =
|
984
|
+
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
953
985
|
if "enable_pag" in kwargs:
|
954
986
|
enable_pag = kwargs.pop("enable_pag")
|
955
987
|
if enable_pag:
|
956
|
-
orig_class_name =
|
957
|
-
|
988
|
+
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
958
989
|
inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
|
959
990
|
|
960
991
|
kwargs = {**load_config_kwargs, **kwargs}
|
@@ -23,6 +23,7 @@ except OptionalDependencyNotAvailable:
|
|
23
23
|
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
24
|
else:
|
25
25
|
_import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
|
26
|
+
_import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
|
26
27
|
_import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
|
27
28
|
_import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
|
28
29
|
|
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
35
36
|
from ...utils.dummy_torch_and_transformers_objects import *
|
36
37
|
else:
|
37
38
|
from .pipeline_cogvideox import CogVideoXPipeline
|
39
|
+
from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
|
38
40
|
from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
|
39
41
|
from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
|
40
42
|
|
@@ -15,12 +15,13 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import torch
|
21
21
|
from transformers import T5EncoderModel, T5Tokenizer
|
22
22
|
|
23
23
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
24
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
24
25
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
25
26
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
26
27
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
@@ -85,7 +86,7 @@ def retrieve_timesteps(
|
|
85
86
|
sigmas: Optional[List[float]] = None,
|
86
87
|
**kwargs,
|
87
88
|
):
|
88
|
-
"""
|
89
|
+
r"""
|
89
90
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
90
91
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
91
92
|
|
@@ -136,7 +137,7 @@ def retrieve_timesteps(
|
|
136
137
|
return timesteps, num_inference_steps
|
137
138
|
|
138
139
|
|
139
|
-
class CogVideoXPipeline(DiffusionPipeline):
|
140
|
+
class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
140
141
|
r"""
|
141
142
|
Pipeline for text-to-video generation using CogVideoX.
|
142
143
|
|
@@ -187,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
187
188
|
self.vae_scale_factor_temporal = (
|
188
189
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
189
190
|
)
|
191
|
+
self.vae_scaling_factor_image = (
|
192
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
193
|
+
)
|
190
194
|
|
191
195
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
192
196
|
|
@@ -316,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
316
320
|
def prepare_latents(
|
317
321
|
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
|
318
322
|
):
|
323
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
324
|
+
raise ValueError(
|
325
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
326
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
327
|
+
)
|
328
|
+
|
319
329
|
shape = (
|
320
330
|
batch_size,
|
321
331
|
(num_frames - 1) // self.vae_scale_factor_temporal + 1,
|
@@ -323,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
323
333
|
height // self.vae_scale_factor_spatial,
|
324
334
|
width // self.vae_scale_factor_spatial,
|
325
335
|
)
|
326
|
-
if isinstance(generator, list) and len(generator) != batch_size:
|
327
|
-
raise ValueError(
|
328
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
329
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
330
|
-
)
|
331
336
|
|
332
337
|
if latents is None:
|
333
338
|
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
@@ -340,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
340
345
|
|
341
346
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
342
347
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
343
|
-
latents = 1 / self.
|
348
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
344
349
|
|
345
350
|
frames = self.vae.decode(latents).sample
|
346
351
|
return frames
|
@@ -448,7 +453,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
448
453
|
crops_coords=grid_crops_coords,
|
449
454
|
grid_size=(grid_height, grid_width),
|
450
455
|
temporal_size=num_frames,
|
451
|
-
use_real=True,
|
452
456
|
)
|
453
457
|
|
454
458
|
freqs_cos = freqs_cos.to(device=device)
|
@@ -463,6 +467,10 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
463
467
|
def num_timesteps(self):
|
464
468
|
return self._num_timesteps
|
465
469
|
|
470
|
+
@property
|
471
|
+
def attention_kwargs(self):
|
472
|
+
return self._attention_kwargs
|
473
|
+
|
466
474
|
@property
|
467
475
|
def interrupt(self):
|
468
476
|
return self._interrupt
|
@@ -488,6 +496,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
488
496
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
489
497
|
output_type: str = "pil",
|
490
498
|
return_dict: bool = True,
|
499
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
491
500
|
callback_on_step_end: Optional[
|
492
501
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
493
502
|
] = None,
|
@@ -505,14 +514,14 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
505
514
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
506
515
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
507
516
|
less than `1`).
|
508
|
-
height (`int`, *optional*, defaults to self.
|
509
|
-
The height in pixels of the generated image. This is set to
|
510
|
-
width (`int`, *optional*, defaults to self.
|
511
|
-
The width in pixels of the generated image. This is set to
|
517
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
518
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
519
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
520
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
512
521
|
num_frames (`int`, defaults to `48`):
|
513
522
|
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
|
514
523
|
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
|
515
|
-
num_seconds is 6 and fps is
|
524
|
+
num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
|
516
525
|
needs to be satisfied is that of divisibility mentioned above.
|
517
526
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
518
527
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -549,6 +558,10 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
549
558
|
return_dict (`bool`, *optional*, defaults to `True`):
|
550
559
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
551
560
|
of a plain tuple.
|
561
|
+
attention_kwargs (`dict`, *optional*):
|
562
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
563
|
+
`self.processor` in
|
564
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
552
565
|
callback_on_step_end (`Callable`, *optional*):
|
553
566
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
554
567
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -578,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
578
591
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
579
592
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
580
593
|
|
581
|
-
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
582
|
-
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
583
594
|
num_videos_per_prompt = 1
|
584
595
|
|
585
596
|
# 1. Check inputs. Raise error if not correct
|
@@ -593,6 +604,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
593
604
|
negative_prompt_embeds,
|
594
605
|
)
|
595
606
|
self._guidance_scale = guidance_scale
|
607
|
+
self._attention_kwargs = attention_kwargs
|
596
608
|
self._interrupt = False
|
597
609
|
|
598
610
|
# 2. Default call parameters
|
@@ -674,6 +686,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|
674
686
|
encoder_hidden_states=prompt_embeds,
|
675
687
|
timestep=timestep,
|
676
688
|
image_rotary_emb=image_rotary_emb,
|
689
|
+
attention_kwargs=attention_kwargs,
|
677
690
|
return_dict=False,
|
678
691
|
)[0]
|
679
692
|
noise_pred = noise_pred.float()
|