diffusers 0.30.2__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. diffusers/__init__.py +38 -2
  2. diffusers/configuration_utils.py +12 -0
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +257 -54
  5. diffusers/loaders/__init__.py +2 -0
  6. diffusers/loaders/ip_adapter.py +5 -1
  7. diffusers/loaders/lora_base.py +14 -7
  8. diffusers/loaders/lora_conversion_utils.py +332 -0
  9. diffusers/loaders/lora_pipeline.py +707 -41
  10. diffusers/loaders/peft.py +1 -0
  11. diffusers/loaders/single_file_utils.py +81 -4
  12. diffusers/loaders/textual_inversion.py +2 -0
  13. diffusers/loaders/unet.py +39 -8
  14. diffusers/models/__init__.py +4 -0
  15. diffusers/models/adapter.py +53 -53
  16. diffusers/models/attention.py +86 -10
  17. diffusers/models/attention_processor.py +169 -133
  18. diffusers/models/autoencoders/autoencoder_kl.py +71 -11
  19. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +287 -85
  20. diffusers/models/controlnet_flux.py +536 -0
  21. diffusers/models/controlnet_sd3.py +7 -3
  22. diffusers/models/controlnet_sparsectrl.py +0 -1
  23. diffusers/models/embeddings.py +238 -61
  24. diffusers/models/embeddings_flax.py +23 -9
  25. diffusers/models/model_loading_utils.py +182 -14
  26. diffusers/models/modeling_utils.py +283 -46
  27. diffusers/models/normalization.py +79 -0
  28. diffusers/models/transformers/__init__.py +1 -0
  29. diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
  30. diffusers/models/transformers/cogvideox_transformer_3d.py +58 -36
  31. diffusers/models/transformers/pixart_transformer_2d.py +9 -1
  32. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  33. diffusers/models/transformers/transformer_flux.py +161 -44
  34. diffusers/models/transformers/transformer_sd3.py +7 -1
  35. diffusers/models/unets/unet_2d_condition.py +8 -8
  36. diffusers/models/unets/unet_motion_model.py +41 -63
  37. diffusers/models/upsampling.py +6 -6
  38. diffusers/pipelines/__init__.py +40 -7
  39. diffusers/pipelines/animatediff/__init__.py +2 -0
  40. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  41. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
  42. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  43. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
  45. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  46. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
  47. diffusers/pipelines/auto_pipeline.py +39 -8
  48. diffusers/pipelines/cogvideo/__init__.py +6 -0
  49. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +32 -34
  50. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
  51. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +837 -0
  52. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +825 -0
  53. diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
  54. diffusers/pipelines/cogview3/__init__.py +47 -0
  55. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  56. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  57. diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
  58. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
  59. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
  60. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
  61. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
  62. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
  63. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
  64. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  65. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
  66. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  67. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  68. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  69. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  70. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  71. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  72. diffusers/pipelines/flux/__init__.py +10 -0
  73. diffusers/pipelines/flux/pipeline_flux.py +53 -20
  74. diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
  75. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
  76. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
  77. diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
  78. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
  79. diffusers/pipelines/free_noise_utils.py +365 -5
  80. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
  81. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  82. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  83. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  84. diffusers/pipelines/kolors/tokenizer.py +4 -0
  85. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  86. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  87. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  88. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  89. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  90. diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
  91. diffusers/pipelines/pag/__init__.py +6 -0
  92. diffusers/pipelines/pag/pag_utils.py +8 -2
  93. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
  94. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
  95. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
  96. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
  97. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
  98. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  99. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
  100. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  101. diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
  102. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  103. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
  104. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  105. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  106. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  107. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  108. diffusers/pipelines/pipeline_loading_utils.py +225 -27
  109. diffusers/pipelines/pipeline_utils.py +123 -180
  110. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  111. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  112. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  113. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  114. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
  115. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  116. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  117. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  118. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
  119. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
  120. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
  121. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  122. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  123. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  124. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
  125. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
  126. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
  127. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  128. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  129. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  130. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  131. diffusers/quantizers/__init__.py +16 -0
  132. diffusers/quantizers/auto.py +126 -0
  133. diffusers/quantizers/base.py +233 -0
  134. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  135. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
  136. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  137. diffusers/quantizers/quantization_config.py +391 -0
  138. diffusers/schedulers/scheduling_ddim.py +4 -1
  139. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  140. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  141. diffusers/schedulers/scheduling_ddpm.py +4 -1
  142. diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
  143. diffusers/schedulers/scheduling_deis_multistep.py +78 -1
  144. diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
  145. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
  146. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  147. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
  148. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  149. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  150. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  151. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  152. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  153. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  154. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  155. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  156. diffusers/schedulers/scheduling_sasolver.py +78 -1
  157. diffusers/schedulers/scheduling_unclip.py +4 -1
  158. diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
  159. diffusers/training_utils.py +48 -18
  160. diffusers/utils/__init__.py +2 -1
  161. diffusers/utils/dummy_pt_objects.py +60 -0
  162. diffusers/utils/dummy_torch_and_transformers_objects.py +195 -0
  163. diffusers/utils/hub_utils.py +16 -4
  164. diffusers/utils/import_utils.py +31 -8
  165. diffusers/utils/loading_utils.py +28 -4
  166. diffusers/utils/peft_utils.py +3 -3
  167. diffusers/utils/testing_utils.py +59 -0
  168. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
  169. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/RECORD +173 -147
  170. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/WHEEL +1 -1
  171. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
  172. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
  173. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ def retrieve_timesteps(
53
53
  sigmas: Optional[List[float]] = None,
54
54
  **kwargs,
55
55
  ):
56
- """
56
+ r"""
57
57
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
58
58
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
59
59
 
@@ -20,6 +20,7 @@ from huggingface_hub.utils import validate_hf_hub_args
20
20
  from ..configuration_utils import ConfigMixin
21
21
  from ..utils import is_sentencepiece_available
22
22
  from .aura_flow import AuraFlowPipeline
23
+ from .cogview3 import CogView3PlusPipeline
23
24
  from .controlnet import (
24
25
  StableDiffusionControlNetImg2ImgPipeline,
25
26
  StableDiffusionControlNetInpaintPipeline,
@@ -29,7 +30,14 @@ from .controlnet import (
29
30
  StableDiffusionXLControlNetPipeline,
30
31
  )
31
32
  from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
32
- from .flux import FluxPipeline
33
+ from .flux import (
34
+ FluxControlNetImg2ImgPipeline,
35
+ FluxControlNetInpaintPipeline,
36
+ FluxControlNetPipeline,
37
+ FluxImg2ImgPipeline,
38
+ FluxInpaintPipeline,
39
+ FluxPipeline,
40
+ )
33
41
  from .hunyuandit import HunyuanDiTPipeline
34
42
  from .kandinsky import (
35
43
  KandinskyCombinedPipeline,
@@ -49,12 +57,16 @@ from .kandinsky2_2 import (
49
57
  )
50
58
  from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
51
59
  from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
60
+ from .lumina import LuminaText2ImgPipeline
52
61
  from .pag import (
53
62
  HunyuanDiTPAGPipeline,
54
63
  PixArtSigmaPAGPipeline,
55
64
  StableDiffusion3PAGPipeline,
65
+ StableDiffusionControlNetPAGInpaintPipeline,
56
66
  StableDiffusionControlNetPAGPipeline,
67
+ StableDiffusionPAGImg2ImgPipeline,
57
68
  StableDiffusionPAGPipeline,
69
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
58
70
  StableDiffusionXLControlNetPAGPipeline,
59
71
  StableDiffusionXLPAGImg2ImgPipeline,
60
72
  StableDiffusionXLPAGInpaintPipeline,
@@ -106,6 +118,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
106
118
  ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
107
119
  ("auraflow", AuraFlowPipeline),
108
120
  ("flux", FluxPipeline),
121
+ ("flux-controlnet", FluxControlNetPipeline),
122
+ ("lumina", LuminaText2ImgPipeline),
123
+ ("cogview3", CogView3PlusPipeline),
109
124
  ]
110
125
  )
111
126
 
@@ -119,9 +134,13 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
119
134
  ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
120
135
  ("kandinsky3", Kandinsky3Img2ImgPipeline),
121
136
  ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
137
+ ("stable-diffusion-pag", StableDiffusionPAGImg2ImgPipeline),
122
138
  ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
123
139
  ("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
140
+ ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
124
141
  ("lcm", LatentConsistencyModelImg2ImgPipeline),
142
+ ("flux", FluxImg2ImgPipeline),
143
+ ("flux-controlnet", FluxControlNetImg2ImgPipeline),
125
144
  ]
126
145
  )
127
146
 
@@ -134,8 +153,11 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
134
153
  ("kandinsky", KandinskyInpaintCombinedPipeline),
135
154
  ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
136
155
  ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
156
+ ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
137
157
  ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
138
158
  ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
159
+ ("flux", FluxInpaintPipeline),
160
+ ("flux-controlnet", FluxControlNetInpaintPipeline),
139
161
  ]
140
162
  )
141
163
 
@@ -161,12 +183,12 @@ _AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
161
183
  )
162
184
 
163
185
  if is_sentencepiece_available():
164
- from .kolors import KolorsPipeline
186
+ from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
165
187
  from .pag import KolorsPAGPipeline
166
188
 
167
189
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
168
190
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
169
- AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
191
+ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
170
192
 
171
193
  SUPPORTED_TASKS_MAPPINGS = [
172
194
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
@@ -656,12 +678,17 @@ class AutoPipelineForImage2Image(ConfigMixin):
656
678
  config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
657
679
  orig_class_name = config["_class_name"]
658
680
 
681
+ # the `orig_class_name` can be:
682
+ # `- *Pipeline` (for regular text-to-image checkpoint)
683
+ # `- *Img2ImgPipeline` (for refiner checkpoint)
684
+ to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
685
+
659
686
  if "controlnet" in kwargs:
660
- orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
687
+ orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
661
688
  if "enable_pag" in kwargs:
662
689
  enable_pag = kwargs.pop("enable_pag")
663
690
  if enable_pag:
664
- orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
691
+ orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
665
692
 
666
693
  image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
667
694
 
@@ -948,13 +975,17 @@ class AutoPipelineForInpainting(ConfigMixin):
948
975
  config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
949
976
  orig_class_name = config["_class_name"]
950
977
 
978
+ # The `orig_class_name`` can be:
979
+ # `- *InpaintPipeline` (for inpaint-specific checkpoint)
980
+ # - or *Pipeline (for regular text-to-image checkpoint)
981
+ to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
982
+
951
983
  if "controlnet" in kwargs:
952
- orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
984
+ orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
953
985
  if "enable_pag" in kwargs:
954
986
  enable_pag = kwargs.pop("enable_pag")
955
987
  if enable_pag:
956
- orig_class_name = config["_class_name"].replace("Pipeline", "PAGPipeline")
957
-
988
+ orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
958
989
  inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
959
990
 
960
991
  kwargs = {**load_config_kwargs, **kwargs}
@@ -23,6 +23,9 @@ except OptionalDependencyNotAvailable:
23
23
  _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
24
  else:
25
25
  _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
26
+ _import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
27
+ _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
28
+ _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
26
29
 
27
30
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
31
  try:
@@ -33,6 +36,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
33
36
  from ...utils.dummy_torch_and_transformers_objects import *
34
37
  else:
35
38
  from .pipeline_cogvideox import CogVideoXPipeline
39
+ from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
40
+ from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
41
+ from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
36
42
 
37
43
  else:
38
44
  import sys
@@ -15,20 +15,21 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from dataclasses import dataclass
19
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
20
19
 
21
20
  import torch
22
21
  from transformers import T5EncoderModel, T5Tokenizer
23
22
 
24
23
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...loaders import CogVideoXLoraLoaderMixin
25
25
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
26
26
  from ...models.embeddings import get_3d_rotary_pos_embed
27
27
  from ...pipelines.pipeline_utils import DiffusionPipeline
28
28
  from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
29
- from ...utils import BaseOutput, logging, replace_example_docstring
29
+ from ...utils import logging, replace_example_docstring
30
30
  from ...utils.torch_utils import randn_tensor
31
31
  from ...video_processor import VideoProcessor
32
+ from .pipeline_output import CogVideoXPipelineOutput
32
33
 
33
34
 
34
35
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -85,7 +86,7 @@ def retrieve_timesteps(
85
86
  sigmas: Optional[List[float]] = None,
86
87
  **kwargs,
87
88
  ):
88
- """
89
+ r"""
89
90
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
90
91
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
91
92
 
@@ -136,22 +137,7 @@ def retrieve_timesteps(
136
137
  return timesteps, num_inference_steps
137
138
 
138
139
 
139
- @dataclass
140
- class CogVideoXPipelineOutput(BaseOutput):
141
- r"""
142
- Output class for CogVideo pipelines.
143
-
144
- Args:
145
- frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
146
- List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
147
- denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
148
- `(batch_size, num_frames, channels, height, width)`.
149
- """
150
-
151
- frames: torch.Tensor
152
-
153
-
154
- class CogVideoXPipeline(DiffusionPipeline):
140
+ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
155
141
  r"""
156
142
  Pipeline for text-to-video generation using CogVideoX.
157
143
 
@@ -202,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline):
202
188
  self.vae_scale_factor_temporal = (
203
189
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
204
190
  )
191
+ self.vae_scaling_factor_image = (
192
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
193
+ )
205
194
 
206
195
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
207
196
 
@@ -331,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline):
331
320
  def prepare_latents(
332
321
  self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
333
322
  ):
323
+ if isinstance(generator, list) and len(generator) != batch_size:
324
+ raise ValueError(
325
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
326
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
327
+ )
328
+
334
329
  shape = (
335
330
  batch_size,
336
331
  (num_frames - 1) // self.vae_scale_factor_temporal + 1,
@@ -338,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline):
338
333
  height // self.vae_scale_factor_spatial,
339
334
  width // self.vae_scale_factor_spatial,
340
335
  )
341
- if isinstance(generator, list) and len(generator) != batch_size:
342
- raise ValueError(
343
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
344
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
345
- )
346
336
 
347
337
  if latents is None:
348
338
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -355,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline):
355
345
 
356
346
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
357
347
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
358
- latents = 1 / self.vae.config.scaling_factor * latents
348
+ latents = 1 / self.vae_scaling_factor_image * latents
359
349
 
360
350
  frames = self.vae.decode(latents).sample
361
351
  return frames
@@ -463,7 +453,6 @@ class CogVideoXPipeline(DiffusionPipeline):
463
453
  crops_coords=grid_crops_coords,
464
454
  grid_size=(grid_height, grid_width),
465
455
  temporal_size=num_frames,
466
- use_real=True,
467
456
  )
468
457
 
469
458
  freqs_cos = freqs_cos.to(device=device)
@@ -478,6 +467,10 @@ class CogVideoXPipeline(DiffusionPipeline):
478
467
  def num_timesteps(self):
479
468
  return self._num_timesteps
480
469
 
470
+ @property
471
+ def attention_kwargs(self):
472
+ return self._attention_kwargs
473
+
481
474
  @property
482
475
  def interrupt(self):
483
476
  return self._interrupt
@@ -503,6 +496,7 @@ class CogVideoXPipeline(DiffusionPipeline):
503
496
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
504
497
  output_type: str = "pil",
505
498
  return_dict: bool = True,
499
+ attention_kwargs: Optional[Dict[str, Any]] = None,
506
500
  callback_on_step_end: Optional[
507
501
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
508
502
  ] = None,
@@ -520,14 +514,14 @@ class CogVideoXPipeline(DiffusionPipeline):
520
514
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
521
515
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
522
516
  less than `1`).
523
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
524
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
525
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
526
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
517
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
518
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
519
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
520
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
527
521
  num_frames (`int`, defaults to `48`):
528
522
  Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
529
523
  contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
530
- num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
524
+ num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
531
525
  needs to be satisfied is that of divisibility mentioned above.
532
526
  num_inference_steps (`int`, *optional*, defaults to 50):
533
527
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -564,6 +558,10 @@ class CogVideoXPipeline(DiffusionPipeline):
564
558
  return_dict (`bool`, *optional*, defaults to `True`):
565
559
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
566
560
  of a plain tuple.
561
+ attention_kwargs (`dict`, *optional*):
562
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
563
+ `self.processor` in
564
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
567
565
  callback_on_step_end (`Callable`, *optional*):
568
566
  A function that calls at the end of each denoising steps during the inference. The function is called
569
567
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -593,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline):
593
591
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
594
592
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
595
593
 
596
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
597
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
598
594
  num_videos_per_prompt = 1
599
595
 
600
596
  # 1. Check inputs. Raise error if not correct
@@ -608,6 +604,7 @@ class CogVideoXPipeline(DiffusionPipeline):
608
604
  negative_prompt_embeds,
609
605
  )
610
606
  self._guidance_scale = guidance_scale
607
+ self._attention_kwargs = attention_kwargs
611
608
  self._interrupt = False
612
609
 
613
610
  # 2. Default call parameters
@@ -689,6 +686,7 @@ class CogVideoXPipeline(DiffusionPipeline):
689
686
  encoder_hidden_states=prompt_embeds,
690
687
  timestep=timestep,
691
688
  image_rotary_emb=image_rotary_emb,
689
+ attention_kwargs=attention_kwargs,
692
690
  return_dict=False,
693
691
  )[0]
694
692
  noise_pred = noise_pred.float()