diffusers 0.30.3__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. diffusers/__init__.py +34 -2
  2. diffusers/configuration_utils.py +12 -0
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +257 -54
  5. diffusers/loaders/__init__.py +2 -0
  6. diffusers/loaders/ip_adapter.py +5 -1
  7. diffusers/loaders/lora_base.py +14 -7
  8. diffusers/loaders/lora_conversion_utils.py +332 -0
  9. diffusers/loaders/lora_pipeline.py +707 -41
  10. diffusers/loaders/peft.py +1 -0
  11. diffusers/loaders/single_file_utils.py +81 -4
  12. diffusers/loaders/textual_inversion.py +2 -0
  13. diffusers/loaders/unet.py +39 -8
  14. diffusers/models/__init__.py +4 -0
  15. diffusers/models/adapter.py +53 -53
  16. diffusers/models/attention.py +86 -10
  17. diffusers/models/attention_processor.py +169 -133
  18. diffusers/models/autoencoders/autoencoder_kl.py +71 -11
  19. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +187 -88
  20. diffusers/models/controlnet_flux.py +536 -0
  21. diffusers/models/controlnet_sd3.py +7 -3
  22. diffusers/models/controlnet_sparsectrl.py +0 -1
  23. diffusers/models/embeddings.py +170 -61
  24. diffusers/models/embeddings_flax.py +23 -9
  25. diffusers/models/model_loading_utils.py +182 -14
  26. diffusers/models/modeling_utils.py +283 -46
  27. diffusers/models/normalization.py +79 -0
  28. diffusers/models/transformers/__init__.py +1 -0
  29. diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
  30. diffusers/models/transformers/cogvideox_transformer_3d.py +23 -2
  31. diffusers/models/transformers/pixart_transformer_2d.py +9 -1
  32. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  33. diffusers/models/transformers/transformer_flux.py +161 -44
  34. diffusers/models/transformers/transformer_sd3.py +7 -1
  35. diffusers/models/unets/unet_2d_condition.py +8 -8
  36. diffusers/models/unets/unet_motion_model.py +41 -63
  37. diffusers/models/upsampling.py +6 -6
  38. diffusers/pipelines/__init__.py +35 -6
  39. diffusers/pipelines/animatediff/__init__.py +2 -0
  40. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  41. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
  42. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  43. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
  45. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  46. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
  47. diffusers/pipelines/auto_pipeline.py +39 -8
  48. diffusers/pipelines/cogvideo/__init__.py +2 -0
  49. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +30 -17
  50. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
  51. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +41 -31
  52. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +42 -29
  53. diffusers/pipelines/cogview3/__init__.py +47 -0
  54. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  55. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  56. diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
  57. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
  58. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
  59. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
  60. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
  61. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
  62. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
  63. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  64. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
  65. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  66. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  67. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  68. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  69. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  70. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  71. diffusers/pipelines/flux/__init__.py +10 -0
  72. diffusers/pipelines/flux/pipeline_flux.py +53 -20
  73. diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
  74. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
  75. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
  76. diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
  77. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
  78. diffusers/pipelines/free_noise_utils.py +365 -5
  79. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
  80. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  81. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  82. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  83. diffusers/pipelines/kolors/tokenizer.py +4 -0
  84. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  85. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  86. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  87. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  88. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  89. diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
  90. diffusers/pipelines/pag/__init__.py +6 -0
  91. diffusers/pipelines/pag/pag_utils.py +8 -2
  92. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
  93. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
  94. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
  95. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
  96. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
  97. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  98. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
  99. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  100. diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
  101. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  102. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
  103. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  104. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  105. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  106. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  107. diffusers/pipelines/pipeline_loading_utils.py +225 -27
  108. diffusers/pipelines/pipeline_utils.py +123 -180
  109. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  110. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  111. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  112. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  113. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
  114. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  115. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  116. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  117. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
  118. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
  119. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
  120. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  121. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  122. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  123. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
  124. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
  125. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
  126. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  127. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  128. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  129. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  130. diffusers/quantizers/__init__.py +16 -0
  131. diffusers/quantizers/auto.py +126 -0
  132. diffusers/quantizers/base.py +233 -0
  133. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  134. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
  135. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  136. diffusers/quantizers/quantization_config.py +391 -0
  137. diffusers/schedulers/scheduling_ddim.py +4 -1
  138. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  139. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  140. diffusers/schedulers/scheduling_ddpm.py +4 -1
  141. diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
  142. diffusers/schedulers/scheduling_deis_multistep.py +78 -1
  143. diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
  144. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
  145. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  146. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
  147. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  148. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  149. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  150. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  151. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  152. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  153. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  154. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  155. diffusers/schedulers/scheduling_sasolver.py +78 -1
  156. diffusers/schedulers/scheduling_unclip.py +4 -1
  157. diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
  158. diffusers/training_utils.py +48 -18
  159. diffusers/utils/__init__.py +2 -1
  160. diffusers/utils/dummy_pt_objects.py +60 -0
  161. diffusers/utils/dummy_torch_and_transformers_objects.py +165 -0
  162. diffusers/utils/hub_utils.py +16 -4
  163. diffusers/utils/import_utils.py +31 -8
  164. diffusers/utils/loading_utils.py +28 -4
  165. diffusers/utils/peft_utils.py +3 -3
  166. diffusers/utils/testing_utils.py +59 -0
  167. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
  168. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/RECORD +172 -149
  169. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
  170. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/WHEEL +0 -0
  171. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
  172. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ def retrieve_timesteps(
53
53
  sigmas: Optional[List[float]] = None,
54
54
  **kwargs,
55
55
  ):
56
- """
56
+ r"""
57
57
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
58
58
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
59
59
 
@@ -20,6 +20,7 @@ from huggingface_hub.utils import validate_hf_hub_args
20
20
  from ..configuration_utils import ConfigMixin
21
21
  from ..utils import is_sentencepiece_available
22
22
  from .aura_flow import AuraFlowPipeline
23
+ from .cogview3 import CogView3PlusPipeline
23
24
  from .controlnet import (
24
25
  StableDiffusionControlNetImg2ImgPipeline,
25
26
  StableDiffusionControlNetInpaintPipeline,
@@ -29,7 +30,14 @@ from .controlnet import (
29
30
  StableDiffusionXLControlNetPipeline,
30
31
  )
31
32
  from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
32
- from .flux import FluxPipeline
33
+ from .flux import (
34
+ FluxControlNetImg2ImgPipeline,
35
+ FluxControlNetInpaintPipeline,
36
+ FluxControlNetPipeline,
37
+ FluxImg2ImgPipeline,
38
+ FluxInpaintPipeline,
39
+ FluxPipeline,
40
+ )
33
41
  from .hunyuandit import HunyuanDiTPipeline
34
42
  from .kandinsky import (
35
43
  KandinskyCombinedPipeline,
@@ -49,12 +57,16 @@ from .kandinsky2_2 import (
49
57
  )
50
58
  from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
51
59
  from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
60
+ from .lumina import LuminaText2ImgPipeline
52
61
  from .pag import (
53
62
  HunyuanDiTPAGPipeline,
54
63
  PixArtSigmaPAGPipeline,
55
64
  StableDiffusion3PAGPipeline,
65
+ StableDiffusionControlNetPAGInpaintPipeline,
56
66
  StableDiffusionControlNetPAGPipeline,
67
+ StableDiffusionPAGImg2ImgPipeline,
57
68
  StableDiffusionPAGPipeline,
69
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
58
70
  StableDiffusionXLControlNetPAGPipeline,
59
71
  StableDiffusionXLPAGImg2ImgPipeline,
60
72
  StableDiffusionXLPAGInpaintPipeline,
@@ -106,6 +118,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
106
118
  ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
107
119
  ("auraflow", AuraFlowPipeline),
108
120
  ("flux", FluxPipeline),
121
+ ("flux-controlnet", FluxControlNetPipeline),
122
+ ("lumina", LuminaText2ImgPipeline),
123
+ ("cogview3", CogView3PlusPipeline),
109
124
  ]
110
125
  )
111
126
 
@@ -119,9 +134,13 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
119
134
  ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
120
135
  ("kandinsky3", Kandinsky3Img2ImgPipeline),
121
136
  ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
137
+ ("stable-diffusion-pag", StableDiffusionPAGImg2ImgPipeline),
122
138
  ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
123
139
  ("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
140
+ ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
124
141
  ("lcm", LatentConsistencyModelImg2ImgPipeline),
142
+ ("flux", FluxImg2ImgPipeline),
143
+ ("flux-controlnet", FluxControlNetImg2ImgPipeline),
125
144
  ]
126
145
  )
127
146
 
@@ -134,8 +153,11 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
134
153
  ("kandinsky", KandinskyInpaintCombinedPipeline),
135
154
  ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
136
155
  ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
156
+ ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
137
157
  ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
138
158
  ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
159
+ ("flux", FluxInpaintPipeline),
160
+ ("flux-controlnet", FluxControlNetInpaintPipeline),
139
161
  ]
140
162
  )
141
163
 
@@ -161,12 +183,12 @@ _AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
161
183
  )
162
184
 
163
185
  if is_sentencepiece_available():
164
- from .kolors import KolorsPipeline
186
+ from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
165
187
  from .pag import KolorsPAGPipeline
166
188
 
167
189
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
168
190
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
169
- AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
191
+ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
170
192
 
171
193
  SUPPORTED_TASKS_MAPPINGS = [
172
194
  AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
@@ -656,12 +678,17 @@ class AutoPipelineForImage2Image(ConfigMixin):
656
678
  config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
657
679
  orig_class_name = config["_class_name"]
658
680
 
681
+ # the `orig_class_name` can be:
682
+ # `- *Pipeline` (for regular text-to-image checkpoint)
683
+ # `- *Img2ImgPipeline` (for refiner checkpoint)
684
+ to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
685
+
659
686
  if "controlnet" in kwargs:
660
- orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
687
+ orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
661
688
  if "enable_pag" in kwargs:
662
689
  enable_pag = kwargs.pop("enable_pag")
663
690
  if enable_pag:
664
- orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
691
+ orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
665
692
 
666
693
  image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
667
694
 
@@ -948,13 +975,17 @@ class AutoPipelineForInpainting(ConfigMixin):
948
975
  config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
949
976
  orig_class_name = config["_class_name"]
950
977
 
978
+ # The `orig_class_name`` can be:
979
+ # `- *InpaintPipeline` (for inpaint-specific checkpoint)
980
+ # - or *Pipeline (for regular text-to-image checkpoint)
981
+ to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
982
+
951
983
  if "controlnet" in kwargs:
952
- orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
984
+ orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
953
985
  if "enable_pag" in kwargs:
954
986
  enable_pag = kwargs.pop("enable_pag")
955
987
  if enable_pag:
956
- orig_class_name = config["_class_name"].replace("Pipeline", "PAGPipeline")
957
-
988
+ orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
958
989
  inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
959
990
 
960
991
  kwargs = {**load_config_kwargs, **kwargs}
@@ -23,6 +23,7 @@ except OptionalDependencyNotAvailable:
23
23
  _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
24
  else:
25
25
  _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
26
+ _import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
26
27
  _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
27
28
  _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
28
29
 
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
35
36
  from ...utils.dummy_torch_and_transformers_objects import *
36
37
  else:
37
38
  from .pipeline_cogvideox import CogVideoXPipeline
39
+ from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
38
40
  from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
39
41
  from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
40
42
 
@@ -15,12 +15,13 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  from transformers import T5EncoderModel, T5Tokenizer
22
22
 
23
23
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...loaders import CogVideoXLoraLoaderMixin
24
25
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
25
26
  from ...models.embeddings import get_3d_rotary_pos_embed
26
27
  from ...pipelines.pipeline_utils import DiffusionPipeline
@@ -85,7 +86,7 @@ def retrieve_timesteps(
85
86
  sigmas: Optional[List[float]] = None,
86
87
  **kwargs,
87
88
  ):
88
- """
89
+ r"""
89
90
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
90
91
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
91
92
 
@@ -136,7 +137,7 @@ def retrieve_timesteps(
136
137
  return timesteps, num_inference_steps
137
138
 
138
139
 
139
- class CogVideoXPipeline(DiffusionPipeline):
140
+ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
140
141
  r"""
141
142
  Pipeline for text-to-video generation using CogVideoX.
142
143
 
@@ -187,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline):
187
188
  self.vae_scale_factor_temporal = (
188
189
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
189
190
  )
191
+ self.vae_scaling_factor_image = (
192
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
193
+ )
190
194
 
191
195
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
192
196
 
@@ -316,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline):
316
320
  def prepare_latents(
317
321
  self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
318
322
  ):
323
+ if isinstance(generator, list) and len(generator) != batch_size:
324
+ raise ValueError(
325
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
326
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
327
+ )
328
+
319
329
  shape = (
320
330
  batch_size,
321
331
  (num_frames - 1) // self.vae_scale_factor_temporal + 1,
@@ -323,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline):
323
333
  height // self.vae_scale_factor_spatial,
324
334
  width // self.vae_scale_factor_spatial,
325
335
  )
326
- if isinstance(generator, list) and len(generator) != batch_size:
327
- raise ValueError(
328
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
329
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
330
- )
331
336
 
332
337
  if latents is None:
333
338
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -340,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline):
340
345
 
341
346
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
342
347
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
343
- latents = 1 / self.vae.config.scaling_factor * latents
348
+ latents = 1 / self.vae_scaling_factor_image * latents
344
349
 
345
350
  frames = self.vae.decode(latents).sample
346
351
  return frames
@@ -448,7 +453,6 @@ class CogVideoXPipeline(DiffusionPipeline):
448
453
  crops_coords=grid_crops_coords,
449
454
  grid_size=(grid_height, grid_width),
450
455
  temporal_size=num_frames,
451
- use_real=True,
452
456
  )
453
457
 
454
458
  freqs_cos = freqs_cos.to(device=device)
@@ -463,6 +467,10 @@ class CogVideoXPipeline(DiffusionPipeline):
463
467
  def num_timesteps(self):
464
468
  return self._num_timesteps
465
469
 
470
+ @property
471
+ def attention_kwargs(self):
472
+ return self._attention_kwargs
473
+
466
474
  @property
467
475
  def interrupt(self):
468
476
  return self._interrupt
@@ -488,6 +496,7 @@ class CogVideoXPipeline(DiffusionPipeline):
488
496
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
489
497
  output_type: str = "pil",
490
498
  return_dict: bool = True,
499
+ attention_kwargs: Optional[Dict[str, Any]] = None,
491
500
  callback_on_step_end: Optional[
492
501
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
493
502
  ] = None,
@@ -505,14 +514,14 @@ class CogVideoXPipeline(DiffusionPipeline):
505
514
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
506
515
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
507
516
  less than `1`).
508
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
509
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
510
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
511
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
517
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
518
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
519
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
520
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
512
521
  num_frames (`int`, defaults to `48`):
513
522
  Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
514
523
  contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
515
- num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
524
+ num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
516
525
  needs to be satisfied is that of divisibility mentioned above.
517
526
  num_inference_steps (`int`, *optional*, defaults to 50):
518
527
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -549,6 +558,10 @@ class CogVideoXPipeline(DiffusionPipeline):
549
558
  return_dict (`bool`, *optional*, defaults to `True`):
550
559
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
551
560
  of a plain tuple.
561
+ attention_kwargs (`dict`, *optional*):
562
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
563
+ `self.processor` in
564
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
552
565
  callback_on_step_end (`Callable`, *optional*):
553
566
  A function that calls at the end of each denoising steps during the inference. The function is called
554
567
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -578,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline):
578
591
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
579
592
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
580
593
 
581
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
582
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
583
594
  num_videos_per_prompt = 1
584
595
 
585
596
  # 1. Check inputs. Raise error if not correct
@@ -593,6 +604,7 @@ class CogVideoXPipeline(DiffusionPipeline):
593
604
  negative_prompt_embeds,
594
605
  )
595
606
  self._guidance_scale = guidance_scale
607
+ self._attention_kwargs = attention_kwargs
596
608
  self._interrupt = False
597
609
 
598
610
  # 2. Default call parameters
@@ -674,6 +686,7 @@ class CogVideoXPipeline(DiffusionPipeline):
674
686
  encoder_hidden_states=prompt_embeds,
675
687
  timestep=timestep,
676
688
  image_rotary_emb=image_rotary_emb,
689
+ attention_kwargs=attention_kwargs,
677
690
  return_dict=False,
678
691
  )[0]
679
692
  noise_pred = noise_pred.float()