diffusers 0.30.2__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. diffusers/__init__.py +38 -2
  2. diffusers/configuration_utils.py +12 -0
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +257 -54
  5. diffusers/loaders/__init__.py +2 -0
  6. diffusers/loaders/ip_adapter.py +5 -1
  7. diffusers/loaders/lora_base.py +14 -7
  8. diffusers/loaders/lora_conversion_utils.py +332 -0
  9. diffusers/loaders/lora_pipeline.py +707 -41
  10. diffusers/loaders/peft.py +1 -0
  11. diffusers/loaders/single_file_utils.py +81 -4
  12. diffusers/loaders/textual_inversion.py +2 -0
  13. diffusers/loaders/unet.py +39 -8
  14. diffusers/models/__init__.py +4 -0
  15. diffusers/models/adapter.py +53 -53
  16. diffusers/models/attention.py +86 -10
  17. diffusers/models/attention_processor.py +169 -133
  18. diffusers/models/autoencoders/autoencoder_kl.py +71 -11
  19. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +287 -85
  20. diffusers/models/controlnet_flux.py +536 -0
  21. diffusers/models/controlnet_sd3.py +7 -3
  22. diffusers/models/controlnet_sparsectrl.py +0 -1
  23. diffusers/models/embeddings.py +238 -61
  24. diffusers/models/embeddings_flax.py +23 -9
  25. diffusers/models/model_loading_utils.py +182 -14
  26. diffusers/models/modeling_utils.py +283 -46
  27. diffusers/models/normalization.py +79 -0
  28. diffusers/models/transformers/__init__.py +1 -0
  29. diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
  30. diffusers/models/transformers/cogvideox_transformer_3d.py +58 -36
  31. diffusers/models/transformers/pixart_transformer_2d.py +9 -1
  32. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  33. diffusers/models/transformers/transformer_flux.py +161 -44
  34. diffusers/models/transformers/transformer_sd3.py +7 -1
  35. diffusers/models/unets/unet_2d_condition.py +8 -8
  36. diffusers/models/unets/unet_motion_model.py +41 -63
  37. diffusers/models/upsampling.py +6 -6
  38. diffusers/pipelines/__init__.py +40 -7
  39. diffusers/pipelines/animatediff/__init__.py +2 -0
  40. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  41. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
  42. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  43. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
  45. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  46. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
  47. diffusers/pipelines/auto_pipeline.py +39 -8
  48. diffusers/pipelines/cogvideo/__init__.py +6 -0
  49. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +32 -34
  50. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
  51. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +837 -0
  52. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +825 -0
  53. diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
  54. diffusers/pipelines/cogview3/__init__.py +47 -0
  55. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  56. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  57. diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
  58. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
  59. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
  60. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
  61. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
  62. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
  63. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
  64. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  65. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
  66. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  67. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  68. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  69. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  70. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  71. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  72. diffusers/pipelines/flux/__init__.py +10 -0
  73. diffusers/pipelines/flux/pipeline_flux.py +53 -20
  74. diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
  75. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
  76. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
  77. diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
  78. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
  79. diffusers/pipelines/free_noise_utils.py +365 -5
  80. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
  81. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  82. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  83. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  84. diffusers/pipelines/kolors/tokenizer.py +4 -0
  85. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  86. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  87. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  88. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  89. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  90. diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
  91. diffusers/pipelines/pag/__init__.py +6 -0
  92. diffusers/pipelines/pag/pag_utils.py +8 -2
  93. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
  94. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
  95. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
  96. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
  97. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
  98. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  99. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
  100. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  101. diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
  102. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  103. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
  104. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  105. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  106. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  107. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  108. diffusers/pipelines/pipeline_loading_utils.py +225 -27
  109. diffusers/pipelines/pipeline_utils.py +123 -180
  110. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  111. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  112. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  113. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  114. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
  115. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  116. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  117. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  118. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
  119. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
  120. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
  121. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  122. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  123. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  124. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
  125. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
  126. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
  127. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  128. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  129. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  130. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  131. diffusers/quantizers/__init__.py +16 -0
  132. diffusers/quantizers/auto.py +126 -0
  133. diffusers/quantizers/base.py +233 -0
  134. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  135. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
  136. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  137. diffusers/quantizers/quantization_config.py +391 -0
  138. diffusers/schedulers/scheduling_ddim.py +4 -1
  139. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  140. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  141. diffusers/schedulers/scheduling_ddpm.py +4 -1
  142. diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
  143. diffusers/schedulers/scheduling_deis_multistep.py +78 -1
  144. diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
  145. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
  146. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  147. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
  148. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  149. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  150. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  151. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  152. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  153. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  154. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  155. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  156. diffusers/schedulers/scheduling_sasolver.py +78 -1
  157. diffusers/schedulers/scheduling_unclip.py +4 -1
  158. diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
  159. diffusers/training_utils.py +48 -18
  160. diffusers/utils/__init__.py +2 -1
  161. diffusers/utils/dummy_pt_objects.py +60 -0
  162. diffusers/utils/dummy_torch_and_transformers_objects.py +195 -0
  163. diffusers/utils/hub_utils.py +16 -4
  164. diffusers/utils/import_utils.py +31 -8
  165. diffusers/utils/loading_utils.py +28 -4
  166. diffusers/utils/peft_utils.py +3 -3
  167. diffusers/utils/testing_utils.py +59 -0
  168. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
  169. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/RECORD +173 -147
  170. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/WHEEL +1 -1
  171. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
  172. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
  173. {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -38,16 +38,44 @@ PipelineImageInput = Union[
38
38
  PipelineDepthInput = PipelineImageInput
39
39
 
40
40
 
41
- def is_valid_image(image):
41
+ def is_valid_image(image) -> bool:
42
+ r"""
43
+ Checks if the input is a valid image.
44
+
45
+ A valid image can be:
46
+ - A `PIL.Image.Image`.
47
+ - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
48
+
49
+ Args:
50
+ image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
51
+ The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
52
+
53
+ Returns:
54
+ `bool`:
55
+ `True` if the input is a valid image, `False` otherwise.
56
+ """
42
57
  return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
43
58
 
44
59
 
45
60
  def is_valid_image_imagelist(images):
46
- # check if the image input is one of the supported formats for image and image list:
47
- # it can be either one of below 3
48
- # (1) a 4d pytorch tensor or numpy array,
49
- # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor
50
- # (3) a list of valid image
61
+ r"""
62
+ Checks if the input is a valid image or list of images.
63
+
64
+ The input can be one of the following formats:
65
+ - A 4D tensor or numpy array (batch of images).
66
+ - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
67
+ `torch.Tensor`.
68
+ - A list of valid images.
69
+
70
+ Args:
71
+ images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
72
+ The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
73
+ images.
74
+
75
+ Returns:
76
+ `bool`:
77
+ `True` if the input is valid, `False` otherwise.
78
+ """
51
79
  if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
52
80
  return True
53
81
  elif is_valid_image(images):
@@ -103,8 +131,16 @@ class VaeImageProcessor(ConfigMixin):
103
131
 
104
132
  @staticmethod
105
133
  def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
106
- """
134
+ r"""
107
135
  Convert a numpy image or a batch of images to a PIL image.
136
+
137
+ Args:
138
+ images (`np.ndarray`):
139
+ The image array to convert to PIL format.
140
+
141
+ Returns:
142
+ `List[PIL.Image.Image]`:
143
+ A list of PIL images.
108
144
  """
109
145
  if images.ndim == 3:
110
146
  images = images[None, ...]
@@ -119,8 +155,16 @@ class VaeImageProcessor(ConfigMixin):
119
155
 
120
156
  @staticmethod
121
157
  def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
122
- """
158
+ r"""
123
159
  Convert a PIL image or a list of PIL images to NumPy arrays.
160
+
161
+ Args:
162
+ images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
163
+ The PIL image or list of images to convert to NumPy format.
164
+
165
+ Returns:
166
+ `np.ndarray`:
167
+ A NumPy array representation of the images.
124
168
  """
125
169
  if not isinstance(images, list):
126
170
  images = [images]
@@ -131,8 +175,16 @@ class VaeImageProcessor(ConfigMixin):
131
175
 
132
176
  @staticmethod
133
177
  def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
134
- """
178
+ r"""
135
179
  Convert a NumPy image to a PyTorch tensor.
180
+
181
+ Args:
182
+ images (`np.ndarray`):
183
+ The NumPy image array to convert to PyTorch format.
184
+
185
+ Returns:
186
+ `torch.Tensor`:
187
+ A PyTorch tensor representation of the images.
136
188
  """
137
189
  if images.ndim == 3:
138
190
  images = images[..., None]
@@ -142,30 +194,62 @@ class VaeImageProcessor(ConfigMixin):
142
194
 
143
195
  @staticmethod
144
196
  def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
145
- """
197
+ r"""
146
198
  Convert a PyTorch tensor to a NumPy image.
199
+
200
+ Args:
201
+ images (`torch.Tensor`):
202
+ The PyTorch tensor to convert to NumPy format.
203
+
204
+ Returns:
205
+ `np.ndarray`:
206
+ A NumPy array representation of the images.
147
207
  """
148
208
  images = images.cpu().permute(0, 2, 3, 1).float().numpy()
149
209
  return images
150
210
 
151
211
  @staticmethod
152
212
  def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
153
- """
213
+ r"""
154
214
  Normalize an image array to [-1,1].
215
+
216
+ Args:
217
+ images (`np.ndarray` or `torch.Tensor`):
218
+ The image array to normalize.
219
+
220
+ Returns:
221
+ `np.ndarray` or `torch.Tensor`:
222
+ The normalized image array.
155
223
  """
156
224
  return 2.0 * images - 1.0
157
225
 
158
226
  @staticmethod
159
227
  def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
160
- """
228
+ r"""
161
229
  Denormalize an image array to [0,1].
230
+
231
+ Args:
232
+ images (`np.ndarray` or `torch.Tensor`):
233
+ The image array to denormalize.
234
+
235
+ Returns:
236
+ `np.ndarray` or `torch.Tensor`:
237
+ The denormalized image array.
162
238
  """
163
239
  return (images / 2 + 0.5).clamp(0, 1)
164
240
 
165
241
  @staticmethod
166
242
  def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
167
- """
243
+ r"""
168
244
  Converts a PIL image to RGB format.
245
+
246
+ Args:
247
+ image (`PIL.Image.Image`):
248
+ The PIL image to convert to RGB.
249
+
250
+ Returns:
251
+ `PIL.Image.Image`:
252
+ The RGB-converted PIL image.
169
253
  """
170
254
  image = image.convert("RGB")
171
255
 
@@ -173,8 +257,16 @@ class VaeImageProcessor(ConfigMixin):
173
257
 
174
258
  @staticmethod
175
259
  def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
176
- """
177
- Converts a PIL image to grayscale format.
260
+ r"""
261
+ Converts a given PIL image to grayscale.
262
+
263
+ Args:
264
+ image (`PIL.Image.Image`):
265
+ The input image to convert.
266
+
267
+ Returns:
268
+ `PIL.Image.Image`:
269
+ The image converted to grayscale.
178
270
  """
179
271
  image = image.convert("L")
180
272
 
@@ -182,8 +274,16 @@ class VaeImageProcessor(ConfigMixin):
182
274
 
183
275
  @staticmethod
184
276
  def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image:
185
- """
277
+ r"""
186
278
  Applies Gaussian blur to an image.
279
+
280
+ Args:
281
+ image (`PIL.Image.Image`):
282
+ The PIL image to convert to grayscale.
283
+
284
+ Returns:
285
+ `PIL.Image.Image`:
286
+ The grayscale-converted PIL image.
187
287
  """
188
288
  image = image.filter(ImageFilter.GaussianBlur(blur_factor))
189
289
 
@@ -191,7 +291,7 @@ class VaeImageProcessor(ConfigMixin):
191
291
 
192
292
  @staticmethod
193
293
  def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
194
- """
294
+ r"""
195
295
  Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect
196
296
  ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for
197
297
  processing are 512x512, the region will be expanded to 128x128.
@@ -285,14 +385,21 @@ class VaeImageProcessor(ConfigMixin):
285
385
  width: int,
286
386
  height: int,
287
387
  ) -> PIL.Image.Image:
288
- """
388
+ r"""
289
389
  Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
290
390
  the image within the dimensions, filling empty with data from image.
291
391
 
292
392
  Args:
293
- image: The image to resize.
294
- width: The width to resize the image to.
295
- height: The height to resize the image to.
393
+ image (`PIL.Image.Image`):
394
+ The image to resize and fill.
395
+ width (`int`):
396
+ The width to resize the image to.
397
+ height (`int`):
398
+ The height to resize the image to.
399
+
400
+ Returns:
401
+ `PIL.Image.Image`:
402
+ The resized and filled image.
296
403
  """
297
404
 
298
405
  ratio = width / height
@@ -330,14 +437,21 @@ class VaeImageProcessor(ConfigMixin):
330
437
  width: int,
331
438
  height: int,
332
439
  ) -> PIL.Image.Image:
333
- """
440
+ r"""
334
441
  Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
335
442
  the image within the dimensions, cropping the excess.
336
443
 
337
444
  Args:
338
- image: The image to resize.
339
- width: The width to resize the image to.
340
- height: The height to resize the image to.
445
+ image (`PIL.Image.Image`):
446
+ The image to resize and crop.
447
+ width (`int`):
448
+ The width to resize the image to.
449
+ height (`int`):
450
+ The height to resize the image to.
451
+
452
+ Returns:
453
+ `PIL.Image.Image`:
454
+ The resized and cropped image.
341
455
  """
342
456
  ratio = width / height
343
457
  src_ratio = image.width / image.height
@@ -429,19 +543,23 @@ class VaeImageProcessor(ConfigMixin):
429
543
  height: Optional[int] = None,
430
544
  width: Optional[int] = None,
431
545
  ) -> Tuple[int, int]:
432
- """
433
- This function return the height and width that are downscaled to the next integer multiple of
434
- `vae_scale_factor`.
546
+ r"""
547
+ Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
435
548
 
436
549
  Args:
437
- image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
438
- The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
439
- shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
440
- have shape `[batch, channel, height, width]`.
441
- height (`int`, *optional*, defaults to `None`):
442
- The height in preprocessed image. If `None`, will use the height of `image` input.
443
- width (`int`, *optional*`, defaults to `None`):
444
- The width in preprocessed. If `None`, will use the width of the `image` input.
550
+ image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
551
+ The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
552
+ should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
553
+ tensor, it should have shape `[batch, channels, height, width]`.
554
+ height (`Optional[int]`, *optional*, defaults to `None`):
555
+ The height of the preprocessed image. If `None`, the height of the `image` input will be used.
556
+ width (`Optional[int]`, *optional*, defaults to `None`):
557
+ The width of the preprocessed image. If `None`, the width of the `image` input will be used.
558
+
559
+ Returns:
560
+ `Tuple[int, int]`:
561
+ A tuple containing the height and width, both resized to the nearest integer multiple of
562
+ `vae_scale_factor`.
445
563
  """
446
564
 
447
565
  if height is None:
@@ -478,13 +596,13 @@ class VaeImageProcessor(ConfigMixin):
478
596
  Preprocess the image input.
479
597
 
480
598
  Args:
481
- image (`pipeline_image_input`):
599
+ image (`PipelineImageInput`):
482
600
  The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
483
601
  supported formats.
484
- height (`int`, *optional*, defaults to `None`):
602
+ height (`int`, *optional*):
485
603
  The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
486
604
  height.
487
- width (`int`, *optional*`, defaults to `None`):
605
+ width (`int`, *optional*):
488
606
  The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
489
607
  resize_mode (`str`, *optional*, defaults to `default`):
490
608
  The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
@@ -496,6 +614,10 @@ class VaeImageProcessor(ConfigMixin):
496
614
  supported for PIL image input.
497
615
  crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
498
616
  The crop coordinates for each image in the batch. If `None`, will not crop the image.
617
+
618
+ Returns:
619
+ `torch.Tensor`:
620
+ The preprocessed image.
499
621
  """
500
622
  supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
501
623
 
@@ -569,7 +691,7 @@ class VaeImageProcessor(ConfigMixin):
569
691
 
570
692
  channel = image.shape[1]
571
693
  # don't need any preprocess if the image is latents
572
- if channel == self.vae_latent_channels:
694
+ if channel == self.config.vae_latent_channels:
573
695
  return image
574
696
 
575
697
  height, width = self.get_default_height_width(image, height, width)
@@ -655,8 +777,22 @@ class VaeImageProcessor(ConfigMixin):
655
777
  image: PIL.Image.Image,
656
778
  crop_coords: Optional[Tuple[int, int, int, int]] = None,
657
779
  ) -> PIL.Image.Image:
658
- """
659
- overlay the inpaint output to the original image
780
+ r"""
781
+ Applies an overlay of the mask and the inpainted image on the original image.
782
+
783
+ Args:
784
+ mask (`PIL.Image.Image`):
785
+ The mask image that highlights regions to overlay.
786
+ init_image (`PIL.Image.Image`):
787
+ The original image to which the overlay is applied.
788
+ image (`PIL.Image.Image`):
789
+ The image to overlay onto the original.
790
+ crop_coords (`Tuple[int, int, int, int]`, *optional*):
791
+ Coordinates to crop the image. If provided, the image will be cropped accordingly.
792
+
793
+ Returns:
794
+ `PIL.Image.Image`:
795
+ The final image with the overlay applied.
660
796
  """
661
797
 
662
798
  width, height = image.width, image.height
@@ -713,8 +849,16 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
713
849
 
714
850
  @staticmethod
715
851
  def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
716
- """
717
- Convert a NumPy image or a batch of images to a PIL image.
852
+ r"""
853
+ Convert a NumPy image or a batch of images to a list of PIL images.
854
+
855
+ Args:
856
+ images (`np.ndarray`):
857
+ The input NumPy array of images, which can be a single image or a batch.
858
+
859
+ Returns:
860
+ `List[PIL.Image.Image]`:
861
+ A list of PIL images converted from the input NumPy array.
718
862
  """
719
863
  if images.ndim == 3:
720
864
  images = images[None, ...]
@@ -729,8 +873,16 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
729
873
 
730
874
  @staticmethod
731
875
  def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
732
- """
876
+ r"""
733
877
  Convert a PIL image or a list of PIL images to NumPy arrays.
878
+
879
+ Args:
880
+ images (`Union[List[PIL.Image.Image], PIL.Image.Image]`):
881
+ The input image or list of images to be converted.
882
+
883
+ Returns:
884
+ `np.ndarray`:
885
+ A NumPy array of the converted images.
734
886
  """
735
887
  if not isinstance(images, list):
736
888
  images = [images]
@@ -741,18 +893,30 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
741
893
 
742
894
  @staticmethod
743
895
  def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
744
- """
745
- Args:
746
- image: RGB-like depth image
896
+ r"""
897
+ Convert an RGB-like depth image to a depth map.
747
898
 
748
- Returns: depth map
899
+ Args:
900
+ image (`Union[np.ndarray, torch.Tensor]`):
901
+ The RGB-like depth image to convert.
749
902
 
903
+ Returns:
904
+ `Union[np.ndarray, torch.Tensor]`:
905
+ The corresponding depth map.
750
906
  """
751
907
  return image[:, :, 1] * 2**8 + image[:, :, 2]
752
908
 
753
909
  def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
754
- """
755
- Convert a NumPy depth image or a batch of images to a PIL image.
910
+ r"""
911
+ Convert a NumPy depth image or a batch of images to a list of PIL images.
912
+
913
+ Args:
914
+ images (`np.ndarray`):
915
+ The input NumPy array of depth images, which can be a single image or a batch.
916
+
917
+ Returns:
918
+ `List[PIL.Image.Image]`:
919
+ A list of PIL images converted from the input NumPy depth images.
756
920
  """
757
921
  if images.ndim == 3:
758
922
  images = images[None, ...]
@@ -833,8 +997,24 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
833
997
  width: Optional[int] = None,
834
998
  target_res: Optional[int] = None,
835
999
  ) -> torch.Tensor:
836
- """
837
- Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
1000
+ r"""
1001
+ Preprocess the image input. Accepted formats are PIL images, NumPy arrays, or PyTorch tensors.
1002
+
1003
+ Args:
1004
+ rgb (`Union[torch.Tensor, PIL.Image.Image, np.ndarray]`):
1005
+ The RGB input image, which can be a single image or a batch.
1006
+ depth (`Union[torch.Tensor, PIL.Image.Image, np.ndarray]`):
1007
+ The depth input image, which can be a single image or a batch.
1008
+ height (`Optional[int]`, *optional*, defaults to `None`):
1009
+ The desired height of the processed image. If `None`, defaults to the height of the input image.
1010
+ width (`Optional[int]`, *optional*, defaults to `None`):
1011
+ The desired width of the processed image. If `None`, defaults to the width of the input image.
1012
+ target_res (`Optional[int]`, *optional*, defaults to `None`):
1013
+ Target resolution for resizing the images. If specified, overrides height and width.
1014
+
1015
+ Returns:
1016
+ `Tuple[torch.Tensor, torch.Tensor]`:
1017
+ A tuple containing the processed RGB and depth images as PyTorch tensors.
838
1018
  """
839
1019
  supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
840
1020
 
@@ -1072,7 +1252,17 @@ class PixArtImageProcessor(VaeImageProcessor):
1072
1252
 
1073
1253
  @staticmethod
1074
1254
  def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
1075
- """Returns binned height and width."""
1255
+ r"""
1256
+ Returns the binned height and width based on the aspect ratio.
1257
+
1258
+ Args:
1259
+ height (`int`): The height of the image.
1260
+ width (`int`): The width of the image.
1261
+ ratios (`dict`): A dictionary where keys are aspect ratios and values are tuples of (height, width).
1262
+
1263
+ Returns:
1264
+ `Tuple[int, int]`: The closest binned height and width.
1265
+ """
1076
1266
  ar = float(height / width)
1077
1267
  closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
1078
1268
  default_hw = ratios[closest_ratio]
@@ -1080,6 +1270,19 @@ class PixArtImageProcessor(VaeImageProcessor):
1080
1270
 
1081
1271
  @staticmethod
1082
1272
  def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
1273
+ r"""
1274
+ Resizes and crops a tensor of images to the specified dimensions.
1275
+
1276
+ Args:
1277
+ samples (`torch.Tensor`):
1278
+ A tensor of shape (N, C, H, W) where N is the batch size, C is the number of channels, H is the height,
1279
+ and W is the width.
1280
+ new_width (`int`): The desired width of the output images.
1281
+ new_height (`int`): The desired height of the output images.
1282
+
1283
+ Returns:
1284
+ `torch.Tensor`: A tensor containing the resized and cropped images.
1285
+ """
1083
1286
  orig_height, orig_width = samples.shape[2], samples.shape[3]
1084
1287
 
1085
1288
  # Check if resizing is needed
@@ -67,6 +67,7 @@ if is_torch_available():
67
67
  "StableDiffusionXLLoraLoaderMixin",
68
68
  "LoraLoaderMixin",
69
69
  "FluxLoraLoaderMixin",
70
+ "CogVideoXLoraLoaderMixin",
70
71
  ]
71
72
  _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
72
73
  _import_structure["ip_adapter"] = ["IPAdapterMixin"]
@@ -84,6 +85,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
84
85
  from .ip_adapter import IPAdapterMixin
85
86
  from .lora_pipeline import (
86
87
  AmusedLoraLoaderMixin,
88
+ CogVideoXLoraLoaderMixin,
87
89
  FluxLoraLoaderMixin,
88
90
  LoraLoaderMixin,
89
91
  SD3LoraLoaderMixin,
@@ -224,7 +224,11 @@ class IPAdapterMixin:
224
224
 
225
225
  # create feature extractor if it has not been registered to the pipeline yet
226
226
  if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
227
- clip_image_size = self.image_encoder.config.image_size
227
+ # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
228
+ default_clip_size = 224
229
+ clip_image_size = (
230
+ self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
231
+ )
228
232
  feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
229
233
  self.register_modules(feature_extractor=feature_extractor)
230
234
 
@@ -532,13 +532,19 @@ class LoraBaseMixin:
532
532
  )
533
533
 
534
534
  list_adapters = self.get_list_adapters() # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]}
535
- all_adapters = {
536
- adapter for adapters in list_adapters.values() for adapter in adapters
537
- } # eg ["adapter1", "adapter2"]
535
+ # eg ["adapter1", "adapter2"]
536
+ all_adapters = {adapter for adapters in list_adapters.values() for adapter in adapters}
537
+ missing_adapters = set(adapter_names) - all_adapters
538
+ if len(missing_adapters) > 0:
539
+ raise ValueError(
540
+ f"Adapter name(s) {missing_adapters} not in the list of present adapters: {all_adapters}."
541
+ )
542
+
543
+ # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]}
538
544
  invert_list_adapters = {
539
545
  adapter: [part for part, adapters in list_adapters.items() if adapter in adapters]
540
546
  for adapter in all_adapters
541
- } # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]}
547
+ }
542
548
 
543
549
  # Decompose weights into weights for denoiser and text encoders.
544
550
  _component_adapter_weights = {}
@@ -699,9 +705,10 @@ class LoraBaseMixin:
699
705
  module.lora_B[adapter_name].to(device)
700
706
  # this is a param, not a module, so device placement is not in-place -> re-assign
701
707
  if hasattr(module, "lora_magnitude_vector") and module.lora_magnitude_vector is not None:
702
- module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[
703
- adapter_name
704
- ].to(device)
708
+ if adapter_name in module.lora_magnitude_vector:
709
+ module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[
710
+ adapter_name
711
+ ].to(device)
705
712
 
706
713
  @staticmethod
707
714
  def pack_weights(layers, prefix):