diffusers 0.30.2__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +38 -2
- diffusers/configuration_utils.py +12 -0
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +257 -54
- diffusers/loaders/__init__.py +2 -0
- diffusers/loaders/ip_adapter.py +5 -1
- diffusers/loaders/lora_base.py +14 -7
- diffusers/loaders/lora_conversion_utils.py +332 -0
- diffusers/loaders/lora_pipeline.py +707 -41
- diffusers/loaders/peft.py +1 -0
- diffusers/loaders/single_file_utils.py +81 -4
- diffusers/loaders/textual_inversion.py +2 -0
- diffusers/loaders/unet.py +39 -8
- diffusers/models/__init__.py +4 -0
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +86 -10
- diffusers/models/attention_processor.py +169 -133
- diffusers/models/autoencoders/autoencoder_kl.py +71 -11
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +287 -85
- diffusers/models/controlnet_flux.py +536 -0
- diffusers/models/controlnet_sd3.py +7 -3
- diffusers/models/controlnet_sparsectrl.py +0 -1
- diffusers/models/embeddings.py +238 -61
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +182 -14
- diffusers/models/modeling_utils.py +283 -46
- diffusers/models/normalization.py +79 -0
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +58 -36
- diffusers/models/transformers/pixart_transformer_2d.py +9 -1
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +161 -44
- diffusers/models/transformers/transformer_sd3.py +7 -1
- diffusers/models/unets/unet_2d_condition.py +8 -8
- diffusers/models/unets/unet_motion_model.py +41 -63
- diffusers/models/upsampling.py +6 -6
- diffusers/pipelines/__init__.py +40 -7
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
- diffusers/pipelines/auto_pipeline.py +39 -8
- diffusers/pipelines/cogvideo/__init__.py +6 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +32 -34
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +837 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +825 -0
- diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +10 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -20
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
- diffusers/pipelines/pag/__init__.py +6 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_loading_utils.py +225 -27
- diffusers/pipelines/pipeline_utils.py +123 -180
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +126 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/quantization_config.py +391 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +4 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
- diffusers/schedulers/scheduling_deis_multistep.py +78 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_sasolver.py +78 -1
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
- diffusers/training_utils.py +48 -18
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +195 -0
- diffusers/utils/hub_utils.py +16 -4
- diffusers/utils/import_utils.py +31 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +3 -3
- diffusers/utils/testing_utils.py +59 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/RECORD +173 -147
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/WHEEL +1 -1
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
diffusers/image_processor.py
CHANGED
@@ -38,16 +38,44 @@ PipelineImageInput = Union[
|
|
38
38
|
PipelineDepthInput = PipelineImageInput
|
39
39
|
|
40
40
|
|
41
|
-
def is_valid_image(image):
|
41
|
+
def is_valid_image(image) -> bool:
|
42
|
+
r"""
|
43
|
+
Checks if the input is a valid image.
|
44
|
+
|
45
|
+
A valid image can be:
|
46
|
+
- A `PIL.Image.Image`.
|
47
|
+
- A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
|
48
|
+
|
49
|
+
Args:
|
50
|
+
image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
|
51
|
+
The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
`bool`:
|
55
|
+
`True` if the input is a valid image, `False` otherwise.
|
56
|
+
"""
|
42
57
|
return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
|
43
58
|
|
44
59
|
|
45
60
|
def is_valid_image_imagelist(images):
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
61
|
+
r"""
|
62
|
+
Checks if the input is a valid image or list of images.
|
63
|
+
|
64
|
+
The input can be one of the following formats:
|
65
|
+
- A 4D tensor or numpy array (batch of images).
|
66
|
+
- A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
|
67
|
+
`torch.Tensor`.
|
68
|
+
- A list of valid images.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
|
72
|
+
The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
|
73
|
+
images.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
`bool`:
|
77
|
+
`True` if the input is valid, `False` otherwise.
|
78
|
+
"""
|
51
79
|
if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
|
52
80
|
return True
|
53
81
|
elif is_valid_image(images):
|
@@ -103,8 +131,16 @@ class VaeImageProcessor(ConfigMixin):
|
|
103
131
|
|
104
132
|
@staticmethod
|
105
133
|
def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
|
106
|
-
"""
|
134
|
+
r"""
|
107
135
|
Convert a numpy image or a batch of images to a PIL image.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
images (`np.ndarray`):
|
139
|
+
The image array to convert to PIL format.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
`List[PIL.Image.Image]`:
|
143
|
+
A list of PIL images.
|
108
144
|
"""
|
109
145
|
if images.ndim == 3:
|
110
146
|
images = images[None, ...]
|
@@ -119,8 +155,16 @@ class VaeImageProcessor(ConfigMixin):
|
|
119
155
|
|
120
156
|
@staticmethod
|
121
157
|
def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
|
122
|
-
"""
|
158
|
+
r"""
|
123
159
|
Convert a PIL image or a list of PIL images to NumPy arrays.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
|
163
|
+
The PIL image or list of images to convert to NumPy format.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
`np.ndarray`:
|
167
|
+
A NumPy array representation of the images.
|
124
168
|
"""
|
125
169
|
if not isinstance(images, list):
|
126
170
|
images = [images]
|
@@ -131,8 +175,16 @@ class VaeImageProcessor(ConfigMixin):
|
|
131
175
|
|
132
176
|
@staticmethod
|
133
177
|
def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
|
134
|
-
"""
|
178
|
+
r"""
|
135
179
|
Convert a NumPy image to a PyTorch tensor.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
images (`np.ndarray`):
|
183
|
+
The NumPy image array to convert to PyTorch format.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
`torch.Tensor`:
|
187
|
+
A PyTorch tensor representation of the images.
|
136
188
|
"""
|
137
189
|
if images.ndim == 3:
|
138
190
|
images = images[..., None]
|
@@ -142,30 +194,62 @@ class VaeImageProcessor(ConfigMixin):
|
|
142
194
|
|
143
195
|
@staticmethod
|
144
196
|
def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
|
145
|
-
"""
|
197
|
+
r"""
|
146
198
|
Convert a PyTorch tensor to a NumPy image.
|
199
|
+
|
200
|
+
Args:
|
201
|
+
images (`torch.Tensor`):
|
202
|
+
The PyTorch tensor to convert to NumPy format.
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
`np.ndarray`:
|
206
|
+
A NumPy array representation of the images.
|
147
207
|
"""
|
148
208
|
images = images.cpu().permute(0, 2, 3, 1).float().numpy()
|
149
209
|
return images
|
150
210
|
|
151
211
|
@staticmethod
|
152
212
|
def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
|
153
|
-
"""
|
213
|
+
r"""
|
154
214
|
Normalize an image array to [-1,1].
|
215
|
+
|
216
|
+
Args:
|
217
|
+
images (`np.ndarray` or `torch.Tensor`):
|
218
|
+
The image array to normalize.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
`np.ndarray` or `torch.Tensor`:
|
222
|
+
The normalized image array.
|
155
223
|
"""
|
156
224
|
return 2.0 * images - 1.0
|
157
225
|
|
158
226
|
@staticmethod
|
159
227
|
def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
|
160
|
-
"""
|
228
|
+
r"""
|
161
229
|
Denormalize an image array to [0,1].
|
230
|
+
|
231
|
+
Args:
|
232
|
+
images (`np.ndarray` or `torch.Tensor`):
|
233
|
+
The image array to denormalize.
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
`np.ndarray` or `torch.Tensor`:
|
237
|
+
The denormalized image array.
|
162
238
|
"""
|
163
239
|
return (images / 2 + 0.5).clamp(0, 1)
|
164
240
|
|
165
241
|
@staticmethod
|
166
242
|
def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
|
167
|
-
"""
|
243
|
+
r"""
|
168
244
|
Converts a PIL image to RGB format.
|
245
|
+
|
246
|
+
Args:
|
247
|
+
image (`PIL.Image.Image`):
|
248
|
+
The PIL image to convert to RGB.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
`PIL.Image.Image`:
|
252
|
+
The RGB-converted PIL image.
|
169
253
|
"""
|
170
254
|
image = image.convert("RGB")
|
171
255
|
|
@@ -173,8 +257,16 @@ class VaeImageProcessor(ConfigMixin):
|
|
173
257
|
|
174
258
|
@staticmethod
|
175
259
|
def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
|
176
|
-
"""
|
177
|
-
Converts a PIL image to grayscale
|
260
|
+
r"""
|
261
|
+
Converts a given PIL image to grayscale.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
image (`PIL.Image.Image`):
|
265
|
+
The input image to convert.
|
266
|
+
|
267
|
+
Returns:
|
268
|
+
`PIL.Image.Image`:
|
269
|
+
The image converted to grayscale.
|
178
270
|
"""
|
179
271
|
image = image.convert("L")
|
180
272
|
|
@@ -182,8 +274,16 @@ class VaeImageProcessor(ConfigMixin):
|
|
182
274
|
|
183
275
|
@staticmethod
|
184
276
|
def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image:
|
185
|
-
"""
|
277
|
+
r"""
|
186
278
|
Applies Gaussian blur to an image.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
image (`PIL.Image.Image`):
|
282
|
+
The PIL image to convert to grayscale.
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
`PIL.Image.Image`:
|
286
|
+
The grayscale-converted PIL image.
|
187
287
|
"""
|
188
288
|
image = image.filter(ImageFilter.GaussianBlur(blur_factor))
|
189
289
|
|
@@ -191,7 +291,7 @@ class VaeImageProcessor(ConfigMixin):
|
|
191
291
|
|
192
292
|
@staticmethod
|
193
293
|
def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
|
194
|
-
"""
|
294
|
+
r"""
|
195
295
|
Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect
|
196
296
|
ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for
|
197
297
|
processing are 512x512, the region will be expanded to 128x128.
|
@@ -285,14 +385,21 @@ class VaeImageProcessor(ConfigMixin):
|
|
285
385
|
width: int,
|
286
386
|
height: int,
|
287
387
|
) -> PIL.Image.Image:
|
288
|
-
"""
|
388
|
+
r"""
|
289
389
|
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
290
390
|
the image within the dimensions, filling empty with data from image.
|
291
391
|
|
292
392
|
Args:
|
293
|
-
image
|
294
|
-
|
295
|
-
|
393
|
+
image (`PIL.Image.Image`):
|
394
|
+
The image to resize and fill.
|
395
|
+
width (`int`):
|
396
|
+
The width to resize the image to.
|
397
|
+
height (`int`):
|
398
|
+
The height to resize the image to.
|
399
|
+
|
400
|
+
Returns:
|
401
|
+
`PIL.Image.Image`:
|
402
|
+
The resized and filled image.
|
296
403
|
"""
|
297
404
|
|
298
405
|
ratio = width / height
|
@@ -330,14 +437,21 @@ class VaeImageProcessor(ConfigMixin):
|
|
330
437
|
width: int,
|
331
438
|
height: int,
|
332
439
|
) -> PIL.Image.Image:
|
333
|
-
"""
|
440
|
+
r"""
|
334
441
|
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
335
442
|
the image within the dimensions, cropping the excess.
|
336
443
|
|
337
444
|
Args:
|
338
|
-
image
|
339
|
-
|
340
|
-
|
445
|
+
image (`PIL.Image.Image`):
|
446
|
+
The image to resize and crop.
|
447
|
+
width (`int`):
|
448
|
+
The width to resize the image to.
|
449
|
+
height (`int`):
|
450
|
+
The height to resize the image to.
|
451
|
+
|
452
|
+
Returns:
|
453
|
+
`PIL.Image.Image`:
|
454
|
+
The resized and cropped image.
|
341
455
|
"""
|
342
456
|
ratio = width / height
|
343
457
|
src_ratio = image.width / image.height
|
@@ -429,19 +543,23 @@ class VaeImageProcessor(ConfigMixin):
|
|
429
543
|
height: Optional[int] = None,
|
430
544
|
width: Optional[int] = None,
|
431
545
|
) -> Tuple[int, int]:
|
432
|
-
"""
|
433
|
-
|
434
|
-
`vae_scale_factor`.
|
546
|
+
r"""
|
547
|
+
Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
|
435
548
|
|
436
549
|
Args:
|
437
|
-
image(`PIL.Image.Image
|
438
|
-
The image input, can be a PIL image,
|
439
|
-
shape `[batch, height, width]` or `[batch, height, width,
|
440
|
-
have shape `[batch,
|
441
|
-
height (`int`, *optional*, defaults to `None`):
|
442
|
-
The height
|
443
|
-
width (`int`, *optional
|
444
|
-
The width
|
550
|
+
image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
|
551
|
+
The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
|
552
|
+
should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
|
553
|
+
tensor, it should have shape `[batch, channels, height, width]`.
|
554
|
+
height (`Optional[int]`, *optional*, defaults to `None`):
|
555
|
+
The height of the preprocessed image. If `None`, the height of the `image` input will be used.
|
556
|
+
width (`Optional[int]`, *optional*, defaults to `None`):
|
557
|
+
The width of the preprocessed image. If `None`, the width of the `image` input will be used.
|
558
|
+
|
559
|
+
Returns:
|
560
|
+
`Tuple[int, int]`:
|
561
|
+
A tuple containing the height and width, both resized to the nearest integer multiple of
|
562
|
+
`vae_scale_factor`.
|
445
563
|
"""
|
446
564
|
|
447
565
|
if height is None:
|
@@ -478,13 +596,13 @@ class VaeImageProcessor(ConfigMixin):
|
|
478
596
|
Preprocess the image input.
|
479
597
|
|
480
598
|
Args:
|
481
|
-
image (`
|
599
|
+
image (`PipelineImageInput`):
|
482
600
|
The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
|
483
601
|
supported formats.
|
484
|
-
height (`int`, *optional
|
602
|
+
height (`int`, *optional*):
|
485
603
|
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
|
486
604
|
height.
|
487
|
-
width (`int`, *optional
|
605
|
+
width (`int`, *optional*):
|
488
606
|
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
489
607
|
resize_mode (`str`, *optional*, defaults to `default`):
|
490
608
|
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
|
@@ -496,6 +614,10 @@ class VaeImageProcessor(ConfigMixin):
|
|
496
614
|
supported for PIL image input.
|
497
615
|
crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
|
498
616
|
The crop coordinates for each image in the batch. If `None`, will not crop the image.
|
617
|
+
|
618
|
+
Returns:
|
619
|
+
`torch.Tensor`:
|
620
|
+
The preprocessed image.
|
499
621
|
"""
|
500
622
|
supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
|
501
623
|
|
@@ -569,7 +691,7 @@ class VaeImageProcessor(ConfigMixin):
|
|
569
691
|
|
570
692
|
channel = image.shape[1]
|
571
693
|
# don't need any preprocess if the image is latents
|
572
|
-
if channel == self.vae_latent_channels:
|
694
|
+
if channel == self.config.vae_latent_channels:
|
573
695
|
return image
|
574
696
|
|
575
697
|
height, width = self.get_default_height_width(image, height, width)
|
@@ -655,8 +777,22 @@ class VaeImageProcessor(ConfigMixin):
|
|
655
777
|
image: PIL.Image.Image,
|
656
778
|
crop_coords: Optional[Tuple[int, int, int, int]] = None,
|
657
779
|
) -> PIL.Image.Image:
|
658
|
-
"""
|
659
|
-
overlay the
|
780
|
+
r"""
|
781
|
+
Applies an overlay of the mask and the inpainted image on the original image.
|
782
|
+
|
783
|
+
Args:
|
784
|
+
mask (`PIL.Image.Image`):
|
785
|
+
The mask image that highlights regions to overlay.
|
786
|
+
init_image (`PIL.Image.Image`):
|
787
|
+
The original image to which the overlay is applied.
|
788
|
+
image (`PIL.Image.Image`):
|
789
|
+
The image to overlay onto the original.
|
790
|
+
crop_coords (`Tuple[int, int, int, int]`, *optional*):
|
791
|
+
Coordinates to crop the image. If provided, the image will be cropped accordingly.
|
792
|
+
|
793
|
+
Returns:
|
794
|
+
`PIL.Image.Image`:
|
795
|
+
The final image with the overlay applied.
|
660
796
|
"""
|
661
797
|
|
662
798
|
width, height = image.width, image.height
|
@@ -713,8 +849,16 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
|
|
713
849
|
|
714
850
|
@staticmethod
|
715
851
|
def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
|
716
|
-
"""
|
717
|
-
Convert a NumPy image or a batch of images to a PIL
|
852
|
+
r"""
|
853
|
+
Convert a NumPy image or a batch of images to a list of PIL images.
|
854
|
+
|
855
|
+
Args:
|
856
|
+
images (`np.ndarray`):
|
857
|
+
The input NumPy array of images, which can be a single image or a batch.
|
858
|
+
|
859
|
+
Returns:
|
860
|
+
`List[PIL.Image.Image]`:
|
861
|
+
A list of PIL images converted from the input NumPy array.
|
718
862
|
"""
|
719
863
|
if images.ndim == 3:
|
720
864
|
images = images[None, ...]
|
@@ -729,8 +873,16 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
|
|
729
873
|
|
730
874
|
@staticmethod
|
731
875
|
def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
|
732
|
-
"""
|
876
|
+
r"""
|
733
877
|
Convert a PIL image or a list of PIL images to NumPy arrays.
|
878
|
+
|
879
|
+
Args:
|
880
|
+
images (`Union[List[PIL.Image.Image], PIL.Image.Image]`):
|
881
|
+
The input image or list of images to be converted.
|
882
|
+
|
883
|
+
Returns:
|
884
|
+
`np.ndarray`:
|
885
|
+
A NumPy array of the converted images.
|
734
886
|
"""
|
735
887
|
if not isinstance(images, list):
|
736
888
|
images = [images]
|
@@ -741,18 +893,30 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
|
|
741
893
|
|
742
894
|
@staticmethod
|
743
895
|
def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
|
744
|
-
"""
|
745
|
-
|
746
|
-
image: RGB-like depth image
|
896
|
+
r"""
|
897
|
+
Convert an RGB-like depth image to a depth map.
|
747
898
|
|
748
|
-
|
899
|
+
Args:
|
900
|
+
image (`Union[np.ndarray, torch.Tensor]`):
|
901
|
+
The RGB-like depth image to convert.
|
749
902
|
|
903
|
+
Returns:
|
904
|
+
`Union[np.ndarray, torch.Tensor]`:
|
905
|
+
The corresponding depth map.
|
750
906
|
"""
|
751
907
|
return image[:, :, 1] * 2**8 + image[:, :, 2]
|
752
908
|
|
753
909
|
def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
|
754
|
-
"""
|
755
|
-
Convert a NumPy depth image or a batch of images to a PIL
|
910
|
+
r"""
|
911
|
+
Convert a NumPy depth image or a batch of images to a list of PIL images.
|
912
|
+
|
913
|
+
Args:
|
914
|
+
images (`np.ndarray`):
|
915
|
+
The input NumPy array of depth images, which can be a single image or a batch.
|
916
|
+
|
917
|
+
Returns:
|
918
|
+
`List[PIL.Image.Image]`:
|
919
|
+
A list of PIL images converted from the input NumPy depth images.
|
756
920
|
"""
|
757
921
|
if images.ndim == 3:
|
758
922
|
images = images[None, ...]
|
@@ -833,8 +997,24 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
|
|
833
997
|
width: Optional[int] = None,
|
834
998
|
target_res: Optional[int] = None,
|
835
999
|
) -> torch.Tensor:
|
836
|
-
"""
|
837
|
-
Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
|
1000
|
+
r"""
|
1001
|
+
Preprocess the image input. Accepted formats are PIL images, NumPy arrays, or PyTorch tensors.
|
1002
|
+
|
1003
|
+
Args:
|
1004
|
+
rgb (`Union[torch.Tensor, PIL.Image.Image, np.ndarray]`):
|
1005
|
+
The RGB input image, which can be a single image or a batch.
|
1006
|
+
depth (`Union[torch.Tensor, PIL.Image.Image, np.ndarray]`):
|
1007
|
+
The depth input image, which can be a single image or a batch.
|
1008
|
+
height (`Optional[int]`, *optional*, defaults to `None`):
|
1009
|
+
The desired height of the processed image. If `None`, defaults to the height of the input image.
|
1010
|
+
width (`Optional[int]`, *optional*, defaults to `None`):
|
1011
|
+
The desired width of the processed image. If `None`, defaults to the width of the input image.
|
1012
|
+
target_res (`Optional[int]`, *optional*, defaults to `None`):
|
1013
|
+
Target resolution for resizing the images. If specified, overrides height and width.
|
1014
|
+
|
1015
|
+
Returns:
|
1016
|
+
`Tuple[torch.Tensor, torch.Tensor]`:
|
1017
|
+
A tuple containing the processed RGB and depth images as PyTorch tensors.
|
838
1018
|
"""
|
839
1019
|
supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
|
840
1020
|
|
@@ -1072,7 +1252,17 @@ class PixArtImageProcessor(VaeImageProcessor):
|
|
1072
1252
|
|
1073
1253
|
@staticmethod
|
1074
1254
|
def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
|
1075
|
-
"""
|
1255
|
+
r"""
|
1256
|
+
Returns the binned height and width based on the aspect ratio.
|
1257
|
+
|
1258
|
+
Args:
|
1259
|
+
height (`int`): The height of the image.
|
1260
|
+
width (`int`): The width of the image.
|
1261
|
+
ratios (`dict`): A dictionary where keys are aspect ratios and values are tuples of (height, width).
|
1262
|
+
|
1263
|
+
Returns:
|
1264
|
+
`Tuple[int, int]`: The closest binned height and width.
|
1265
|
+
"""
|
1076
1266
|
ar = float(height / width)
|
1077
1267
|
closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
|
1078
1268
|
default_hw = ratios[closest_ratio]
|
@@ -1080,6 +1270,19 @@ class PixArtImageProcessor(VaeImageProcessor):
|
|
1080
1270
|
|
1081
1271
|
@staticmethod
|
1082
1272
|
def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
|
1273
|
+
r"""
|
1274
|
+
Resizes and crops a tensor of images to the specified dimensions.
|
1275
|
+
|
1276
|
+
Args:
|
1277
|
+
samples (`torch.Tensor`):
|
1278
|
+
A tensor of shape (N, C, H, W) where N is the batch size, C is the number of channels, H is the height,
|
1279
|
+
and W is the width.
|
1280
|
+
new_width (`int`): The desired width of the output images.
|
1281
|
+
new_height (`int`): The desired height of the output images.
|
1282
|
+
|
1283
|
+
Returns:
|
1284
|
+
`torch.Tensor`: A tensor containing the resized and cropped images.
|
1285
|
+
"""
|
1083
1286
|
orig_height, orig_width = samples.shape[2], samples.shape[3]
|
1084
1287
|
|
1085
1288
|
# Check if resizing is needed
|
diffusers/loaders/__init__.py
CHANGED
@@ -67,6 +67,7 @@ if is_torch_available():
|
|
67
67
|
"StableDiffusionXLLoraLoaderMixin",
|
68
68
|
"LoraLoaderMixin",
|
69
69
|
"FluxLoraLoaderMixin",
|
70
|
+
"CogVideoXLoraLoaderMixin",
|
70
71
|
]
|
71
72
|
_import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
|
72
73
|
_import_structure["ip_adapter"] = ["IPAdapterMixin"]
|
@@ -84,6 +85,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
84
85
|
from .ip_adapter import IPAdapterMixin
|
85
86
|
from .lora_pipeline import (
|
86
87
|
AmusedLoraLoaderMixin,
|
88
|
+
CogVideoXLoraLoaderMixin,
|
87
89
|
FluxLoraLoaderMixin,
|
88
90
|
LoraLoaderMixin,
|
89
91
|
SD3LoraLoaderMixin,
|
diffusers/loaders/ip_adapter.py
CHANGED
@@ -224,7 +224,11 @@ class IPAdapterMixin:
|
|
224
224
|
|
225
225
|
# create feature extractor if it has not been registered to the pipeline yet
|
226
226
|
if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
|
227
|
-
|
227
|
+
# FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
|
228
|
+
default_clip_size = 224
|
229
|
+
clip_image_size = (
|
230
|
+
self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
|
231
|
+
)
|
228
232
|
feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
|
229
233
|
self.register_modules(feature_extractor=feature_extractor)
|
230
234
|
|
diffusers/loaders/lora_base.py
CHANGED
@@ -532,13 +532,19 @@ class LoraBaseMixin:
|
|
532
532
|
)
|
533
533
|
|
534
534
|
list_adapters = self.get_list_adapters() # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]}
|
535
|
-
|
536
|
-
|
537
|
-
|
535
|
+
# eg ["adapter1", "adapter2"]
|
536
|
+
all_adapters = {adapter for adapters in list_adapters.values() for adapter in adapters}
|
537
|
+
missing_adapters = set(adapter_names) - all_adapters
|
538
|
+
if len(missing_adapters) > 0:
|
539
|
+
raise ValueError(
|
540
|
+
f"Adapter name(s) {missing_adapters} not in the list of present adapters: {all_adapters}."
|
541
|
+
)
|
542
|
+
|
543
|
+
# eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]}
|
538
544
|
invert_list_adapters = {
|
539
545
|
adapter: [part for part, adapters in list_adapters.items() if adapter in adapters]
|
540
546
|
for adapter in all_adapters
|
541
|
-
}
|
547
|
+
}
|
542
548
|
|
543
549
|
# Decompose weights into weights for denoiser and text encoders.
|
544
550
|
_component_adapter_weights = {}
|
@@ -699,9 +705,10 @@ class LoraBaseMixin:
|
|
699
705
|
module.lora_B[adapter_name].to(device)
|
700
706
|
# this is a param, not a module, so device placement is not in-place -> re-assign
|
701
707
|
if hasattr(module, "lora_magnitude_vector") and module.lora_magnitude_vector is not None:
|
702
|
-
|
703
|
-
adapter_name
|
704
|
-
|
708
|
+
if adapter_name in module.lora_magnitude_vector:
|
709
|
+
module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[
|
710
|
+
adapter_name
|
711
|
+
].to(device)
|
705
712
|
|
706
713
|
@staticmethod
|
707
714
|
def pack_weights(layers, prefix):
|