diffusers 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +15 -1
- diffusers/commands/env.py +1 -5
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +2 -1
- diffusers/loaders/__init__.py +2 -2
- diffusers/loaders/lora.py +406 -140
- diffusers/loaders/lora_conversion_utils.py +7 -1
- diffusers/loaders/single_file.py +13 -1
- diffusers/loaders/single_file_model.py +15 -8
- diffusers/loaders/single_file_utils.py +267 -17
- diffusers/loaders/unet.py +307 -272
- diffusers/models/__init__.py +7 -3
- diffusers/models/attention.py +125 -1
- diffusers/models/attention_processor.py +169 -1
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
- diffusers/models/autoencoders/autoencoder_kl.py +17 -6
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
- diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
- diffusers/models/autoencoders/vq_model.py +182 -0
- diffusers/models/controlnet_sd3.py +418 -0
- diffusers/models/controlnet_xs.py +6 -6
- diffusers/models/embeddings.py +112 -84
- diffusers/models/model_loading_utils.py +55 -0
- diffusers/models/modeling_utils.py +138 -20
- diffusers/models/normalization.py +11 -6
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/dual_transformer_2d.py +5 -4
- diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
- diffusers/models/transformers/prior_transformer.py +5 -5
- diffusers/models/transformers/transformer_2d.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +353 -0
- diffusers/models/transformers/transformer_temporal.py +12 -10
- diffusers/models/unets/unet_1d.py +3 -3
- diffusers/models/unets/unet_2d.py +3 -3
- diffusers/models/unets/unet_2d_condition.py +4 -15
- diffusers/models/unets/unet_3d_condition.py +5 -17
- diffusers/models/unets/unet_i2vgen_xl.py +4 -4
- diffusers/models/unets/unet_motion_model.py +4 -4
- diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
- diffusers/models/vq_model.py +8 -165
- diffusers/pipelines/__init__.py +11 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
- diffusers/pipelines/auto_pipeline.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +53 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1062 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
- diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
- diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
- diffusers/pipelines/pia/pipeline_pia.py +4 -3
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +904 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +941 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
- diffusers/schedulers/__init__.py +2 -0
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
- diffusers/schedulers/scheduling_edm_euler.py +2 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/training_utils.py +4 -4
- diffusers/utils/__init__.py +3 -0
- diffusers/utils/constants.py +2 -0
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +45 -0
- diffusers/utils/dynamic_modules_utils.py +15 -13
- diffusers/utils/hub_utils.py +106 -0
- diffusers/utils/import_utils.py +0 -1
- diffusers/utils/logging.py +3 -1
- diffusers/utils/state_dict_utils.py +2 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/METADATA +3 -3
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/RECORD +112 -112
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/WHEEL +1 -1
- diffusers/models/dual_transformer_2d.py +0 -20
- diffusers/models/prior_transformer.py +0 -12
- diffusers/models/t5_film_transformer.py +0 -70
- diffusers/models/transformer_2d.py +0 -25
- diffusers/models/transformer_temporal.py +0 -34
- diffusers/models/unet_1d.py +0 -26
- diffusers/models/unet_1d_blocks.py +0 -203
- diffusers/models/unet_2d.py +0 -27
- diffusers/models/unet_2d_blocks.py +0 -375
- diffusers/models/unet_2d_condition.py +0 -25
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/LICENSE +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/top_level.txt +0 -0
@@ -390,9 +390,10 @@ class StableDiffusionControlNetXSPipeline(
|
|
390
390
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
391
391
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
392
392
|
|
393
|
-
if
|
394
|
-
|
395
|
-
|
393
|
+
if self.text_encoder is not None:
|
394
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
395
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
396
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
396
397
|
|
397
398
|
return prompt_embeds, negative_prompt_embeds
|
398
399
|
|
@@ -17,7 +17,7 @@ class IFWatermarker(ModelMixin, ConfigMixin):
|
|
17
17
|
self.watermark_image_as_pil = None
|
18
18
|
|
19
19
|
def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
|
20
|
-
#
|
20
|
+
# Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
|
21
21
|
|
22
22
|
h = images[0].height
|
23
23
|
w = images[0].width
|
@@ -456,9 +456,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
|
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
457
457
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
458
458
|
|
459
|
-
if
|
460
|
-
|
461
|
-
|
459
|
+
if self.text_encoder is not None:
|
460
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
461
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
462
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
462
463
|
|
463
464
|
return prompt_embeds, negative_prompt_embeds
|
464
465
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
CHANGED
@@ -426,9 +426,10 @@ class StableDiffusionInpaintPipelineLegacy(
|
|
426
426
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
427
427
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
428
428
|
|
429
|
-
if
|
430
|
-
|
431
|
-
|
429
|
+
if self.text_encoder is not None:
|
430
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
431
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
432
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
432
433
|
|
433
434
|
return prompt_embeds, negative_prompt_embeds
|
434
435
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
CHANGED
@@ -364,9 +364,10 @@ class StableDiffusionModelEditingPipeline(
|
|
364
364
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
365
365
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
366
366
|
|
367
|
-
if
|
368
|
-
|
369
|
-
|
367
|
+
if self.text_encoder is not None:
|
368
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
369
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
370
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
370
371
|
|
371
372
|
return prompt_embeds, negative_prompt_embeds
|
372
373
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
CHANGED
@@ -355,9 +355,10 @@ class StableDiffusionParadigmsPipeline(
|
|
355
355
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
356
356
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
357
357
|
|
358
|
-
if
|
359
|
-
|
360
|
-
|
358
|
+
if self.text_encoder is not None:
|
359
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
360
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
361
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
361
362
|
|
362
363
|
return prompt_embeds, negative_prompt_embeds
|
363
364
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
CHANGED
@@ -578,9 +578,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin
|
|
578
578
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
579
579
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
580
580
|
|
581
|
-
if
|
582
|
-
|
583
|
-
|
581
|
+
if self.text_encoder is not None:
|
582
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
583
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
584
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
584
585
|
|
585
586
|
return prompt_embeds, negative_prompt_embeds
|
586
587
|
|
@@ -52,7 +52,9 @@ EXAMPLE_DOC_STRING = """
|
|
52
52
|
>>> import torch
|
53
53
|
>>> from diffusers import HunyuanDiTPipeline
|
54
54
|
|
55
|
-
>>> pipe = HunyuanDiTPipeline.from_pretrained(
|
55
|
+
>>> pipe = HunyuanDiTPipeline.from_pretrained(
|
56
|
+
... "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
|
57
|
+
... )
|
56
58
|
>>> pipe.to("cuda")
|
57
59
|
|
58
60
|
>>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
|
@@ -226,16 +228,22 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
226
228
|
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
|
227
229
|
)
|
228
230
|
|
229
|
-
self.vae_scale_factor =
|
231
|
+
self.vae_scale_factor = (
|
232
|
+
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
233
|
+
)
|
230
234
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
231
235
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
232
|
-
self.default_sample_size =
|
236
|
+
self.default_sample_size = (
|
237
|
+
self.transformer.config.sample_size
|
238
|
+
if hasattr(self, "transformer") and self.transformer is not None
|
239
|
+
else 128
|
240
|
+
)
|
233
241
|
|
234
242
|
def encode_prompt(
|
235
243
|
self,
|
236
244
|
prompt: str,
|
237
|
-
device: torch.device,
|
238
|
-
dtype: torch.dtype,
|
245
|
+
device: torch.device = None,
|
246
|
+
dtype: torch.dtype = None,
|
239
247
|
num_images_per_prompt: int = 1,
|
240
248
|
do_classifier_free_guidance: bool = True,
|
241
249
|
negative_prompt: Optional[str] = None,
|
@@ -279,6 +287,17 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
279
287
|
text_encoder_index (`int`, *optional*):
|
280
288
|
Index of the text encoder to use. `0` for clip and `1` for T5.
|
281
289
|
"""
|
290
|
+
if dtype is None:
|
291
|
+
if self.text_encoder_2 is not None:
|
292
|
+
dtype = self.text_encoder_2.dtype
|
293
|
+
elif self.transformer is not None:
|
294
|
+
dtype = self.transformer.dtype
|
295
|
+
else:
|
296
|
+
dtype = None
|
297
|
+
|
298
|
+
if device is None:
|
299
|
+
device = self._execution_device
|
300
|
+
|
282
301
|
tokenizers = [self.tokenizer, self.tokenizer_2]
|
283
302
|
text_encoders = [self.text_encoder, self.text_encoder_2]
|
284
303
|
|
@@ -405,9 +405,10 @@ class LatentConsistencyModelImg2ImgPipeline(
|
|
405
405
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
406
406
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
407
407
|
|
408
|
-
if
|
409
|
-
|
410
|
-
|
408
|
+
if self.text_encoder is not None:
|
409
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
410
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
411
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
411
412
|
|
412
413
|
return prompt_embeds, negative_prompt_embeds
|
413
414
|
|
@@ -389,9 +389,10 @@ class LatentConsistencyModelPipeline(
|
|
389
389
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
390
390
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
391
391
|
|
392
|
-
if
|
393
|
-
|
394
|
-
|
392
|
+
if self.text_encoder is not None:
|
393
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
394
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
395
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
395
396
|
|
396
397
|
return prompt_embeds, negative_prompt_embeds
|
397
398
|
|
@@ -245,9 +245,9 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
245
245
|
) -> Union[np.ndarray, torch.Tensor]:
|
246
246
|
"""
|
247
247
|
Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
|
248
|
-
behavior of matplotlib.colormaps, but allows the user to use the most discriminative color
|
249
|
-
without having to install or import matplotlib. For all other cases, the function will attempt to use
|
250
|
-
native implementation.
|
248
|
+
behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
|
249
|
+
"binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
|
250
|
+
the native implementation.
|
251
251
|
|
252
252
|
Args:
|
253
253
|
image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
|
@@ -255,7 +255,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
255
255
|
bytes: Whether to return the output as uint8 or floating point image.
|
256
256
|
_force_method:
|
257
257
|
Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
|
258
|
-
implementation of the
|
258
|
+
implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
|
259
259
|
|
260
260
|
Returns:
|
261
261
|
An RGB-colorized tensor corresponding to the input image.
|
@@ -265,6 +265,26 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
265
265
|
if _force_method not in (None, "matplotlib", "custom"):
|
266
266
|
raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
|
267
267
|
|
268
|
+
supported_cmaps = {
|
269
|
+
"binary": [
|
270
|
+
(1.0, 1.0, 1.0),
|
271
|
+
(0.0, 0.0, 0.0),
|
272
|
+
],
|
273
|
+
"Spectral": [ # Taken from matplotlib/_cm.py
|
274
|
+
(0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
|
275
|
+
(0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
|
276
|
+
(0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
|
277
|
+
(0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
|
278
|
+
(0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
|
279
|
+
(1.0, 1.0, 0.74901960784313726),
|
280
|
+
(0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
|
281
|
+
(0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
|
282
|
+
(0.4, 0.76078431372549016, 0.6470588235294118),
|
283
|
+
(0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
|
284
|
+
(0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
|
285
|
+
],
|
286
|
+
}
|
287
|
+
|
268
288
|
def method_matplotlib(image, cmap, bytes=False):
|
269
289
|
if is_matplotlib_available():
|
270
290
|
import matplotlib
|
@@ -298,24 +318,19 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
298
318
|
else:
|
299
319
|
image = image.float()
|
300
320
|
|
301
|
-
|
302
|
-
|
321
|
+
is_cmap_reversed = cmap.endswith("_r")
|
322
|
+
if is_cmap_reversed:
|
323
|
+
cmap = cmap[:-2]
|
303
324
|
|
304
|
-
|
305
|
-
(
|
306
|
-
|
307
|
-
|
308
|
-
(0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
|
309
|
-
(0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
|
310
|
-
(1.0, 1.0, 0.74901960784313726),
|
311
|
-
(0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
|
312
|
-
(0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
|
313
|
-
(0.4, 0.76078431372549016, 0.6470588235294118),
|
314
|
-
(0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
|
315
|
-
(0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
|
316
|
-
)
|
325
|
+
if cmap not in supported_cmaps:
|
326
|
+
raise ValueError(
|
327
|
+
f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
|
328
|
+
)
|
317
329
|
|
318
|
-
cmap =
|
330
|
+
cmap = supported_cmaps[cmap]
|
331
|
+
if is_cmap_reversed:
|
332
|
+
cmap = cmap[::-1]
|
333
|
+
cmap = torch.tensor(cmap, dtype=torch.float, device=image.device) # [K,3]
|
319
334
|
K = cmap.shape[0]
|
320
335
|
|
321
336
|
pos = image.clamp(min=0, max=1) * (K - 1)
|
@@ -375,9 +375,10 @@ class PIAPipeline(
|
|
375
375
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
376
376
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
377
377
|
|
378
|
-
if
|
379
|
-
|
380
|
-
|
378
|
+
if self.text_encoder is not None:
|
379
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
380
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
381
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
381
382
|
|
382
383
|
return prompt_embeds, negative_prompt_embeds
|
383
384
|
|
@@ -394,7 +394,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
|
|
394
394
|
|
395
395
|
# get unconditional embeddings for classifier free guidance
|
396
396
|
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
397
|
-
uncond_tokens = [negative_prompt] * batch_size
|
397
|
+
uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
|
398
398
|
uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
|
399
399
|
max_length = prompt_embeds.shape[1]
|
400
400
|
uncond_input = self.tokenizer(
|
@@ -320,7 +320,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
|
320
320
|
|
321
321
|
# get unconditional embeddings for classifier free guidance
|
322
322
|
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
323
|
-
uncond_tokens = [negative_prompt] * batch_size
|
323
|
+
uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
|
324
324
|
uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
|
325
325
|
max_length = prompt_embeds.shape[1]
|
326
326
|
uncond_input = self.tokenizer(
|
@@ -376,6 +376,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
376
376
|
|
377
377
|
# 2. Define call parameters
|
378
378
|
batch_size = 1 if isinstance(prompt, str) else len(prompt)
|
379
|
+
device = self._execution_device
|
379
380
|
|
380
381
|
if editing_prompt:
|
381
382
|
enable_edit_guidance = True
|
@@ -405,7 +406,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
405
406
|
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
406
407
|
)
|
407
408
|
text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
|
408
|
-
text_embeddings = self.text_encoder(text_input_ids.to(
|
409
|
+
text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
|
409
410
|
|
410
411
|
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
411
412
|
bs_embed, seq_len, _ = text_embeddings.shape
|
@@ -433,9 +434,9 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
433
434
|
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
434
435
|
)
|
435
436
|
edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
|
436
|
-
edit_concepts = self.text_encoder(edit_concepts_input_ids.to(
|
437
|
+
edit_concepts = self.text_encoder(edit_concepts_input_ids.to(device))[0]
|
437
438
|
else:
|
438
|
-
edit_concepts = editing_prompt_embeddings.to(
|
439
|
+
edit_concepts = editing_prompt_embeddings.to(device).repeat(batch_size, 1, 1)
|
439
440
|
|
440
441
|
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
441
442
|
bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
|
@@ -476,7 +477,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
476
477
|
truncation=True,
|
477
478
|
return_tensors="pt",
|
478
479
|
)
|
479
|
-
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(
|
480
|
+
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
|
480
481
|
|
481
482
|
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
482
483
|
seq_len = uncond_embeddings.shape[1]
|
@@ -493,7 +494,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
493
494
|
# get the initial random noise unless the user supplied it
|
494
495
|
|
495
496
|
# 4. Prepare timesteps
|
496
|
-
self.scheduler.set_timesteps(num_inference_steps, device=
|
497
|
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
497
498
|
timesteps = self.scheduler.timesteps
|
498
499
|
|
499
500
|
# 5. Prepare latent variables
|
@@ -504,7 +505,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
504
505
|
height,
|
505
506
|
width,
|
506
507
|
text_embeddings.dtype,
|
507
|
-
|
508
|
+
device,
|
508
509
|
generator,
|
509
510
|
latents,
|
510
511
|
)
|
@@ -562,12 +563,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
562
563
|
if enable_edit_guidance:
|
563
564
|
concept_weights = torch.zeros(
|
564
565
|
(len(noise_pred_edit_concepts), noise_guidance.shape[0]),
|
565
|
-
device=
|
566
|
+
device=device,
|
566
567
|
dtype=noise_guidance.dtype,
|
567
568
|
)
|
568
569
|
noise_guidance_edit = torch.zeros(
|
569
570
|
(len(noise_pred_edit_concepts), *noise_guidance.shape),
|
570
|
-
device=
|
571
|
+
device=device,
|
571
572
|
dtype=noise_guidance.dtype,
|
572
573
|
)
|
573
574
|
# noise_guidance_edit = torch.zeros_like(noise_guidance)
|
@@ -644,21 +645,19 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
644
645
|
|
645
646
|
# noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
|
646
647
|
|
647
|
-
warmup_inds = torch.tensor(warmup_inds).to(
|
648
|
+
warmup_inds = torch.tensor(warmup_inds).to(device)
|
648
649
|
if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
|
649
650
|
concept_weights = concept_weights.to("cpu") # Offload to cpu
|
650
651
|
noise_guidance_edit = noise_guidance_edit.to("cpu")
|
651
652
|
|
652
|
-
concept_weights_tmp = torch.index_select(concept_weights.to(
|
653
|
+
concept_weights_tmp = torch.index_select(concept_weights.to(device), 0, warmup_inds)
|
653
654
|
concept_weights_tmp = torch.where(
|
654
655
|
concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
|
655
656
|
)
|
656
657
|
concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
|
657
658
|
# concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
|
658
659
|
|
659
|
-
noise_guidance_edit_tmp = torch.index_select(
|
660
|
-
noise_guidance_edit.to(self.device), 0, warmup_inds
|
661
|
-
)
|
660
|
+
noise_guidance_edit_tmp = torch.index_select(noise_guidance_edit.to(device), 0, warmup_inds)
|
662
661
|
noise_guidance_edit_tmp = torch.einsum(
|
663
662
|
"cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
|
664
663
|
)
|
@@ -669,8 +668,8 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
669
668
|
|
670
669
|
del noise_guidance_edit_tmp
|
671
670
|
del concept_weights_tmp
|
672
|
-
concept_weights = concept_weights.to(
|
673
|
-
noise_guidance_edit = noise_guidance_edit.to(
|
671
|
+
concept_weights = concept_weights.to(device)
|
672
|
+
noise_guidance_edit = noise_guidance_edit.to(device)
|
674
673
|
|
675
674
|
concept_weights = torch.where(
|
676
675
|
concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
|
@@ -679,6 +678,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
679
678
|
concept_weights = torch.nan_to_num(concept_weights)
|
680
679
|
|
681
680
|
noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
|
681
|
+
noise_guidance_edit = noise_guidance_edit.to(edit_momentum.device)
|
682
682
|
|
683
683
|
noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
|
684
684
|
|
@@ -689,7 +689,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
689
689
|
self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
|
690
690
|
|
691
691
|
if sem_guidance is not None:
|
692
|
-
edit_guidance = sem_guidance[i].to(
|
692
|
+
edit_guidance = sem_guidance[i].to(device)
|
693
693
|
noise_guidance = noise_guidance + edit_guidance
|
694
694
|
|
695
695
|
noise_pred = noise_pred_uncond + noise_guidance
|
@@ -705,7 +705,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
705
705
|
# 8. Post-processing
|
706
706
|
if not output_type == "latent":
|
707
707
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
708
|
-
image, has_nsfw_concept = self.run_safety_checker(image,
|
708
|
+
image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
|
709
709
|
else:
|
710
710
|
image = latents
|
711
711
|
has_nsfw_concept = None
|
@@ -474,9 +474,10 @@ class StableDiffusionPipeline(
|
|
474
474
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
475
475
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
476
476
|
|
477
|
-
if
|
478
|
-
|
479
|
-
|
477
|
+
if self.text_encoder is not None:
|
478
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
479
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
480
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
480
481
|
|
481
482
|
return prompt_embeds, negative_prompt_embeds
|
482
483
|
|
@@ -357,9 +357,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
|
|
357
357
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
358
358
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
359
359
|
|
360
|
-
if
|
361
|
-
|
362
|
-
|
360
|
+
if self.text_encoder is not None:
|
361
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
362
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
363
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
363
364
|
|
364
365
|
return prompt_embeds, negative_prompt_embeds
|
365
366
|
|
@@ -545,7 +546,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
|
|
545
546
|
|
546
547
|
if depth_map is None:
|
547
548
|
pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
|
548
|
-
pixel_values = pixel_values.to(device=device)
|
549
|
+
pixel_values = pixel_values.to(device=device, dtype=dtype)
|
549
550
|
# The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
|
550
551
|
# So we use `torch.autocast` here for half precision inference.
|
551
552
|
if torch.backends.mps.is_available():
|
@@ -517,9 +517,10 @@ class StableDiffusionImg2ImgPipeline(
|
|
517
517
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
518
518
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
519
519
|
|
520
|
-
if
|
521
|
-
|
522
|
-
|
520
|
+
if self.text_encoder is not None:
|
521
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
522
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
523
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
523
524
|
|
524
525
|
return prompt_embeds, negative_prompt_embeds
|
525
526
|
|
@@ -589,9 +589,10 @@ class StableDiffusionInpaintPipeline(
|
|
589
589
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
590
590
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
591
591
|
|
592
|
-
if
|
593
|
-
|
594
|
-
|
592
|
+
if self.text_encoder is not None:
|
593
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
594
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
595
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
595
596
|
|
596
597
|
return prompt_embeds, negative_prompt_embeds
|
597
598
|
|
@@ -377,9 +377,10 @@ class StableDiffusionUpscalePipeline(
|
|
377
377
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
378
378
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
379
379
|
|
380
|
-
if
|
381
|
-
|
382
|
-
|
380
|
+
if self.text_encoder is not None:
|
381
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
382
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
383
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
383
384
|
|
384
385
|
return prompt_embeds, negative_prompt_embeds
|
385
386
|
|
@@ -458,9 +458,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
458
458
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
459
459
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
460
460
|
|
461
|
-
if
|
462
|
-
|
463
|
-
|
461
|
+
if self.text_encoder is not None:
|
462
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
463
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
464
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
464
465
|
|
465
466
|
return prompt_embeds, negative_prompt_embeds
|
466
467
|
|
@@ -51,8 +51,8 @@ EXAMPLE_DOC_STRING = """
|
|
51
51
|
>>> from diffusers import StableUnCLIPImg2ImgPipeline
|
52
52
|
|
53
53
|
>>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
|
54
|
-
... "
|
55
|
-
... )
|
54
|
+
... "stabilityai/stable-diffusion-2-1-unclip-small", torch_dtype=torch.float16
|
55
|
+
... )
|
56
56
|
>>> pipe = pipe.to("cuda")
|
57
57
|
|
58
58
|
>>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
@@ -63,7 +63,7 @@ EXAMPLE_DOC_STRING = """
|
|
63
63
|
|
64
64
|
>>> prompt = "A fantasy landscape, trending on artstation"
|
65
65
|
|
66
|
-
>>> images = pipe(
|
66
|
+
>>> images = pipe(init_image, prompt).images
|
67
67
|
>>> images[0].save("fantasy_landscape.png")
|
68
68
|
```
|
69
69
|
"""
|
@@ -422,9 +422,10 @@ class StableUnCLIPImg2ImgPipeline(
|
|
422
422
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
423
423
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
424
424
|
|
425
|
-
if
|
426
|
-
|
427
|
-
|
425
|
+
if self.text_encoder is not None:
|
426
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
427
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
428
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
428
429
|
|
429
430
|
return prompt_embeds, negative_prompt_embeds
|
430
431
|
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_flax_available,
|
9
|
+
is_torch_available,
|
10
|
+
is_transformers_available,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
_dummy_objects = {}
|
15
|
+
_additional_imports = {}
|
16
|
+
_import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
|
17
|
+
|
18
|
+
try:
|
19
|
+
if not (is_transformers_available() and is_torch_available()):
|
20
|
+
raise OptionalDependencyNotAvailable()
|
21
|
+
except OptionalDependencyNotAvailable:
|
22
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
23
|
+
|
24
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
25
|
+
else:
|
26
|
+
_import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
|
27
|
+
_import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
|
28
|
+
|
29
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
30
|
+
try:
|
31
|
+
if not (is_transformers_available() and is_torch_available()):
|
32
|
+
raise OptionalDependencyNotAvailable()
|
33
|
+
except OptionalDependencyNotAvailable:
|
34
|
+
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
|
35
|
+
else:
|
36
|
+
from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
|
37
|
+
from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
|
38
|
+
|
39
|
+
else:
|
40
|
+
import sys
|
41
|
+
|
42
|
+
sys.modules[__name__] = _LazyModule(
|
43
|
+
__name__,
|
44
|
+
globals()["__file__"],
|
45
|
+
_import_structure,
|
46
|
+
module_spec=__spec__,
|
47
|
+
)
|
48
|
+
|
49
|
+
for name, value in _dummy_objects.items():
|
50
|
+
setattr(sys.modules[__name__], name, value)
|
51
|
+
for name, value in _additional_imports.items():
|
52
|
+
setattr(sys.modules[__name__], name, value)
|