diffusers 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. diffusers/__init__.py +15 -1
  2. diffusers/commands/env.py +1 -5
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +2 -1
  5. diffusers/loaders/__init__.py +2 -2
  6. diffusers/loaders/lora.py +406 -140
  7. diffusers/loaders/lora_conversion_utils.py +7 -1
  8. diffusers/loaders/single_file.py +13 -1
  9. diffusers/loaders/single_file_model.py +15 -8
  10. diffusers/loaders/single_file_utils.py +267 -17
  11. diffusers/loaders/unet.py +307 -272
  12. diffusers/models/__init__.py +7 -3
  13. diffusers/models/attention.py +125 -1
  14. diffusers/models/attention_processor.py +169 -1
  15. diffusers/models/autoencoders/__init__.py +1 -0
  16. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  17. diffusers/models/autoencoders/autoencoder_kl.py +17 -6
  18. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
  19. diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
  20. diffusers/models/autoencoders/vq_model.py +182 -0
  21. diffusers/models/controlnet_sd3.py +418 -0
  22. diffusers/models/controlnet_xs.py +6 -6
  23. diffusers/models/embeddings.py +112 -84
  24. diffusers/models/model_loading_utils.py +55 -0
  25. diffusers/models/modeling_utils.py +138 -20
  26. diffusers/models/normalization.py +11 -6
  27. diffusers/models/transformers/__init__.py +1 -0
  28. diffusers/models/transformers/dual_transformer_2d.py +5 -4
  29. diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
  30. diffusers/models/transformers/prior_transformer.py +5 -5
  31. diffusers/models/transformers/transformer_2d.py +2 -2
  32. diffusers/models/transformers/transformer_sd3.py +353 -0
  33. diffusers/models/transformers/transformer_temporal.py +12 -10
  34. diffusers/models/unets/unet_1d.py +3 -3
  35. diffusers/models/unets/unet_2d.py +3 -3
  36. diffusers/models/unets/unet_2d_condition.py +4 -15
  37. diffusers/models/unets/unet_3d_condition.py +5 -17
  38. diffusers/models/unets/unet_i2vgen_xl.py +4 -4
  39. diffusers/models/unets/unet_motion_model.py +4 -4
  40. diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
  41. diffusers/models/vq_model.py +8 -165
  42. diffusers/pipelines/__init__.py +11 -0
  43. diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
  45. diffusers/pipelines/auto_pipeline.py +8 -0
  46. diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
  47. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
  48. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
  49. diffusers/pipelines/controlnet_sd3/__init__.py +53 -0
  50. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1062 -0
  51. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
  52. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  53. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
  54. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
  55. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
  56. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
  57. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
  58. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
  59. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
  60. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
  61. diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
  62. diffusers/pipelines/pia/pipeline_pia.py +4 -3
  63. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  64. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  65. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
  69. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
  70. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
  71. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
  72. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
  73. diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
  74. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  75. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +904 -0
  76. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +941 -0
  77. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
  78. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
  79. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
  80. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
  81. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
  82. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
  83. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
  84. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
  85. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
  86. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
  87. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
  88. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
  89. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  90. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
  91. diffusers/schedulers/__init__.py +2 -0
  92. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  93. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
  94. diffusers/schedulers/scheduling_edm_euler.py +2 -4
  95. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
  96. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  97. diffusers/training_utils.py +4 -4
  98. diffusers/utils/__init__.py +3 -0
  99. diffusers/utils/constants.py +2 -0
  100. diffusers/utils/dummy_pt_objects.py +60 -0
  101. diffusers/utils/dummy_torch_and_transformers_objects.py +45 -0
  102. diffusers/utils/dynamic_modules_utils.py +15 -13
  103. diffusers/utils/hub_utils.py +106 -0
  104. diffusers/utils/import_utils.py +0 -1
  105. diffusers/utils/logging.py +3 -1
  106. diffusers/utils/state_dict_utils.py +2 -0
  107. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/METADATA +3 -3
  108. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/RECORD +112 -112
  109. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/WHEEL +1 -1
  110. diffusers/models/dual_transformer_2d.py +0 -20
  111. diffusers/models/prior_transformer.py +0 -12
  112. diffusers/models/t5_film_transformer.py +0 -70
  113. diffusers/models/transformer_2d.py +0 -25
  114. diffusers/models/transformer_temporal.py +0 -34
  115. diffusers/models/unet_1d.py +0 -26
  116. diffusers/models/unet_1d_blocks.py +0 -203
  117. diffusers/models/unet_2d.py +0 -27
  118. diffusers/models/unet_2d_blocks.py +0 -375
  119. diffusers/models/unet_2d_condition.py +0 -25
  120. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/LICENSE +0 -0
  121. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/entry_points.txt +0 -0
  122. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/top_level.txt +0 -0
@@ -390,9 +390,10 @@ class StableDiffusionControlNetXSPipeline(
390
390
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
391
391
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
392
392
 
393
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
394
- # Retrieve the original scale by scaling back the LoRA layers
395
- unscale_lora_layers(self.text_encoder, lora_scale)
393
+ if self.text_encoder is not None:
394
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
395
+ # Retrieve the original scale by scaling back the LoRA layers
396
+ unscale_lora_layers(self.text_encoder, lora_scale)
396
397
 
397
398
  return prompt_embeds, negative_prompt_embeds
398
399
 
@@ -17,7 +17,7 @@ class IFWatermarker(ModelMixin, ConfigMixin):
17
17
  self.watermark_image_as_pil = None
18
18
 
19
19
  def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
20
- # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
20
+ # Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
21
21
 
22
22
  h = images[0].height
23
23
  w = images[0].width
@@ -456,9 +456,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
456
456
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
457
457
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
458
458
 
459
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
460
- # Retrieve the original scale by scaling back the LoRA layers
461
- unscale_lora_layers(self.text_encoder, lora_scale)
459
+ if self.text_encoder is not None:
460
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
461
+ # Retrieve the original scale by scaling back the LoRA layers
462
+ unscale_lora_layers(self.text_encoder, lora_scale)
462
463
 
463
464
  return prompt_embeds, negative_prompt_embeds
464
465
 
@@ -426,9 +426,10 @@ class StableDiffusionInpaintPipelineLegacy(
426
426
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
427
427
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
428
428
 
429
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
430
- # Retrieve the original scale by scaling back the LoRA layers
431
- unscale_lora_layers(self.text_encoder, lora_scale)
429
+ if self.text_encoder is not None:
430
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
431
+ # Retrieve the original scale by scaling back the LoRA layers
432
+ unscale_lora_layers(self.text_encoder, lora_scale)
432
433
 
433
434
  return prompt_embeds, negative_prompt_embeds
434
435
 
@@ -364,9 +364,10 @@ class StableDiffusionModelEditingPipeline(
364
364
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
365
365
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
366
366
 
367
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
368
- # Retrieve the original scale by scaling back the LoRA layers
369
- unscale_lora_layers(self.text_encoder, lora_scale)
367
+ if self.text_encoder is not None:
368
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
369
+ # Retrieve the original scale by scaling back the LoRA layers
370
+ unscale_lora_layers(self.text_encoder, lora_scale)
370
371
 
371
372
  return prompt_embeds, negative_prompt_embeds
372
373
 
@@ -355,9 +355,10 @@ class StableDiffusionParadigmsPipeline(
355
355
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
356
356
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
357
357
 
358
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
359
- # Retrieve the original scale by scaling back the LoRA layers
360
- unscale_lora_layers(self.text_encoder, lora_scale)
358
+ if self.text_encoder is not None:
359
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
360
+ # Retrieve the original scale by scaling back the LoRA layers
361
+ unscale_lora_layers(self.text_encoder, lora_scale)
361
362
 
362
363
  return prompt_embeds, negative_prompt_embeds
363
364
 
@@ -578,9 +578,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin
578
578
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
579
579
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
580
580
 
581
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
582
- # Retrieve the original scale by scaling back the LoRA layers
583
- unscale_lora_layers(self.text_encoder, lora_scale)
581
+ if self.text_encoder is not None:
582
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
583
+ # Retrieve the original scale by scaling back the LoRA layers
584
+ unscale_lora_layers(self.text_encoder, lora_scale)
584
585
 
585
586
  return prompt_embeds, negative_prompt_embeds
586
587
 
@@ -52,7 +52,9 @@ EXAMPLE_DOC_STRING = """
52
52
  >>> import torch
53
53
  >>> from diffusers import HunyuanDiTPipeline
54
54
 
55
- >>> pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT", torch_dtype=torch.float16)
55
+ >>> pipe = HunyuanDiTPipeline.from_pretrained(
56
+ ... "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
57
+ ... )
56
58
  >>> pipe.to("cuda")
57
59
 
58
60
  >>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
@@ -226,16 +228,22 @@ class HunyuanDiTPipeline(DiffusionPipeline):
226
228
  " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
227
229
  )
228
230
 
229
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
231
+ self.vae_scale_factor = (
232
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
233
+ )
230
234
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
231
235
  self.register_to_config(requires_safety_checker=requires_safety_checker)
232
- self.default_sample_size = self.transformer.config.sample_size
236
+ self.default_sample_size = (
237
+ self.transformer.config.sample_size
238
+ if hasattr(self, "transformer") and self.transformer is not None
239
+ else 128
240
+ )
233
241
 
234
242
  def encode_prompt(
235
243
  self,
236
244
  prompt: str,
237
- device: torch.device,
238
- dtype: torch.dtype,
245
+ device: torch.device = None,
246
+ dtype: torch.dtype = None,
239
247
  num_images_per_prompt: int = 1,
240
248
  do_classifier_free_guidance: bool = True,
241
249
  negative_prompt: Optional[str] = None,
@@ -279,6 +287,17 @@ class HunyuanDiTPipeline(DiffusionPipeline):
279
287
  text_encoder_index (`int`, *optional*):
280
288
  Index of the text encoder to use. `0` for clip and `1` for T5.
281
289
  """
290
+ if dtype is None:
291
+ if self.text_encoder_2 is not None:
292
+ dtype = self.text_encoder_2.dtype
293
+ elif self.transformer is not None:
294
+ dtype = self.transformer.dtype
295
+ else:
296
+ dtype = None
297
+
298
+ if device is None:
299
+ device = self._execution_device
300
+
282
301
  tokenizers = [self.tokenizer, self.tokenizer_2]
283
302
  text_encoders = [self.text_encoder, self.text_encoder_2]
284
303
 
@@ -405,9 +405,10 @@ class LatentConsistencyModelImg2ImgPipeline(
405
405
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
406
406
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
407
407
 
408
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
409
- # Retrieve the original scale by scaling back the LoRA layers
410
- unscale_lora_layers(self.text_encoder, lora_scale)
408
+ if self.text_encoder is not None:
409
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
410
+ # Retrieve the original scale by scaling back the LoRA layers
411
+ unscale_lora_layers(self.text_encoder, lora_scale)
411
412
 
412
413
  return prompt_embeds, negative_prompt_embeds
413
414
 
@@ -389,9 +389,10 @@ class LatentConsistencyModelPipeline(
389
389
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
390
390
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
391
391
 
392
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
393
- # Retrieve the original scale by scaling back the LoRA layers
394
- unscale_lora_layers(self.text_encoder, lora_scale)
392
+ if self.text_encoder is not None:
393
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
394
+ # Retrieve the original scale by scaling back the LoRA layers
395
+ unscale_lora_layers(self.text_encoder, lora_scale)
395
396
 
396
397
  return prompt_embeds, negative_prompt_embeds
397
398
 
@@ -245,9 +245,9 @@ class MarigoldImageProcessor(ConfigMixin):
245
245
  ) -> Union[np.ndarray, torch.Tensor]:
246
246
  """
247
247
  Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
248
- behavior of matplotlib.colormaps, but allows the user to use the most discriminative color map "Spectral"
249
- without having to install or import matplotlib. For all other cases, the function will attempt to use the
250
- native implementation.
248
+ behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
249
+ "binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
250
+ the native implementation.
251
251
 
252
252
  Args:
253
253
  image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
@@ -255,7 +255,7 @@ class MarigoldImageProcessor(ConfigMixin):
255
255
  bytes: Whether to return the output as uint8 or floating point image.
256
256
  _force_method:
257
257
  Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
258
- implementation of the "Spectral" color map (`"custom"`), or rely on autodetection (`None`, default).
258
+ implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
259
259
 
260
260
  Returns:
261
261
  An RGB-colorized tensor corresponding to the input image.
@@ -265,6 +265,26 @@ class MarigoldImageProcessor(ConfigMixin):
265
265
  if _force_method not in (None, "matplotlib", "custom"):
266
266
  raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
267
267
 
268
+ supported_cmaps = {
269
+ "binary": [
270
+ (1.0, 1.0, 1.0),
271
+ (0.0, 0.0, 0.0),
272
+ ],
273
+ "Spectral": [ # Taken from matplotlib/_cm.py
274
+ (0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
275
+ (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
276
+ (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
277
+ (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
278
+ (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
279
+ (1.0, 1.0, 0.74901960784313726),
280
+ (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
281
+ (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
282
+ (0.4, 0.76078431372549016, 0.6470588235294118),
283
+ (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
284
+ (0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
285
+ ],
286
+ }
287
+
268
288
  def method_matplotlib(image, cmap, bytes=False):
269
289
  if is_matplotlib_available():
270
290
  import matplotlib
@@ -298,24 +318,19 @@ class MarigoldImageProcessor(ConfigMixin):
298
318
  else:
299
319
  image = image.float()
300
320
 
301
- if cmap != "Spectral":
302
- raise ValueError("Only 'Spectral' color map is available without installing matplotlib.")
321
+ is_cmap_reversed = cmap.endswith("_r")
322
+ if is_cmap_reversed:
323
+ cmap = cmap[:-2]
303
324
 
304
- _Spectral_data = ( # Taken from matplotlib/_cm.py
305
- (0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
306
- (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
307
- (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
308
- (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
309
- (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
310
- (1.0, 1.0, 0.74901960784313726),
311
- (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
312
- (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
313
- (0.4, 0.76078431372549016, 0.6470588235294118),
314
- (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
315
- (0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
316
- )
325
+ if cmap not in supported_cmaps:
326
+ raise ValueError(
327
+ f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
328
+ )
317
329
 
318
- cmap = torch.tensor(_Spectral_data, dtype=torch.float, device=image.device) # [K,3]
330
+ cmap = supported_cmaps[cmap]
331
+ if is_cmap_reversed:
332
+ cmap = cmap[::-1]
333
+ cmap = torch.tensor(cmap, dtype=torch.float, device=image.device) # [K,3]
319
334
  K = cmap.shape[0]
320
335
 
321
336
  pos = image.clamp(min=0, max=1) * (K - 1)
@@ -375,9 +375,10 @@ class PIAPipeline(
375
375
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
376
376
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
377
377
 
378
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
379
- # Retrieve the original scale by scaling back the LoRA layers
380
- unscale_lora_layers(self.text_encoder, lora_scale)
378
+ if self.text_encoder is not None:
379
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
380
+ # Retrieve the original scale by scaling back the LoRA layers
381
+ unscale_lora_layers(self.text_encoder, lora_scale)
381
382
 
382
383
  return prompt_embeds, negative_prompt_embeds
383
384
 
@@ -394,7 +394,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
394
394
 
395
395
  # get unconditional embeddings for classifier free guidance
396
396
  if do_classifier_free_guidance and negative_prompt_embeds is None:
397
- uncond_tokens = [negative_prompt] * batch_size
397
+ uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
398
398
  uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
399
399
  max_length = prompt_embeds.shape[1]
400
400
  uncond_input = self.tokenizer(
@@ -320,7 +320,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
320
320
 
321
321
  # get unconditional embeddings for classifier free guidance
322
322
  if do_classifier_free_guidance and negative_prompt_embeds is None:
323
- uncond_tokens = [negative_prompt] * batch_size
323
+ uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
324
324
  uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
325
325
  max_length = prompt_embeds.shape[1]
326
326
  uncond_input = self.tokenizer(
@@ -376,6 +376,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
376
376
 
377
377
  # 2. Define call parameters
378
378
  batch_size = 1 if isinstance(prompt, str) else len(prompt)
379
+ device = self._execution_device
379
380
 
380
381
  if editing_prompt:
381
382
  enable_edit_guidance = True
@@ -405,7 +406,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
405
406
  f" {self.tokenizer.model_max_length} tokens: {removed_text}"
406
407
  )
407
408
  text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
408
- text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
409
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
409
410
 
410
411
  # duplicate text embeddings for each generation per prompt, using mps friendly method
411
412
  bs_embed, seq_len, _ = text_embeddings.shape
@@ -433,9 +434,9 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
433
434
  f" {self.tokenizer.model_max_length} tokens: {removed_text}"
434
435
  )
435
436
  edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
436
- edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
437
+ edit_concepts = self.text_encoder(edit_concepts_input_ids.to(device))[0]
437
438
  else:
438
- edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
439
+ edit_concepts = editing_prompt_embeddings.to(device).repeat(batch_size, 1, 1)
439
440
 
440
441
  # duplicate text embeddings for each generation per prompt, using mps friendly method
441
442
  bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
@@ -476,7 +477,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
476
477
  truncation=True,
477
478
  return_tensors="pt",
478
479
  )
479
- uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
480
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
480
481
 
481
482
  # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
482
483
  seq_len = uncond_embeddings.shape[1]
@@ -493,7 +494,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
493
494
  # get the initial random noise unless the user supplied it
494
495
 
495
496
  # 4. Prepare timesteps
496
- self.scheduler.set_timesteps(num_inference_steps, device=self.device)
497
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
497
498
  timesteps = self.scheduler.timesteps
498
499
 
499
500
  # 5. Prepare latent variables
@@ -504,7 +505,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
504
505
  height,
505
506
  width,
506
507
  text_embeddings.dtype,
507
- self.device,
508
+ device,
508
509
  generator,
509
510
  latents,
510
511
  )
@@ -562,12 +563,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
562
563
  if enable_edit_guidance:
563
564
  concept_weights = torch.zeros(
564
565
  (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
565
- device=self.device,
566
+ device=device,
566
567
  dtype=noise_guidance.dtype,
567
568
  )
568
569
  noise_guidance_edit = torch.zeros(
569
570
  (len(noise_pred_edit_concepts), *noise_guidance.shape),
570
- device=self.device,
571
+ device=device,
571
572
  dtype=noise_guidance.dtype,
572
573
  )
573
574
  # noise_guidance_edit = torch.zeros_like(noise_guidance)
@@ -644,21 +645,19 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
644
645
 
645
646
  # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
646
647
 
647
- warmup_inds = torch.tensor(warmup_inds).to(self.device)
648
+ warmup_inds = torch.tensor(warmup_inds).to(device)
648
649
  if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
649
650
  concept_weights = concept_weights.to("cpu") # Offload to cpu
650
651
  noise_guidance_edit = noise_guidance_edit.to("cpu")
651
652
 
652
- concept_weights_tmp = torch.index_select(concept_weights.to(self.device), 0, warmup_inds)
653
+ concept_weights_tmp = torch.index_select(concept_weights.to(device), 0, warmup_inds)
653
654
  concept_weights_tmp = torch.where(
654
655
  concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
655
656
  )
656
657
  concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
657
658
  # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
658
659
 
659
- noise_guidance_edit_tmp = torch.index_select(
660
- noise_guidance_edit.to(self.device), 0, warmup_inds
661
- )
660
+ noise_guidance_edit_tmp = torch.index_select(noise_guidance_edit.to(device), 0, warmup_inds)
662
661
  noise_guidance_edit_tmp = torch.einsum(
663
662
  "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
664
663
  )
@@ -669,8 +668,8 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
669
668
 
670
669
  del noise_guidance_edit_tmp
671
670
  del concept_weights_tmp
672
- concept_weights = concept_weights.to(self.device)
673
- noise_guidance_edit = noise_guidance_edit.to(self.device)
671
+ concept_weights = concept_weights.to(device)
672
+ noise_guidance_edit = noise_guidance_edit.to(device)
674
673
 
675
674
  concept_weights = torch.where(
676
675
  concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
@@ -679,6 +678,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
679
678
  concept_weights = torch.nan_to_num(concept_weights)
680
679
 
681
680
  noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
681
+ noise_guidance_edit = noise_guidance_edit.to(edit_momentum.device)
682
682
 
683
683
  noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
684
684
 
@@ -689,7 +689,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
689
689
  self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
690
690
 
691
691
  if sem_guidance is not None:
692
- edit_guidance = sem_guidance[i].to(self.device)
692
+ edit_guidance = sem_guidance[i].to(device)
693
693
  noise_guidance = noise_guidance + edit_guidance
694
694
 
695
695
  noise_pred = noise_pred_uncond + noise_guidance
@@ -705,7 +705,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
705
705
  # 8. Post-processing
706
706
  if not output_type == "latent":
707
707
  image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
708
- image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
708
+ image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
709
709
  else:
710
710
  image = latents
711
711
  has_nsfw_concept = None
@@ -474,9 +474,10 @@ class StableDiffusionPipeline(
474
474
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
475
475
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
476
476
 
477
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
478
- # Retrieve the original scale by scaling back the LoRA layers
479
- unscale_lora_layers(self.text_encoder, lora_scale)
477
+ if self.text_encoder is not None:
478
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
479
+ # Retrieve the original scale by scaling back the LoRA layers
480
+ unscale_lora_layers(self.text_encoder, lora_scale)
480
481
 
481
482
  return prompt_embeds, negative_prompt_embeds
482
483
 
@@ -357,9 +357,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
357
357
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
358
358
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
359
359
 
360
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
361
- # Retrieve the original scale by scaling back the LoRA layers
362
- unscale_lora_layers(self.text_encoder, lora_scale)
360
+ if self.text_encoder is not None:
361
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
362
+ # Retrieve the original scale by scaling back the LoRA layers
363
+ unscale_lora_layers(self.text_encoder, lora_scale)
363
364
 
364
365
  return prompt_embeds, negative_prompt_embeds
365
366
 
@@ -545,7 +546,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
545
546
 
546
547
  if depth_map is None:
547
548
  pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
548
- pixel_values = pixel_values.to(device=device)
549
+ pixel_values = pixel_values.to(device=device, dtype=dtype)
549
550
  # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
550
551
  # So we use `torch.autocast` here for half precision inference.
551
552
  if torch.backends.mps.is_available():
@@ -517,9 +517,10 @@ class StableDiffusionImg2ImgPipeline(
517
517
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
518
518
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
519
519
 
520
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
521
- # Retrieve the original scale by scaling back the LoRA layers
522
- unscale_lora_layers(self.text_encoder, lora_scale)
520
+ if self.text_encoder is not None:
521
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
522
+ # Retrieve the original scale by scaling back the LoRA layers
523
+ unscale_lora_layers(self.text_encoder, lora_scale)
523
524
 
524
525
  return prompt_embeds, negative_prompt_embeds
525
526
 
@@ -589,9 +589,10 @@ class StableDiffusionInpaintPipeline(
589
589
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
590
590
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
591
591
 
592
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
593
- # Retrieve the original scale by scaling back the LoRA layers
594
- unscale_lora_layers(self.text_encoder, lora_scale)
592
+ if self.text_encoder is not None:
593
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
594
+ # Retrieve the original scale by scaling back the LoRA layers
595
+ unscale_lora_layers(self.text_encoder, lora_scale)
595
596
 
596
597
  return prompt_embeds, negative_prompt_embeds
597
598
 
@@ -377,9 +377,10 @@ class StableDiffusionUpscalePipeline(
377
377
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
378
378
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
379
379
 
380
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
381
- # Retrieve the original scale by scaling back the LoRA layers
382
- unscale_lora_layers(self.text_encoder, lora_scale)
380
+ if self.text_encoder is not None:
381
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
382
+ # Retrieve the original scale by scaling back the LoRA layers
383
+ unscale_lora_layers(self.text_encoder, lora_scale)
383
384
 
384
385
  return prompt_embeds, negative_prompt_embeds
385
386
 
@@ -458,9 +458,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
458
458
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
459
459
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
460
460
 
461
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
462
- # Retrieve the original scale by scaling back the LoRA layers
463
- unscale_lora_layers(self.text_encoder, lora_scale)
461
+ if self.text_encoder is not None:
462
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
463
+ # Retrieve the original scale by scaling back the LoRA layers
464
+ unscale_lora_layers(self.text_encoder, lora_scale)
464
465
 
465
466
  return prompt_embeds, negative_prompt_embeds
466
467
 
@@ -51,8 +51,8 @@ EXAMPLE_DOC_STRING = """
51
51
  >>> from diffusers import StableUnCLIPImg2ImgPipeline
52
52
 
53
53
  >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
54
- ... "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
55
- ... ) # TODO update model path
54
+ ... "stabilityai/stable-diffusion-2-1-unclip-small", torch_dtype=torch.float16
55
+ ... )
56
56
  >>> pipe = pipe.to("cuda")
57
57
 
58
58
  >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
@@ -63,7 +63,7 @@ EXAMPLE_DOC_STRING = """
63
63
 
64
64
  >>> prompt = "A fantasy landscape, trending on artstation"
65
65
 
66
- >>> images = pipe(prompt, init_image).images
66
+ >>> images = pipe(init_image, prompt).images
67
67
  >>> images[0].save("fantasy_landscape.png")
68
68
  ```
69
69
  """
@@ -422,9 +422,10 @@ class StableUnCLIPImg2ImgPipeline(
422
422
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
423
423
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
424
424
 
425
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
426
- # Retrieve the original scale by scaling back the LoRA layers
427
- unscale_lora_layers(self.text_encoder, lora_scale)
425
+ if self.text_encoder is not None:
426
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
427
+ # Retrieve the original scale by scaling back the LoRA layers
428
+ unscale_lora_layers(self.text_encoder, lora_scale)
428
429
 
429
430
  return prompt_embeds, negative_prompt_embeds
430
431
 
@@ -0,0 +1,52 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_flax_available,
9
+ is_torch_available,
10
+ is_transformers_available,
11
+ )
12
+
13
+
14
+ _dummy_objects = {}
15
+ _additional_imports = {}
16
+ _import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
17
+
18
+ try:
19
+ if not (is_transformers_available() and is_torch_available()):
20
+ raise OptionalDependencyNotAvailable()
21
+ except OptionalDependencyNotAvailable:
22
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
23
+
24
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
25
+ else:
26
+ _import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
27
+ _import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
28
+
29
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
30
+ try:
31
+ if not (is_transformers_available() and is_torch_available()):
32
+ raise OptionalDependencyNotAvailable()
33
+ except OptionalDependencyNotAvailable:
34
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
35
+ else:
36
+ from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
37
+ from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
38
+
39
+ else:
40
+ import sys
41
+
42
+ sys.modules[__name__] = _LazyModule(
43
+ __name__,
44
+ globals()["__file__"],
45
+ _import_structure,
46
+ module_spec=__spec__,
47
+ )
48
+
49
+ for name, value in _dummy_objects.items():
50
+ setattr(sys.modules[__name__], name, value)
51
+ for name, value in _additional_imports.items():
52
+ setattr(sys.modules[__name__], name, value)