diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. diffusers/__init__.py +9 -1
  2. diffusers/commands/env.py +1 -5
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +2 -1
  5. diffusers/loaders/__init__.py +2 -2
  6. diffusers/loaders/lora.py +406 -140
  7. diffusers/loaders/lora_conversion_utils.py +7 -1
  8. diffusers/loaders/single_file.py +1 -1
  9. diffusers/loaders/single_file_model.py +5 -0
  10. diffusers/loaders/single_file_utils.py +242 -2
  11. diffusers/loaders/unet.py +307 -272
  12. diffusers/models/__init__.py +5 -3
  13. diffusers/models/attention.py +125 -1
  14. diffusers/models/attention_processor.py +169 -1
  15. diffusers/models/autoencoders/__init__.py +1 -0
  16. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  17. diffusers/models/autoencoders/autoencoder_kl.py +17 -6
  18. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
  19. diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
  20. diffusers/models/autoencoders/vq_model.py +182 -0
  21. diffusers/models/controlnet_xs.py +6 -6
  22. diffusers/models/embeddings.py +112 -84
  23. diffusers/models/model_loading_utils.py +55 -0
  24. diffusers/models/modeling_utils.py +128 -17
  25. diffusers/models/normalization.py +11 -6
  26. diffusers/models/transformers/__init__.py +1 -0
  27. diffusers/models/transformers/dual_transformer_2d.py +5 -4
  28. diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
  29. diffusers/models/transformers/prior_transformer.py +5 -5
  30. diffusers/models/transformers/transformer_2d.py +2 -2
  31. diffusers/models/transformers/transformer_sd3.py +344 -0
  32. diffusers/models/transformers/transformer_temporal.py +12 -10
  33. diffusers/models/unets/unet_1d.py +3 -3
  34. diffusers/models/unets/unet_2d.py +3 -3
  35. diffusers/models/unets/unet_2d_condition.py +4 -15
  36. diffusers/models/unets/unet_3d_condition.py +5 -17
  37. diffusers/models/unets/unet_i2vgen_xl.py +4 -4
  38. diffusers/models/unets/unet_motion_model.py +4 -4
  39. diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
  40. diffusers/models/vq_model.py +8 -165
  41. diffusers/pipelines/__init__.py +2 -0
  42. diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
  43. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
  44. diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
  45. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
  46. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
  47. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
  48. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  49. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
  50. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
  51. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
  52. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
  53. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
  54. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
  55. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
  56. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
  57. diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
  58. diffusers/pipelines/pia/pipeline_pia.py +4 -3
  59. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  60. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  61. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
  65. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
  69. diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
  70. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  71. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +886 -0
  72. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +923 -0
  73. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
  74. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
  75. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
  76. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
  77. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
  78. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
  79. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
  80. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
  81. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
  82. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
  83. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
  84. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
  85. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  86. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
  87. diffusers/schedulers/__init__.py +2 -0
  88. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  89. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
  90. diffusers/schedulers/scheduling_edm_euler.py +2 -4
  91. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
  92. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  93. diffusers/training_utils.py +4 -4
  94. diffusers/utils/__init__.py +3 -0
  95. diffusers/utils/constants.py +2 -0
  96. diffusers/utils/dummy_pt_objects.py +30 -0
  97. diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0
  98. diffusers/utils/dynamic_modules_utils.py +15 -13
  99. diffusers/utils/hub_utils.py +106 -0
  100. diffusers/utils/import_utils.py +0 -1
  101. diffusers/utils/logging.py +3 -1
  102. diffusers/utils/state_dict_utils.py +2 -0
  103. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/METADATA +45 -45
  104. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/RECORD +108 -111
  105. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/WHEEL +1 -1
  106. diffusers/models/dual_transformer_2d.py +0 -20
  107. diffusers/models/prior_transformer.py +0 -12
  108. diffusers/models/t5_film_transformer.py +0 -70
  109. diffusers/models/transformer_2d.py +0 -25
  110. diffusers/models/transformer_temporal.py +0 -34
  111. diffusers/models/unet_1d.py +0 -26
  112. diffusers/models/unet_1d_blocks.py +0 -203
  113. diffusers/models/unet_2d.py +0 -27
  114. diffusers/models/unet_2d_blocks.py +0 -375
  115. diffusers/models/unet_2d_condition.py +0 -25
  116. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/LICENSE +0 -0
  117. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/entry_points.txt +0 -0
  118. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/top_level.txt +0 -0
@@ -11,172 +11,15 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from dataclasses import dataclass
15
- from typing import Optional, Tuple, Union
14
+ from ..utils import deprecate
15
+ from .autoencoders.vq_model import VQEncoderOutput, VQModel
16
16
 
17
- import torch
18
- import torch.nn as nn
19
17
 
20
- from ..configuration_utils import ConfigMixin, register_to_config
21
- from ..utils import BaseOutput
22
- from ..utils.accelerate_utils import apply_forward_hook
23
- from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
24
- from .modeling_utils import ModelMixin
18
+ class VQEncoderOutput(VQEncoderOutput):
19
+ deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
20
+ deprecate("VQEncoderOutput", "0.31", deprecation_message)
25
21
 
26
22
 
27
- @dataclass
28
- class VQEncoderOutput(BaseOutput):
29
- """
30
- Output of VQModel encoding method.
31
-
32
- Args:
33
- latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
34
- The encoded output sample from the last layer of the model.
35
- """
36
-
37
- latents: torch.Tensor
38
-
39
-
40
- class VQModel(ModelMixin, ConfigMixin):
41
- r"""
42
- A VQ-VAE model for decoding latent representations.
43
-
44
- This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
45
- for all models (such as downloading or saving).
46
-
47
- Parameters:
48
- in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
49
- out_channels (int, *optional*, defaults to 3): Number of channels in the output.
50
- down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
51
- Tuple of downsample block types.
52
- up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
53
- Tuple of upsample block types.
54
- block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
55
- Tuple of block output channels.
56
- layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
57
- act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
58
- latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
59
- sample_size (`int`, *optional*, defaults to `32`): Sample input size.
60
- num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
61
- norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
62
- vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
63
- scaling_factor (`float`, *optional*, defaults to `0.18215`):
64
- The component-wise standard deviation of the trained latent space computed using the first batch of the
65
- training set. This is used to scale the latent space to have unit variance when training the diffusion
66
- model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
67
- diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
68
- / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
69
- Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
70
- norm_type (`str`, *optional*, defaults to `"group"`):
71
- Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
72
- """
73
-
74
- @register_to_config
75
- def __init__(
76
- self,
77
- in_channels: int = 3,
78
- out_channels: int = 3,
79
- down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
80
- up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
81
- block_out_channels: Tuple[int, ...] = (64,),
82
- layers_per_block: int = 1,
83
- act_fn: str = "silu",
84
- latent_channels: int = 3,
85
- sample_size: int = 32,
86
- num_vq_embeddings: int = 256,
87
- norm_num_groups: int = 32,
88
- vq_embed_dim: Optional[int] = None,
89
- scaling_factor: float = 0.18215,
90
- norm_type: str = "group", # group, spatial
91
- mid_block_add_attention=True,
92
- lookup_from_codebook=False,
93
- force_upcast=False,
94
- ):
95
- super().__init__()
96
-
97
- # pass init params to Encoder
98
- self.encoder = Encoder(
99
- in_channels=in_channels,
100
- out_channels=latent_channels,
101
- down_block_types=down_block_types,
102
- block_out_channels=block_out_channels,
103
- layers_per_block=layers_per_block,
104
- act_fn=act_fn,
105
- norm_num_groups=norm_num_groups,
106
- double_z=False,
107
- mid_block_add_attention=mid_block_add_attention,
108
- )
109
-
110
- vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
111
-
112
- self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
113
- self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
114
- self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
115
-
116
- # pass init params to Decoder
117
- self.decoder = Decoder(
118
- in_channels=latent_channels,
119
- out_channels=out_channels,
120
- up_block_types=up_block_types,
121
- block_out_channels=block_out_channels,
122
- layers_per_block=layers_per_block,
123
- act_fn=act_fn,
124
- norm_num_groups=norm_num_groups,
125
- norm_type=norm_type,
126
- mid_block_add_attention=mid_block_add_attention,
127
- )
128
-
129
- @apply_forward_hook
130
- def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
131
- h = self.encoder(x)
132
- h = self.quant_conv(h)
133
-
134
- if not return_dict:
135
- return (h,)
136
-
137
- return VQEncoderOutput(latents=h)
138
-
139
- @apply_forward_hook
140
- def decode(
141
- self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
142
- ) -> Union[DecoderOutput, torch.Tensor]:
143
- # also go through quantization layer
144
- if not force_not_quantize:
145
- quant, commit_loss, _ = self.quantize(h)
146
- elif self.config.lookup_from_codebook:
147
- quant = self.quantize.get_codebook_entry(h, shape)
148
- commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
149
- else:
150
- quant = h
151
- commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
152
- quant2 = self.post_quant_conv(quant)
153
- dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
154
-
155
- if not return_dict:
156
- return dec, commit_loss
157
-
158
- return DecoderOutput(sample=dec, commit_loss=commit_loss)
159
-
160
- def forward(
161
- self, sample: torch.Tensor, return_dict: bool = True
162
- ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
163
- r"""
164
- The [`VQModel`] forward method.
165
-
166
- Args:
167
- sample (`torch.Tensor`): Input sample.
168
- return_dict (`bool`, *optional*, defaults to `True`):
169
- Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
170
-
171
- Returns:
172
- [`~models.vq_model.VQEncoderOutput`] or `tuple`:
173
- If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
174
- is returned.
175
- """
176
-
177
- h = self.encode(sample).latents
178
- dec = self.decode(h)
179
-
180
- if not return_dict:
181
- return dec.sample, dec.commit_loss
182
- return dec
23
+ class VQModel(VQModel):
24
+ deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
25
+ deprecate("VQModel", "0.31", deprecation_message)
@@ -220,6 +220,7 @@ else:
220
220
  "StableDiffusionLDM3DPipeline",
221
221
  ]
222
222
  )
223
+ _import_structure["stable_diffusion_3"] = ["StableDiffusion3Pipeline", "StableDiffusion3Img2ImgPipeline"]
223
224
  _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
224
225
  _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
225
226
  _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
@@ -485,6 +486,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
485
486
  StableUnCLIPImg2ImgPipeline,
486
487
  StableUnCLIPPipeline,
487
488
  )
489
+ from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
488
490
  from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
489
491
  from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
490
492
  from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
@@ -316,9 +316,10 @@ class AnimateDiffPipeline(
316
316
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
317
317
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
318
318
 
319
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
320
- # Retrieve the original scale by scaling back the LoRA layers
321
- unscale_lora_layers(self.text_encoder, lora_scale)
319
+ if self.text_encoder is not None:
320
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
321
+ # Retrieve the original scale by scaling back the LoRA layers
322
+ unscale_lora_layers(self.text_encoder, lora_scale)
322
323
 
323
324
  return prompt_embeds, negative_prompt_embeds
324
325
 
@@ -420,9 +420,10 @@ class AnimateDiffVideoToVideoPipeline(
420
420
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
421
421
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
422
422
 
423
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
424
- # Retrieve the original scale by scaling back the LoRA layers
425
- unscale_lora_layers(self.text_encoder, lora_scale)
423
+ if self.text_encoder is not None:
424
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
425
+ # Retrieve the original scale by scaling back the LoRA layers
426
+ unscale_lora_layers(self.text_encoder, lora_scale)
426
427
 
427
428
  return prompt_embeds, negative_prompt_embeds
428
429
 
@@ -463,9 +463,10 @@ class StableDiffusionControlNetPipeline(
463
463
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
464
464
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
465
465
 
466
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
467
- # Retrieve the original scale by scaling back the LoRA layers
468
- unscale_lora_layers(self.text_encoder, lora_scale)
466
+ if self.text_encoder is not None:
467
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
468
+ # Retrieve the original scale by scaling back the LoRA layers
469
+ unscale_lora_layers(self.text_encoder, lora_scale)
469
470
 
470
471
  return prompt_embeds, negative_prompt_embeds
471
472
 
@@ -441,9 +441,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
441
441
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
442
442
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
443
443
 
444
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
445
- # Retrieve the original scale by scaling back the LoRA layers
446
- unscale_lora_layers(self.text_encoder, lora_scale)
444
+ if self.text_encoder is not None:
445
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
446
+ # Retrieve the original scale by scaling back the LoRA layers
447
+ unscale_lora_layers(self.text_encoder, lora_scale)
447
448
 
448
449
  return prompt_embeds, negative_prompt_embeds
449
450
 
@@ -566,9 +566,10 @@ class StableDiffusionControlNetInpaintPipeline(
566
566
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
567
567
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
568
568
 
569
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
570
- # Retrieve the original scale by scaling back the LoRA layers
571
- unscale_lora_layers(self.text_encoder, lora_scale)
569
+ if self.text_encoder is not None:
570
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
571
+ # Retrieve the original scale by scaling back the LoRA layers
572
+ unscale_lora_layers(self.text_encoder, lora_scale)
572
573
 
573
574
  return prompt_embeds, negative_prompt_embeds
574
575
 
@@ -390,9 +390,10 @@ class StableDiffusionControlNetXSPipeline(
390
390
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
391
391
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
392
392
 
393
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
394
- # Retrieve the original scale by scaling back the LoRA layers
395
- unscale_lora_layers(self.text_encoder, lora_scale)
393
+ if self.text_encoder is not None:
394
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
395
+ # Retrieve the original scale by scaling back the LoRA layers
396
+ unscale_lora_layers(self.text_encoder, lora_scale)
396
397
 
397
398
  return prompt_embeds, negative_prompt_embeds
398
399
 
@@ -17,7 +17,7 @@ class IFWatermarker(ModelMixin, ConfigMixin):
17
17
  self.watermark_image_as_pil = None
18
18
 
19
19
  def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
20
- # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
20
+ # Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
21
21
 
22
22
  h = images[0].height
23
23
  w = images[0].width
@@ -456,9 +456,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
456
456
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
457
457
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
458
458
 
459
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
460
- # Retrieve the original scale by scaling back the LoRA layers
461
- unscale_lora_layers(self.text_encoder, lora_scale)
459
+ if self.text_encoder is not None:
460
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
461
+ # Retrieve the original scale by scaling back the LoRA layers
462
+ unscale_lora_layers(self.text_encoder, lora_scale)
462
463
 
463
464
  return prompt_embeds, negative_prompt_embeds
464
465
 
@@ -426,9 +426,10 @@ class StableDiffusionInpaintPipelineLegacy(
426
426
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
427
427
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
428
428
 
429
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
430
- # Retrieve the original scale by scaling back the LoRA layers
431
- unscale_lora_layers(self.text_encoder, lora_scale)
429
+ if self.text_encoder is not None:
430
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
431
+ # Retrieve the original scale by scaling back the LoRA layers
432
+ unscale_lora_layers(self.text_encoder, lora_scale)
432
433
 
433
434
  return prompt_embeds, negative_prompt_embeds
434
435
 
@@ -364,9 +364,10 @@ class StableDiffusionModelEditingPipeline(
364
364
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
365
365
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
366
366
 
367
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
368
- # Retrieve the original scale by scaling back the LoRA layers
369
- unscale_lora_layers(self.text_encoder, lora_scale)
367
+ if self.text_encoder is not None:
368
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
369
+ # Retrieve the original scale by scaling back the LoRA layers
370
+ unscale_lora_layers(self.text_encoder, lora_scale)
370
371
 
371
372
  return prompt_embeds, negative_prompt_embeds
372
373
 
@@ -355,9 +355,10 @@ class StableDiffusionParadigmsPipeline(
355
355
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
356
356
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
357
357
 
358
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
359
- # Retrieve the original scale by scaling back the LoRA layers
360
- unscale_lora_layers(self.text_encoder, lora_scale)
358
+ if self.text_encoder is not None:
359
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
360
+ # Retrieve the original scale by scaling back the LoRA layers
361
+ unscale_lora_layers(self.text_encoder, lora_scale)
361
362
 
362
363
  return prompt_embeds, negative_prompt_embeds
363
364
 
@@ -578,9 +578,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin
578
578
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
579
579
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
580
580
 
581
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
582
- # Retrieve the original scale by scaling back the LoRA layers
583
- unscale_lora_layers(self.text_encoder, lora_scale)
581
+ if self.text_encoder is not None:
582
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
583
+ # Retrieve the original scale by scaling back the LoRA layers
584
+ unscale_lora_layers(self.text_encoder, lora_scale)
584
585
 
585
586
  return prompt_embeds, negative_prompt_embeds
586
587
 
@@ -52,7 +52,9 @@ EXAMPLE_DOC_STRING = """
52
52
  >>> import torch
53
53
  >>> from diffusers import HunyuanDiTPipeline
54
54
 
55
- >>> pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT", torch_dtype=torch.float16)
55
+ >>> pipe = HunyuanDiTPipeline.from_pretrained(
56
+ ... "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
57
+ ... )
56
58
  >>> pipe.to("cuda")
57
59
 
58
60
  >>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
@@ -226,16 +228,22 @@ class HunyuanDiTPipeline(DiffusionPipeline):
226
228
  " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
227
229
  )
228
230
 
229
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
231
+ self.vae_scale_factor = (
232
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
233
+ )
230
234
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
231
235
  self.register_to_config(requires_safety_checker=requires_safety_checker)
232
- self.default_sample_size = self.transformer.config.sample_size
236
+ self.default_sample_size = (
237
+ self.transformer.config.sample_size
238
+ if hasattr(self, "transformer") and self.transformer is not None
239
+ else 128
240
+ )
233
241
 
234
242
  def encode_prompt(
235
243
  self,
236
244
  prompt: str,
237
- device: torch.device,
238
- dtype: torch.dtype,
245
+ device: torch.device = None,
246
+ dtype: torch.dtype = None,
239
247
  num_images_per_prompt: int = 1,
240
248
  do_classifier_free_guidance: bool = True,
241
249
  negative_prompt: Optional[str] = None,
@@ -279,6 +287,17 @@ class HunyuanDiTPipeline(DiffusionPipeline):
279
287
  text_encoder_index (`int`, *optional*):
280
288
  Index of the text encoder to use. `0` for clip and `1` for T5.
281
289
  """
290
+ if dtype is None:
291
+ if self.text_encoder_2 is not None:
292
+ dtype = self.text_encoder_2.dtype
293
+ elif self.transformer is not None:
294
+ dtype = self.transformer.dtype
295
+ else:
296
+ dtype = None
297
+
298
+ if device is None:
299
+ device = self._execution_device
300
+
282
301
  tokenizers = [self.tokenizer, self.tokenizer_2]
283
302
  text_encoders = [self.text_encoder, self.text_encoder_2]
284
303
 
@@ -405,9 +405,10 @@ class LatentConsistencyModelImg2ImgPipeline(
405
405
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
406
406
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
407
407
 
408
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
409
- # Retrieve the original scale by scaling back the LoRA layers
410
- unscale_lora_layers(self.text_encoder, lora_scale)
408
+ if self.text_encoder is not None:
409
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
410
+ # Retrieve the original scale by scaling back the LoRA layers
411
+ unscale_lora_layers(self.text_encoder, lora_scale)
411
412
 
412
413
  return prompt_embeds, negative_prompt_embeds
413
414
 
@@ -389,9 +389,10 @@ class LatentConsistencyModelPipeline(
389
389
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
390
390
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
391
391
 
392
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
393
- # Retrieve the original scale by scaling back the LoRA layers
394
- unscale_lora_layers(self.text_encoder, lora_scale)
392
+ if self.text_encoder is not None:
393
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
394
+ # Retrieve the original scale by scaling back the LoRA layers
395
+ unscale_lora_layers(self.text_encoder, lora_scale)
395
396
 
396
397
  return prompt_embeds, negative_prompt_embeds
397
398
 
@@ -245,9 +245,9 @@ class MarigoldImageProcessor(ConfigMixin):
245
245
  ) -> Union[np.ndarray, torch.Tensor]:
246
246
  """
247
247
  Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
248
- behavior of matplotlib.colormaps, but allows the user to use the most discriminative color map "Spectral"
249
- without having to install or import matplotlib. For all other cases, the function will attempt to use the
250
- native implementation.
248
+ behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
249
+ "binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
250
+ the native implementation.
251
251
 
252
252
  Args:
253
253
  image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
@@ -255,7 +255,7 @@ class MarigoldImageProcessor(ConfigMixin):
255
255
  bytes: Whether to return the output as uint8 or floating point image.
256
256
  _force_method:
257
257
  Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
258
- implementation of the "Spectral" color map (`"custom"`), or rely on autodetection (`None`, default).
258
+ implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
259
259
 
260
260
  Returns:
261
261
  An RGB-colorized tensor corresponding to the input image.
@@ -265,6 +265,26 @@ class MarigoldImageProcessor(ConfigMixin):
265
265
  if _force_method not in (None, "matplotlib", "custom"):
266
266
  raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
267
267
 
268
+ supported_cmaps = {
269
+ "binary": [
270
+ (1.0, 1.0, 1.0),
271
+ (0.0, 0.0, 0.0),
272
+ ],
273
+ "Spectral": [ # Taken from matplotlib/_cm.py
274
+ (0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
275
+ (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
276
+ (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
277
+ (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
278
+ (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
279
+ (1.0, 1.0, 0.74901960784313726),
280
+ (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
281
+ (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
282
+ (0.4, 0.76078431372549016, 0.6470588235294118),
283
+ (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
284
+ (0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
285
+ ],
286
+ }
287
+
268
288
  def method_matplotlib(image, cmap, bytes=False):
269
289
  if is_matplotlib_available():
270
290
  import matplotlib
@@ -298,24 +318,19 @@ class MarigoldImageProcessor(ConfigMixin):
298
318
  else:
299
319
  image = image.float()
300
320
 
301
- if cmap != "Spectral":
302
- raise ValueError("Only 'Spectral' color map is available without installing matplotlib.")
321
+ is_cmap_reversed = cmap.endswith("_r")
322
+ if is_cmap_reversed:
323
+ cmap = cmap[:-2]
303
324
 
304
- _Spectral_data = ( # Taken from matplotlib/_cm.py
305
- (0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
306
- (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
307
- (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
308
- (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
309
- (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
310
- (1.0, 1.0, 0.74901960784313726),
311
- (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
312
- (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
313
- (0.4, 0.76078431372549016, 0.6470588235294118),
314
- (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
315
- (0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
316
- )
325
+ if cmap not in supported_cmaps:
326
+ raise ValueError(
327
+ f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
328
+ )
317
329
 
318
- cmap = torch.tensor(_Spectral_data, dtype=torch.float, device=image.device) # [K,3]
330
+ cmap = supported_cmaps[cmap]
331
+ if is_cmap_reversed:
332
+ cmap = cmap[::-1]
333
+ cmap = torch.tensor(cmap, dtype=torch.float, device=image.device) # [K,3]
319
334
  K = cmap.shape[0]
320
335
 
321
336
  pos = image.clamp(min=0, max=1) * (K - 1)
@@ -375,9 +375,10 @@ class PIAPipeline(
375
375
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
376
376
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
377
377
 
378
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
379
- # Retrieve the original scale by scaling back the LoRA layers
380
- unscale_lora_layers(self.text_encoder, lora_scale)
378
+ if self.text_encoder is not None:
379
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
380
+ # Retrieve the original scale by scaling back the LoRA layers
381
+ unscale_lora_layers(self.text_encoder, lora_scale)
381
382
 
382
383
  return prompt_embeds, negative_prompt_embeds
383
384
 
@@ -394,7 +394,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
394
394
 
395
395
  # get unconditional embeddings for classifier free guidance
396
396
  if do_classifier_free_guidance and negative_prompt_embeds is None:
397
- uncond_tokens = [negative_prompt] * batch_size
397
+ uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
398
398
  uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
399
399
  max_length = prompt_embeds.shape[1]
400
400
  uncond_input = self.tokenizer(
@@ -320,7 +320,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
320
320
 
321
321
  # get unconditional embeddings for classifier free guidance
322
322
  if do_classifier_free_guidance and negative_prompt_embeds is None:
323
- uncond_tokens = [negative_prompt] * batch_size
323
+ uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
324
324
  uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
325
325
  max_length = prompt_embeds.shape[1]
326
326
  uncond_input = self.tokenizer(