PyPI - diffusers - Versions diffs - 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

diffusers 0.28.2py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

diffusers/models/vq_model.py CHANGED Viewed

@@ -11,172 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from ..utils import deprecate
+from .autoencoders.vq_model import VQEncoderOutput, VQModel
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
-from .modeling_utils import ModelMixin
+class VQEncoderOutput(VQEncoderOutput):
+    deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
+    deprecate("VQEncoderOutput", "0.31", deprecation_message)
-@dataclass
-class VQEncoderOutput(BaseOutput):
-    """
-    Output of VQModel encoding method.
-    Args:
-        latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            The encoded output sample from the last layer of the model.
-    """
-    latents: torch.Tensor
-class VQModel(ModelMixin, ConfigMixin):
-    r"""
-    A VQ-VAE model for decoding latent representations.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
-        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
-        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
-        scaling_factor (`float`, *optional*, defaults to `0.18215`):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-        norm_type (`str`, *optional*, defaults to `"group"`):
-            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int, ...] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 3,
-        sample_size: int = 32,
-        num_vq_embeddings: int = 256,
-        norm_num_groups: int = 32,
-        vq_embed_dim: Optional[int] = None,
-        scaling_factor: float = 0.18215,
-        norm_type: str = "group",  # group, spatial
-        mid_block_add_attention=True,
-        lookup_from_codebook=False,
-        force_upcast=False,
-    ):
-        super().__init__()
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=False,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
-        self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
-        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
-        self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            norm_type=norm_type,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-    @apply_forward_hook
-    def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        if not return_dict:
-            return (h,)
-        return VQEncoderOutput(latents=h)
-    @apply_forward_hook
-    def decode(
-        self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, commit_loss, _ = self.quantize(h)
-        elif self.config.lookup_from_codebook:
-            quant = self.quantize.get_codebook_entry(h, shape)
-            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
-        else:
-            quant = h
-            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
-        quant2 = self.post_quant_conv(quant)
-        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
-        if not return_dict:
-            return dec, commit_loss
-        return DecoderOutput(sample=dec, commit_loss=commit_loss)
-    def forward(
-        self, sample: torch.Tensor, return_dict: bool = True
-    ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
-        r"""
-        The [`VQModel`] forward method.
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vq_model.VQEncoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
-                is returned.
-        """
-        h = self.encode(sample).latents
-        dec = self.decode(h)
-        if not return_dict:
-            return dec.sample, dec.commit_loss
-        return dec
+class VQModel(VQModel):
+    deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
+    deprecate("VQModel", "0.31", deprecation_message)

diffusers/pipelines/__init__.py CHANGED Viewed

@@ -220,6 +220,7 @@ else:
             "StableDiffusionLDM3DPipeline",
         ]
     )
+    _import_structure["stable_diffusion_3"] = ["StableDiffusion3Pipeline", "StableDiffusion3Img2ImgPipeline"]
     _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
     _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
     _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
@@ -485,6 +486,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableUnCLIPImg2ImgPipeline,
             StableUnCLIPPipeline,
         )
+        from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
         from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
         from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
         from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline

diffusers/pipelines/animatediff/pipeline_animatediff.py CHANGED Viewed

@@ -316,9 +316,10 @@ class AnimateDiffPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py CHANGED Viewed

@@ -420,9 +420,10 @@ class AnimateDiffVideoToVideoPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/controlnet/pipeline_controlnet.py CHANGED Viewed

@@ -463,9 +463,10 @@ class StableDiffusionControlNetPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py CHANGED Viewed

@@ -441,9 +441,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py CHANGED Viewed

@@ -566,9 +566,10 @@ class StableDiffusionControlNetInpaintPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py CHANGED Viewed

@@ -390,9 +390,10 @@ class StableDiffusionControlNetXSPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/deepfloyd_if/watermark.py CHANGED Viewed

@@ -17,7 +17,7 @@ class IFWatermarker(ModelMixin, ConfigMixin):
         self.watermark_image_as_pil = None
     def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
-        # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
+        # Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
         h = images[0].height
         w = images[0].width

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py CHANGED Viewed

@@ -456,9 +456,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py CHANGED Viewed

@@ -426,9 +426,10 @@ class StableDiffusionInpaintPipelineLegacy(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py CHANGED Viewed

@@ -364,9 +364,10 @@ class StableDiffusionModelEditingPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py CHANGED Viewed

@@ -355,9 +355,10 @@ class StableDiffusionParadigmsPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py CHANGED Viewed

@@ -578,9 +578,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py CHANGED Viewed

@@ -52,7 +52,9 @@ EXAMPLE_DOC_STRING = """
         >>> import torch
         >>> from diffusers import HunyuanDiTPipeline
-        >>> pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT", torch_dtype=torch.float16)
+        >>> pipe = HunyuanDiTPipeline.from_pretrained(
+        ...     "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
+        ... )
         >>> pipe.to("cuda")
         >>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
@@ -226,16 +228,22 @@ class HunyuanDiTPipeline(DiffusionPipeline):
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.default_sample_size = self.transformer.config.sample_size
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
     def encode_prompt(
         self,
         prompt: str,
-        device: torch.device,
-        dtype: torch.dtype,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
         num_images_per_prompt: int = 1,
         do_classifier_free_guidance: bool = True,
         negative_prompt: Optional[str] = None,
@@ -279,6 +287,17 @@ class HunyuanDiTPipeline(DiffusionPipeline):
             text_encoder_index (`int`, *optional*):
                 Index of the text encoder to use. `0` for clip and `1` for T5.
         """
+        if dtype is None:
+            if self.text_encoder_2 is not None:
+                dtype = self.text_encoder_2.dtype
+            elif self.transformer is not None:
+                dtype = self.transformer.dtype
+            else:
+                dtype = None
+        if device is None:
+            device = self._execution_device
         tokenizers = [self.tokenizer, self.tokenizer_2]
         text_encoders = [self.text_encoder, self.text_encoder_2]

diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py CHANGED Viewed

@@ -405,9 +405,10 @@ class LatentConsistencyModelImg2ImgPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py CHANGED Viewed

@@ -389,9 +389,10 @@ class LatentConsistencyModelPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/marigold/marigold_image_processing.py CHANGED Viewed

@@ -245,9 +245,9 @@ class MarigoldImageProcessor(ConfigMixin):
     ) -> Union[np.ndarray, torch.Tensor]:
         """
         Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
-        behavior of matplotlib.colormaps, but allows the user to use the most discriminative color map "Spectral"
-        without having to install or import matplotlib. For all other cases, the function will attempt to use the
-        native implementation.
+        behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
+        "binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
+        the native implementation.
         Args:
             image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
@@ -255,7 +255,7 @@ class MarigoldImageProcessor(ConfigMixin):
             bytes: Whether to return the output as uint8 or floating point image.
             _force_method:
                 Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
-                implementation of the "Spectral" color map (`"custom"`), or rely on autodetection (`None`, default).
+                implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
         Returns:
             An RGB-colorized tensor corresponding to the input image.
@@ -265,6 +265,26 @@ class MarigoldImageProcessor(ConfigMixin):
         if _force_method not in (None, "matplotlib", "custom"):
             raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
+        supported_cmaps = {
+            "binary": [
+                (1.0, 1.0, 1.0),
+                (0.0, 0.0, 0.0),
+            ],
+            "Spectral": [  # Taken from matplotlib/_cm.py
+                (0.61960784313725492, 0.003921568627450980, 0.25882352941176473),  # 0.0 -> [0]
+                (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
+                (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
+                (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
+                (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
+                (1.0, 1.0, 0.74901960784313726),
+                (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
+                (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
+                (0.4, 0.76078431372549016, 0.6470588235294118),
+                (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
+                (0.36862745098039218, 0.30980392156862746, 0.63529411764705879),  # 1.0 -> [K-1]
+            ],
+        }
         def method_matplotlib(image, cmap, bytes=False):
             if is_matplotlib_available():
                 import matplotlib
@@ -298,24 +318,19 @@ class MarigoldImageProcessor(ConfigMixin):
             else:
                 image = image.float()
-            if cmap != "Spectral":
-                raise ValueError("Only 'Spectral' color map is available without installing matplotlib.")
+            is_cmap_reversed = cmap.endswith("_r")
+            if is_cmap_reversed:
+                cmap = cmap[:-2]
-            _Spectral_data = (  # Taken from matplotlib/_cm.py
-                (0.61960784313725492, 0.003921568627450980, 0.25882352941176473),  # 0.0 -> [0]
-                (0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
-                (0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
-                (0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
-                (0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
-                (1.0, 1.0, 0.74901960784313726),
-                (0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
-                (0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
-                (0.4, 0.76078431372549016, 0.6470588235294118),
-                (0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
-                (0.36862745098039218, 0.30980392156862746, 0.63529411764705879),  # 1.0 -> [K-1]
-            )
+            if cmap not in supported_cmaps:
+                raise ValueError(
+                    f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
+                )
-            cmap = torch.tensor(_Spectral_data, dtype=torch.float, device=image.device)  # [K,3]
+            cmap = supported_cmaps[cmap]
+            if is_cmap_reversed:
+                cmap = cmap[::-1]
+            cmap = torch.tensor(cmap, dtype=torch.float, device=image.device)  # [K,3]
             K = cmap.shape[0]
             pos = image.clamp(min=0, max=1) * (K - 1)

diffusers/pipelines/pia/pipeline_pia.py CHANGED Viewed

@@ -375,9 +375,10 @@ class PIAPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py CHANGED Viewed

@@ -394,7 +394,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
             uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(

diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py CHANGED Viewed

@@ -320,7 +320,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
             uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(

diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl

diffusers 0.28.2py3-none-any.whl → 0.29.0py3-none-any.whl