PyPI - diffusers - Versions diffs - 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

diffusers 0.26.3py3-none-any.whl → 0.27.0py3-none-any.whl

Files changed (299) hide show

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+# Copyright 2024 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ from ....utils import (
     unscale_lora_layers,
 )
 from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -280,7 +280,7 @@ class Pix2PixZeroAttnProcessor:
         return hidden_states
-class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
+class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin):
     r"""
     Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion.
@@ -463,7 +463,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
             batch_size = prompt_embeds.shape[0]
         if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
+            # textual inversion: process multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
                 prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
@@ -545,7 +545,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
             else:
                 uncond_tokens = negative_prompt
-            # textual inversion: procecss multi-vector tokens if necessary
+            # textual inversion: process multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
                 uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)

diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py CHANGED Viewed

@@ -268,7 +268,6 @@ class GLIGENTextBoundingboxProjection(nn.Module):
         return objs
-# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel with UNet2DConditionModel->UNetFlatConditionModel, nn.Conv2d->LinearMultiDim, Block2D->BlockFlat
 class UNetFlatConditionModel(ModelMixin, ConfigMixin):
     r"""
     A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
@@ -1334,7 +1333,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
                     **additional_residuals,
                 )
             else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
                     sample += down_intrablock_additional_residuals.pop(0)
@@ -1590,7 +1589,7 @@ class DownBlockFlat(nn.Module):
         self.gradient_checkpointing = False
     def forward(
-        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None
     ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
         output_states = ()
@@ -1612,13 +1611,13 @@ class DownBlockFlat(nn.Module):
                         create_custom_forward(resnet), hidden_states, temb
                     )
             else:
-                hidden_states = resnet(hidden_states, temb, scale=scale)
+                hidden_states = resnet(hidden_states, temb)
             output_states = output_states + (hidden_states,)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, scale=scale)
+                hidden_states = downsampler(hidden_states)
             output_states = output_states + (hidden_states,)
@@ -1729,8 +1728,6 @@ class CrossAttnDownBlockFlat(nn.Module):
     ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
         output_states = ()
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
         blocks = list(zip(self.resnets, self.attentions))
         for i, (resnet, attn) in enumerate(blocks):
@@ -1761,7 +1758,7 @@ class CrossAttnDownBlockFlat(nn.Module):
                     return_dict=False,
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -1779,7 +1776,7 @@ class CrossAttnDownBlockFlat(nn.Module):
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, scale=lora_scale)
+                hidden_states = downsampler(hidden_states)
             output_states = output_states + (hidden_states,)
@@ -1843,8 +1840,13 @@ class UpBlockFlat(nn.Module):
         res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
         temb: Optional[torch.FloatTensor] = None,
         upsample_size: Optional[int] = None,
-        scale: float = 1.0,
+        *args,
+        **kwargs,
     ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
         is_freeu_enabled = (
             getattr(self, "s1", None)
             and getattr(self, "s2", None)
@@ -1888,11 +1890,11 @@ class UpBlockFlat(nn.Module):
                         create_custom_forward(resnet), hidden_states, temb
                     )
             else:
-                hidden_states = resnet(hidden_states, temb, scale=scale)
+                hidden_states = resnet(hidden_states, temb)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+                hidden_states = upsampler(hidden_states, upsample_size)
         return hidden_states
@@ -2000,7 +2002,10 @@ class CrossAttnUpBlockFlat(nn.Module):
         attention_mask: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
     ) -> torch.FloatTensor:
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
         is_freeu_enabled = (
             getattr(self, "s1", None)
             and getattr(self, "s2", None)
@@ -2054,7 +2059,7 @@ class CrossAttnUpBlockFlat(nn.Module):
                     return_dict=False,
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -2066,7 +2071,7 @@ class CrossAttnUpBlockFlat(nn.Module):
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+                hidden_states = upsampler(hidden_states, upsample_size)
         return hidden_states
@@ -2159,7 +2164,7 @@ class UNetMidBlockFlat(nn.Module):
         attentions = []
         if attention_head_dim is None:
-            logger.warn(
+            logger.warning(
                 f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
             )
             attention_head_dim = in_channels
@@ -2331,8 +2336,11 @@ class UNetMidBlockFlatCrossAttn(nn.Module):
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
     ) -> torch.FloatTensor:
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+        hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             if self.training and self.gradient_checkpointing:
@@ -2369,7 +2377,7 @@ class UNetMidBlockFlatCrossAttn(nn.Module):
                     encoder_attention_mask=encoder_attention_mask,
                     return_dict=False,
                 )[0]
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
         return hidden_states
@@ -2470,7 +2478,8 @@ class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
     ) -> torch.FloatTensor:
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
         if attention_mask is None:
             # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
@@ -2483,7 +2492,7 @@ class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
             #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
             mask = attention_mask
-        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             # attn
             hidden_states = attn(
@@ -2494,6 +2503,6 @@ class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
             )
             # resnet
-            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            hidden_states = resnet(hidden_states, temb)
         return hidden_states

diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -246,7 +246,6 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,

diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 Microsoft and The HuggingFace Team. All rights reserved.
+# Copyright 2024 Microsoft and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/pipelines/dit/pipeline_dit.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # Copyright (c) 2021 OpenAI
 # MIT License
 #
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/pipelines/free_init_utils.py ADDED Viewed

@@ -0,0 +1,184 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Tuple, Union
+import torch
+import torch.fft as fft
+from ..utils.torch_utils import randn_tensor
+class FreeInitMixin:
+    r"""Mixin class for FreeInit."""
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                FreeInit low pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                the original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                the original implementation.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+    def _get_free_init_freq_filter(
+        self,
+        shape: Tuple[int, ...],
+        device: Union[str, torch.dtype],
+        filter_type: str,
+        order: float,
+        spatial_stop_frequency: float,
+        temporal_stop_frequency: float,
+    ) -> torch.Tensor:
+        r"""Returns the FreeInit filter based on filter type and other input conditions."""
+        time, height, width = shape[-3], shape[-2], shape[-1]
+        mask = torch.zeros(shape)
+        if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+            return mask
+        if filter_type == "butterworth":
+            def retrieve_mask(x):
+                return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+        elif filter_type == "gaussian":
+            def retrieve_mask(x):
+                return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+        elif filter_type == "ideal":
+            def retrieve_mask(x):
+                return 1 if x <= spatial_stop_frequency * 2 else 0
+        else:
+            raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+        for t in range(time):
+            for h in range(height):
+                for w in range(width):
+                    d_square = (
+                        ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
+                        + (2 * h / height - 1) ** 2
+                        + (2 * w / width - 1) ** 2
+                    )
+                    mask[..., t, h, w] = retrieve_mask(d_square)
+        return mask.to(device)
+    def _apply_freq_filter(self, x: torch.Tensor, noise: torch.Tensor, low_pass_filter: torch.Tensor) -> torch.Tensor:
+        r"""Noise reinitialization."""
+        # FFT
+        x_freq = fft.fftn(x, dim=(-3, -2, -1))
+        x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+        noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+        noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+        # frequency mix
+        high_pass_filter = 1 - low_pass_filter
+        x_freq_low = x_freq * low_pass_filter
+        noise_freq_high = noise_freq * high_pass_filter
+        x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+        # IFFT
+        x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+        x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+        return x_mixed
+    def _apply_free_init(
+        self,
+        latents: torch.Tensor,
+        free_init_iteration: int,
+        num_inference_steps: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: torch.Generator,
+    ):
+        if free_init_iteration == 0:
+            self._free_init_initial_noise = latents.detach().clone()
+            return latents, self.scheduler.timesteps
+        latent_shape = latents.shape
+        free_init_filter_shape = (1, *latent_shape[1:])
+        free_init_freq_filter = self._get_free_init_freq_filter(
+            shape=free_init_filter_shape,
+            device=device,
+            filter_type=self._free_init_method,
+            order=self._free_init_order,
+            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+        )
+        current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
+        diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
+        z_t = self.scheduler.add_noise(
+            original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
+        ).to(dtype=torch.float32)
+        z_rand = randn_tensor(
+            shape=latent_shape,
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
+        )
+        latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
+        latents = latents.to(dtype)
+        # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+        if self._free_init_use_fast_sampling:
+            num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+        return latents, self.scheduler.timesteps

diffusers 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

diffusers 0.26.3py3-none-any.whl → 0.27.0py3-none-any.whl