PyPI - diffusers - Versions diffs - 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/models/autoencoders/autoencoder_oobleck.py CHANGED Viewed

@@ -317,6 +317,7 @@ class AutoencoderOobleck(ModelMixin, ConfigMixin):
     """
     _supports_gradient_checkpointing = False
+    _supports_group_offloading = False
     @register_to_config
     def __init__(

diffusers/models/autoencoders/autoencoder_tiny.py CHANGED Viewed

@@ -154,10 +154,6 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         self.register_to_config(block_out_channels=decoder_block_out_channels)
         self.register_to_config(force_upcast=False)
-    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
-        if isinstance(module, (EncoderTiny, DecoderTiny)):
-            module.gradient_checkpointing = value
     def scale_latents(self, x: torch.Tensor) -> torch.Tensor:
         """raw latents -> [0, 1]"""
         return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)

diffusers/models/autoencoders/consistency_decoder_vae.py CHANGED Viewed

@@ -60,7 +60,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
         >>> pipe = StableDiffusionPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
+        ...     "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
         ... ).to("cuda")
         >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
@@ -68,6 +68,8 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         ```
     """
+    _supports_group_offloading = False
     @register_to_config
     def __init__(
         self,

diffusers/models/autoencoders/vae.py CHANGED Viewed

@@ -18,7 +18,7 @@ import numpy as np
 import torch
 import torch.nn as nn
-from ...utils import BaseOutput, is_torch_version
+from ...utils import BaseOutput
 from ...utils.torch_utils import randn_tensor
 from ..activations import get_activation
 from ..attention_processor import SpatialNorm
@@ -156,28 +156,11 @@ class Encoder(nn.Module):
         sample = self.conv_in(sample)
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
             # down
-            if is_torch_version(">=", "1.11.0"):
-                for down_block in self.down_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(down_block), sample, use_reentrant=False
-                    )
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, use_reentrant=False
-                )
-            else:
-                for down_block in self.down_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+            for down_block in self.down_blocks:
+                sample = self._gradient_checkpointing_func(down_block, sample)
+            # middle
+            sample = self._gradient_checkpointing_func(self.mid_block, sample)
         else:
             # down
@@ -305,41 +288,13 @@ class Decoder(nn.Module):
         upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
         if torch.is_grad_enabled() and self.gradient_checkpointing:
+            # middle
+            sample = self._gradient_checkpointing_func(self.mid_block, sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            if is_torch_version(">=", "1.11.0"):
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block),
-                    sample,
-                    latent_embeds,
-                    use_reentrant=False,
-                )
-                sample = sample.to(upscale_dtype)
-                # up
-                for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(up_block),
-                        sample,
-                        latent_embeds,
-                        use_reentrant=False,
-                    )
-            else:
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, latent_embeds
-                )
-                sample = sample.to(upscale_dtype)
-                # up
-                for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+            # up
+            for up_block in self.up_blocks:
+                sample = self._gradient_checkpointing_func(up_block, sample, latent_embeds)
         else:
             # middle
             sample = self.mid_block(sample, latent_embeds)
@@ -558,72 +513,28 @@ class MaskConditionDecoder(nn.Module):
         upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
         if torch.is_grad_enabled() and self.gradient_checkpointing:
+            # middle
+            sample = self._gradient_checkpointing_func(self.mid_block, sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            if is_torch_version(">=", "1.11.0"):
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block),
-                    sample,
-                    latent_embeds,
-                    use_reentrant=False,
-                )
-                sample = sample.to(upscale_dtype)
-                # condition encoder
-                if image is not None and mask is not None:
-                    masked_image = (1 - mask) * image
-                    im_x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(self.condition_encoder),
-                        masked_image,
-                        mask,
-                        use_reentrant=False,
-                    )
-                # up
-                for up_block in self.up_blocks:
-                    if image is not None and mask is not None:
-                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
-                        sample = sample * mask_ + sample_ * (1 - mask_)
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(up_block),
-                        sample,
-                        latent_embeds,
-                        use_reentrant=False,
-                    )
-                if image is not None and mask is not None:
-                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
-            else:
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, latent_embeds
+            # condition encoder
+            if image is not None and mask is not None:
+                masked_image = (1 - mask) * image
+                im_x = self._gradient_checkpointing_func(
+                    self.condition_encoder,
+                    masked_image,
+                    mask,
                 )
-                sample = sample.to(upscale_dtype)
-                # condition encoder
-                if image is not None and mask is not None:
-                    masked_image = (1 - mask) * image
-                    im_x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(self.condition_encoder),
-                        masked_image,
-                        mask,
-                    )
-                # up
-                for up_block in self.up_blocks:
-                    if image is not None and mask is not None:
-                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
-                        sample = sample * mask_ + sample_ * (1 - mask_)
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+            # up
+            for up_block in self.up_blocks:
                 if image is not None and mask is not None:
-                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+                    sample_ = im_x[str(tuple(sample.shape))]
+                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                    sample = sample * mask_ + sample_ * (1 - mask_)
+                sample = self._gradient_checkpointing_func(up_block, sample, latent_embeds)
+            if image is not None and mask is not None:
+                sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
         else:
             # middle
             sample = self.mid_block(sample, latent_embeds)
@@ -890,17 +801,7 @@ class EncoderTiny(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""The forward method of the `EncoderTiny` class."""
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
-            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+            x = self._gradient_checkpointing_func(self.layers, x)
         else:
             # scale image from [-1, 1] to [0, 1] to match TAESD convention
@@ -976,18 +877,7 @@ class DecoderTiny(nn.Module):
         x = torch.tanh(x / 3) * 3
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
-            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+            x = self._gradient_checkpointing_func(self.layers, x)
         else:
             x = self.layers(x)

diffusers/models/autoencoders/vq_model.py CHANGED Viewed

@@ -71,6 +71,9 @@ class VQModel(ModelMixin, ConfigMixin):
             Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
     """
+    _skip_layerwise_casting_patterns = ["quantize"]
+    _supports_group_offloading = False
     @register_to_config
     def __init__(
         self,

diffusers/models/cache_utils.py ADDED Viewed

@@ -0,0 +1,108 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils.logging import get_logger
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+class CacheMixin:
+    r"""
+    A class for enable/disabling caching techniques on diffusion models.
+    Supported caching techniques:
+        - [Pyramid Attention Broadcast](https://huggingface.co/papers/2408.12588)
+        - [FasterCache](https://huggingface.co/papers/2410.19355)
+    """
+    _cache_config = None
+    @property
+    def is_cache_enabled(self) -> bool:
+        return self._cache_config is not None
+    def enable_cache(self, config) -> None:
+        r"""
+        Enable caching techniques on the model.
+        Args:
+            config (`Union[PyramidAttentionBroadcastConfig]`):
+                The configuration for applying the caching technique. Currently supported caching techniques are:
+                    - [`~hooks.PyramidAttentionBroadcastConfig`]
+        Example:
+        ```python
+        >>> import torch
+        >>> from diffusers import CogVideoXPipeline, PyramidAttentionBroadcastConfig
+        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> config = PyramidAttentionBroadcastConfig(
+        ...     spatial_attention_block_skip_range=2,
+        ...     spatial_attention_timestep_skip_range=(100, 800),
+        ...     current_timestep_callback=lambda: pipe.current_timestep,
+        ... )
+        >>> pipe.transformer.enable_cache(config)
+        ```
+        """
+        from ..hooks import (
+            FasterCacheConfig,
+            PyramidAttentionBroadcastConfig,
+            apply_faster_cache,
+            apply_pyramid_attention_broadcast,
+        )
+        if self.is_cache_enabled:
+            raise ValueError(
+                f"Caching has already been enabled with {type(self._cache_config)}. To apply a new caching technique, please disable the existing one first."
+            )
+        if isinstance(config, PyramidAttentionBroadcastConfig):
+            apply_pyramid_attention_broadcast(self, config)
+        elif isinstance(config, FasterCacheConfig):
+            apply_faster_cache(self, config)
+        else:
+            raise ValueError(f"Cache config {type(config)} is not supported.")
+        self._cache_config = config
+    def disable_cache(self) -> None:
+        from ..hooks import FasterCacheConfig, HookRegistry, PyramidAttentionBroadcastConfig
+        from ..hooks.faster_cache import _FASTER_CACHE_BLOCK_HOOK, _FASTER_CACHE_DENOISER_HOOK
+        from ..hooks.pyramid_attention_broadcast import _PYRAMID_ATTENTION_BROADCAST_HOOK
+        if self._cache_config is None:
+            logger.warning("Caching techniques have not been enabled, so there's nothing to disable.")
+            return
+        if isinstance(self._cache_config, PyramidAttentionBroadcastConfig):
+            registry = HookRegistry.check_if_exists_or_initialize(self)
+            registry.remove_hook(_PYRAMID_ATTENTION_BROADCAST_HOOK, recurse=True)
+        elif isinstance(self._cache_config, FasterCacheConfig):
+            registry = HookRegistry.check_if_exists_or_initialize(self)
+            registry.remove_hook(_FASTER_CACHE_DENOISER_HOOK, recurse=True)
+            registry.remove_hook(_FASTER_CACHE_BLOCK_HOOK, recurse=True)
+        else:
+            raise ValueError(f"Cache config {type(self._cache_config)} is not supported.")
+        self._cache_config = None
+    def _reset_stateful_cache(self, recurse: bool = True) -> None:
+        from ..hooks import HookRegistry
+        HookRegistry.check_if_exists_or_initialize(self).reset_stateful_hooks(recurse=recurse)

diffusers/models/controlnets/__init__.py CHANGED Viewed

@@ -18,6 +18,7 @@ if is_torch_available():
     from .controlnet_union import ControlNetUnionModel
     from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
     from .multicontrolnet import MultiControlNetModel
+    from .multicontrolnet_union import MultiControlNetUnionModel
 if is_flax_available():
     from .controlnet_flax import FlaxControlNetModel

diffusers/models/controlnets/controlnet.py CHANGED Viewed

@@ -31,8 +31,6 @@ from ..attention_processor import (
 from ..embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
 from ..unets.unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
     UNetMidBlock2D,
     UNetMidBlock2DCrossAttn,
     get_down_block,
@@ -659,10 +657,6 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         for module in self.children():
             fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
     def forward(
         self,
         sample: torch.Tensor,
@@ -740,10 +734,11 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
             is_mps = sample.device.type == "mps"
+            is_npu = sample.device.type == "npu"
             if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
+                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
             else:
-                dtype = torch.int32 if is_mps else torch.int64
+                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)

diffusers/models/controlnets/controlnet_flux.py CHANGED Viewed

@@ -22,7 +22,7 @@ from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...models.attention_processor import AttentionProcessor
 from ...models.modeling_utils import ModelMixin
-from ...utils import USE_PEFT_BACKEND, BaseOutput, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from ..controlnets.controlnet import ControlNetConditioningEmbedding, zero_module
 from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
 from ..modeling_outputs import Transformer2DModelOutput
@@ -178,10 +178,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
     @classmethod
     def from_transformer(
         cls,
@@ -302,15 +298,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         )
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-        if self.union:
-            # union mode
-            if controlnet_mode is None:
-                raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
-            # union mode emb
-            controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
-            encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
-            txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
         if txt_ids.ndim == 3:
             logger.warning(
                 "Passing `txt_ids` 3d torch.Tensor is deprecated."
@@ -324,30 +311,27 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             )
             img_ids = img_ids[0]
+        if self.union:
+            # union mode
+            if controlnet_mode is None:
+                raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
+            # union mode emb
+            controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
+            encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
+            txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
         ids = torch.cat((txt_ids, img_ids), dim=0)
         image_rotary_emb = self.pos_embed(ids)
         block_samples = ()
         for index_block, block in enumerate(self.transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     encoder_hidden_states,
                     temb,
                     image_rotary_emb,
-                    **ckpt_kwargs,
                 )
             else:
@@ -364,23 +348,11 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         single_block_samples = ()
         for index_block, block in enumerate(self.single_transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     temb,
                     image_rotary_emb,
-                    **ckpt_kwargs,
                 )
             else:

diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl