PyPI - diffusers - Versions diffs - 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl - Mend

diffusers 0.27.2py3-none-any.whl → 0.28.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (278) hide show

diffusers/models/autoencoders/autoencoder_asym_kl.py CHANGED Viewed

@@ -112,9 +112,7 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
         self.register_to_config(force_upcast=False)
     @apply_forward_hook
-    def encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]:
+    def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderKLOutput, Tuple[torch.Tensor]]:
         h = self.encoder(x)
         moments = self.quant_conv(h)
         posterior = DiagonalGaussianDistribution(moments)
@@ -126,11 +124,11 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
     def _decode(
         self,
-        z: torch.FloatTensor,
-        image: Optional[torch.FloatTensor] = None,
-        mask: Optional[torch.FloatTensor] = None,
+        z: torch.Tensor,
+        image: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         z = self.post_quant_conv(z)
         dec = self.decoder(z, image, mask)
@@ -142,12 +140,12 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def decode(
         self,
-        z: torch.FloatTensor,
+        z: torch.Tensor,
         generator: Optional[torch.Generator] = None,
-        image: Optional[torch.FloatTensor] = None,
-        mask: Optional[torch.FloatTensor] = None,
+        image: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         decoded = self._decode(z, image, mask).sample
         if not return_dict:
@@ -157,16 +155,16 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
     def forward(
         self,
-        sample: torch.FloatTensor,
-        mask: Optional[torch.FloatTensor] = None,
+        sample: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
         sample_posterior: bool = False,
         return_dict: bool = True,
         generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         r"""
         Args:
-            sample (`torch.FloatTensor`): Input sample.
-            mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask.
+            sample (`torch.Tensor`): Input sample.
+            mask (`torch.Tensor`, *optional*, defaults to `None`): Optional inpainting mask.
             sample_posterior (`bool`, *optional*, defaults to `False`):
                 Whether to sample from the posterior.
             return_dict (`bool`, *optional*, defaults to `True`):

diffusers/models/autoencoders/autoencoder_kl.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 import torch.nn as nn
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalVAEMixin
+from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -32,7 +32,7 @@ from ..modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
-class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
@@ -65,6 +65,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     """
     _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
     @register_to_config
     def __init__(
@@ -236,13 +237,13 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     @apply_forward_hook
     def encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
+        self, x: torch.Tensor, return_dict: bool = True
     ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
         """
         Encode a batch of images into latents.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
@@ -267,7 +268,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
         if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
             return self.tiled_decode(z, return_dict=return_dict)
@@ -280,14 +281,12 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         return DecoderOutput(sample=dec)
     @apply_forward_hook
-    def decode(
-        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
+    def decode(self, z: torch.Tensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.Tensor]:
         """
         Decode a batch of images.
         Args:
-            z (`torch.FloatTensor`): Input batch of latent vectors.
+            z (`torch.Tensor`): Input batch of latent vectors.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
@@ -301,7 +300,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
             decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
             decoded = torch.cat(decoded_slices)
         else:
-            decoded = self._decode(z).sample
+            decoded = self._decode(z, return_dict=False)[0]
         if not return_dict:
             return (decoded,)
@@ -320,7 +319,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
             b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
         return b
-    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
         r"""Encode a batch of images using a tiled encoder.
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -330,7 +329,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         output, but they should be much less noticeable.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
@@ -374,12 +373,12 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         return AutoencoderKLOutput(latent_dist=posterior)
-    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
         r"""
         Decode a batch of images using a tiled decoder.
         Args:
-            z (`torch.FloatTensor`): Input batch of latent vectors.
+            z (`torch.Tensor`): Input batch of latent vectors.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
@@ -424,14 +423,14 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     def forward(
         self,
-        sample: torch.FloatTensor,
+        sample: torch.Tensor,
         sample_posterior: bool = False,
         return_dict: bool = True,
         generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
+    ) -> Union[DecoderOutput, torch.Tensor]:
         r"""
         Args:
-            sample (`torch.FloatTensor`): Input sample.
+            sample (`torch.Tensor`): Input sample.
             sample_posterior (`bool`, *optional*, defaults to `False`):
                 Whether to sample from the posterior.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -453,8 +452,8 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
     def fuse_qkv_projections(self):
         """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
-        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
         <Tip warning={true}>

diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py CHANGED Viewed

@@ -86,10 +86,10 @@ class TemporalDecoder(nn.Module):
     def forward(
         self,
-        sample: torch.FloatTensor,
-        image_only_indicator: torch.FloatTensor,
+        sample: torch.Tensor,
+        image_only_indicator: torch.Tensor,
         num_frames: int = 1,
-    ) -> torch.FloatTensor:
+    ) -> torch.Tensor:
         r"""The forward method of the `Decoder` class."""
         sample = self.conv_in(sample)
@@ -315,13 +315,13 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
+        self, x: torch.Tensor, return_dict: bool = True
     ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
         """
         Encode a batch of images into latents.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
@@ -341,15 +341,15 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def decode(
         self,
-        z: torch.FloatTensor,
+        z: torch.Tensor,
         num_frames: int,
         return_dict: bool = True,
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
+    ) -> Union[DecoderOutput, torch.Tensor]:
         """
         Decode a batch of images.
         Args:
-            z (`torch.FloatTensor`): Input batch of latent vectors.
+            z (`torch.Tensor`): Input batch of latent vectors.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
@@ -370,15 +370,15 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
     def forward(
         self,
-        sample: torch.FloatTensor,
+        sample: torch.Tensor,
         sample_posterior: bool = False,
         return_dict: bool = True,
         generator: Optional[torch.Generator] = None,
         num_frames: int = 1,
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
+    ) -> Union[DecoderOutput, torch.Tensor]:
         r"""
         Args:
-            sample (`torch.FloatTensor`): Input sample.
+            sample (`torch.Tensor`): Input sample.
             sample_posterior (`bool`, *optional*, defaults to `False`):
                 Whether to sample from the posterior.
             return_dict (`bool`, *optional*, defaults to `True`):

diffusers/models/autoencoders/autoencoder_tiny.py CHANGED Viewed

@@ -102,6 +102,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         encoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
         decoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
         act_fn: str = "relu",
+        upsample_fn: str = "nearest",
         latent_channels: int = 4,
         upsampling_scaling_factor: int = 2,
         num_encoder_blocks: Tuple[int, ...] = (1, 3, 3, 3),
@@ -133,6 +134,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
             block_out_channels=decoder_block_out_channels,
             upsampling_scaling_factor=upsampling_scaling_factor,
             act_fn=act_fn,
+            upsample_fn=upsample_fn,
         )
         self.latent_magnitude = latent_magnitude
@@ -155,11 +157,11 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         if isinstance(module, (EncoderTiny, DecoderTiny)):
             module.gradient_checkpointing = value
-    def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def scale_latents(self, x: torch.Tensor) -> torch.Tensor:
         """raw latents -> [0, 1]"""
         return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)
-    def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def unscale_latents(self, x: torch.Tensor) -> torch.Tensor:
         """[0, 1] -> raw latents"""
         return x.sub(self.latent_shift).mul(2 * self.latent_magnitude)
@@ -192,7 +194,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         """
         self.enable_tiling(False)
-    def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         r"""Encode a batch of images using a tiled encoder.
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -200,10 +202,10 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         tiles overlap and are blended together to form a smooth output.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
         Returns:
-            `torch.FloatTensor`: Encoded batch of images.
+            `torch.Tensor`: Encoded batch of images.
         """
         # scale of encoder output relative to input
         sf = self.spatial_scale_factor
@@ -240,7 +242,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
                 tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
         return out
-    def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def _tiled_decode(self, x: torch.Tensor) -> torch.Tensor:
         r"""Encode a batch of images using a tiled encoder.
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -248,10 +250,10 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         tiles overlap and are blended together to form a smooth output.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
         Returns:
-            `torch.FloatTensor`: Encoded batch of images.
+            `torch.Tensor`: Encoded batch of images.
         """
         # scale of decoder output relative to input
         sf = self.spatial_scale_factor
@@ -288,9 +290,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
         return out
     @apply_forward_hook
-    def encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
-    ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]:
+    def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[torch.Tensor]]:
         if self.use_slicing and x.shape[0] > 1:
             output = [
                 self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1)
@@ -306,8 +306,8 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def decode(
-        self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         if self.use_slicing and x.shape[0] > 1:
             output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
             output = torch.cat(output)
@@ -321,12 +321,12 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
     def forward(
         self,
-        sample: torch.FloatTensor,
+        sample: torch.Tensor,
         return_dict: bool = True,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         r"""
         Args:
-            sample (`torch.FloatTensor`): Input sample.
+            sample (`torch.Tensor`): Input sample.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
         """

diffusers/models/autoencoders/consistency_decoder_vae.py CHANGED Viewed

@@ -63,7 +63,8 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         ...     "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
         ... ).to("cuda")
-        >>> pipe("horse", generator=torch.manual_seed(0)).images
+        >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
+        >>> image
         ```
     """
@@ -72,6 +73,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         self,
         scaling_factor: float = 0.18215,
         latent_channels: int = 4,
+        sample_size: int = 32,
         encoder_act_fn: str = "silu",
         encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
         encoder_double_z: bool = True,
@@ -153,6 +155,16 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         self.use_slicing = False
         self.use_tiling = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
     # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
     def enable_tiling(self, use_tiling: bool = True):
         r"""
@@ -264,15 +276,15 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
+        self, x: torch.Tensor, return_dict: bool = True
     ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]:
         """
         Encode a batch of images into latents.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
+                Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain
                 tuple.
         Returns:
@@ -300,11 +312,24 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
     @apply_forward_hook
     def decode(
         self,
-        z: torch.FloatTensor,
+        z: torch.Tensor,
         generator: Optional[torch.Generator] = None,
         return_dict: bool = True,
         num_inference_steps: int = 2,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
+        """
+        Decodes the input latent vector `z` using the consistency decoder VAE model.
+        Args:
+            z (torch.Tensor): The input latent vector.
+            generator (Optional[torch.Generator]): The random number generator. Default is None.
+            return_dict (bool): Whether to return the output as a dictionary. Default is True.
+            num_inference_steps (int): The number of inference steps. Default is 2.
+        Returns:
+            Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.
+        """
         z = (z * self.config.scaling_factor - self.means) / self.stds
         scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
@@ -345,7 +370,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
             b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
         return b
-    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
+    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[ConsistencyDecoderVAEOutput, Tuple]:
         r"""Encode a batch of images using a tiled encoder.
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -355,7 +380,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         output, but they should be much less noticeable.
         Args:
-            x (`torch.FloatTensor`): Input batch of images.
+            x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
                 plain tuple.
@@ -402,14 +427,14 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
     def forward(
         self,
-        sample: torch.FloatTensor,
+        sample: torch.Tensor,
         sample_posterior: bool = False,
         return_dict: bool = True,
         generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+    ) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
         r"""
         Args:
-            sample (`torch.FloatTensor`): Input sample.
+            sample (`torch.Tensor`): Input sample.
             sample_posterior (`bool`, *optional*, defaults to `False`):
                 Whether to sample from the posterior.
             return_dict (`bool`, *optional*, defaults to `True`):

diffusers/models/autoencoders/vae.py CHANGED Viewed

@@ -36,11 +36,12 @@ class DecoderOutput(BaseOutput):
     Output of decoding method.
     Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
             The decoded output sample from the last layer of the model.
     """
-    sample: torch.FloatTensor
+    sample: torch.Tensor
+    commit_loss: Optional[torch.FloatTensor] = None
 class Encoder(nn.Module):
@@ -90,7 +91,6 @@ class Encoder(nn.Module):
             padding=1,
         )
-        self.mid_block = None
         self.down_blocks = nn.ModuleList([])
         # down
@@ -137,7 +137,7 @@ class Encoder(nn.Module):
         self.gradient_checkpointing = False
-    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
         r"""The forward method of the `Encoder` class."""
         sample = self.conv_in(sample)
@@ -228,7 +228,6 @@ class Decoder(nn.Module):
             padding=1,
         )
-        self.mid_block = None
         self.up_blocks = nn.ModuleList([])
         temb_channels = in_channels if norm_type == "spatial" else None
@@ -284,9 +283,9 @@ class Decoder(nn.Module):
     def forward(
         self,
-        sample: torch.FloatTensor,
-        latent_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
+        sample: torch.Tensor,
+        latent_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         r"""The forward method of the `Decoder` class."""
         sample = self.conv_in(sample)
@@ -369,7 +368,7 @@ class UpSample(nn.Module):
         self.out_channels = out_channels
         self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
-    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""The forward method of the `UpSample` class."""
         x = torch.relu(x)
         x = self.deconv(x)
@@ -418,7 +417,7 @@ class MaskConditionEncoder(nn.Module):
         self.layers = nn.Sequential(*layers)
-    def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor:
+    def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor:
         r"""The forward method of the `MaskConditionEncoder` class."""
         out = {}
         for l in range(len(self.layers)):
@@ -474,7 +473,6 @@ class MaskConditionDecoder(nn.Module):
             padding=1,
         )
-        self.mid_block = None
         self.up_blocks = nn.ModuleList([])
         temb_channels = in_channels if norm_type == "spatial" else None
@@ -536,11 +534,11 @@ class MaskConditionDecoder(nn.Module):
     def forward(
         self,
-        z: torch.FloatTensor,
-        image: Optional[torch.FloatTensor] = None,
-        mask: Optional[torch.FloatTensor] = None,
-        latent_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
+        z: torch.Tensor,
+        image: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        latent_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         r"""The forward method of the `MaskConditionDecoder` class."""
         sample = z
         sample = self.conv_in(sample)
@@ -714,7 +712,7 @@ class VectorQuantizer(nn.Module):
         back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
         return back.reshape(ishape)
-    def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]:
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]:
         # reshape z -> (batch, height, width, channel) and flatten
         z = z.permute(0, 2, 3, 1).contiguous()
         z_flattened = z.view(-1, self.vq_embed_dim)
@@ -733,7 +731,7 @@ class VectorQuantizer(nn.Module):
             loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
         # preserve gradients
-        z_q: torch.FloatTensor = z + (z_q - z).detach()
+        z_q: torch.Tensor = z + (z_q - z).detach()
         # reshape back to match original input shape
         z_q = z_q.permute(0, 3, 1, 2).contiguous()
@@ -748,7 +746,7 @@ class VectorQuantizer(nn.Module):
         return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor:
+    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor:
         # shape specifying (batch, height, width, channel)
         if self.remap is not None:
             indices = indices.reshape(shape[0], -1)  # add batch axis
@@ -756,7 +754,7 @@ class VectorQuantizer(nn.Module):
             indices = indices.reshape(-1)  # flatten again
         # get quantized latent vectors
-        z_q: torch.FloatTensor = self.embedding(indices)
+        z_q: torch.Tensor = self.embedding(indices)
         if shape is not None:
             z_q = z_q.view(shape)
@@ -779,7 +777,7 @@ class DiagonalGaussianDistribution(object):
                 self.mean, device=self.parameters.device, dtype=self.parameters.dtype
             )
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
         # make sure sample is on the same device as the parameters and has same dtype
         sample = randn_tensor(
             self.mean.shape,
@@ -876,7 +874,7 @@ class EncoderTiny(nn.Module):
         self.layers = nn.Sequential(*layers)
         self.gradient_checkpointing = False
-    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""The forward method of the `EncoderTiny` class."""
         if self.training and self.gradient_checkpointing:
@@ -926,6 +924,7 @@ class DecoderTiny(nn.Module):
         block_out_channels: Tuple[int, ...],
         upsampling_scaling_factor: int,
         act_fn: str,
+        upsample_fn: str,
     ):
         super().__init__()
@@ -942,7 +941,7 @@ class DecoderTiny(nn.Module):
                 layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
             if not is_final_block:
-                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor))
+                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor, mode=upsample_fn))
             conv_out_channel = num_channels if not is_final_block else out_channels
             layers.append(
@@ -958,7 +957,7 @@ class DecoderTiny(nn.Module):
         self.layers = nn.Sequential(*layers)
         self.gradient_checkpointing = False
-    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""The forward method of the `DecoderTiny` class."""
         # Clamp.
         x = torch.tanh(x / 3) * 3

diffusers 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl

diffusers 0.27.2py3-none-any.whl → 0.28.1py3-none-any.whl