PyPI - diffusers - Versions diffs - 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/pipelines/kolors/text_encoder.py CHANGED Viewed

@@ -104,13 +104,6 @@ class RMSNorm(torch.nn.Module):
         return (self.weight * hidden_states).to(input_dtype)
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
 class CoreAttention(torch.nn.Module):
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
@@ -314,7 +307,6 @@ class SelfAttention(torch.nn.Module):
             self.qkv_hidden_size,
             bias=config.add_bias_linear or config.add_qkv_bias,
             device=device,
-            **_config_to_kwargs(config),
         )
         self.core_attention = CoreAttention(config, self.layer_number)
@@ -325,7 +317,6 @@ class SelfAttention(torch.nn.Module):
             config.hidden_size,
             bias=config.add_bias_linear,
             device=device,
-            **_config_to_kwargs(config),
         )
     def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
@@ -449,7 +440,6 @@ class MLP(torch.nn.Module):
             config.ffn_hidden_size * 2,
             bias=self.add_bias,
             device=device,
-            **_config_to_kwargs(config),
         )
         def swiglu(x):
@@ -459,9 +449,7 @@ class MLP(torch.nn.Module):
         self.activation_func = swiglu
         # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size, config.hidden_size, bias=self.add_bias, device=device, **_config_to_kwargs(config)
-        )
+        self.dense_4h_to_h = nn.Linear(config.ffn_hidden_size, config.hidden_size, bias=self.add_bias, device=device)
     def forward(self, hidden_states):
         # [s, b, 4hp]
@@ -488,18 +476,14 @@ class GLMBlock(torch.nn.Module):
         LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
         # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(
-            config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
-        )
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device)
         # Self attention.
         self.self_attention = SelfAttention(config, layer_number, device=device)
         self.hidden_dropout = config.hidden_dropout
         # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(
-            config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
-        )
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device)
         # MLP
         self.mlp = MLP(config, device=device)
@@ -569,9 +553,7 @@ class GLMTransformer(torch.nn.Module):
         if self.post_layer_norm:
             LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
             # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(
-                config.hidden_size, eps=config.layernorm_epsilon, device=device, dtype=config.torch_dtype
-            )
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device)
         self.gradient_checkpointing = False
@@ -605,7 +587,7 @@ class GLMTransformer(torch.nn.Module):
             layer = self._get_layer(index)
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                layer_ret = torch.utils.checkpoint.checkpoint(
+                layer_ret = self._gradient_checkpointing_func(
                     layer, hidden_states, attention_mask, rotary_pos_emb, kv_caches[index], use_cache
                 )
             else:
@@ -666,10 +648,6 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
         return position_ids
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
@@ -683,9 +661,7 @@ class Embedding(torch.nn.Module):
         self.hidden_size = config.hidden_size
         # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size, self.hidden_size, dtype=config.torch_dtype, device=device
-        )
+        self.word_embeddings = nn.Embedding(config.padded_vocab_size, self.hidden_size, device=device)
         self.fp32_residual_connection = config.fp32_residual_connection
     def forward(self, input_ids):
@@ -788,16 +764,13 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
-        self.rotary_pos_emb = RotaryEmbedding(
-            rotary_dim // 2, original_impl=config.original_rope, device=device, dtype=config.torch_dtype
-        )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(
             nn.Linear,
             config.hidden_size,
             config.padded_vocab_size,
             bias=False,
-            dtype=config.torch_dtype,
             **init_kwargs,
         )
         self.pre_seq_len = config.pre_seq_len

diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py CHANGED Viewed

@@ -30,6 +30,7 @@ from ...schedulers import LCMScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
     deprecate,
+    is_torch_xla_available,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -40,6 +41,13 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -226,7 +234,7 @@ class LatentConsistencyModelImg2ImgPipeline(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
@@ -952,6 +960,9 @@ class LatentConsistencyModelImg2ImgPipeline(
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
         denoised = denoised.to(prompt_embeds.dtype)
         if not output_type == "latent":
             image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]

diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ...schedulers import LCMScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
     deprecate,
+    is_torch_xla_available,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -39,8 +40,16 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
@@ -209,7 +218,7 @@ class LatentConsistencyModelPipeline(
             feature_extractor=feature_extractor,
             image_encoder=image_encoder,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
@@ -881,6 +890,9 @@ class LatentConsistencyModelPipeline(
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
         denoised = denoised.to(prompt_embeds.dtype)
         if not output_type == "latent":
             image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]

diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py CHANGED Viewed

@@ -25,10 +25,19 @@ from transformers.utils import logging
 from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import is_torch_xla_available
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 class LDMTextToImagePipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using latent diffusion.
@@ -202,6 +211,9 @@ class LDMTextToImagePipeline(DiffusionPipeline):
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+            if XLA_AVAILABLE:
+                xm.mark_step()
         # scale and decode the image latents with vae
         latents = 1 / self.vqvae.config.scaling_factor * latents
         image = self.vqvae.decode(latents).sample
@@ -532,10 +544,6 @@ class LDMBertPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (LDMBertEncoder,)):
-            module.gradient_checkpointing = value
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
@@ -676,15 +684,8 @@ class LDMBertEncoder(LDMBertPreTrainedModel):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(encoder_layer),
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer,
                     hidden_states,
                     attention_mask,
                     (head_mask[idx] if head_mask is not None else None),

diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py CHANGED Viewed

@@ -15,11 +15,19 @@ from ...schedulers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import PIL_INTERPOLATION
+from ...utils import PIL_INTERPOLATION, is_torch_xla_available
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 def preprocess(image):
     w, h = image.size
     w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
@@ -174,6 +182,9 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+            if XLA_AVAILABLE:
+                xm.mark_step()
         # decode the image latents with the VQVAE
         image = self.vqvae.decode(latents).sample
         image = torch.clamp(image, -1.0, 1.0)

diffusers/pipelines/latte/pipeline_latte.py CHANGED Viewed

@@ -30,8 +30,10 @@ from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     BACKENDS_MAPPING,
     BaseOutput,
+    deprecate,
     is_bs4_available,
     is_ftfy_available,
+    is_torch_xla_available,
     logging,
     replace_example_docstring,
 )
@@ -39,8 +41,16 @@ from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ...video_processor import VideoProcessor
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 if is_bs4_available():
     from bs4 import BeautifulSoup
@@ -180,7 +190,7 @@ class LattePipeline(DiffusionPipeline):
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
     # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
@@ -592,6 +602,10 @@ class LattePipeline(DiffusionPipeline):
     def num_timesteps(self):
         return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
     @property
     def interrupt(self):
         return self._interrupt
@@ -623,7 +637,7 @@ class LattePipeline(DiffusionPipeline):
         clean_caption: bool = True,
         mask_feature: bool = True,
         enable_temporal_attentions: bool = True,
-        decode_chunk_size: Optional[int] = None,
+        decode_chunk_size: int = 14,
     ) -> Union[LattePipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -719,6 +733,7 @@ class LattePipeline(DiffusionPipeline):
             negative_prompt_embeds,
         )
         self._guidance_scale = guidance_scale
+        self._current_timestep = None
         self._interrupt = False
         # 2. Default height and width to transformer
@@ -780,6 +795,7 @@ class LattePipeline(DiffusionPipeline):
                 if self.interrupt:
                     continue
+                self._current_timestep = t
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -788,10 +804,11 @@ class LattePipeline(DiffusionPipeline):
                     # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
                     # This would be a good case for the `match` statement (Python 3.10+)
                     is_mps = latent_model_input.device.type == "mps"
+                    is_npu = latent_model_input.device.type == "npu"
                     if isinstance(current_timestep, float):
-                        dtype = torch.float32 if is_mps else torch.float64
+                        dtype = torch.float32 if (is_mps or is_npu) else torch.float64
                     else:
-                        dtype = torch.int32 if is_mps else torch.int64
+                        dtype = torch.int32 if (is_mps or is_npu) else torch.int64
                     current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
                 elif len(current_timestep.shape) == 0:
                     current_timestep = current_timestep[None].to(latent_model_input.device)
@@ -800,7 +817,7 @@ class LattePipeline(DiffusionPipeline):
                 # predict noise model_output
                 noise_pred = self.transformer(
-                    latent_model_input,
+                    hidden_states=latent_model_input,
                     encoder_hidden_states=prompt_embeds,
                     timestep=current_timestep,
                     enable_temporal_attentions=enable_temporal_attentions,
@@ -836,8 +853,20 @@ class LattePipeline(DiffusionPipeline):
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
-        if not output_type == "latents":
-            video = self.decode_latents(latents, video_length, decode_chunk_size=14)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latents":
+            deprecation_message = (
+                "Passing `output_type='latents'` is deprecated. Please pass `output_type='latent'` instead."
+            )
+            deprecate("output_type_latents", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "latent"
+        if not output_type == "latent":
+            video = self.decode_latents(latents, video_length, decode_chunk_size=decode_chunk_size)
             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
         else:
             video = latents

diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
     deprecate,
+    is_torch_xla_available,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -29,26 +30,32 @@ from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import LEditsPPDiffusionPipelineOutput, LEditsPPInversionPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> import PIL
-        >>> import requests
         >>> import torch
-        >>> from io import BytesIO
         >>> from diffusers import LEditsPPPipelineStableDiffusion
         >>> from diffusers.utils import load_image
         >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ...     "runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16
         ... )
+        >>> pipe.enable_vae_tiling()
         >>> pipe = pipe.to("cuda")
         >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
-        >>> image = load_image(img_url).convert("RGB")
+        >>> image = load_image(img_url).resize((512, 512))
         >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
@@ -152,7 +159,7 @@ class LeditsGaussianSmoothing:
         # The gaussian kernel is the product of the gaussian function of each dimension.
         kernel = 1
-        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size], indexing="ij")
         for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
             mean = (size - 1) / 2
             kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
@@ -318,7 +325,7 @@ class LEditsPPPipelineStableDiffusion(
                 "The scheduler has been changed to DPMSolverMultistepScheduler."
             )
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+        if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
@@ -332,7 +339,7 @@ class LEditsPPPipelineStableDiffusion(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+        if scheduler is not None and getattr(scheduler.config, "clip_sample", False) is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -361,10 +368,14 @@ class LEditsPPPipelineStableDiffusion(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        is_unet_version_less_0_9_0 = (
+            unet is not None
+            and hasattr(unet.config, "_diffusers_version")
+            and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
+        )
+        is_unet_sample_size_less_64 = (
+            unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        )
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -391,7 +402,7 @@ class LEditsPPPipelineStableDiffusion(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
@@ -706,6 +717,35 @@ class LEditsPPPipelineStableDiffusion(
     def cross_attention_kwargs(self):
         return self._cross_attention_kwargs
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -1182,6 +1222,9 @@ class LEditsPPPipelineStableDiffusion(
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
         # 8. Post-processing
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
@@ -1271,6 +1314,8 @@ class LEditsPPPipelineStableDiffusion(
             [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
             and respective VAE reconstruction(s).
         """
+        if height is not None and height % 32 != 0 or width is not None and width % 32 != 0:
+            raise ValueError("height and width must be a factor of 32.")
         # Reset attn processor, we do not want to store attn maps during inversion
         self.unet.set_attn_processor(AttnProcessor())
@@ -1349,6 +1394,9 @@ class LEditsPPPipelineStableDiffusion(
                 progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
         self.init_latents = xts[-1].expand(self.batch_size, -1, -1, -1)
         zs = zs.flip(0)
         self.zs = zs
@@ -1360,6 +1408,12 @@ class LEditsPPPipelineStableDiffusion(
         image = self.image_processor.preprocess(
             image=image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
         )
+        height, width = image.shape[-2:]
+        if height % 32 != 0 or width % 32 != 0:
+            raise ValueError(
+                "Image height and width must be a factor of 32. "
+                "Consider down-sampling the input using the `height` and `width` parameters"
+            )
         resized = self.image_processor.postprocess(image=image, output_type="pil")
         if max(image.shape[-2:]) > self.vae.config["sample_size"] * 1.5:

diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl