PyPI - diffusers - Versions diffs - 0.31.0__py3-none-any.whl → 0.32.1__py3-none-any.whl - Mend

diffusers 0.31.0py3-none-any.whl → 0.32.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

diffusers/__init__.py +66 -5
diffusers/callbacks.py +56 -3
diffusers/configuration_utils.py +1 -1
diffusers/dependency_versions_table.py +1 -1
diffusers/image_processor.py +25 -17
diffusers/loaders/__init__.py +22 -3
diffusers/loaders/ip_adapter.py +538 -15
diffusers/loaders/lora_base.py +124 -118
diffusers/loaders/lora_conversion_utils.py +318 -3
diffusers/loaders/lora_pipeline.py +1688 -368
diffusers/loaders/peft.py +379 -0
diffusers/loaders/single_file_model.py +71 -4
diffusers/loaders/single_file_utils.py +519 -9
diffusers/loaders/textual_inversion.py +3 -3
diffusers/loaders/transformer_flux.py +181 -0
diffusers/loaders/transformer_sd3.py +89 -0
diffusers/loaders/unet.py +17 -4
diffusers/models/__init__.py +47 -14
diffusers/models/activations.py +22 -9
diffusers/models/attention.py +13 -4
diffusers/models/attention_flax.py +1 -1
diffusers/models/attention_processor.py +2059 -281
diffusers/models/autoencoders/__init__.py +5 -0
diffusers/models/autoencoders/autoencoder_dc.py +620 -0
diffusers/models/autoencoders/autoencoder_kl.py +2 -1
diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
diffusers/models/autoencoders/vae.py +18 -5
diffusers/models/controlnet.py +47 -802
diffusers/models/controlnet_flux.py +29 -495
diffusers/models/controlnet_sd3.py +25 -379
diffusers/models/controlnet_sparsectrl.py +46 -718
diffusers/models/controlnets/__init__.py +23 -0
diffusers/models/controlnets/controlnet.py +872 -0
diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
diffusers/models/controlnets/controlnet_flux.py +536 -0
diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
diffusers/models/controlnets/controlnet_sd3.py +489 -0
diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
diffusers/models/controlnets/controlnet_union.py +832 -0
diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
diffusers/models/controlnets/multicontrolnet.py +183 -0
diffusers/models/embeddings.py +838 -43
diffusers/models/model_loading_utils.py +88 -6
diffusers/models/modeling_flax_utils.py +1 -1
diffusers/models/modeling_utils.py +72 -26
diffusers/models/normalization.py +78 -13
diffusers/models/transformers/__init__.py +5 -0
diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
diffusers/models/transformers/dit_transformer_2d.py +1 -1
diffusers/models/transformers/latte_transformer_3d.py +4 -4
diffusers/models/transformers/pixart_transformer_2d.py +1 -1
diffusers/models/transformers/sana_transformer.py +488 -0
diffusers/models/transformers/stable_audio_transformer.py +1 -1
diffusers/models/transformers/transformer_2d.py +1 -1
diffusers/models/transformers/transformer_allegro.py +422 -0
diffusers/models/transformers/transformer_cogview3plus.py +1 -1
diffusers/models/transformers/transformer_flux.py +30 -9
diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
diffusers/models/transformers/transformer_ltx.py +469 -0
diffusers/models/transformers/transformer_mochi.py +499 -0
diffusers/models/transformers/transformer_sd3.py +105 -17
diffusers/models/transformers/transformer_temporal.py +1 -1
diffusers/models/unets/unet_1d_blocks.py +1 -1
diffusers/models/unets/unet_2d.py +8 -1
diffusers/models/unets/unet_2d_blocks.py +88 -21
diffusers/models/unets/unet_2d_condition.py +1 -1
diffusers/models/unets/unet_3d_blocks.py +9 -7
diffusers/models/unets/unet_motion_model.py +5 -5
diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
diffusers/models/unets/unet_stable_cascade.py +2 -2
diffusers/models/unets/uvit_2d.py +1 -1
diffusers/models/upsampling.py +8 -0
diffusers/pipelines/__init__.py +34 -0
diffusers/pipelines/allegro/__init__.py +48 -0
diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
diffusers/pipelines/allegro/pipeline_output.py +23 -0
diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
diffusers/pipelines/auto_pipeline.py +53 -6
diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
diffusers/pipelines/controlnet/__init__.py +86 -80
diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
diffusers/pipelines/flux/__init__.py +13 -1
diffusers/pipelines/flux/modeling_flux.py +47 -0
diffusers/pipelines/flux/pipeline_flux.py +204 -29
diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
diffusers/pipelines/flux/pipeline_output.py +16 -0
diffusers/pipelines/hunyuan_video/__init__.py +48 -0
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
diffusers/pipelines/kolors/text_encoder.py +2 -2
diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
diffusers/pipelines/ltx/__init__.py +50 -0
diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
diffusers/pipelines/ltx/pipeline_output.py +20 -0
diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
diffusers/pipelines/mochi/__init__.py +48 -0
diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
diffusers/pipelines/mochi/pipeline_output.py +20 -0
diffusers/pipelines/pag/__init__.py +7 -0
diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
diffusers/pipelines/pipeline_flax_utils.py +1 -1
diffusers/pipelines/pipeline_loading_utils.py +25 -4
diffusers/pipelines/pipeline_utils.py +35 -6
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
diffusers/pipelines/sana/__init__.py +47 -0
diffusers/pipelines/sana/pipeline_output.py +21 -0
diffusers/pipelines/sana/pipeline_sana.py +884 -0
diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
diffusers/quantizers/auto.py +14 -1
diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
diffusers/quantizers/gguf/__init__.py +1 -0
diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
diffusers/quantizers/gguf/utils.py +456 -0
diffusers/quantizers/quantization_config.py +280 -2
diffusers/quantizers/torchao/__init__.py +15 -0
diffusers/quantizers/torchao/torchao_quantizer.py +292 -0
diffusers/schedulers/scheduling_ddpm.py +2 -6
diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
diffusers/schedulers/scheduling_deis_multistep.py +28 -9
diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
diffusers/schedulers/scheduling_euler_discrete.py +4 -4
diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
diffusers/schedulers/scheduling_heun_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
diffusers/schedulers/scheduling_lcm.py +2 -6
diffusers/schedulers/scheduling_lms_discrete.py +4 -4
diffusers/schedulers/scheduling_repaint.py +1 -1
diffusers/schedulers/scheduling_sasolver.py +28 -9
diffusers/schedulers/scheduling_tcd.py +2 -6
diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
diffusers/training_utils.py +16 -2
diffusers/utils/__init__.py +5 -0
diffusers/utils/constants.py +1 -0
diffusers/utils/dummy_pt_objects.py +180 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
diffusers/utils/dynamic_modules_utils.py +3 -3
diffusers/utils/hub_utils.py +31 -39
diffusers/utils/import_utils.py +67 -0
diffusers/utils/peft_utils.py +3 -0
diffusers/utils/testing_utils.py +56 -1
diffusers/utils/torch_utils.py +3 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/METADATA +6 -6
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/RECORD +214 -162
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/WHEEL +1 -1
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/LICENSE +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/entry_points.txt +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/top_level.txt +0 -0

diffusers/pipelines/flux/pipeline_flux_controlnet.py CHANGED Viewed

@@ -27,7 +27,7 @@ from transformers import (
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
 from ...models.autoencoders import AutoencoderKL
-from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
 from ...models.transformers import FluxTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
@@ -97,6 +97,20 @@ def calculate_shift(
     return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
@@ -216,13 +230,15 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
             controlnet=controlnet,
         )
         self.vae_scale_factor = (
-            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
         )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
         self.tokenizer_max_length = (
             self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
         )
-        self.default_sample_size = 64
+        self.default_sample_size = 128
     def _get_t5_prompt_embeds(
         self,
@@ -410,8 +426,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         callback_on_step_end_tensor_inputs=None,
         max_sequence_length=None,
     ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -450,9 +468,9 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
     @staticmethod
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
-        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
-        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
         latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
@@ -476,13 +494,15 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape
-        height = height // vae_scale_factor
-        width = width // vae_scale_factor
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
-        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
         latents = latents.permute(0, 3, 1, 4, 2, 5)
-        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
         return latents
@@ -498,13 +518,15 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         generator,
         latents=None,
     ):
-        height = 2 * (int(height) // self.vae_scale_factor)
-        width = 2 * (int(width) // self.vae_scale_factor)
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
         shape = (batch_size, num_channels_latents, height, width)
         if latents is not None:
-            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
             return latents.to(device=device, dtype=dtype), latent_image_ids
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -516,7 +538,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
-        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
         return latents, latent_image_ids
@@ -580,7 +602,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 28,
-        timesteps: List[int] = None,
+        sigmas: Optional[List[float]] = None,
         guidance_scale: float = 7.0,
         control_guidance_start: Union[float, List[float]] = 0.0,
         control_guidance_end: Union[float, List[float]] = 1.0,
@@ -616,10 +638,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
             guidance_scale (`float`, *optional*, defaults to 7.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -728,6 +750,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         device = self._execution_device
         dtype = self.transformer.dtype
+        # 3. Prepare text embeddings
         lora_scale = (
             self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
         )
@@ -764,7 +787,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
             controlnet_blocks_repeat = False if self.controlnet.input_hint_block is None else True
             if self.controlnet.input_hint_block is None:
                 # vae encode
-                control_image = self.vae.encode(control_image).latent_dist.sample()
+                control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
                 control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
                 # pack
@@ -802,7 +825,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
                 if self.controlnet.nets[0].input_hint_block is None:
                     # vae encode
-                    control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                    control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
                     control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
                     # pack
@@ -849,7 +872,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
         )
         # 5. Prepare timesteps
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
         image_seq_len = latents.shape[1]
         mu = calculate_shift(
             image_seq_len,
@@ -862,8 +885,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
             self.scheduler,
             num_inference_steps,
             device,
-            timesteps,
-            sigmas,
+            sigmas=sigmas,
             mu=mu,
         )

diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py CHANGED Viewed

@@ -13,7 +13,7 @@ from transformers import (
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
 from ...models.autoencoders import AutoencoderKL
-from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
 from ...models.transformers import FluxTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
@@ -228,13 +228,15 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             controlnet=controlnet,
         )
         self.vae_scale_factor = (
-            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
         )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
         self.tokenizer_max_length = (
             self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
         )
-        self.default_sample_size = 64
+        self.default_sample_size = 128
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
@@ -453,8 +455,10 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if height % self.vae_scale_factor * 2 != 0 or width % self.vae_scale_factor * 2 != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -493,9 +497,9 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
     @staticmethod
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
-        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
-        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
         latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
@@ -519,13 +523,15 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape
-        height = height // vae_scale_factor
-        width = width // vae_scale_factor
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
-        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
         latents = latents.permute(0, 3, 1, 4, 2, 5)
-        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
         return latents
@@ -549,11 +555,12 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
-        height = 2 * (int(height) // self.vae_scale_factor)
-        width = 2 * (int(width) // self.vae_scale_factor)
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
         shape = (batch_size, num_channels_latents, height, width)
-        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
         if latents is not None:
             return latents.to(device=device, dtype=dtype), latent_image_ids
@@ -639,7 +646,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         width: Optional[int] = None,
         strength: float = 0.6,
         num_inference_steps: int = 28,
-        timesteps: List[int] = None,
+        sigmas: Optional[List[float]] = None,
         guidance_scale: float = 7.0,
         control_guidance_start: Union[float, List[float]] = 0.0,
         control_guidance_end: Union[float, List[float]] = 1.0,
@@ -678,8 +685,10 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             num_inference_steps (`int`, *optional*, defaults to 28):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
             guidance_scale (`float`, *optional*, defaults to 7.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
             control_mode (`int` or `List[int]`, *optional*):
@@ -794,7 +803,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             )
             height, width = control_image.shape[-2:]
-            control_image = self.vae.encode(control_image).latent_dist.sample()
+            control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
             control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
             height_control_image, width_control_image = control_image.shape[2:]
@@ -825,7 +834,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 )
                 height, width = control_image_.shape[-2:]
-                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
+                control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
                 control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
                 height_control_image, width_control_image = control_image_.shape[2:]
@@ -851,8 +860,8 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
             control_mode = control_mode.reshape([-1, 1])
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-        image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
         mu = calculate_shift(
             image_seq_len,
             self.scheduler.config.base_image_seq_len,
@@ -864,14 +873,12 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             self.scheduler,
             num_inference_steps,
             device,
-            timesteps,
-            sigmas,
+            sigmas=sigmas,
             mu=mu,
         )
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         latents, latent_image_ids = self.prepare_latents(
             init_image,
             latent_timestep,
@@ -903,9 +910,12 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                guidance = (
-                    torch.tensor([guidance_scale], device=device) if self.controlnet.config.guidance_embeds else None
-                )
+                if isinstance(self.controlnet, FluxMultiControlNetModel):
+                    use_guidance = self.controlnet.nets[0].config.guidance_embeds
+                else:
+                    use_guidance = self.controlnet.config.guidance_embeds
+                guidance = torch.tensor([guidance_scale], device=device) if use_guidance else None
                 guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
                 if isinstance(controlnet_keep[i], list):

diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py CHANGED Viewed

@@ -14,7 +14,7 @@ from transformers import (
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
 from ...models.autoencoders import AutoencoderKL
-from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from ...models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
 from ...models.transformers import FluxTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
@@ -231,11 +231,13 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         )
         self.vae_scale_factor = (
-            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
         )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
         self.mask_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor,
+            vae_scale_factor=self.vae_scale_factor * 2,
             vae_latent_channels=self.vae.config.latent_channels,
             do_normalize=False,
             do_binarize=True,
@@ -244,7 +246,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         self.tokenizer_max_length = (
             self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
         )
-        self.default_sample_size = 64
+        self.default_sample_size = 128
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
@@ -467,8 +469,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -520,9 +524,9 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
     @staticmethod
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
-        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
-        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
         latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
@@ -546,13 +550,15 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape
-        height = height // vae_scale_factor
-        width = width // vae_scale_factor
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
-        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
         latents = latents.permute(0, 3, 1, 4, 2, 5)
-        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
         return latents
@@ -576,11 +582,12 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
-        height = 2 * (int(height) // self.vae_scale_factor)
-        width = 2 * (int(width) // self.vae_scale_factor)
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
         shape = (batch_size, num_channels_latents, height, width)
-        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
         image = image.to(device=device, dtype=dtype)
         image_latents = self._encode_vae_image(image=image, generator=generator)
@@ -622,8 +629,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         device,
         generator,
     ):
-        height = 2 * (int(height) // self.vae_scale_factor)
-        width = 2 * (int(width) // self.vae_scale_factor)
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
@@ -661,7 +670,6 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
         masked_image_latents = self._pack_latents(
             masked_image_latents,
             batch_size,
@@ -744,7 +752,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         width: Optional[int] = None,
         strength: float = 0.6,
         padding_mask_crop: Optional[int] = None,
-        timesteps: List[int] = None,
+        sigmas: Optional[List[float]] = None,
         num_inference_steps: int = 28,
         guidance_scale: float = 7.0,
         control_guidance_start: Union[float, List[float]] = 0.0,
@@ -791,8 +799,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             num_inference_steps (`int`, *optional*, defaults to 28):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
             guidance_scale (`float`, *optional*, defaults to 7.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
             control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
@@ -930,19 +940,22 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             )
             height, width = control_image.shape[-2:]
-            # vae encode
-            control_image = self.vae.encode(control_image).latent_dist.sample()
-            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-            # pack
-            height_control_image, width_control_image = control_image.shape[2:]
-            control_image = self._pack_latents(
-                control_image,
-                batch_size * num_images_per_prompt,
-                num_channels_latents,
-                height_control_image,
-                width_control_image,
-            )
+            # xlab controlnet has a input_hint_block and instantx controlnet does not
+            controlnet_blocks_repeat = False if self.controlnet.input_hint_block is None else True
+            if self.controlnet.input_hint_block is None:
+                # vae encode
+                control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
+                control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+                # pack
+                height_control_image, width_control_image = control_image.shape[2:]
+                control_image = self._pack_latents(
+                    control_image,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
             # set control mode
             if control_mode is not None:
@@ -952,7 +965,9 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         elif isinstance(self.controlnet, FluxMultiControlNetModel):
             control_images = []
-            for control_image_ in control_image:
+            # xlab controlnet has a input_hint_block and instantx controlnet does not
+            controlnet_blocks_repeat = False if self.controlnet.nets[0].input_hint_block is None else True
+            for i, control_image_ in enumerate(control_image):
                 control_image_ = self.prepare_image(
                     image=control_image_,
                     width=width,
@@ -964,19 +979,20 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 )
                 height, width = control_image_.shape[-2:]
-                # vae encode
-                control_image_ = self.vae.encode(control_image_).latent_dist.sample()
-                control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-                # pack
-                height_control_image, width_control_image = control_image_.shape[2:]
-                control_image_ = self._pack_latents(
-                    control_image_,
-                    batch_size * num_images_per_prompt,
-                    num_channels_latents,
-                    height_control_image,
-                    width_control_image,
-                )
+                if self.controlnet.nets[0].input_hint_block is None:
+                    # vae encode
+                    control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
+                    control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+                    # pack
+                    height_control_image, width_control_image = control_image_.shape[2:]
+                    control_image_ = self._pack_latents(
+                        control_image_,
+                        batch_size * num_images_per_prompt,
+                        num_channels_latents,
+                        height_control_image,
+                        width_control_image,
+                    )
                 control_images.append(control_image_)
@@ -995,8 +1011,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
         # 6. Prepare timesteps
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-        image_seq_len = (int(global_height) // self.vae_scale_factor) * (int(global_width) // self.vae_scale_factor)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = (int(global_height) // self.vae_scale_factor // 2) * (
+            int(global_width) // self.vae_scale_factor // 2
+        )
         mu = calculate_shift(
             image_seq_len,
             self.scheduler.config.base_image_seq_len,
@@ -1008,8 +1026,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
             self.scheduler,
             num_inference_steps,
             device,
-            timesteps,
-            sigmas,
+            sigmas=sigmas,
             mu=mu,
         )
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
@@ -1078,7 +1095,11 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
                 # predict the noise residual
-                if self.controlnet.config.guidance_embeds:
+                if isinstance(self.controlnet, FluxMultiControlNetModel):
+                    use_guidance = self.controlnet.nets[0].config.guidance_embeds
+                else:
+                    use_guidance = self.controlnet.config.guidance_embeds
+                if use_guidance:
                     guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
                     guidance = guidance.expand(latents.shape[0])
                 else:
@@ -1125,6 +1146,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
                     img_ids=latent_image_ids,
                     joint_attention_kwargs=self.joint_attention_kwargs,
                     return_dict=False,
+                    controlnet_blocks_repeat=controlnet_blocks_repeat,
                 )[0]
                 # compute the previous noisy sample x_t -> x_t-1

diffusers 0.31.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

diffusers 0.31.0py3-none-any.whl → 0.32.1py3-none-any.whl