PyPI - diffusers - Versions diffs - 0.34.0__py3-none-any.whl → 0.35.1__py3-none-any.whl - Mend

diffusers 0.34.0py3-none-any.whl → 0.35.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (191) hide show

diffusers/__init__.py +98 -1
diffusers/callbacks.py +35 -0
diffusers/commands/custom_blocks.py +134 -0
diffusers/commands/diffusers_cli.py +2 -0
diffusers/commands/fp16_safetensors.py +1 -1
diffusers/configuration_utils.py +11 -2
diffusers/dependency_versions_table.py +3 -3
diffusers/guiders/__init__.py +41 -0
diffusers/guiders/adaptive_projected_guidance.py +188 -0
diffusers/guiders/auto_guidance.py +190 -0
diffusers/guiders/classifier_free_guidance.py +141 -0
diffusers/guiders/classifier_free_zero_star_guidance.py +152 -0
diffusers/guiders/frequency_decoupled_guidance.py +327 -0
diffusers/guiders/guider_utils.py +309 -0
diffusers/guiders/perturbed_attention_guidance.py +271 -0
diffusers/guiders/skip_layer_guidance.py +262 -0
diffusers/guiders/smoothed_energy_guidance.py +251 -0
diffusers/guiders/tangential_classifier_free_guidance.py +143 -0
diffusers/hooks/__init__.py +17 -0
diffusers/hooks/_common.py +56 -0
diffusers/hooks/_helpers.py +293 -0
diffusers/hooks/faster_cache.py +7 -6
diffusers/hooks/first_block_cache.py +259 -0
diffusers/hooks/group_offloading.py +292 -286
diffusers/hooks/hooks.py +56 -1
diffusers/hooks/layer_skip.py +263 -0
diffusers/hooks/layerwise_casting.py +2 -7
diffusers/hooks/pyramid_attention_broadcast.py +14 -11
diffusers/hooks/smoothed_energy_guidance_utils.py +167 -0
diffusers/hooks/utils.py +43 -0
diffusers/loaders/__init__.py +6 -0
diffusers/loaders/ip_adapter.py +255 -4
diffusers/loaders/lora_base.py +63 -30
diffusers/loaders/lora_conversion_utils.py +434 -53
diffusers/loaders/lora_pipeline.py +834 -37
diffusers/loaders/peft.py +28 -5
diffusers/loaders/single_file_model.py +44 -11
diffusers/loaders/single_file_utils.py +170 -2
diffusers/loaders/transformer_flux.py +9 -10
diffusers/loaders/transformer_sd3.py +6 -1
diffusers/loaders/unet.py +22 -5
diffusers/loaders/unet_loader_utils.py +5 -2
diffusers/models/__init__.py +8 -0
diffusers/models/attention.py +484 -3
diffusers/models/attention_dispatch.py +1218 -0
diffusers/models/attention_processor.py +105 -663
diffusers/models/auto_model.py +2 -2
diffusers/models/autoencoders/__init__.py +1 -0
diffusers/models/autoencoders/autoencoder_dc.py +14 -1
diffusers/models/autoencoders/autoencoder_kl.py +1 -1
diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -1
diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +1070 -0
diffusers/models/autoencoders/autoencoder_kl_wan.py +370 -40
diffusers/models/cache_utils.py +31 -9
diffusers/models/controlnets/controlnet_flux.py +5 -5
diffusers/models/controlnets/controlnet_union.py +4 -4
diffusers/models/embeddings.py +26 -34
diffusers/models/model_loading_utils.py +233 -1
diffusers/models/modeling_flax_utils.py +1 -2
diffusers/models/modeling_utils.py +159 -94
diffusers/models/transformers/__init__.py +2 -0
diffusers/models/transformers/transformer_chroma.py +16 -117
diffusers/models/transformers/transformer_cogview4.py +36 -2
diffusers/models/transformers/transformer_cosmos.py +11 -4
diffusers/models/transformers/transformer_flux.py +372 -132
diffusers/models/transformers/transformer_hunyuan_video.py +6 -0
diffusers/models/transformers/transformer_ltx.py +104 -23
diffusers/models/transformers/transformer_qwenimage.py +645 -0
diffusers/models/transformers/transformer_skyreels_v2.py +607 -0
diffusers/models/transformers/transformer_wan.py +298 -85
diffusers/models/transformers/transformer_wan_vace.py +15 -21
diffusers/models/unets/unet_2d_condition.py +2 -1
diffusers/modular_pipelines/__init__.py +83 -0
diffusers/modular_pipelines/components_manager.py +1068 -0
diffusers/modular_pipelines/flux/__init__.py +66 -0
diffusers/modular_pipelines/flux/before_denoise.py +689 -0
diffusers/modular_pipelines/flux/decoders.py +109 -0
diffusers/modular_pipelines/flux/denoise.py +227 -0
diffusers/modular_pipelines/flux/encoders.py +412 -0
diffusers/modular_pipelines/flux/modular_blocks.py +181 -0
diffusers/modular_pipelines/flux/modular_pipeline.py +59 -0
diffusers/modular_pipelines/modular_pipeline.py +2446 -0
diffusers/modular_pipelines/modular_pipeline_utils.py +672 -0
diffusers/modular_pipelines/node_utils.py +665 -0
diffusers/modular_pipelines/stable_diffusion_xl/__init__.py +77 -0
diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py +1874 -0
diffusers/modular_pipelines/stable_diffusion_xl/decoders.py +208 -0
diffusers/modular_pipelines/stable_diffusion_xl/denoise.py +771 -0
diffusers/modular_pipelines/stable_diffusion_xl/encoders.py +887 -0
diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py +380 -0
diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py +365 -0
diffusers/modular_pipelines/wan/__init__.py +66 -0
diffusers/modular_pipelines/wan/before_denoise.py +365 -0
diffusers/modular_pipelines/wan/decoders.py +105 -0
diffusers/modular_pipelines/wan/denoise.py +261 -0
diffusers/modular_pipelines/wan/encoders.py +242 -0
diffusers/modular_pipelines/wan/modular_blocks.py +144 -0
diffusers/modular_pipelines/wan/modular_pipeline.py +90 -0
diffusers/pipelines/__init__.py +31 -0
diffusers/pipelines/audioldm2/pipeline_audioldm2.py +2 -3
diffusers/pipelines/auto_pipeline.py +17 -13
diffusers/pipelines/chroma/pipeline_chroma.py +5 -5
diffusers/pipelines/chroma/pipeline_chroma_img2img.py +5 -5
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +9 -8
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +9 -8
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +10 -9
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +9 -8
diffusers/pipelines/cogview4/pipeline_cogview4.py +16 -15
diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +3 -2
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +212 -93
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +7 -3
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +194 -92
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +1 -1
diffusers/pipelines/dit/pipeline_dit.py +3 -1
diffusers/pipelines/flux/__init__.py +4 -0
diffusers/pipelines/flux/pipeline_flux.py +34 -26
diffusers/pipelines/flux/pipeline_flux_control.py +8 -8
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +1 -1
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1 -1
diffusers/pipelines/flux/pipeline_flux_controlnet.py +1 -1
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +1 -1
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1 -1
diffusers/pipelines/flux/pipeline_flux_fill.py +1 -1
diffusers/pipelines/flux/pipeline_flux_img2img.py +1 -1
diffusers/pipelines/flux/pipeline_flux_inpaint.py +1 -1
diffusers/pipelines/flux/pipeline_flux_kontext.py +1134 -0
diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py +1460 -0
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
diffusers/pipelines/flux/pipeline_output.py +6 -4
diffusers/pipelines/hidream_image/pipeline_hidream_image.py +5 -5
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +25 -24
diffusers/pipelines/ltx/pipeline_ltx.py +13 -12
diffusers/pipelines/ltx/pipeline_ltx_condition.py +10 -9
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +13 -12
diffusers/pipelines/mochi/pipeline_mochi.py +9 -8
diffusers/pipelines/pipeline_flax_utils.py +2 -2
diffusers/pipelines/pipeline_loading_utils.py +24 -2
diffusers/pipelines/pipeline_utils.py +22 -15
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +3 -1
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +20 -0
diffusers/pipelines/qwenimage/__init__.py +55 -0
diffusers/pipelines/qwenimage/pipeline_output.py +21 -0
diffusers/pipelines/qwenimage/pipeline_qwenimage.py +726 -0
diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +849 -0
diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +829 -0
diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +1015 -0
diffusers/pipelines/sana/pipeline_sana_sprint.py +5 -5
diffusers/pipelines/skyreels_v2/__init__.py +59 -0
diffusers/pipelines/skyreels_v2/pipeline_output.py +20 -0
diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py +610 -0
diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py +978 -0
diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py +1059 -0
diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_v2v.py +1063 -0
diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py +745 -0
diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -1
diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +2 -1
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +6 -5
diffusers/pipelines/wan/pipeline_wan.py +78 -20
diffusers/pipelines/wan/pipeline_wan_i2v.py +112 -32
diffusers/pipelines/wan/pipeline_wan_vace.py +1 -2
diffusers/quantizers/__init__.py +1 -177
diffusers/quantizers/base.py +11 -0
diffusers/quantizers/gguf/utils.py +92 -3
diffusers/quantizers/pipe_quant_config.py +202 -0
diffusers/quantizers/torchao/torchao_quantizer.py +26 -0
diffusers/schedulers/scheduling_deis_multistep.py +8 -1
diffusers/schedulers/scheduling_dpmsolver_multistep.py +6 -0
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +6 -0
diffusers/schedulers/scheduling_scm.py +0 -1
diffusers/schedulers/scheduling_unipc_multistep.py +10 -1
diffusers/schedulers/scheduling_utils.py +2 -2
diffusers/schedulers/scheduling_utils_flax.py +1 -1
diffusers/training_utils.py +78 -0
diffusers/utils/__init__.py +10 -0
diffusers/utils/constants.py +4 -0
diffusers/utils/dummy_pt_objects.py +312 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +255 -0
diffusers/utils/dynamic_modules_utils.py +84 -25
diffusers/utils/hub_utils.py +33 -17
diffusers/utils/import_utils.py +70 -0
diffusers/utils/peft_utils.py +11 -8
diffusers/utils/testing_utils.py +136 -10
diffusers/utils/torch_utils.py +18 -0
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/METADATA +6 -6
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/RECORD +191 -127
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/LICENSE +0 -0
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/WHEEL +0 -0
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/entry_points.txt +0 -0
{diffusers-0.34.0.dist-info → diffusers-0.35.1.dist-info}/top_level.txt +0 -0

diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py CHANGED Viewed

@@ -19,7 +19,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
 import torch
-import torch.nn.functional as F
 from transformers import (
     CLIPImageProcessor,
     CLIPTextModel,
@@ -38,7 +37,13 @@ from ...loaders import (
     StableDiffusionXLLoraLoaderMixin,
     TextualInversionLoaderMixin,
 )
-from ...models import AutoencoderKL, ControlNetModel, ControlNetUnionModel, ImageProjection, UNet2DConditionModel
+from ...models import (
+    AutoencoderKL,
+    ControlNetUnionModel,
+    ImageProjection,
+    MultiControlNetUnionModel,
+    UNet2DConditionModel,
+)
 from ...models.attention_processor import (
     AttnProcessor2_0,
     XFormersAttnProcessor,
@@ -262,7 +267,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        controlnet: ControlNetUnionModel,
+        controlnet: Union[
+            ControlNetUnionModel, List[ControlNetUnionModel], Tuple[ControlNetUnionModel], MultiControlNetUnionModel
+        ],
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
@@ -272,8 +279,8 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
     ):
         super().__init__()
-        if not isinstance(controlnet, ControlNetUnionModel):
-            raise ValueError("Expected `controlnet` to be of type `ControlNetUnionModel`.")
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetUnionModel(controlnet)
         self.register_modules(
             vae=vae,
@@ -649,6 +656,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         controlnet_conditioning_scale=1.0,
         control_guidance_start=0.0,
         control_guidance_end=1.0,
+        control_mode=None,
         callback_on_step_end_tensor_inputs=None,
     ):
         if strength < 0 or strength > 1:
@@ -722,28 +730,44 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                 "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
             )
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetUnionModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
         # Check `image`
-        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
-            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
-        )
-        if (
-            isinstance(self.controlnet, ControlNetModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, ControlNetModel)
-        ):
-            self.check_image(image, prompt, prompt_embeds)
-        elif (
-            isinstance(self.controlnet, ControlNetUnionModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, ControlNetUnionModel)
-        ):
-            self.check_image(image, prompt, prompt_embeds)
-        else:
-            assert False
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+        if isinstance(controlnet, ControlNetUnionModel):
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        elif isinstance(controlnet, MultiControlNetUnionModel):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+            elif not all(isinstance(i, list) for i in image):
+                raise ValueError("For multiple controlnets: elements of `image` must be list of conditionings.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+            for images_ in image:
+                for image_ in images_:
+                    self.check_image(image_, prompt, prompt_embeds)
         if not isinstance(control_guidance_start, (tuple, list)):
             control_guidance_start = [control_guidance_start]
+        if isinstance(controlnet, MultiControlNetUnionModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
         if not isinstance(control_guidance_end, (tuple, list)):
             control_guidance_end = [control_guidance_end]
@@ -762,6 +786,15 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
             if end > 1.0:
                 raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+        # Check `control_mode`
+        if isinstance(controlnet, ControlNetUnionModel):
+            if max(control_mode) >= controlnet.config.num_control_type:
+                raise ValueError(f"control_mode: must be lower than {controlnet.config.num_control_type}.")
+        elif isinstance(controlnet, MultiControlNetUnionModel):
+            for _control_mode, _controlnet in zip(control_mode, self.controlnet.nets):
+                if max(_control_mode) >= _controlnet.config.num_control_type:
+                    raise ValueError(f"control_mode: must be lower than {_controlnet.config.num_control_type}.")
         if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
             raise ValueError(
                 "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
@@ -1049,7 +1082,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
         image: PipelineImageInput = None,
-        control_image: PipelineImageInput = None,
+        control_image: Union[PipelineImageInput, List[PipelineImageInput]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         strength: float = 0.8,
@@ -1074,7 +1107,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         guess_mode: bool = False,
         control_guidance_start: Union[float, List[float]] = 0.0,
         control_guidance_end: Union[float, List[float]] = 1.0,
-        control_mode: Optional[Union[int, List[int]]] = None,
+        control_mode: Optional[Union[int, List[int], List[List[int]]]] = None,
         original_size: Tuple[int, int] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Tuple[int, int] = None,
@@ -1104,13 +1137,13 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                     `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                 The initial image will be used as the starting point for the image generation process. Can also accept
                 image latents as `image`, if passing latents directly, it will not be encoded again.
-            control_image (`PipelineImageInput`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also
-                be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
-                and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in
-                init, images must be passed as a list such that each element of the list can be correctly batched for
-                input to a single controlnet.
+            control_image (`PipelineImageInput` or `List[PipelineImageInput]`, *optional*):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
             height (`int`, *optional*, defaults to the size of control_image):
                 The height in pixels of the generated image. Anything below 512 pixels won't work well for
                 [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
@@ -1184,16 +1217,21 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list.
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
             guess_mode (`bool`, *optional*, defaults to `False`):
                 In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
                 you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
             control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
-                The percentage of total steps at which the controlnet starts applying.
+                The percentage of total steps at which the ControlNet starts applying.
             control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The percentage of total steps at which the controlnet stops applying.
+                The percentage of total steps at which the ControlNet stops applying.
+            control_mode (`int` or `List[int]` or `List[List[int]], *optional*):
+                The control condition types for the ControlNet. See the ControlNet's model card forinformation on the
+                available control modes. If multiple ControlNets are specified in `init`, control_mode should be a list
+                where each ControlNet should have its corresponding control mode list. Should reflect the order of
+                conditions in control_image
             original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                 If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
                 `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
@@ -1273,12 +1311,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
-        # align format for control guidance
-        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
-            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
-        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
-            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
         if not isinstance(control_image, list):
             control_image = [control_image]
         else:
@@ -1287,37 +1319,56 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         if not isinstance(control_mode, list):
             control_mode = [control_mode]
-        if len(control_image) != len(control_mode):
-            raise ValueError("Expected len(control_image) == len(control_type)")
+        if isinstance(controlnet, MultiControlNetUnionModel):
+            control_image = [[item] for item in control_image]
+            control_mode = [[item] for item in control_mode]
-        num_control_type = controlnet.config.num_control_type
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetUnionModel) else len(control_mode)
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+        if isinstance(controlnet_conditioning_scale, float):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetUnionModel) else len(control_mode)
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * mult
         # 1. Check inputs
-        control_type = [0 for _ in range(num_control_type)]
-        for _image, control_idx in zip(control_image, control_mode):
-            control_type[control_idx] = 1
-            self.check_inputs(
-                prompt,
-                prompt_2,
-                _image,
-                strength,
-                num_inference_steps,
-                callback_steps,
-                negative_prompt,
-                negative_prompt_2,
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-                ip_adapter_image,
-                ip_adapter_image_embeds,
-                controlnet_conditioning_scale,
-                control_guidance_start,
-                control_guidance_end,
-                callback_on_step_end_tensor_inputs,
-            )
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            control_mode,
+            callback_on_step_end_tensor_inputs,
+        )
-        control_type = torch.Tensor(control_type)
+        if isinstance(controlnet, ControlNetUnionModel):
+            control_type = torch.zeros(controlnet.config.num_control_type).scatter_(0, torch.tensor(control_mode), 1)
+        elif isinstance(controlnet, MultiControlNetUnionModel):
+            control_type = [
+                torch.zeros(controlnet_.config.num_control_type).scatter_(0, torch.tensor(control_mode_), 1)
+                for control_mode_, controlnet_ in zip(control_mode, self.controlnet.nets)
+            ]
         self._guidance_scale = guidance_scale
         self._clip_skip = clip_skip
@@ -1334,7 +1385,11 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         device = self._execution_device
-        global_pool_conditions = controlnet.config.global_pool_conditions
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetUnionModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
         guess_mode = guess_mode or global_pool_conditions
         # 3.1. Encode input prompt
@@ -1372,22 +1427,55 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                 self.do_classifier_free_guidance,
             )
-        # 4. Prepare image and controlnet_conditioning_image
+        # 4.1 Prepare image
         image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
-        for idx, _ in enumerate(control_image):
-            control_image[idx] = self.prepare_control_image(
-                image=control_image[idx],
-                width=width,
-                height=height,
-                batch_size=batch_size * num_images_per_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                device=device,
-                dtype=controlnet.dtype,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-            height, width = control_image[idx].shape[-2:]
+        # 4.2 Prepare control images
+        if isinstance(controlnet, ControlNetUnionModel):
+            control_images = []
+            for image_ in control_image:
+                image_ = self.prepare_control_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                control_images.append(image_)
+            control_image = control_images
+            height, width = control_image[0].shape[-2:]
+        elif isinstance(controlnet, MultiControlNetUnionModel):
+            control_images = []
+            for control_image_ in control_image:
+                images = []
+                for image_ in control_image_:
+                    image_ = self.prepare_control_image(
+                        image=image_,
+                        width=width,
+                        height=height,
+                        batch_size=batch_size * num_images_per_prompt,
+                        num_images_per_prompt=num_images_per_prompt,
+                        device=device,
+                        dtype=controlnet.dtype,
+                        do_classifier_free_guidance=self.do_classifier_free_guidance,
+                        guess_mode=guess_mode,
+                    )
+                    images.append(image_)
+                control_images.append(images)
+            control_image = control_images
+            height, width = control_image[0][0].shape[-2:]
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -1414,10 +1502,11 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         # 7.1 Create tensor stating which controlnets to keep
         controlnet_keep = []
         for i in range(len(timesteps)):
-            controlnet_keep.append(
-                1.0
-                - float(i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end)
-            )
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps)
         # 7.2 Prepare added time ids & embeddings
         original_size = original_size or (height, width)
@@ -1460,12 +1549,25 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
         prompt_embeds = prompt_embeds.to(device)
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device)
-        control_type = (
-            control_type.reshape(1, -1)
-            .to(device, dtype=prompt_embeds.dtype)
-            .repeat(batch_size * num_images_per_prompt * 2, 1)
+        control_type_repeat_factor = (
+            batch_size * num_images_per_prompt * (2 if self.do_classifier_free_guidance else 1)
         )
+        if isinstance(controlnet, ControlNetUnionModel):
+            control_type = (
+                control_type.reshape(1, -1)
+                .to(self._execution_device, dtype=prompt_embeds.dtype)
+                .repeat(control_type_repeat_factor, 1)
+            )
+        elif isinstance(controlnet, MultiControlNetUnionModel):
+            control_type = [
+                _control_type.reshape(1, -1)
+                .to(self._execution_device, dtype=prompt_embeds.dtype)
+                .repeat(control_type_repeat_factor, 1)
+                for _control_type in control_type
+            ]
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:

diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py CHANGED Viewed

@@ -717,7 +717,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Sta
         from diffusers import CycleDiffusionPipeline, DDIMScheduler
         # load the pipeline
-        # make sure you're logged in with `huggingface-cli login`
+        # make sure you're logged in with `hf auth login`
         model_id_or_path = "CompVis/stable-diffusion-v1-4"
         scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
         pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")

diffusers/pipelines/dit/pipeline_dit.py CHANGED Viewed

@@ -46,7 +46,9 @@ class DiTPipeline(DiffusionPipeline):
     Parameters:
         transformer ([`DiTTransformer2DModel`]):
-            A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents.
+            A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents. Initially published as
+            [`Transformer2DModel`](https://huggingface.co/facebook/DiT-XL-2-256/blob/main/transformer/config.json#L2)
+            in the config, but the mismatch can be ignored.
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
         scheduler ([`DDIMScheduler`]):

diffusers/pipelines/flux/__init__.py CHANGED Viewed

@@ -33,6 +33,8 @@ else:
     _import_structure["pipeline_flux_fill"] = ["FluxFillPipeline"]
     _import_structure["pipeline_flux_img2img"] = ["FluxImg2ImgPipeline"]
     _import_structure["pipeline_flux_inpaint"] = ["FluxInpaintPipeline"]
+    _import_structure["pipeline_flux_kontext"] = ["FluxKontextPipeline"]
+    _import_structure["pipeline_flux_kontext_inpaint"] = ["FluxKontextInpaintPipeline"]
     _import_structure["pipeline_flux_prior_redux"] = ["FluxPriorReduxPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -52,6 +54,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .pipeline_flux_fill import FluxFillPipeline
         from .pipeline_flux_img2img import FluxImg2ImgPipeline
         from .pipeline_flux_inpaint import FluxInpaintPipeline
+        from .pipeline_flux_kontext import FluxKontextPipeline
+        from .pipeline_flux_kontext_inpaint import FluxKontextInpaintPipeline
         from .pipeline_flux_prior_redux import FluxPriorReduxPipeline
 else:
     import sys

diffusers/pipelines/flux/pipeline_flux.py CHANGED Viewed

@@ -310,7 +310,7 @@ class FluxPipeline(
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -674,7 +674,8 @@ class FluxPipeline(
                 The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                 `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
             true_cfg_scale (`float`, *optional*, defaults to 1.0):
-                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+                True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and
+                `negative_prompt` is provided.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image. This is set to 1024 by default for the best results.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -687,11 +688,11 @@ class FluxPipeline(
                 their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                 will be used.
             guidance_scale (`float`, *optional*, defaults to 3.5):
-                Guidance scale as defined in [Classifier-Free Diffusion
-                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
-                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
-                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
-                the text `prompt`, usually at the expense of lower image quality.
+                Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+                a model to generate images more aligned with `prompt` at the expense of lower image quality.
+                Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
+                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -840,6 +841,8 @@ class FluxPipeline(
         # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas:
+            sigmas = None
         image_seq_len = latents.shape[1]
         mu = calculate_shift(
             image_seq_len,
@@ -898,6 +901,8 @@ class FluxPipeline(
             )
         # 6. Denoising loop
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
         self.scheduler.set_begin_index(0)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -910,32 +915,35 @@ class FluxPipeline(
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-                if do_true_cfg:
-                    if negative_image_embeds is not None:
-                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-                    neg_noise_pred = self.transformer(
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
                         guidance=guidance,
-                        pooled_projections=negative_pooled_prompt_embeds,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
+                        pooled_projections=pooled_prompt_embeds,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
                         img_ids=latent_image_ids,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
                     )[0]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            pooled_projections=negative_pooled_prompt_embeds,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )[0]
                     noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
                 # compute the previous noisy sample x_t -> x_t-1

diffusers/pipelines/flux/pipeline_flux_control.py CHANGED Viewed

@@ -163,9 +163,9 @@ class FluxControlPipeline(
     TextualInversionLoaderMixin,
 ):
     r"""
-    The Flux pipeline for controllable text-to-image generation.
+    The Flux pipeline for controllable text-to-image generation with image conditions.
-    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Reference: https://bfl.ai/flux-1-tools
     Args:
         transformer ([`FluxTransformer2DModel`]):
@@ -324,7 +324,7 @@ class FluxControlPipeline(
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -661,11 +661,11 @@ class FluxControlPipeline(
                 their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                 will be used.
             guidance_scale (`float`, *optional*, defaults to 3.5):
-                Guidance scale as defined in [Classifier-Free Diffusion
-                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
-                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
-                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
-                the text `prompt`, usually at the expense of lower image quality.
+                Embedded guidance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+                a model to generate images more aligned with prompt at the expense of lower image quality.
+                Guidance-distilled models approximates true classifier-free guidance for `guidance_scale` > 1. Refer to
+                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

diffusers/pipelines/flux/pipeline_flux_control_img2img.py CHANGED Viewed

@@ -335,7 +335,7 @@ class FluxControlImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSin
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,

diffusers/pipelines/flux/pipeline_flux_control_inpaint.py CHANGED Viewed

@@ -374,7 +374,7 @@ class FluxControlInpaintPipeline(
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,

diffusers/pipelines/flux/pipeline_flux_controlnet.py CHANGED Viewed

@@ -341,7 +341,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,

diffusers 0.34.0__py3-none-any.whl → 0.35.1__py3-none-any.whl

diffusers 0.34.0py3-none-any.whl → 0.35.1py3-none-any.whl