PyPI - diffusers - Versions diffs - 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl - Mend

diffusers 0.31.0py3-none-any.whl → 0.32.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

diffusers/__init__.py +66 -5
diffusers/callbacks.py +56 -3
diffusers/configuration_utils.py +1 -1
diffusers/dependency_versions_table.py +1 -1
diffusers/image_processor.py +25 -17
diffusers/loaders/__init__.py +22 -3
diffusers/loaders/ip_adapter.py +538 -15
diffusers/loaders/lora_base.py +124 -118
diffusers/loaders/lora_conversion_utils.py +318 -3
diffusers/loaders/lora_pipeline.py +1688 -368
diffusers/loaders/peft.py +379 -0
diffusers/loaders/single_file_model.py +71 -4
diffusers/loaders/single_file_utils.py +519 -9
diffusers/loaders/textual_inversion.py +3 -3
diffusers/loaders/transformer_flux.py +181 -0
diffusers/loaders/transformer_sd3.py +89 -0
diffusers/loaders/unet.py +17 -4
diffusers/models/__init__.py +47 -14
diffusers/models/activations.py +22 -9
diffusers/models/attention.py +13 -4
diffusers/models/attention_flax.py +1 -1
diffusers/models/attention_processor.py +2059 -281
diffusers/models/autoencoders/__init__.py +5 -0
diffusers/models/autoencoders/autoencoder_dc.py +620 -0
diffusers/models/autoencoders/autoencoder_kl.py +2 -1
diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
diffusers/models/autoencoders/vae.py +18 -5
diffusers/models/controlnet.py +47 -802
diffusers/models/controlnet_flux.py +29 -495
diffusers/models/controlnet_sd3.py +25 -379
diffusers/models/controlnet_sparsectrl.py +46 -718
diffusers/models/controlnets/__init__.py +23 -0
diffusers/models/controlnets/controlnet.py +872 -0
diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
diffusers/models/controlnets/controlnet_flux.py +536 -0
diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
diffusers/models/controlnets/controlnet_sd3.py +489 -0
diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
diffusers/models/controlnets/controlnet_union.py +832 -0
diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
diffusers/models/controlnets/multicontrolnet.py +183 -0
diffusers/models/embeddings.py +838 -43
diffusers/models/model_loading_utils.py +88 -6
diffusers/models/modeling_flax_utils.py +1 -1
diffusers/models/modeling_utils.py +74 -28
diffusers/models/normalization.py +78 -13
diffusers/models/transformers/__init__.py +5 -0
diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
diffusers/models/transformers/dit_transformer_2d.py +1 -1
diffusers/models/transformers/latte_transformer_3d.py +4 -4
diffusers/models/transformers/pixart_transformer_2d.py +1 -1
diffusers/models/transformers/sana_transformer.py +488 -0
diffusers/models/transformers/stable_audio_transformer.py +1 -1
diffusers/models/transformers/transformer_2d.py +1 -1
diffusers/models/transformers/transformer_allegro.py +422 -0
diffusers/models/transformers/transformer_cogview3plus.py +1 -1
diffusers/models/transformers/transformer_flux.py +30 -9
diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
diffusers/models/transformers/transformer_ltx.py +469 -0
diffusers/models/transformers/transformer_mochi.py +499 -0
diffusers/models/transformers/transformer_sd3.py +105 -17
diffusers/models/transformers/transformer_temporal.py +1 -1
diffusers/models/unets/unet_1d_blocks.py +1 -1
diffusers/models/unets/unet_2d.py +8 -1
diffusers/models/unets/unet_2d_blocks.py +88 -21
diffusers/models/unets/unet_2d_condition.py +1 -1
diffusers/models/unets/unet_3d_blocks.py +9 -7
diffusers/models/unets/unet_motion_model.py +5 -5
diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
diffusers/models/unets/unet_stable_cascade.py +2 -2
diffusers/models/unets/uvit_2d.py +1 -1
diffusers/models/upsampling.py +8 -0
diffusers/pipelines/__init__.py +34 -0
diffusers/pipelines/allegro/__init__.py +48 -0
diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
diffusers/pipelines/allegro/pipeline_output.py +23 -0
diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
diffusers/pipelines/auto_pipeline.py +53 -6
diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
diffusers/pipelines/controlnet/__init__.py +86 -80
diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
diffusers/pipelines/flux/__init__.py +13 -1
diffusers/pipelines/flux/modeling_flux.py +47 -0
diffusers/pipelines/flux/pipeline_flux.py +204 -29
diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
diffusers/pipelines/flux/pipeline_output.py +16 -0
diffusers/pipelines/hunyuan_video/__init__.py +48 -0
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
diffusers/pipelines/kolors/text_encoder.py +2 -2
diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
diffusers/pipelines/ltx/__init__.py +50 -0
diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
diffusers/pipelines/ltx/pipeline_output.py +20 -0
diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
diffusers/pipelines/mochi/__init__.py +48 -0
diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
diffusers/pipelines/mochi/pipeline_output.py +20 -0
diffusers/pipelines/pag/__init__.py +7 -0
diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
diffusers/pipelines/pipeline_flax_utils.py +1 -1
diffusers/pipelines/pipeline_loading_utils.py +25 -4
diffusers/pipelines/pipeline_utils.py +35 -6
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
diffusers/pipelines/sana/__init__.py +47 -0
diffusers/pipelines/sana/pipeline_output.py +21 -0
diffusers/pipelines/sana/pipeline_sana.py +884 -0
diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
diffusers/quantizers/auto.py +14 -1
diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
diffusers/quantizers/gguf/__init__.py +1 -0
diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
diffusers/quantizers/gguf/utils.py +456 -0
diffusers/quantizers/quantization_config.py +280 -2
diffusers/quantizers/torchao/__init__.py +15 -0
diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
diffusers/schedulers/scheduling_ddpm.py +2 -6
diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
diffusers/schedulers/scheduling_deis_multistep.py +28 -9
diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
diffusers/schedulers/scheduling_euler_discrete.py +4 -4
diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
diffusers/schedulers/scheduling_heun_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
diffusers/schedulers/scheduling_lcm.py +2 -6
diffusers/schedulers/scheduling_lms_discrete.py +4 -4
diffusers/schedulers/scheduling_repaint.py +1 -1
diffusers/schedulers/scheduling_sasolver.py +28 -9
diffusers/schedulers/scheduling_tcd.py +2 -6
diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
diffusers/training_utils.py +16 -2
diffusers/utils/__init__.py +5 -0
diffusers/utils/constants.py +1 -0
diffusers/utils/dummy_pt_objects.py +180 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
diffusers/utils/dynamic_modules_utils.py +3 -3
diffusers/utils/hub_utils.py +31 -39
diffusers/utils/import_utils.py +67 -0
diffusers/utils/peft_utils.py +3 -0
diffusers/utils/testing_utils.py +56 -1
diffusers/utils/torch_utils.py +3 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/METADATA +69 -69
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/RECORD +214 -162
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0

diffusers/loaders/transformer_flux.py ADDED Viewed

@@ -0,0 +1,181 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import nullcontext
+from ..models.embeddings import (
+    ImageProjection,
+    MultiIPAdapterImageProjection,
+)
+from ..models.modeling_utils import load_model_dict_into_meta
+from ..utils import (
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+if is_accelerate_available():
+    pass
+logger = logging.get_logger(__name__)
+class FluxTransformer2DLoadersMixin:
+    """
+    Load layers into a [`FluxTransformer2DModel`].
+    """
+    def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_usage=False):
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        updated_state_dict = {}
+        image_projection = None
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        if "proj.weight" in state_dict:
+            # IP-Adapter
+            num_image_text_embeds = 4
+            if state_dict["proj.weight"].shape[0] == 65536:
+                num_image_text_embeds = 16
+            clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+            cross_attention_dim = state_dict["proj.weight"].shape[0] // num_image_text_embeds
+            with init_context():
+                image_projection = ImageProjection(
+                    cross_attention_dim=cross_attention_dim,
+                    image_embed_dim=clip_embeddings_dim,
+                    num_image_text_embeds=num_image_text_embeds,
+                )
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj", "image_embeds")
+                updated_state_dict[diffusers_name] = value
+        if not low_cpu_mem_usage:
+            image_projection.load_state_dict(updated_state_dict, strict=True)
+        else:
+            load_model_dict_into_meta(image_projection, updated_state_dict, device=self.device, dtype=self.dtype)
+        return image_projection
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=False):
+        from ..models.attention_processor import (
+            FluxIPAdapterJointAttnProcessor2_0,
+        )
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 0
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        for name in self.attn_processors.keys():
+            if name.startswith("single_transformer_blocks"):
+                attn_processor_class = self.attn_processors[name].__class__
+                attn_procs[name] = attn_processor_class()
+            else:
+                cross_attention_dim = self.config.joint_attention_dim
+                hidden_size = self.inner_dim
+                attn_processor_class = FluxIPAdapterJointAttnProcessor2_0
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        num_image_text_embed = 4
+                        if state_dict["image_proj"]["proj.weight"].shape[0] == 65536:
+                            num_image_text_embed = 16
+                        # IP-Adapter
+                        num_image_text_embeds += [num_image_text_embed]
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                        dtype=self.dtype,
+                        device=self.device,
+                    )
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                    value_dict.update({f"to_k_ip.{i}.bias": state_dict["ip_adapter"][f"{key_id}.to_k_ip.bias"]})
+                    value_dict.update({f"to_v_ip.{i}.bias": state_dict["ip_adapter"][f"{key_id}.to_v_ip.bias"]})
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = self.device
+                    dtype = self.dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+                key_id += 1
+        return attn_procs
+    def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
+        if not isinstance(state_dicts, list):
+            state_dicts = [state_dicts]
+        self.encoder_hid_proj = None
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+        self.set_attn_processor(attn_procs)
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(
+                state_dict["image_proj"], low_cpu_mem_usage=low_cpu_mem_usage
+            )
+            image_projection_layers.append(image_projection_layer)
+        self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+        self.to(dtype=self.dtype, device=self.device)

diffusers/loaders/transformer_sd3.py ADDED Viewed

@@ -0,0 +1,89 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+from ..models.attention_processor import SD3IPAdapterJointAttnProcessor2_0
+from ..models.embeddings import IPAdapterTimeImageProjection
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
+class SD3Transformer2DLoadersMixin:
+    """Load IP-Adapters and LoRA layers into a `[SD3Transformer2DModel]`."""
+    def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool = _LOW_CPU_MEM_USAGE_DEFAULT) -> None:
+        """Sets IP-Adapter attention processors, image projection, and loads state_dict.
+        Args:
+            state_dict (`Dict`):
+                State dict with keys "ip_adapter", which contains parameters for attention processors, and
+                "image_proj", which contains parameters for image projection net.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
+        # IP-Adapter cross attention parameters
+        hidden_size = self.config.attention_head_dim * self.config.num_attention_heads
+        ip_hidden_states_dim = self.config.attention_head_dim * self.config.num_attention_heads
+        timesteps_emb_dim = state_dict["ip_adapter"]["0.norm_ip.linear.weight"].shape[1]
+        # Dict where key is transformer layer index, value is attention processor's state dict
+        # ip_adapter state dict keys example: "0.norm_ip.linear.weight"
+        layer_state_dict = {idx: {} for idx in range(len(self.attn_processors))}
+        for key, weights in state_dict["ip_adapter"].items():
+            idx, name = key.split(".", maxsplit=1)
+            layer_state_dict[int(idx)][name] = weights
+        # Create IP-Adapter attention processor
+        attn_procs = {}
+        for idx, name in enumerate(self.attn_processors.keys()):
+            attn_procs[name] = SD3IPAdapterJointAttnProcessor2_0(
+                hidden_size=hidden_size,
+                ip_hidden_states_dim=ip_hidden_states_dim,
+                head_dim=self.config.attention_head_dim,
+                timesteps_emb_dim=timesteps_emb_dim,
+            ).to(self.device, dtype=self.dtype)
+            if not low_cpu_mem_usage:
+                attn_procs[name].load_state_dict(layer_state_dict[idx], strict=True)
+            else:
+                load_model_dict_into_meta(
+                    attn_procs[name], layer_state_dict[idx], device=self.device, dtype=self.dtype
+                )
+        self.set_attn_processor(attn_procs)
+        # Image projetion parameters
+        embed_dim = state_dict["image_proj"]["proj_in.weight"].shape[1]
+        output_dim = state_dict["image_proj"]["proj_out.weight"].shape[0]
+        hidden_dim = state_dict["image_proj"]["proj_in.weight"].shape[0]
+        heads = state_dict["image_proj"]["layers.0.attn.to_q.weight"].shape[0] // 64
+        num_queries = state_dict["image_proj"]["latents"].shape[1]
+        timestep_in_dim = state_dict["image_proj"]["time_embedding.linear_1.weight"].shape[1]
+        # Image projection
+        self.image_proj = IPAdapterTimeImageProjection(
+            embed_dim=embed_dim,
+            output_dim=output_dim,
+            hidden_dim=hidden_dim,
+            heads=heads,
+            num_queries=num_queries,
+            timestep_in_dim=timestep_in_dim,
+        ).to(device=self.device, dtype=self.dtype)
+        if not low_cpu_mem_usage:
+            self.image_proj.load_state_dict(state_dict["image_proj"], strict=True)
+        else:
+            load_model_dict_into_meta(self.image_proj, state_dict["image_proj"], device=self.device, dtype=self.dtype)

diffusers/loaders/unet.py CHANGED Viewed

@@ -36,6 +36,7 @@ from ..utils import (
     USE_PEFT_BACKEND,
     _get_model_file,
     convert_unet_state_dict_to_peft,
+    deprecate,
     get_adapter_name,
     get_peft_kwargs,
     is_accelerate_available,
@@ -209,6 +210,10 @@ class UNet2DConditionLoadersMixin:
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
+        if is_lora:
+            deprecation_message = "Using the `load_attn_procs()` method has been deprecated and will be removed in a future version. Please use `load_lora_adapter()`."
+            deprecate("load_attn_procs", "0.40.0", deprecation_message)
         if is_custom_diffusion:
             attn_processors = self._process_custom_diffusion(state_dict=state_dict)
         elif is_lora:
@@ -487,6 +492,9 @@ class UNet2DConditionLoadersMixin:
                     )
                 state_dict = {k: v for k, v in state_dict.items() if isinstance(v, torch.Tensor)}
         else:
+            deprecation_message = "Using the `save_attn_procs()` method has been deprecated and will be removed in a future version. Please use `save_lora_adapter()`."
+            deprecate("save_attn_procs", "0.40.0", deprecation_message)
             if not USE_PEFT_BACKEND:
                 raise ValueError("PEFT backend is required for saving LoRAs using the `save_attn_procs()` method.")
@@ -765,6 +773,7 @@ class UNet2DConditionLoadersMixin:
         from ..models.attention_processor import (
             IPAdapterAttnProcessor,
             IPAdapterAttnProcessor2_0,
+            IPAdapterXFormersAttnProcessor,
         )
         if low_cpu_mem_usage:
@@ -804,11 +813,15 @@ class UNet2DConditionLoadersMixin:
             if cross_attention_dim is None or "motion_modules" in name:
                 attn_processor_class = self.attn_processors[name].__class__
                 attn_procs[name] = attn_processor_class()
             else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
+                if "XFormers" in str(self.attn_processors[name].__class__):
+                    attn_processor_class = IPAdapterXFormersAttnProcessor
+                else:
+                    attn_processor_class = (
+                        IPAdapterAttnProcessor2_0
+                        if hasattr(F, "scaled_dot_product_attention")
+                        else IPAdapterAttnProcessor
+                    )
                 num_image_text_embeds = []
                 for state_dict in state_dicts:
                     if "proj.weight" in state_dict["image_proj"]:

diffusers/models/__init__.py CHANGED Viewed

@@ -27,19 +27,29 @@ _import_structure = {}
 if is_torch_available():
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
     _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_dc"] = ["AutoencoderDC"]
     _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
     _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
+    _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
+    _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
+    _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
     _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["autoencoders.vq_model"] = ["VQModel"]
-    _import_structure["controlnet"] = ["ControlNetModel"]
-    _import_structure["controlnet_flux"] = ["FluxControlNetModel", "FluxMultiControlNetModel"]
-    _import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"]
-    _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
-    _import_structure["controlnet_sparsectrl"] = ["SparseControlNetModel"]
-    _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["controlnets.controlnet"] = ["ControlNetModel"]
+    _import_structure["controlnets.controlnet_flux"] = ["FluxControlNetModel", "FluxMultiControlNetModel"]
+    _import_structure["controlnets.controlnet_hunyuan"] = [
+        "HunyuanDiT2DControlNetModel",
+        "HunyuanDiT2DMultiControlNetModel",
+    ]
+    _import_structure["controlnets.controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
+    _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
+    _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
+    _import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
@@ -51,11 +61,16 @@ if is_torch_available():
     _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
     _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
+    _import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
     _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
+    _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
     _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
     _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
+    _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
+    _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
+    _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
     _import_structure["unets.unet_1d"] = ["UNet1DModel"]
@@ -70,7 +85,7 @@ if is_torch_available():
     _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
 if is_flax_available():
-    _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["controlnets.controlnet_flax"] = ["FlaxControlNetModel"]
     _import_structure["unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
     _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
@@ -80,23 +95,37 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .adapter import MultiAdapter, T2IAdapter
         from .autoencoders import (
             AsymmetricAutoencoderKL,
+            AutoencoderDC,
             AutoencoderKL,
+            AutoencoderKLAllegro,
             AutoencoderKLCogVideoX,
+            AutoencoderKLHunyuanVideo,
+            AutoencoderKLLTXVideo,
+            AutoencoderKLMochi,
             AutoencoderKLTemporalDecoder,
             AutoencoderOobleck,
             AutoencoderTiny,
             ConsistencyDecoderVAE,
             VQModel,
         )
-        from .controlnet import ControlNetModel
-        from .controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
-        from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
-        from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
-        from .controlnet_sparsectrl import SparseControlNetModel
-        from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
+        from .controlnets import (
+            ControlNetModel,
+            ControlNetUnionModel,
+            ControlNetXSAdapter,
+            FluxControlNetModel,
+            FluxMultiControlNetModel,
+            HunyuanDiT2DControlNetModel,
+            HunyuanDiT2DMultiControlNetModel,
+            MultiControlNetModel,
+            SD3ControlNetModel,
+            SD3MultiControlNetModel,
+            SparseControlNetModel,
+            UNetControlNetXSModel,
+        )
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
         from .transformers import (
+            AllegroTransformer3DModel,
             AuraFlowTransformer2DModel,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
@@ -104,10 +133,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             DualTransformer2DModel,
             FluxTransformer2DModel,
             HunyuanDiT2DModel,
+            HunyuanVideoTransformer3DModel,
             LatteTransformer3DModel,
+            LTXVideoTransformer3DModel,
             LuminaNextDiT2DModel,
+            MochiTransformer3DModel,
             PixArtTransformer2DModel,
             PriorTransformer,
+            SanaTransformer2DModel,
             SD3Transformer2DModel,
             StableAudioDiTModel,
             T5FilmDecoder,
@@ -129,7 +162,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         )
     if is_flax_available():
-        from .controlnet_flax import FlaxControlNetModel
+        from .controlnets import FlaxControlNetModel
         from .unets import FlaxUNet2DConditionModel
         from .vae_flax import FlaxAutoencoderKL

diffusers/models/activations.py CHANGED Viewed

@@ -18,7 +18,7 @@ import torch.nn.functional as F
 from torch import nn
 from ..utils import deprecate
-from ..utils.import_utils import is_torch_npu_available
+from ..utils.import_utils import is_torch_npu_available, is_torch_version
 if is_torch_npu_available():
@@ -79,10 +79,10 @@ class GELU(nn.Module):
         self.approximate = approximate
     def gelu(self, gate: torch.Tensor) -> torch.Tensor:
-        if gate.device.type != "mps":
-            return F.gelu(gate, approximate=self.approximate)
-        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+            # fp16 gelu not supported on mps before torch 2.0
+            return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate, approximate=self.approximate)
     def forward(self, hidden_states):
         hidden_states = self.proj(hidden_states)
@@ -105,10 +105,10 @@ class GEGLU(nn.Module):
         self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
     def gelu(self, gate: torch.Tensor) -> torch.Tensor:
-        if gate.device.type != "mps":
-            return F.gelu(gate)
-        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+            # fp16 gelu not supported on mps before torch 2.0
+            return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+        return F.gelu(gate)
     def forward(self, hidden_states, *args, **kwargs):
         if len(args) > 0 or kwargs.get("scale", None) is not None:
@@ -136,6 +136,7 @@ class SwiGLU(nn.Module):
     def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
         super().__init__()
         self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
         self.activation = nn.SiLU()
@@ -163,3 +164,15 @@ class ApproximateGELU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x * torch.sigmoid(1.702 * x)
+class LinearActivation(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.activation = get_activation(activation)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        return self.activation(hidden_states)

diffusers/models/attention.py CHANGED Viewed

@@ -19,7 +19,7 @@ from torch import nn
 from ..utils import deprecate, logging
 from ..utils.torch_utils import maybe_allow_in_graph
-from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
+from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, LinearActivation, SwiGLU
 from .attention_processor import Attention, JointAttnProcessor2_0
 from .embeddings import SinusoidalPositionalEmbedding
 from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm, SD35AdaLayerNormZeroX
@@ -188,8 +188,13 @@ class JointTransformerBlock(nn.Module):
         self._chunk_dim = dim
     def forward(
-        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
+        joint_attention_kwargs = joint_attention_kwargs or {}
         if self.use_dual_attention:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
                 hidden_states, emb=temb
@@ -206,7 +211,9 @@ class JointTransformerBlock(nn.Module):
         # Attention.
         attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            **joint_attention_kwargs,
         )
         # Process attention outputs for the `hidden_states`.
@@ -214,7 +221,7 @@ class JointTransformerBlock(nn.Module):
         hidden_states = hidden_states + attn_output
         if self.use_dual_attention:
-            attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
+            attn_output2 = self.attn2(hidden_states=norm_hidden_states2, **joint_attention_kwargs)
             attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
             hidden_states = hidden_states + attn_output2
@@ -1222,6 +1229,8 @@ class FeedForward(nn.Module):
             act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
         elif activation_fn == "swiglu":
             act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "linear-silu":
+            act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
         self.net = nn.ModuleList([])
         # project in

diffusers/models/attention_flax.py CHANGED Viewed

@@ -216,8 +216,8 @@ class FlaxAttention(nn.Module):
             hidden_states = jax_memory_efficient_attention(
                 query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
             )
             hidden_states = hidden_states.transpose(1, 0, 2)
+            hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
         else:
             # compute attentions
             if self.split_head_dim:

diffusers 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl

diffusers 0.31.0py3-none-any.whl → 0.32.0py3-none-any.whl