PyPI - diffusers - Versions diffs - 0.29.0__py3-none-any.whl → 0.29.2__py3-none-any.whl - Mend

diffusers 0.29.0py3-none-any.whl → 0.29.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

diffusers/__init__.py +7 -1
diffusers/loaders/lora.py +3 -4
diffusers/loaders/lora_conversion_utils.py +145 -110
diffusers/loaders/single_file.py +12 -0
diffusers/loaders/single_file_model.py +10 -8
diffusers/loaders/single_file_utils.py +33 -23
diffusers/models/__init__.py +2 -0
diffusers/models/controlnet_sd3.py +418 -0
diffusers/models/modeling_utils.py +10 -3
diffusers/models/transformers/transformer_2d.py +4 -2
diffusers/models/transformers/transformer_sd3.py +17 -8
diffusers/pipelines/__init__.py +9 -0
diffusers/pipelines/auto_pipeline.py +8 -0
diffusers/pipelines/controlnet_sd3/__init__.py +53 -0
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1062 -0
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +23 -5
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +23 -5
diffusers/utils/dummy_pt_objects.py +30 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +15 -0
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/METADATA +44 -44
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/RECORD +25 -22
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/WHEEL +1 -1
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/LICENSE +0 -0
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/entry_points.txt +0 -0
{diffusers-0.29.0.dist-info → diffusers-0.29.2.dist-info}/top_level.txt +0 -0

diffusers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.29.0"
+__version__ = "0.29.2"
 from typing import TYPE_CHECKING
@@ -91,6 +91,8 @@ else:
             "MultiAdapter",
             "PixArtTransformer2DModel",
             "PriorTransformer",
+            "SD3ControlNetModel",
+            "SD3MultiControlNetModel",
             "SD3Transformer2DModel",
             "StableCascadeUNet",
             "T2IAdapter",
@@ -278,6 +280,7 @@ else:
             "StableCascadeCombinedPipeline",
             "StableCascadeDecoderPipeline",
             "StableCascadePriorPipeline",
+            "StableDiffusion3ControlNetPipeline",
             "StableDiffusion3Img2ImgPipeline",
             "StableDiffusion3Pipeline",
             "StableDiffusionAdapterPipeline",
@@ -501,6 +504,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             MultiAdapter,
             PixArtTransformer2DModel,
             PriorTransformer,
+            SD3ControlNetModel,
+            SD3MultiControlNetModel,
             SD3Transformer2DModel,
             T2IAdapter,
             T5FilmDecoder,
@@ -666,6 +671,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableCascadeCombinedPipeline,
             StableCascadeDecoderPipeline,
             StableCascadePriorPipeline,
+            StableDiffusion3ControlNetPipeline,
             StableDiffusion3Img2ImgPipeline,
             StableDiffusion3Pipeline,
             StableDiffusionAdapterPipeline,

diffusers/loaders/lora.py CHANGED Viewed

@@ -42,7 +42,7 @@ from ..utils import (
     set_adapter_layers,
     set_weights_and_activate_adapters,
 )
-from .lora_conversion_utils import _convert_kohya_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers
+from .lora_conversion_utils import _convert_non_diffusers_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers
 if is_transformers_available():
@@ -287,7 +287,7 @@ class LoraLoaderMixin:
             if unet_config is not None:
                 # use unet config to remap block numbers
                 state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
-            state_dict, network_alphas = _convert_kohya_lora_to_diffusers(state_dict)
+            state_dict, network_alphas = _convert_non_diffusers_lora_to_diffusers(state_dict)
         return state_dict, network_alphas
@@ -395,8 +395,7 @@ class LoraLoaderMixin:
         # their prefixes.
         keys = list(state_dict.keys())
         only_text_encoder = all(key.startswith(cls.text_encoder_name) for key in keys)
-        if any(key.startswith(cls.unet_name) for key in keys) and not only_text_encoder:
+        if not only_text_encoder:
             # Load the layers corresponding to UNet.
             logger.info(f"Loading {cls.unet_name}.")
             unet.load_attn_procs(

diffusers/loaders/lora_conversion_utils.py CHANGED Viewed

@@ -123,134 +123,76 @@ def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", b
     return new_state_dict
-def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_name="text_encoder"):
+def _convert_non_diffusers_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_name="text_encoder"):
+    """
+    Converts a non-Diffusers LoRA state dict to a Diffusers compatible state dict.
+    Args:
+        state_dict (`dict`): The state dict to convert.
+        unet_name (`str`, optional): The name of the U-Net module in the Diffusers model. Defaults to "unet".
+        text_encoder_name (`str`, optional): The name of the text encoder module in the Diffusers model. Defaults to
+            "text_encoder".
+    Returns:
+        `tuple`: A tuple containing the converted state dict and a dictionary of alphas.
+    """
     unet_state_dict = {}
     te_state_dict = {}
     te2_state_dict = {}
     network_alphas = {}
-    is_unet_dora_lora = any("dora_scale" in k and "lora_unet_" in k for k in state_dict)
-    is_te_dora_lora = any("dora_scale" in k and ("lora_te_" in k or "lora_te1_" in k) for k in state_dict)
-    is_te2_dora_lora = any("dora_scale" in k and "lora_te2_" in k for k in state_dict)
-    if is_unet_dora_lora or is_te_dora_lora or is_te2_dora_lora:
+    # Check for DoRA-enabled LoRAs.
+    dora_present_in_unet = any("dora_scale" in k and "lora_unet_" in k for k in state_dict)
+    dora_present_in_te = any("dora_scale" in k and ("lora_te_" in k or "lora_te1_" in k) for k in state_dict)
+    dora_present_in_te2 = any("dora_scale" in k and "lora_te2_" in k for k in state_dict)
+    if dora_present_in_unet or dora_present_in_te or dora_present_in_te2:
         if is_peft_version("<", "0.9.0"):
             raise ValueError(
                 "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
             )
-    # every down weight has a corresponding up weight and potentially an alpha weight
-    lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
-    for key in lora_keys:
+    # Iterate over all LoRA weights.
+    all_lora_keys = list(state_dict.keys())
+    for key in all_lora_keys:
+        if not key.endswith("lora_down.weight"):
+            continue
+        # Extract LoRA name.
         lora_name = key.split(".")[0]
+        # Find corresponding up weight and alpha.
         lora_name_up = lora_name + ".lora_up.weight"
         lora_name_alpha = lora_name + ".alpha"
+        # Handle U-Net LoRAs.
         if lora_name.startswith("lora_unet_"):
-            diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
-            if "input.blocks" in diffusers_name:
-                diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
-            else:
-                diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+            diffusers_name = _convert_unet_lora_key(key)
-            if "middle.block" in diffusers_name:
-                diffusers_name = diffusers_name.replace("middle.block", "mid_block")
-            else:
-                diffusers_name = diffusers_name.replace("mid.block", "mid_block")
-            if "output.blocks" in diffusers_name:
-                diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
-            else:
-                diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
-            diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
-            diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
-            diffusers_name = diffusers_name.replace("proj.in", "proj_in")
-            diffusers_name = diffusers_name.replace("proj.out", "proj_out")
-            diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
-            # SDXL specificity.
-            if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
-                pattern = r"\.\d+(?=\D*$)"
-                diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
-            if ".in." in diffusers_name:
-                diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
-            if ".out." in diffusers_name:
-                diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
-            if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
-                diffusers_name = diffusers_name.replace("op", "conv")
-            if "skip" in diffusers_name:
-                diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
-            # LyCORIS specificity.
-            if "time.emb.proj" in diffusers_name:
-                diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
-            if "conv.shortcut" in diffusers_name:
-                diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
-            # General coverage.
-            if "transformer_blocks" in diffusers_name:
-                if "attn1" in diffusers_name or "attn2" in diffusers_name:
-                    diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
-                    diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
-                    unet_state_dict[diffusers_name] = state_dict.pop(key)
-                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                elif "ff" in diffusers_name:
-                    unet_state_dict[diffusers_name] = state_dict.pop(key)
-                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
-                unet_state_dict[diffusers_name] = state_dict.pop(key)
-                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            else:
-                unet_state_dict[diffusers_name] = state_dict.pop(key)
-                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            # Store down and up weights.
+            unet_state_dict[diffusers_name] = state_dict.pop(key)
+            unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            if is_unet_dora_lora:
+            # Store DoRA scale if present.
+            if dora_present_in_unet:
                 dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down."
                 unet_state_dict[
                     diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")
                 ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
+        # Handle text encoder LoRAs.
         elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
+            diffusers_name = _convert_text_encoder_lora_key(key, lora_name)
+            # Store down and up weights for te or te2.
             if lora_name.startswith(("lora_te_", "lora_te1_")):
-                key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
             else:
-                key_to_replace = "lora_te2_"
-            diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
-            diffusers_name = diffusers_name.replace("text.model", "text_model")
-            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-            diffusers_name = diffusers_name.replace("text.projection", "text_projection")
-            if "self_attn" in diffusers_name:
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif "mlp" in diffusers_name:
-                # Be aware that this is the new diffusers convention and the rest of the code might
-                # not utilize it yet.
-                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            # OneTrainer specificity
-            elif "text_projection" in diffusers_name and lora_name.startswith("lora_te2_"):
                 te2_state_dict[diffusers_name] = state_dict.pop(key)
                 te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            if (is_te_dora_lora or is_te2_dora_lora) and lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
+            # Store DoRA scale if present.
+            if dora_present_in_te or dora_present_in_te2:
                 dora_scale_key_to_replace_te = (
                     "_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer."
                 )
@@ -263,22 +205,18 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
                         diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
                     ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
-        # Rename the alphas so that they can be mapped appropriately.
+        # Store alpha if present.
         if lora_name_alpha in state_dict:
             alpha = state_dict.pop(lora_name_alpha).item()
-            if lora_name_alpha.startswith("lora_unet_"):
-                prefix = "unet."
-            elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
-                prefix = "text_encoder."
-            else:
-                prefix = "text_encoder_2."
-            new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
-            network_alphas.update({new_name: alpha})
+            network_alphas.update(_get_alpha_name(lora_name_alpha, diffusers_name, alpha))
+    # Check if any keys remain.
     if len(state_dict) > 0:
         raise ValueError(f"The following keys have not been correctly renamed: \n\n {', '.join(state_dict.keys())}")
-    logger.info("Kohya-style checkpoint detected.")
+    logger.info("Non-diffusers checkpoint detected.")
+    # Construct final state dict.
     unet_state_dict = {f"{unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
     te_state_dict = {f"{text_encoder_name}.{module_name}": params for module_name, params in te_state_dict.items()}
     te2_state_dict = (
@@ -291,3 +229,100 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
     new_state_dict = {**unet_state_dict, **te_state_dict}
     return new_state_dict, network_alphas
+def _convert_unet_lora_key(key):
+    """
+    Converts a U-Net LoRA key to a Diffusers compatible key.
+    """
+    diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+    # Replace common U-Net naming patterns.
+    diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
+    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+    diffusers_name = diffusers_name.replace("middle.block", "mid_block")
+    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+    diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
+    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+    diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+    diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+    diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+    diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+    diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
+    diffusers_name = diffusers_name.replace("proj.in", "proj_in")
+    diffusers_name = diffusers_name.replace("proj.out", "proj_out")
+    diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
+    # SDXL specific conversions.
+    if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
+        pattern = r"\.\d+(?=\D*$)"
+        diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
+    if ".in." in diffusers_name:
+        diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
+    if ".out." in diffusers_name:
+        diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
+    if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
+        diffusers_name = diffusers_name.replace("op", "conv")
+    if "skip" in diffusers_name:
+        diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
+    # LyCORIS specific conversions.
+    if "time.emb.proj" in diffusers_name:
+        diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
+    if "conv.shortcut" in diffusers_name:
+        diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
+    # General conversions.
+    if "transformer_blocks" in diffusers_name:
+        if "attn1" in diffusers_name or "attn2" in diffusers_name:
+            diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+            diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
+        elif "ff" in diffusers_name:
+            pass
+    elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
+        pass
+    else:
+        pass
+    return diffusers_name
+def _convert_text_encoder_lora_key(key, lora_name):
+    """
+    Converts a text encoder LoRA key to a Diffusers compatible key.
+    """
+    if lora_name.startswith(("lora_te_", "lora_te1_")):
+        key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
+    else:
+        key_to_replace = "lora_te2_"
+    diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
+    diffusers_name = diffusers_name.replace("text.model", "text_model")
+    diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+    diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+    diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+    diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+    diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+    diffusers_name = diffusers_name.replace("text.projection", "text_projection")
+    if "self_attn" in diffusers_name or "text_projection" in diffusers_name:
+        pass
+    elif "mlp" in diffusers_name:
+        # Be aware that this is the new diffusers convention and the rest of the code might
+        # not utilize it yet.
+        diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+    return diffusers_name
+def _get_alpha_name(lora_name_alpha, diffusers_name, alpha):
+    """
+    Gets the correct alpha name for the Diffusers model.
+    """
+    if lora_name_alpha.startswith("lora_unet_"):
+        prefix = "unet."
+    elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
+        prefix = "text_encoder."
+    else:
+        prefix = "text_encoder_2."
+    new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
+    return {new_name: alpha}

diffusers/loaders/single_file.py CHANGED Viewed

@@ -28,9 +28,11 @@ from .single_file_utils import (
     _legacy_load_safety_checker,
     _legacy_load_scheduler,
     create_diffusers_clip_model_from_ldm,
+    create_diffusers_t5_model_from_checkpoint,
     fetch_diffusers_config,
     fetch_original_config,
     is_clip_model_in_single_file,
+    is_t5_in_single_file,
     load_single_file_checkpoint,
 )
@@ -118,6 +120,16 @@ def load_single_file_sub_model(
             is_legacy_loading=is_legacy_loading,
         )
+    elif is_transformers_model and is_t5_in_single_file(checkpoint):
+        loaded_sub_model = create_diffusers_t5_model_from_checkpoint(
+            class_obj,
+            checkpoint=checkpoint,
+            config=cached_model_config_path,
+            subfolder=name,
+            torch_dtype=torch_dtype,
+            local_files_only=local_files_only,
+        )
     elif is_tokenizer and is_legacy_loading:
         loaded_sub_model = _legacy_load_clip_tokenizer(
             class_obj, checkpoint=checkpoint, config=cached_model_config_path, local_files_only=local_files_only

diffusers/loaders/single_file_model.py CHANGED Viewed

@@ -276,16 +276,18 @@ class FromOriginalModelMixin:
         if is_accelerate_available():
             unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
-            if model._keys_to_ignore_on_load_unexpected is not None:
-                for pat in model._keys_to_ignore_on_load_unexpected:
-                    unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-            if len(unexpected_keys) > 0:
-                logger.warning(
-                    f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                )
         else:
-            model.load_state_dict(diffusers_format_checkpoint)
+            _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
+        if model._keys_to_ignore_on_load_unexpected is not None:
+            for pat in model._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
         if torch_dtype is not None:
             model.to(torch_dtype)

diffusers/loaders/single_file_utils.py CHANGED Viewed

@@ -252,7 +252,6 @@ LDM_CONTROLNET_KEY = "control_model."
 LDM_CLIP_PREFIX_TO_REMOVE = [
     "cond_stage_model.transformer.",
     "conditioner.embedders.0.transformer.",
-    "text_encoders.clip_l.transformer.",
 ]
 OPEN_CLIP_PREFIX = "conditioner.embedders.0.model."
 LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
@@ -399,11 +398,14 @@ def is_open_clip_sdxl_model(checkpoint):
 def is_open_clip_sd3_model(checkpoint):
-    is_open_clip_sdxl_refiner_model(checkpoint)
+    if CHECKPOINT_KEY_NAMES["open_clip_sd3"] in checkpoint:
+        return True
+    return False
 def is_open_clip_sdxl_refiner_model(checkpoint):
-    if CHECKPOINT_KEY_NAMES["open_clip_sd3"] in checkpoint:
+    if CHECKPOINT_KEY_NAMES["open_clip_sdxl_refiner"] in checkpoint:
         return True
     return False
@@ -1233,11 +1235,14 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
-def convert_ldm_clip_checkpoint(checkpoint):
+def convert_ldm_clip_checkpoint(checkpoint, remove_prefix=None):
     keys = list(checkpoint.keys())
     text_model_dict = {}
-    remove_prefixes = LDM_CLIP_PREFIX_TO_REMOVE
+    remove_prefixes = []
+    remove_prefixes.extend(LDM_CLIP_PREFIX_TO_REMOVE)
+    if remove_prefix:
+        remove_prefixes.append(remove_prefix)
     for key in keys:
         for prefix in remove_prefixes:
@@ -1263,8 +1268,6 @@ def convert_open_clip_checkpoint(
     else:
         text_proj_dim = LDM_OPEN_CLIP_TEXT_PROJECTION_DIM
-    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
     keys = list(checkpoint.keys())
     keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
@@ -1313,9 +1316,6 @@ def convert_open_clip_checkpoint(
         else:
             text_model_dict[diffusers_key] = checkpoint.get(key)
-    if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
-        text_model_dict.pop("text_model.embeddings.position_ids", None)
     return text_model_dict
@@ -1376,6 +1376,13 @@ def create_diffusers_clip_model_from_ldm(
     ):
         diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint)
+    elif (
+        is_clip_sd3_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["clip_sd3"]].shape[-1] == position_embedding_dim
+    ):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint, "text_encoders.clip_l.transformer.")
+        diffusers_format_checkpoint["text_projection.weight"] = torch.eye(position_embedding_dim)
     elif is_open_clip_model(checkpoint):
         prefix = "cond_stage_model.model."
         diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
@@ -1391,26 +1398,28 @@ def create_diffusers_clip_model_from_ldm(
         prefix = "conditioner.embedders.0.model."
         diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
-    elif is_open_clip_sd3_model(checkpoint):
-        prefix = "text_encoders.clip_g.transformer."
-        diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix)
+    elif (
+        is_open_clip_sd3_model(checkpoint)
+        and checkpoint[CHECKPOINT_KEY_NAMES["open_clip_sd3"]].shape[-1] == position_embedding_dim
+    ):
+        diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint, "text_encoders.clip_g.transformer.")
     else:
         raise ValueError("The provided checkpoint does not seem to contain a valid CLIP model.")
     if is_accelerate_available():
         unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
-        if model._keys_to_ignore_on_load_unexpected is not None:
-            for pat in model._keys_to_ignore_on_load_unexpected:
-                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+    else:
+        _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-            )
+    if model._keys_to_ignore_on_load_unexpected is not None:
+        for pat in model._keys_to_ignore_on_load_unexpected:
+            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-    else:
-        model.load_state_dict(diffusers_format_checkpoint)
+    if len(unexpected_keys) > 0:
+        logger.warning(
+            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+        )
     if torch_dtype is not None:
         model.to(torch_dtype)
@@ -1755,7 +1764,7 @@ def convert_sd3_t5_checkpoint_to_diffusers(checkpoint):
     keys = list(checkpoint.keys())
     text_model_dict = {}
-    remove_prefixes = ["text_encoders.t5xxl.transformer.encoder."]
+    remove_prefixes = ["text_encoders.t5xxl.transformer."]
     for key in keys:
         for prefix in remove_prefixes:
@@ -1799,3 +1808,4 @@ def create_diffusers_t5_model_from_checkpoint(
     else:
         model.load_state_dict(diffusers_format_checkpoint)
+    return model

diffusers/models/__init__.py CHANGED Viewed

@@ -33,6 +33,7 @@ if is_torch_available():
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["autoencoders.vq_model"] = ["VQModel"]
     _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
     _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
@@ -74,6 +75,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             VQModel,
         )
         from .controlnet import ControlNetModel
+        from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
         from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin

diffusers 0.29.0__py3-none-any.whl → 0.29.2__py3-none-any.whl

diffusers 0.29.0py3-none-any.whl → 0.29.2py3-none-any.whl