PyPI - diffusers - Versions diffs - 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl - Mend

diffusers 0.33.1py3-none-any.whl → 0.35.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (551) hide show

diffusers/loaders/lora_conversion_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,24 @@ def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", b
     # 1. get all state_dict_keys
     all_keys = list(state_dict.keys())
     sgm_patterns = ["input_blocks", "middle_block", "output_blocks"]
+    not_sgm_patterns = ["down_blocks", "mid_block", "up_blocks"]
+    # check if state_dict contains both patterns
+    contains_sgm_patterns = False
+    contains_not_sgm_patterns = False
+    for key in all_keys:
+        if any(p in key for p in sgm_patterns):
+            contains_sgm_patterns = True
+        elif any(p in key for p in not_sgm_patterns):
+            contains_not_sgm_patterns = True
+    # if state_dict contains both patterns, remove sgm
+    # we can then return state_dict immediately
+    if contains_sgm_patterns and contains_not_sgm_patterns:
+        for key in all_keys:
+            if any(p in key for p in sgm_patterns):
+                state_dict.pop(key)
+        return state_dict
     # 2. check if needs remapping, if not return original dict
     is_in_sgm_format = False
@@ -126,7 +144,7 @@ def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", b
             )
             new_state_dict[new_key] = state_dict.pop(key)
-    if len(state_dict) > 0:
+    if state_dict:
         raise ValueError("At this point all state dict entries have to be converted.")
     return new_state_dict
@@ -415,7 +433,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
         ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
         if not is_sparse:
             # down_weight is copied to each split
-            ait_sd.update({k: down_weight for k in ait_down_keys})
+            ait_sd.update(dict.fromkeys(ait_down_keys, down_weight))
             # up_weight is split to each split
             ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
@@ -709,8 +727,25 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
             elif k.startswith("lora_te1_"):
                 has_te_keys = True
                 continue
+            elif k.startswith("lora_transformer_context_embedder"):
+                diffusers_key = "context_embedder"
+            elif k.startswith("lora_transformer_norm_out_linear"):
+                diffusers_key = "norm_out.linear"
+            elif k.startswith("lora_transformer_proj_out"):
+                diffusers_key = "proj_out"
+            elif k.startswith("lora_transformer_x_embedder"):
+                diffusers_key = "x_embedder"
+            elif k.startswith("lora_transformer_time_text_embed_guidance_embedder_linear_"):
+                i = int(k.split("lora_transformer_time_text_embed_guidance_embedder_linear_")[-1])
+                diffusers_key = f"time_text_embed.guidance_embedder.linear_{i}"
+            elif k.startswith("lora_transformer_time_text_embed_text_embedder_linear_"):
+                i = int(k.split("lora_transformer_time_text_embed_text_embedder_linear_")[-1])
+                diffusers_key = f"time_text_embed.text_embedder.linear_{i}"
+            elif k.startswith("lora_transformer_time_text_embed_timestep_embedder_linear_"):
+                i = int(k.split("lora_transformer_time_text_embed_timestep_embedder_linear_")[-1])
+                diffusers_key = f"time_text_embed.timestep_embedder.linear_{i}"
             else:
-                raise NotImplementedError
+                raise NotImplementedError(f"Handling for key ({k}) is not implemented.")
             if "attn_" in k:
                 if "_to_out_0" in k:
@@ -782,7 +817,11 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
     # has both `peft` and non-peft state dict.
     has_peft_state_dict = any(k.startswith("transformer.") for k in state_dict)
     if has_peft_state_dict:
-        state_dict = {k: v for k, v in state_dict.items() if k.startswith("transformer.")}
+        state_dict = {
+            k.replace("lora_down.weight", "lora_A.weight").replace("lora_up.weight", "lora_B.weight"): v
+            for k, v in state_dict.items()
+            if k.startswith("transformer.")
+        }
         return state_dict
     # Another weird one.
@@ -801,7 +840,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
             if zero_status_pe:
                 logger.info(
                     "The `position_embedding` LoRA params are all zeros which make them ineffective. "
-                    "So, we will purge them out of the curret state dict to make loading possible."
+                    "So, we will purge them out of the current state dict to make loading possible."
                 )
             else:
@@ -817,7 +856,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
             if zero_status_t5:
                 logger.info(
                     "The `t5xxl` LoRA params are all zeros which make them ineffective. "
-                    "So, we will purge them out of the curret state dict to make loading possible."
+                    "So, we will purge them out of the current state dict to make loading possible."
                 )
             else:
                 logger.info(
@@ -832,7 +871,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
             if zero_status_diff_b:
                 logger.info(
                     "The `diff_b` LoRA params are all zeros which make them ineffective. "
-                    "So, we will purge them out of the curret state dict to make loading possible."
+                    "So, we will purge them out of the current state dict to make loading possible."
                 )
             else:
                 logger.info(
@@ -848,7 +887,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
             if zero_status_diff:
                 logger.info(
                     "The `diff` LoRA params are all zeros which make them ineffective. "
-                    "So, we will purge them out of the curret state dict to make loading possible."
+                    "So, we will purge them out of the current state dict to make loading possible."
                 )
             else:
                 logger.info(
@@ -905,7 +944,7 @@ def _convert_xlabs_flux_lora_to_diffusers(old_state_dict):
         ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
         # down_weight is copied to each split
-        ait_sd.update({k: down_weight for k in ait_down_keys})
+        ait_sd.update(dict.fromkeys(ait_down_keys, down_weight))
         # up_weight is split to each split
         ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
@@ -1219,7 +1258,7 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
             f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
         )
-    # single transfomer blocks
+    # single transformer blocks
     for i in range(num_single_layers):
         block_prefix = f"single_transformer_blocks.{i}."
@@ -1311,6 +1350,228 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
     return converted_state_dict
+def _convert_fal_kontext_lora_to_diffusers(original_state_dict):
+    converted_state_dict = {}
+    original_state_dict_keys = list(original_state_dict.keys())
+    num_layers = 19
+    num_single_layers = 38
+    inner_dim = 3072
+    mlp_ratio = 4.0
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        original_block_prefix = "base_model.model."
+        for lora_key in ["lora_A", "lora_B"]:
+            # norms
+            converted_state_dict[f"{block_prefix}norm1.linear.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.img_mod.lin.{lora_key}.weight"
+            )
+            if f"double_blocks.{i}.img_mod.lin.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}norm1.linear.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.img_mod.lin.{lora_key}.bias"
+                )
+            converted_state_dict[f"{block_prefix}norm1_context.linear.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.txt_mod.lin.{lora_key}.weight"
+            )
+            # Q, K, V
+            if lora_key == "lora_A":
+                sample_lora_weight = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.img_attn.qkv.{lora_key}.weight"
+                )
+                converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([sample_lora_weight])
+                converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([sample_lora_weight])
+                converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([sample_lora_weight])
+                context_lora_weight = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.txt_attn.qkv.{lora_key}.weight"
+                )
+                converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.weight"] = torch.cat(
+                    [context_lora_weight]
+                )
+                converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.weight"] = torch.cat(
+                    [context_lora_weight]
+                )
+                converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.weight"] = torch.cat(
+                    [context_lora_weight]
+                )
+            else:
+                sample_q, sample_k, sample_v = torch.chunk(
+                    original_state_dict.pop(
+                        f"{original_block_prefix}double_blocks.{i}.img_attn.qkv.{lora_key}.weight"
+                    ),
+                    3,
+                    dim=0,
+                )
+                converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([sample_q])
+                converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([sample_k])
+                converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([sample_v])
+                context_q, context_k, context_v = torch.chunk(
+                    original_state_dict.pop(
+                        f"{original_block_prefix}double_blocks.{i}.txt_attn.qkv.{lora_key}.weight"
+                    ),
+                    3,
+                    dim=0,
+                )
+                converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.weight"] = torch.cat([context_q])
+                converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.weight"] = torch.cat([context_k])
+                converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.weight"] = torch.cat([context_v])
+            if f"double_blocks.{i}.img_attn.qkv.{lora_key}.bias" in original_state_dict_keys:
+                sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+                    original_state_dict.pop(f"{original_block_prefix}double_blocks.{i}.img_attn.qkv.{lora_key}.bias"),
+                    3,
+                    dim=0,
+                )
+                converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([sample_q_bias])
+                converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([sample_k_bias])
+                converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([sample_v_bias])
+            if f"double_blocks.{i}.txt_attn.qkv.{lora_key}.bias" in original_state_dict_keys:
+                context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+                    original_state_dict.pop(f"{original_block_prefix}double_blocks.{i}.txt_attn.qkv.{lora_key}.bias"),
+                    3,
+                    dim=0,
+                )
+                converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.bias"] = torch.cat([context_q_bias])
+                converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.bias"] = torch.cat([context_k_bias])
+                converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.bias"] = torch.cat([context_v_bias])
+            # ff img_mlp
+            converted_state_dict[f"{block_prefix}ff.net.0.proj.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.img_mlp.0.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.img_mlp.0.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}ff.net.0.proj.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.img_mlp.0.{lora_key}.bias"
+                )
+            converted_state_dict[f"{block_prefix}ff.net.2.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.img_mlp.2.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.img_mlp.2.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}ff.net.2.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.img_mlp.2.{lora_key}.bias"
+                )
+            converted_state_dict[f"{block_prefix}ff_context.net.0.proj.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.txt_mlp.0.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.txt_mlp.0.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}ff_context.net.0.proj.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.txt_mlp.0.{lora_key}.bias"
+                )
+            converted_state_dict[f"{block_prefix}ff_context.net.2.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.txt_mlp.2.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.txt_mlp.2.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}ff_context.net.2.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.txt_mlp.2.{lora_key}.bias"
+                )
+            # output projections.
+            converted_state_dict[f"{block_prefix}attn.to_out.0.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.img_attn.proj.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.img_attn.proj.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}attn.to_out.0.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.img_attn.proj.{lora_key}.bias"
+                )
+            converted_state_dict[f"{block_prefix}attn.to_add_out.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}double_blocks.{i}.txt_attn.proj.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}double_blocks.{i}.txt_attn.proj.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}attn.to_add_out.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}double_blocks.{i}.txt_attn.proj.{lora_key}.bias"
+                )
+    # single transformer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        for lora_key in ["lora_A", "lora_B"]:
+            # norm.linear  <- single_blocks.0.modulation.lin
+            converted_state_dict[f"{block_prefix}norm.linear.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}single_blocks.{i}.modulation.lin.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}single_blocks.{i}.modulation.lin.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}norm.linear.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}single_blocks.{i}.modulation.lin.{lora_key}.bias"
+                )
+            # Q, K, V, mlp
+            mlp_hidden_dim = int(inner_dim * mlp_ratio)
+            split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
+            if lora_key == "lora_A":
+                lora_weight = original_state_dict.pop(
+                    f"{original_block_prefix}single_blocks.{i}.linear1.{lora_key}.weight"
+                )
+                converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([lora_weight])
+                converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([lora_weight])
+                converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([lora_weight])
+                converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.weight"] = torch.cat([lora_weight])
+                if f"{original_block_prefix}single_blocks.{i}.linear1.{lora_key}.bias" in original_state_dict_keys:
+                    lora_bias = original_state_dict.pop(f"single_blocks.{i}.linear1.{lora_key}.bias")
+                    converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([lora_bias])
+                    converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([lora_bias])
+                    converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([lora_bias])
+                    converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.bias"] = torch.cat([lora_bias])
+            else:
+                q, k, v, mlp = torch.split(
+                    original_state_dict.pop(f"{original_block_prefix}single_blocks.{i}.linear1.{lora_key}.weight"),
+                    split_size,
+                    dim=0,
+                )
+                converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([q])
+                converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([k])
+                converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([v])
+                converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.weight"] = torch.cat([mlp])
+                if f"{original_block_prefix}single_blocks.{i}.linear1.{lora_key}.bias" in original_state_dict_keys:
+                    q_bias, k_bias, v_bias, mlp_bias = torch.split(
+                        original_state_dict.pop(f"{original_block_prefix}single_blocks.{i}.linear1.{lora_key}.bias"),
+                        split_size,
+                        dim=0,
+                    )
+                    converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([q_bias])
+                    converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([k_bias])
+                    converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([v_bias])
+                    converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.bias"] = torch.cat([mlp_bias])
+            # output projections.
+            converted_state_dict[f"{block_prefix}proj_out.{lora_key}.weight"] = original_state_dict.pop(
+                f"{original_block_prefix}single_blocks.{i}.linear2.{lora_key}.weight"
+            )
+            if f"{original_block_prefix}single_blocks.{i}.linear2.{lora_key}.bias" in original_state_dict_keys:
+                converted_state_dict[f"{block_prefix}proj_out.{lora_key}.bias"] = original_state_dict.pop(
+                    f"{original_block_prefix}single_blocks.{i}.linear2.{lora_key}.bias"
+                )
+    for lora_key in ["lora_A", "lora_B"]:
+        converted_state_dict[f"proj_out.{lora_key}.weight"] = original_state_dict.pop(
+            f"{original_block_prefix}final_layer.linear.{lora_key}.weight"
+        )
+        if f"{original_block_prefix}final_layer.linear.{lora_key}.bias" in original_state_dict_keys:
+            converted_state_dict[f"proj_out.{lora_key}.bias"] = original_state_dict.pop(
+                f"{original_block_prefix}final_layer.linear.{lora_key}.bias"
+            )
+    if len(original_state_dict) > 0:
+        raise ValueError(f"`original_state_dict` should be empty at this point but has {original_state_dict.keys()=}.")
+    for key in list(converted_state_dict.keys()):
+        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
+    return converted_state_dict
 def _convert_hunyuan_video_lora_to_diffusers(original_state_dict):
     converted_state_dict = {k: original_state_dict.pop(k) for k in list(original_state_dict.keys())}
@@ -1561,45 +1822,286 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
     converted_state_dict = {}
     original_state_dict = {k[len("diffusion_model.") :]: v for k, v in state_dict.items()}
-    num_blocks = len({k.split("blocks.")[1].split(".")[0] for k in original_state_dict})
+    block_numbers = {int(k.split(".")[1]) for k in original_state_dict if k.startswith("blocks.")}
+    min_block = min(block_numbers)
+    max_block = max(block_numbers)
     is_i2v_lora = any("k_img" in k for k in original_state_dict) and any("v_img" in k for k in original_state_dict)
+    lora_down_key = "lora_A" if any("lora_A" in k for k in original_state_dict) else "lora_down"
+    lora_up_key = "lora_B" if any("lora_B" in k for k in original_state_dict) else "lora_up"
+    has_time_projection_weight = any(
+        k.startswith("time_projection") and k.endswith(".weight") for k in original_state_dict
+    )
-    for i in range(num_blocks):
+    def get_alpha_scales(down_weight, alpha_key):
+        rank = down_weight.shape[0]
+        alpha = original_state_dict.pop(alpha_key).item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+    for key in list(original_state_dict.keys()):
+        if key.endswith((".diff", ".diff_b")) and "norm" in key:
+            # NOTE: we don't support this because norm layer diff keys are just zeroed values. We can support it
+            # in future if needed and they are not zeroed.
+            original_state_dict.pop(key)
+            logger.debug(f"Removing {key} key from the state dict as it is a norm diff key. This is unsupported.")
+        if "time_projection" in key and not has_time_projection_weight:
+            # AccVideo lora has diff bias keys but not the weight keys. This causes a weird problem where
+            # our lora config adds the time proj lora layers, but we don't have the weights for them.
+            # CausVid lora has the weight keys and the bias keys.
+            original_state_dict.pop(key)
+    # For the `diff_b` keys, we treat them as lora_bias.
+    # https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraConfig.lora_bias
+    for i in range(min_block, max_block + 1):
         # Self-attention
         for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_A.weight"] = original_state_dict.pop(
-                f"blocks.{i}.self_attn.{o}.lora_A.weight"
-            )
-            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_B.weight"] = original_state_dict.pop(
-                f"blocks.{i}.self_attn.{o}.lora_B.weight"
-            )
+            alpha_key = f"blocks.{i}.self_attn.{o}.alpha"
+            has_alpha = alpha_key in original_state_dict
+            original_key_A = f"blocks.{i}.self_attn.{o}.{lora_down_key}.weight"
+            converted_key_A = f"blocks.{i}.attn1.{c}.lora_A.weight"
+            original_key_B = f"blocks.{i}.self_attn.{o}.{lora_up_key}.weight"
+            converted_key_B = f"blocks.{i}.attn1.{c}.lora_B.weight"
+            if has_alpha:
+                down_weight = original_state_dict.pop(original_key_A)
+                up_weight = original_state_dict.pop(original_key_B)
+                scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+                converted_state_dict[converted_key_A] = down_weight * scale_down
+                converted_state_dict[converted_key_B] = up_weight * scale_up
+            else:
+                if original_key_A in original_state_dict:
+                    converted_state_dict[converted_key_A] = original_state_dict.pop(original_key_A)
+                if original_key_B in original_state_dict:
+                    converted_state_dict[converted_key_B] = original_state_dict.pop(original_key_B)
+            original_key = f"blocks.{i}.self_attn.{o}.diff_b"
+            converted_key = f"blocks.{i}.attn1.{c}.lora_B.bias"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
         # Cross-attention
         for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = original_state_dict.pop(
-                f"blocks.{i}.cross_attn.{o}.lora_A.weight"
-            )
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = original_state_dict.pop(
-                f"blocks.{i}.cross_attn.{o}.lora_B.weight"
-            )
+            alpha_key = f"blocks.{i}.cross_attn.{o}.alpha"
+            has_alpha = alpha_key in original_state_dict
+            original_key_A = f"blocks.{i}.cross_attn.{o}.{lora_down_key}.weight"
+            converted_key_A = f"blocks.{i}.attn2.{c}.lora_A.weight"
+            original_key_B = f"blocks.{i}.cross_attn.{o}.{lora_up_key}.weight"
+            converted_key_B = f"blocks.{i}.attn2.{c}.lora_B.weight"
+            if original_key_A in original_state_dict:
+                down_weight = original_state_dict.pop(original_key_A)
+                converted_state_dict[converted_key_A] = down_weight
+            if original_key_B in original_state_dict:
+                up_weight = original_state_dict.pop(original_key_B)
+                converted_state_dict[converted_key_B] = up_weight
+            if has_alpha:
+                scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+                converted_state_dict[converted_key_A] *= scale_down
+                converted_state_dict[converted_key_B] *= scale_up
+            original_key = f"blocks.{i}.cross_attn.{o}.diff_b"
+            converted_key = f"blocks.{i}.attn2.{c}.lora_B.bias"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
         if is_i2v_lora:
             for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
-                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = original_state_dict.pop(
-                    f"blocks.{i}.cross_attn.{o}.lora_A.weight"
-                )
-                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = original_state_dict.pop(
-                    f"blocks.{i}.cross_attn.{o}.lora_B.weight"
-                )
+                alpha_key = f"blocks.{i}.cross_attn.{o}.alpha"
+                has_alpha = alpha_key in original_state_dict
+                original_key_A = f"blocks.{i}.cross_attn.{o}.{lora_down_key}.weight"
+                converted_key_A = f"blocks.{i}.attn2.{c}.lora_A.weight"
+                original_key_B = f"blocks.{i}.cross_attn.{o}.{lora_up_key}.weight"
+                converted_key_B = f"blocks.{i}.attn2.{c}.lora_B.weight"
+                if original_key_A in original_state_dict:
+                    down_weight = original_state_dict.pop(original_key_A)
+                    converted_state_dict[converted_key_A] = down_weight
+                if original_key_B in original_state_dict:
+                    up_weight = original_state_dict.pop(original_key_B)
+                    converted_state_dict[converted_key_B] = up_weight
+                if has_alpha:
+                    scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+                    converted_state_dict[converted_key_A] *= scale_down
+                    converted_state_dict[converted_key_B] *= scale_up
+                original_key = f"blocks.{i}.cross_attn.{o}.diff_b"
+                converted_key = f"blocks.{i}.attn2.{c}.lora_B.bias"
+                if original_key in original_state_dict:
+                    converted_state_dict[converted_key] = original_state_dict.pop(original_key)
         # FFN
         for o, c in zip(["ffn.0", "ffn.2"], ["net.0.proj", "net.2"]):
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_A.weight"] = original_state_dict.pop(
-                f"blocks.{i}.{o}.lora_A.weight"
+            alpha_key = f"blocks.{i}.{o}.alpha"
+            has_alpha = alpha_key in original_state_dict
+            original_key_A = f"blocks.{i}.{o}.{lora_down_key}.weight"
+            converted_key_A = f"blocks.{i}.ffn.{c}.lora_A.weight"
+            original_key_B = f"blocks.{i}.{o}.{lora_up_key}.weight"
+            converted_key_B = f"blocks.{i}.ffn.{c}.lora_B.weight"
+            if original_key_A in original_state_dict:
+                down_weight = original_state_dict.pop(original_key_A)
+                converted_state_dict[converted_key_A] = down_weight
+            if original_key_B in original_state_dict:
+                up_weight = original_state_dict.pop(original_key_B)
+                converted_state_dict[converted_key_B] = up_weight
+            if has_alpha:
+                scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+                converted_state_dict[converted_key_A] *= scale_down
+                converted_state_dict[converted_key_B] *= scale_up
+            original_key = f"blocks.{i}.{o}.diff_b"
+            converted_key = f"blocks.{i}.ffn.{c}.lora_B.bias"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+    # Remaining.
+    if original_state_dict:
+        if any("time_projection" in k for k in original_state_dict):
+            original_key = f"time_projection.1.{lora_down_key}.weight"
+            converted_key = "condition_embedder.time_proj.lora_A.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+            original_key = f"time_projection.1.{lora_up_key}.weight"
+            converted_key = "condition_embedder.time_proj.lora_B.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+            if "time_projection.1.diff_b" in original_state_dict:
+                converted_state_dict["condition_embedder.time_proj.lora_B.bias"] = original_state_dict.pop(
+                    "time_projection.1.diff_b"
+                )
+        if any("head.head" in k for k in state_dict):
+            converted_state_dict["proj_out.lora_A.weight"] = original_state_dict.pop(
+                f"head.head.{lora_down_key}.weight"
             )
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.weight"] = original_state_dict.pop(
-                f"blocks.{i}.{o}.lora_B.weight"
+            converted_state_dict["proj_out.lora_B.weight"] = original_state_dict.pop(f"head.head.{lora_up_key}.weight")
+            if "head.head.diff_b" in original_state_dict:
+                converted_state_dict["proj_out.lora_B.bias"] = original_state_dict.pop("head.head.diff_b")
+        for text_time in ["text_embedding", "time_embedding"]:
+            if any(text_time in k for k in original_state_dict):
+                for b_n in [0, 2]:
+                    diffusers_b_n = 1 if b_n == 0 else 2
+                    diffusers_name = (
+                        "condition_embedder.text_embedder"
+                        if text_time == "text_embedding"
+                        else "condition_embedder.time_embedder"
+                    )
+                    if any(f"{text_time}.{b_n}" in k for k in original_state_dict):
+                        converted_state_dict[f"{diffusers_name}.linear_{diffusers_b_n}.lora_A.weight"] = (
+                            original_state_dict.pop(f"{text_time}.{b_n}.{lora_down_key}.weight")
+                        )
+                        converted_state_dict[f"{diffusers_name}.linear_{diffusers_b_n}.lora_B.weight"] = (
+                            original_state_dict.pop(f"{text_time}.{b_n}.{lora_up_key}.weight")
+                        )
+                    if f"{text_time}.{b_n}.diff_b" in original_state_dict:
+                        converted_state_dict[f"{diffusers_name}.linear_{diffusers_b_n}.lora_B.bias"] = (
+                            original_state_dict.pop(f"{text_time}.{b_n}.diff_b")
+                        )
+        for img_ours, img_theirs in [
+            ("ff.net.0.proj", "img_emb.proj.1"),
+            ("ff.net.2", "img_emb.proj.3"),
+        ]:
+            original_key = f"{img_theirs}.{lora_down_key}.weight"
+            converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_A.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+            original_key = f"{img_theirs}.{lora_up_key}.weight"
+            converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_B.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+                bias_key_theirs = original_key.removesuffix(f".{lora_up_key}.weight") + ".diff_b"
+                if bias_key_theirs in original_state_dict:
+                    bias_key = converted_key.removesuffix(".weight") + ".bias"
+                    converted_state_dict[bias_key] = original_state_dict.pop(bias_key_theirs)
+    if len(original_state_dict) > 0:
+        diff = all(".diff" in k for k in original_state_dict)
+        if diff:
+            diff_keys = {k for k in original_state_dict if k.endswith(".diff")}
+            if not all("lora" not in k for k in diff_keys):
+                raise ValueError
+            logger.info(
+                "The remaining `state_dict` contains `diff` keys which we do not handle yet. If you see performance issues, please file an issue: "
+                "https://github.com/huggingface/diffusers//issues/new"
             )
+        else:
+            raise ValueError(f"`state_dict` should be empty at this point but has {original_state_dict.keys()=}")
+    for key in list(converted_state_dict.keys()):
+        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
+    return converted_state_dict
+def _convert_musubi_wan_lora_to_diffusers(state_dict):
+    # https://github.com/kohya-ss/musubi-tuner
+    converted_state_dict = {}
+    original_state_dict = {k[len("lora_unet_") :]: v for k, v in state_dict.items()}
+    num_blocks = len({k.split("blocks_")[1].split("_")[0] for k in original_state_dict})
+    is_i2v_lora = any("k_img" in k for k in original_state_dict) and any("v_img" in k for k in original_state_dict)
+    def get_alpha_scales(down_weight, key):
+        rank = down_weight.shape[0]
+        alpha = original_state_dict.pop(key + ".alpha").item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+    for i in range(num_blocks):
+        # Self-attention
+        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_self_attn_{o}")
+            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_B.weight"] = up_weight * scale_up
+        # Cross-attention
+        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
+            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
+        if is_i2v_lora:
+            for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
+                down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
+                up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
+                scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
+        # FFN
+        for o, c in zip(["ffn_0", "ffn_2"], ["net.0.proj", "net.2"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_{o}")
+            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.weight"] = up_weight * scale_up
     if len(original_state_dict) > 0:
         raise ValueError(f"`state_dict` should be empty at this point but has {original_state_dict.keys()=}")
@@ -1608,3 +2110,123 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
         converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
     return converted_state_dict
+def _convert_non_diffusers_hidream_lora_to_diffusers(state_dict, non_diffusers_prefix="diffusion_model"):
+    if not all(k.startswith(non_diffusers_prefix) for k in state_dict):
+        raise ValueError("Invalid LoRA state dict for HiDream.")
+    converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict
+def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_prefix="diffusion_model"):
+    if not all(k.startswith(f"{non_diffusers_prefix}.") for k in state_dict):
+        raise ValueError("Invalid LoRA state dict for LTX-Video.")
+    converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict
+def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
+    has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
+    if has_lora_unet:
+        state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
+        def convert_key(key: str) -> str:
+            prefix = "transformer_blocks"
+            if "." in key:
+                base, suffix = key.rsplit(".", 1)
+            else:
+                base, suffix = key, ""
+            start = f"{prefix}_"
+            rest = base[len(start) :]
+            if "." in rest:
+                head, tail = rest.split(".", 1)
+                tail = "." + tail
+            else:
+                head, tail = rest, ""
+            # Protected n-grams that must keep their internal underscores
+            protected = {
+                # pairs
+                ("to", "q"),
+                ("to", "k"),
+                ("to", "v"),
+                ("to", "out"),
+                ("add", "q"),
+                ("add", "k"),
+                ("add", "v"),
+                ("txt", "mlp"),
+                ("img", "mlp"),
+                ("txt", "mod"),
+                ("img", "mod"),
+                # triplets
+                ("add", "q", "proj"),
+                ("add", "k", "proj"),
+                ("add", "v", "proj"),
+                ("to", "add", "out"),
+            }
+            prot_by_len = {}
+            for ng in protected:
+                prot_by_len.setdefault(len(ng), set()).add(ng)
+            parts = head.split("_")
+            merged = []
+            i = 0
+            lengths_desc = sorted(prot_by_len.keys(), reverse=True)
+            while i < len(parts):
+                matched = False
+                for L in lengths_desc:
+                    if i + L <= len(parts) and tuple(parts[i : i + L]) in prot_by_len[L]:
+                        merged.append("_".join(parts[i : i + L]))
+                        i += L
+                        matched = True
+                        break
+                if not matched:
+                    merged.append(parts[i])
+                    i += 1
+            head_converted = ".".join(merged)
+            converted_base = f"{prefix}.{head_converted}{tail}"
+            return converted_base + (("." + suffix) if suffix else "")
+        state_dict = {convert_key(k): v for k, v in state_dict.items()}
+    converted_state_dict = {}
+    all_keys = list(state_dict.keys())
+    down_key = ".lora_down.weight"
+    up_key = ".lora_up.weight"
+    def get_alpha_scales(down_weight, alpha_key):
+        rank = down_weight.shape[0]
+        alpha = state_dict.pop(alpha_key).item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+    for k in all_keys:
+        if k.endswith(down_key):
+            diffusers_down_key = k.replace(down_key, ".lora_A.weight")
+            diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
+            alpha_key = k.replace(down_key, ".alpha")
+            down_weight = state_dict.pop(k)
+            up_weight = state_dict.pop(k.replace(down_key, up_key))
+            scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+            converted_state_dict[diffusers_down_key] = down_weight * scale_down
+            converted_state_dict[diffusers_up_key] = up_weight * scale_up
+    if len(state_dict) > 0:
+        raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict

diffusers 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl

diffusers 0.33.1py3-none-any.whl → 0.35.0py3-none-any.whl