PyPI - InvokeAI - Versions diffs - 6.9.0rc3__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl - Mend

InvokeAI 6.9.0rc3py3-none-any.whl → 6.10.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

invokeai/backend/model_manager/load/model_util.py CHANGED Viewed

@@ -10,7 +10,7 @@ import onnxruntime as ort
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from transformers import CLIPTokenizer, T5Tokenizer, T5TokenizerFast
+from transformers import CLIPTokenizer, PreTrainedTokenizerBase, T5Tokenizer, T5TokenizerFast
 from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import DepthAnythingPipeline
 from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
@@ -73,6 +73,10 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
         # relative to the text encoder that it's used with, so shouldn't matter too much, but we should fix this at some
         # point.
         return len(model)
+    elif isinstance(model, PreTrainedTokenizerBase):
+        # Catch-all for other tokenizer types (e.g., Qwen2Tokenizer, Qwen3Tokenizer).
+        # Tokenizers are small relative to models, so returning 0 is acceptable.
+        return 0
     else:
         # TODO(ryand): Promote this from a log to an exception once we are confident that we are handling all of the
         # supported model types.
@@ -156,6 +160,7 @@ def calc_model_size_by_fs(model_path: Path, subfolder: Optional[str] = None, var
         (".msgpack",),  # flax
         (".ckpt",),  # tf
         (".h5",),  # tf2
+        (".gguf",),  # gguf quantized
     ]
     for file_format in formats:

invokeai/backend/model_manager/metadata/metadata_base.py CHANGED Viewed

@@ -95,13 +95,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
         self,
         variant: Optional[ModelRepoVariant] = None,
         subfolder: Optional[Path] = None,
+        subfolders: Optional[List[Path]] = None,
         session: Optional[Session] = None,
     ) -> List[RemoteModelFile]:
         """
-        Return list of downloadable files, filtering by variant and subfolder, if any.
+        Return list of downloadable files, filtering by variant and subfolder(s), if any.
         :param variant: Return model files needed to reconstruct the indicated variant
-        :param subfolder: Return model files from the designated subfolder only
+        :param subfolder: Return model files from the designated subfolder only (deprecated, use subfolders)
+        :param subfolders: Return model files from the designated subfolders
         :param session: A request.Session object used for internet-free testing
         Note that there is special variant-filtering behavior here:
@@ -111,10 +113,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
         session = session or Session()
         configure_http_backend(backend_factory=lambda: session)  # used in testing
-        paths = filter_files([x.path for x in self.files], variant, subfolder)  #  all files in the model
-        prefix = f"{subfolder}/" if subfolder else ""
+        paths = filter_files([x.path for x in self.files], variant, subfolder, subfolders)  #  all files in the model
+        # Determine prefix for model_index.json check - only applies for single subfolder
+        prefix = ""
+        if subfolder and not subfolders:
+            prefix = f"{subfolder}/"
         # the next step reads model_index.json to determine which subdirectories belong
-        # to the model
+        # to the model (only for single subfolder case)
         if Path(f"{prefix}model_index.json") in paths:
             url = hf_hub_url(self.id, filename="model_index.json", subfolder=str(subfolder) if subfolder else None)
             resp = session.get(url)

invokeai/backend/model_manager/model_on_disk.py CHANGED Viewed

@@ -84,6 +84,9 @@ class ModelOnDisk:
         path = self.resolve_weight_file(path)
+        if path in self._state_dict_cache:
+            return self._state_dict_cache[path]
         with SilenceWarnings():
             if path.suffix.endswith((".ckpt", ".pt", ".pth", ".bin")):
                 scan_result = scan_file_path(path)

invokeai/backend/model_manager/starter_models.py CHANGED Viewed

@@ -690,6 +690,69 @@ flux_fill = StarterModel(
 )
 # endregion
+# region Z-Image
+z_image_qwen3_encoder = StarterModel(
+    name="Z-Image Qwen3 Text Encoder",
+    base=BaseModelType.Any,
+    source="Tongyi-MAI/Z-Image-Turbo::text_encoder+tokenizer",
+    description="Qwen3 4B text encoder with tokenizer for Z-Image (full precision). ~8GB",
+    type=ModelType.Qwen3Encoder,
+)
+z_image_qwen3_encoder_quantized = StarterModel(
+    name="Z-Image Qwen3 Text Encoder (quantized)",
+    base=BaseModelType.Any,
+    source="https://huggingface.co/worstplayer/Z-Image_Qwen_3_4b_text_encoder_GGUF/resolve/main/Qwen_3_4b-Q6_K.gguf",
+    description="Qwen3 4B text encoder for Z-Image quantized to GGUF Q6_K format. ~3.3GB",
+    type=ModelType.Qwen3Encoder,
+    format=ModelFormat.GGUFQuantized,
+)
+z_image_turbo = StarterModel(
+    name="Z-Image Turbo",
+    base=BaseModelType.ZImage,
+    source="Tongyi-MAI/Z-Image-Turbo",
+    description="Z-Image Turbo - fast 6B parameter text-to-image model with 8 inference steps. Supports bilingual prompts (English & Chinese). ~13GB",
+    type=ModelType.Main,
+)
+z_image_turbo_quantized = StarterModel(
+    name="Z-Image Turbo (quantized)",
+    base=BaseModelType.ZImage,
+    source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
+    description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
+    type=ModelType.Main,
+    format=ModelFormat.GGUFQuantized,
+    dependencies=[z_image_qwen3_encoder_quantized],
+)
+z_image_turbo_q8 = StarterModel(
+    name="Z-Image Turbo (Q8)",
+    base=BaseModelType.ZImage,
+    source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
+    description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
+    type=ModelType.Main,
+    format=ModelFormat.GGUFQuantized,
+    dependencies=[z_image_qwen3_encoder_quantized],
+)
+z_image_controlnet_union = StarterModel(
+    name="Z-Image ControlNet Union",
+    base=BaseModelType.ZImage,
+    source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors",
+    description="Unified ControlNet for Z-Image Turbo supporting Canny, HED, Depth, Pose, MLSD, and Inpainting modes.",
+    type=ModelType.ControlNet,
+)
+z_image_controlnet_tile = StarterModel(
+    name="Z-Image ControlNet Tile",
+    base=BaseModelType.ZImage,
+    source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.safetensors",
+    description="Dedicated Tile ControlNet for Z-Image Turbo. Useful for upscaling and adding detail. ~6.7GB",
+    type=ModelType.ControlNet,
+)
+# endregion
 # List of starter models, displayed on the frontend.
 # The order/sort of this list is not changed by the frontend - set it how you want it here.
 STARTER_MODELS: list[StarterModel] = [
@@ -766,6 +829,13 @@ STARTER_MODELS: list[StarterModel] = [
     cogview4,
     flux_krea,
     flux_krea_quantized,
+    z_image_turbo,
+    z_image_turbo_quantized,
+    z_image_turbo_q8,
+    z_image_qwen3_encoder,
+    z_image_qwen3_encoder_quantized,
+    z_image_controlnet_union,
+    z_image_controlnet_tile,
 ]
 sd1_bundle: list[StarterModel] = [

invokeai/backend/model_manager/taxonomy.py CHANGED Viewed

@@ -48,6 +48,8 @@ class BaseModelType(str, Enum):
     """Indicates the model is associated with FLUX.1 model architecture, including FLUX Dev, Schnell and Fill."""
     CogView4 = "cogview4"
     """Indicates the model is associated with CogView 4 model architecture."""
+    ZImage = "z-image"
+    """Indicates the model is associated with Z-Image model architecture, including Z-Image-Turbo."""
     Unknown = "unknown"
     """Indicates the model's base architecture is unknown."""
@@ -67,6 +69,7 @@ class ModelType(str, Enum):
     CLIPEmbed = "clip_embed"
     T2IAdapter = "t2i_adapter"
     T5Encoder = "t5_encoder"
+    Qwen3Encoder = "qwen3_encoder"
     SpandrelImageToImage = "spandrel_image_to_image"
     SigLIP = "siglip"
     FluxRedux = "flux_redux"
@@ -126,6 +129,7 @@ class ModelFormat(str, Enum):
     EmbeddingFolder = "embedding_folder"
     InvokeAI = "invokeai"
     T5Encoder = "t5_encoder"
+    Qwen3Encoder = "qwen3_encoder"
     BnbQuantizedLlmInt8b = "bnb_quantized_int8b"
     BnbQuantizednf4b = "bnb_quantized_nf4b"
     GGUFQuantized = "gguf_quantized"
@@ -167,6 +171,7 @@ class FluxLoRAFormat(str, Enum):
     OneTrainer = "flux.onetrainer"
     Control = "flux.control"
     AIToolkit = "flux.aitoolkit"
+    XLabs = "flux.xlabs"
 AnyVariant: TypeAlias = Union[ModelVariantType, ClipVariantType, FluxVariantType]

invokeai/backend/model_manager/util/select_hf_files.py CHANGED Viewed

@@ -24,12 +24,14 @@ def filter_files(
     files: List[Path],
     variant: Optional[ModelRepoVariant] = None,
     subfolder: Optional[Path] = None,
+    subfolders: Optional[List[Path]] = None,
 ) -> List[Path]:
     """
     Take a list of files in a HuggingFace repo root and return paths to files needed to load the model.
     :param files: List of files relative to the repo root.
-    :param subfolder: Filter by the indicated subfolder.
+    :param subfolder: Filter by the indicated subfolder (deprecated, use subfolders instead).
+    :param subfolders: Filter by multiple subfolders. Files from any of these subfolders will be included.
     :param variant: Filter by files belonging to a particular variant, such as fp16.
     The file list can be obtained from the `files` field of HuggingFaceMetadata,
@@ -37,11 +39,24 @@ def filter_files(
     """
     variant = variant or ModelRepoVariant.Default
     paths: List[Path] = []
-    root = files[0].parts[0]
+    if not files:
+        return []
+    root = files[0].parts[0] if files[0].parts else Path(".")
+    # Build list of subfolders to filter by
+    filter_subfolders: List[Path] = []
+    if subfolders:
+        filter_subfolders = subfolders
+    elif subfolder:
+        filter_subfolders = [subfolder]
     # if the subfolder is a single file, then bypass the selection and just return it
-    if subfolder and subfolder.suffix in [".safetensors", ".bin", ".onnx", ".xml", ".pth", ".pt", ".ckpt", ".msgpack"]:
-        return [root / subfolder]
+    if len(filter_subfolders) == 1:
+        sf = filter_subfolders[0]
+        if sf.suffix in [".safetensors", ".bin", ".onnx", ".xml", ".pth", ".pt", ".ckpt", ".msgpack"]:
+            return [root / sf]
     # Start by filtering on model file extensions, discarding images, docs, etc
     for file in files:
@@ -66,10 +81,10 @@ def filter_files(
         elif re.search(r"model.*\.(safetensors|bin|onnx|xml|pth|pt|ckpt|msgpack)$", file.name):
             paths.append(file)
-    # limit search to subfolder if requested
-    if subfolder:
-        subfolder = root / subfolder
-        paths = [x for x in paths if Path(subfolder) in x.parents]
+    # limit search to subfolder(s) if requested
+    if filter_subfolders:
+        absolute_subfolders = [root / sf for sf in filter_subfolders]
+        paths = [x for x in paths if any(Path(sf) in x.parents for sf in absolute_subfolders)]
     # _filter_by_variant uniquifies the paths and returns a set
     return sorted(_filter_by_variant(paths, variant))

invokeai/backend/patches/layer_patcher.py CHANGED Viewed

@@ -86,7 +86,8 @@ class LayerPatcher:
         # submodules. If the layer keys do not contain a dot, then they are flattened, meaning that all '.' have been
         # replaced with '_'. Non-flattened keys are preferred, because they allow submodules to be accessed directly
         # without searching, but some legacy code still uses flattened keys.
-        layer_keys_are_flattened = "." not in next(iter(patch.layers.keys()))
+        first_key = next(iter(patch.layers.keys()))
+        layer_keys_are_flattened = "." not in first_key
         prefix_len = len(prefix)
@@ -174,28 +175,45 @@ class LayerPatcher:
         # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
         # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
-        for param_name, param_weight in patch.get_parameters(
-            dict(module_to_patch.named_parameters(recurse=False)), weight=patch_weight
-        ).items():
+        params_dict = patch.get_parameters(dict(module_to_patch.named_parameters(recurse=False)), weight=patch_weight)
+        if not params_dict:
+            logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
+            logger.warning(f"LoRA patch returned no parameters for module: {module_to_patch_key}")
+            return
+        for param_name, param_weight in params_dict.items():
             param_key = module_to_patch_key + "." + param_name
             module_param = module_to_patch.get_parameter(param_name)
             # Save original weight
             original_weights.save(param_key, module_param)
-            # HACK(ryand): This condition is only necessary to handle layers in FLUX control LoRAs that change the
-            # shape of the original layer.
+            # Handle layers that change the shape of the original layer.
+            # FLUX control LoRAs intentionally expand certain layers - we pad the original weight with zeros.
+            # For other LoRAs (e.g., Z-Image with architecture mismatch), skip incompatible layers with a warning.
             if module_param.nelement() != param_weight.nelement():
-                assert isinstance(patch, FluxControlLoRALayer)
-                expanded_weight = pad_with_zeros(module_param, param_weight.shape)
-                setattr(
-                    module_to_patch,
-                    param_name,
-                    torch.nn.Parameter(expanded_weight, requires_grad=module_param.requires_grad),
-                )
-                module_param = expanded_weight
-            module_param += param_weight.to(dtype=dtype)
+                if isinstance(patch, FluxControlLoRALayer):
+                    # FLUX Control LoRAs intentionally expand layers - pad with zeros
+                    expanded_weight = pad_with_zeros(module_param, param_weight.shape)
+                    setattr(
+                        module_to_patch,
+                        param_name,
+                        torch.nn.Parameter(expanded_weight, requires_grad=module_param.requires_grad),
+                    )
+                    module_param = expanded_weight
+                else:
+                    # For other LoRAs, shape mismatch indicates architecture incompatibility - skip the layer
+                    logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
+                    logger.warning(
+                        f"Skipping LoRA layer '{module_to_patch_key}.{param_name}' due to shape mismatch: "
+                        f"model has {module_param.nelement()} elements, LoRA expects {param_weight.nelement()}. "
+                        "This LoRA may be incompatible with this model architecture."
+                    )
+                    continue
+            # Convert param_weight to the correct device and dtype, then apply to model weights
+            param_weight_converted = param_weight.to(device=device, dtype=dtype)
+            module_param.data.copy_(module_param.data + param_weight_converted)
         patch.to(device=TorchDevice.CPU_DEVICE)

invokeai/backend/patches/layers/lora_layer_base.py CHANGED Viewed

@@ -60,7 +60,8 @@ class LoRALayerBase(BaseLayerPatch):
     def get_parameters(self, orig_parameters: dict[str, torch.Tensor], weight: float) -> dict[str, torch.Tensor]:
         scale = self.scale()
-        params = {"weight": self.get_weight(orig_parameters["weight"]) * (weight * scale)}
+        lora_weight = self.get_weight(orig_parameters["weight"])
+        params = {"weight": lora_weight * (weight * scale)}
         bias = self.get_bias(orig_parameters.get("bias", None))
         if bias is not None:
             params["bias"] = bias * (weight * scale)

invokeai/backend/patches/lora_conversions/flux_aitoolkit_lora_conversion_utils.py CHANGED Viewed

@@ -12,18 +12,33 @@ from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
 from invokeai.backend.util import InvokeAILogger
+def _has_flux_layer_structure(state_dict: dict[str | int, Any]) -> bool:
+    """Check if state dict has Flux-specific layer patterns (double_blocks/single_blocks)."""
+    return any(
+        k.startswith("diffusion_model.double_blocks.") or k.startswith("diffusion_model.single_blocks.")
+        for k in state_dict.keys()
+        if isinstance(k, str)
+    )
 def is_state_dict_likely_in_flux_aitoolkit_format(
     state_dict: dict[str | int, Any],
     metadata: dict[str, Any] | None = None,
 ) -> bool:
+    # Always check for Flux-specific layer structure first
+    # This prevents misidentifying Z-Image LoRAs (which use diffusion_model.layers.X) as Flux
+    if not _has_flux_layer_structure(state_dict):
+        return False
     if metadata:
         try:
             software = json.loads(metadata.get("software", "{}"))
         except json.JSONDecodeError:
             return False
         return software.get("name") == "ai-toolkit"
-    # metadata got lost somewhere
-    return any("diffusion_model" == k.split(".", 1)[0] for k in state_dict.keys() if isinstance(k, str))
+    # No metadata - if it has Flux layer structure, assume it's AI Toolkit format
+    return True
 @dataclass

invokeai/backend/patches/lora_conversions/flux_xlabs_lora_conversion_utils.py ADDED Viewed

@@ -0,0 +1,92 @@
+import re
+from typing import Any, Dict
+import torch
+from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
+from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
+from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_TRANSFORMER_PREFIX
+from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
+# A regex pattern that matches all of the transformer keys in the xlabs FLUX LoRA format.
+# Example keys:
+#   double_blocks.0.processor.qkv_lora1.down.weight
+#   double_blocks.0.processor.qkv_lora1.up.weight
+#   double_blocks.0.processor.proj_lora1.down.weight
+#   double_blocks.0.processor.proj_lora1.up.weight
+#   double_blocks.0.processor.qkv_lora2.down.weight
+#   double_blocks.0.processor.proj_lora2.up.weight
+FLUX_XLABS_KEY_REGEX = r"double_blocks\.(\d+)\.processor\.(qkv|proj)_lora([12])\.(down|up)\.weight"
+def is_state_dict_likely_in_flux_xlabs_format(state_dict: dict[str | int, Any]) -> bool:
+    """Checks if the provided state dict is likely in the xlabs FLUX LoRA format.
+    The xlabs format is characterized by keys matching the pattern:
+    double_blocks.{block_idx}.processor.{qkv|proj}_lora{1|2}.{down|up}.weight
+    Where:
+    - lora1 corresponds to the image attention stream (img_attn)
+    - lora2 corresponds to the text attention stream (txt_attn)
+    """
+    if not state_dict:
+        return False
+    # Check that all keys match the xlabs pattern
+    for key in state_dict.keys():
+        if not isinstance(key, str):
+            continue
+        if not re.match(FLUX_XLABS_KEY_REGEX, key):
+            return False
+    # Ensure we have at least some valid keys
+    return any(isinstance(k, str) and re.match(FLUX_XLABS_KEY_REGEX, k) for k in state_dict.keys())
+def lora_model_from_flux_xlabs_state_dict(state_dict: Dict[str, torch.Tensor]) -> ModelPatchRaw:
+    """Converts an xlabs FLUX LoRA state dict to the InvokeAI ModelPatchRaw format.
+    The xlabs format uses:
+    - lora1 for image attention stream (img_attn)
+    - lora2 for text attention stream (txt_attn)
+    - qkv for query/key/value projection
+    - proj for output projection
+    Key mapping:
+    - double_blocks.X.processor.qkv_lora1 -> double_blocks.X.img_attn.qkv
+    - double_blocks.X.processor.proj_lora1 -> double_blocks.X.img_attn.proj
+    - double_blocks.X.processor.qkv_lora2 -> double_blocks.X.txt_attn.qkv
+    - double_blocks.X.processor.proj_lora2 -> double_blocks.X.txt_attn.proj
+    """
+    # Group keys by layer (without the .down.weight/.up.weight suffix)
+    grouped_state_dict: dict[str, dict[str, torch.Tensor]] = {}
+    for key, value in state_dict.items():
+        match = re.match(FLUX_XLABS_KEY_REGEX, key)
+        if not match:
+            raise ValueError(f"Key '{key}' does not match the expected pattern for xlabs FLUX LoRA weights.")
+        block_idx = match.group(1)
+        component = match.group(2)  # qkv or proj
+        lora_stream = match.group(3)  # 1 or 2
+        direction = match.group(4)  # down or up
+        # Map lora1 -> img_attn, lora2 -> txt_attn
+        attn_type = "img_attn" if lora_stream == "1" else "txt_attn"
+        # Create the InvokeAI-style layer key
+        layer_key = f"double_blocks.{block_idx}.{attn_type}.{component}"
+        if layer_key not in grouped_state_dict:
+            grouped_state_dict[layer_key] = {}
+        # Map down/up to lora_down/lora_up
+        param_name = f"lora_{direction}.weight"
+        grouped_state_dict[layer_key][param_name] = value
+    # Create LoRA layers
+    layers: dict[str, BaseLayerPatch] = {}
+    for layer_key, layer_state_dict in grouped_state_dict.items():
+        layers[FLUX_LORA_TRANSFORMER_PREFIX + layer_key] = any_lora_layer_from_state_dict(layer_state_dict)
+    return ModelPatchRaw(layers=layers)

invokeai/backend/patches/lora_conversions/formats.py CHANGED Viewed

@@ -14,6 +14,9 @@ from invokeai.backend.patches.lora_conversions.flux_kohya_lora_conversion_utils
 from invokeai.backend.patches.lora_conversions.flux_onetrainer_lora_conversion_utils import (
     is_state_dict_likely_in_flux_onetrainer_format,
 )
+from invokeai.backend.patches.lora_conversions.flux_xlabs_lora_conversion_utils import (
+    is_state_dict_likely_in_flux_xlabs_format,
+)
 def flux_format_from_state_dict(
@@ -30,5 +33,7 @@ def flux_format_from_state_dict(
         return FluxLoRAFormat.Control
     elif is_state_dict_likely_in_flux_aitoolkit_format(state_dict, metadata):
         return FluxLoRAFormat.AIToolkit
+    elif is_state_dict_likely_in_flux_xlabs_format(state_dict):
+        return FluxLoRAFormat.XLabs
     else:
         return None

invokeai/backend/patches/lora_conversions/z_image_lora_constants.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Z-Image LoRA prefix constants
+# These prefixes are used for key mapping when applying LoRA patches to Z-Image models
+# Prefix for Z-Image transformer (S3-DiT architecture) LoRA layers
+Z_IMAGE_LORA_TRANSFORMER_PREFIX = "lora_transformer-"
+# Prefix for Qwen3 text encoder LoRA layers
+Z_IMAGE_LORA_QWEN3_PREFIX = "lora_qwen3-"

invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Z-Image LoRA conversion utilities.
+Z-Image uses S3-DiT transformer architecture with Qwen3 text encoder.
+LoRAs for Z-Image typically follow the diffusers PEFT format.
+"""
+from typing import Dict
+import torch
+from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
+from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
+from invokeai.backend.patches.lora_conversions.z_image_lora_constants import (
+    Z_IMAGE_LORA_QWEN3_PREFIX,
+    Z_IMAGE_LORA_TRANSFORMER_PREFIX,
+)
+from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
+def is_state_dict_likely_z_image_lora(state_dict: dict[str | int, torch.Tensor]) -> bool:
+    """Checks if the provided state dict is likely a Z-Image LoRA.
+    Z-Image LoRAs can have keys for transformer and/or Qwen3 text encoder.
+    They may use various prefixes depending on the training framework.
+    """
+    str_keys = [k for k in state_dict.keys() if isinstance(k, str)]
+    # Check for Z-Image transformer keys (S3-DiT architecture)
+    # Various training frameworks use different prefixes
+    has_transformer_keys = any(
+        k.startswith(
+            (
+                "transformer.",
+                "base_model.model.transformer.",
+                "diffusion_model.",
+            )
+        )
+        for k in str_keys
+    )
+    # Check for Qwen3 text encoder keys
+    has_qwen3_keys = any(k.startswith(("text_encoder.", "base_model.model.text_encoder.")) for k in str_keys)
+    return has_transformer_keys or has_qwen3_keys
+def lora_model_from_z_image_state_dict(
+    state_dict: Dict[str, torch.Tensor], alpha: float | None = None
+) -> ModelPatchRaw:
+    """Convert a Z-Image LoRA state dict to a ModelPatchRaw.
+    Z-Image LoRAs can contain layers for:
+    - Transformer (S3-DiT architecture)
+    - Qwen3 text encoder
+    Z-Image LoRAs may use various key prefixes depending on how they were trained:
+    - "transformer." or "base_model.model.transformer." for diffusers PEFT format
+    - "diffusion_model." for some training frameworks
+    - "text_encoder." or "base_model.model.text_encoder." for Qwen3 encoder
+    Args:
+        state_dict: The LoRA state dict
+        alpha: The alpha value for LoRA scaling. If None, uses rank as alpha.
+    Returns:
+        A ModelPatchRaw containing the LoRA layers
+    """
+    layers: dict[str, BaseLayerPatch] = {}
+    # Group keys by layer
+    grouped_state_dict = _group_by_layer(state_dict)
+    for layer_key, layer_dict in grouped_state_dict.items():
+        # Convert PEFT format keys to internal format
+        values = _get_lora_layer_values(layer_dict, alpha)
+        # Determine the appropriate prefix based on the layer type and clean up the key
+        clean_key = layer_key
+        # Handle various transformer prefixes
+        transformer_prefixes = [
+            "base_model.model.transformer.diffusion_model.",
+            "base_model.model.transformer.",
+            "transformer.diffusion_model.",
+            "transformer.",
+            "diffusion_model.",
+        ]
+        # Handle text encoder prefixes
+        text_encoder_prefixes = [
+            "base_model.model.text_encoder.",
+            "text_encoder.",
+        ]
+        is_text_encoder = False
+        # Check and strip text encoder prefixes first
+        for prefix in text_encoder_prefixes:
+            if layer_key.startswith(prefix):
+                clean_key = layer_key[len(prefix) :]
+                is_text_encoder = True
+                break
+        # If not text encoder, check transformer prefixes
+        if not is_text_encoder:
+            for prefix in transformer_prefixes:
+                if layer_key.startswith(prefix):
+                    clean_key = layer_key[len(prefix) :]
+                    break
+        # Apply the appropriate internal prefix
+        if is_text_encoder:
+            final_key = f"{Z_IMAGE_LORA_QWEN3_PREFIX}{clean_key}"
+        else:
+            final_key = f"{Z_IMAGE_LORA_TRANSFORMER_PREFIX}{clean_key}"
+        layer = any_lora_layer_from_state_dict(values)
+        layers[final_key] = layer
+    return ModelPatchRaw(layers=layers)
+def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | None) -> dict[str, torch.Tensor]:
+    """Convert layer dict keys from PEFT format to internal format."""
+    if "lora_A.weight" in layer_dict:
+        # PEFT format: lora_A.weight, lora_B.weight
+        values = {
+            "lora_down.weight": layer_dict["lora_A.weight"],
+            "lora_up.weight": layer_dict["lora_B.weight"],
+        }
+        if alpha is not None:
+            values["alpha"] = torch.tensor(alpha)
+        return values
+    elif "lora_down.weight" in layer_dict:
+        # Already in internal format
+        return layer_dict
+    else:
+        # Unknown format, return as-is
+        return layer_dict
+def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
+    """Groups the keys in the state dict by layer."""
+    layer_dict: dict[str, dict[str, torch.Tensor]] = {}
+    for key in state_dict:
+        if not isinstance(key, str):
+            continue
+        # Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
+        parts = key.rsplit(".", maxsplit=2)
+        layer_name = parts[0]
+        key_name = ".".join(parts[1:])
+        if layer_name not in layer_dict:
+            layer_dict[layer_name] = {}
+        layer_dict[layer_name][key_name] = state_dict[key]
+    return layer_dict

InvokeAI 6.9.0rc3__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl

InvokeAI 6.9.0rc3py3-none-any.whl → 6.10.0rc1py3-none-any.whl