PyPI - InvokeAI - Versions diffs - 6.10.0__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl - Mend

InvokeAI 6.10.0py3-none-any.whl → 6.10.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

invokeai/backend/model_manager/starter_models.py CHANGED Viewed

@@ -720,20 +720,20 @@ z_image_turbo_quantized = StarterModel(
     name="Z-Image Turbo (quantized)",
     base=BaseModelType.ZImage,
     source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
-    description="Z-Image Turbo quantized to GGUF Q4_K format. Requires standalone Qwen3 text encoder and Flux VAE. ~4GB",
+    description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
     type=ModelType.Main,
     format=ModelFormat.GGUFQuantized,
-    dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
+    dependencies=[z_image_qwen3_encoder_quantized],
 )
 z_image_turbo_q8 = StarterModel(
     name="Z-Image Turbo (Q8)",
     base=BaseModelType.ZImage,
     source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
-    description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires standalone Qwen3 text encoder and Flux VAE. ~6.6GB",
+    description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
     type=ModelType.Main,
     format=ModelFormat.GGUFQuantized,
-    dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
+    dependencies=[z_image_qwen3_encoder_quantized],
 )
 z_image_controlnet_union = StarterModel(
@@ -890,19 +890,10 @@ flux_bundle: list[StarterModel] = [
     flux_krea_quantized,
 ]
-zimage_bundle: list[StarterModel] = [
-    z_image_turbo_quantized,
-    z_image_qwen3_encoder_quantized,
-    z_image_controlnet_union,
-    z_image_controlnet_tile,
-    flux_vae,
-]
 STARTER_BUNDLES: dict[str, StarterModelBundle] = {
     BaseModelType.StableDiffusion1: StarterModelBundle(name="Stable Diffusion 1.5", models=sd1_bundle),
     BaseModelType.StableDiffusionXL: StarterModelBundle(name="SDXL", models=sdxl_bundle),
     BaseModelType.Flux: StarterModelBundle(name="FLUX.1 dev", models=flux_bundle),
-    BaseModelType.ZImage: StarterModelBundle(name="Z-Image Turbo", models=zimage_bundle),
 }
 assert len(STARTER_MODELS) == len({m.source for m in STARTER_MODELS}), "Duplicate starter models"

invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py CHANGED Viewed

@@ -140,50 +140,16 @@ def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | N
 def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
-    """Groups the keys in the state dict by layer.
-    Z-Image LoRAs have keys like:
-    - diffusion_model.layers.17.attention.to_k.alpha
-    - diffusion_model.layers.17.attention.to_k.dora_scale
-    - diffusion_model.layers.17.attention.to_k.lora_down.weight
-    - diffusion_model.layers.17.attention.to_k.lora_up.weight
-    We need to group these by the full layer path (e.g., diffusion_model.layers.17.attention.to_k)
-    and extract the suffix (alpha, dora_scale, lora_down.weight, lora_up.weight).
-    """
+    """Groups the keys in the state dict by layer."""
     layer_dict: dict[str, dict[str, torch.Tensor]] = {}
-    # Known suffixes that indicate the end of a layer name
-    known_suffixes = [
-        ".lora_A.weight",
-        ".lora_B.weight",
-        ".lora_down.weight",
-        ".lora_up.weight",
-        ".dora_scale",
-        ".alpha",
-    ]
     for key in state_dict:
         if not isinstance(key, str):
             continue
-        # Try to find a known suffix
-        layer_name = None
-        key_name = None
-        for suffix in known_suffixes:
-            if key.endswith(suffix):
-                layer_name = key[: -len(suffix)]
-                key_name = suffix[1:]  # Remove leading dot
-                break
-        if layer_name is None:
-            # Fallback to original logic for unknown formats
-            parts = key.rsplit(".", maxsplit=2)
-            layer_name = parts[0]
-            key_name = ".".join(parts[1:])
+        # Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
+        parts = key.rsplit(".", maxsplit=2)
+        layer_name = parts[0]
+        key_name = ".".join(parts[1:])
         if layer_name not in layer_dict:
             layer_dict[layer_name] = {}
         layer_dict[layer_name][key_name] = state_dict[key]
     return layer_dict

invokeai/backend/quantization/gguf/ggml_tensor.py CHANGED Viewed

@@ -17,32 +17,21 @@ def dequantize_and_run(func, args, kwargs):
     Also casts other floating point tensors to match the compute_dtype of GGMLTensors
     to avoid dtype mismatches in matrix operations.
     """
-    # Find the compute_dtype and target_device from any GGMLTensor in the args
+    # Find the compute_dtype from any GGMLTensor in the args
     compute_dtype = None
-    target_device = None
     for a in args:
         if hasattr(a, "compute_dtype"):
             compute_dtype = a.compute_dtype
-        if isinstance(a, torch.Tensor) and target_device is None:
-            target_device = a.device
-        if compute_dtype is not None and target_device is not None:
             break
-    if compute_dtype is None or target_device is None:
+    if compute_dtype is None:
         for v in kwargs.values():
-            if hasattr(v, "compute_dtype") and compute_dtype is None:
+            if hasattr(v, "compute_dtype"):
                 compute_dtype = v.compute_dtype
-            if isinstance(v, torch.Tensor) and target_device is None:
-                target_device = v.device
-            if compute_dtype is not None and target_device is not None:
                 break
     def process_tensor(t):
         if hasattr(t, "get_dequantized_tensor"):
-            result = t.get_dequantized_tensor()
-            # Ensure the dequantized tensor is on the target device
-            if target_device is not None and result.device != target_device:
-                result = result.to(target_device)
-            return result
+            return t.get_dequantized_tensor()
         elif isinstance(t, torch.Tensor) and compute_dtype is not None and t.is_floating_point():
             # Cast other floating point tensors to match the GGUF compute_dtype
             return t.to(compute_dtype)

invokeai/backend/z_image/extensions/regional_prompting_extension.py CHANGED Viewed

@@ -66,16 +66,12 @@ class ZImageRegionalPromptingExtension:
     ) -> torch.Tensor | None:
         """Prepare a regional attention mask for Z-Image.
-        This uses an 'unrestricted' image self-attention approach (similar to FLUX):
-        - Image tokens can attend to ALL other image tokens (unrestricted self-attention)
+        The mask controls which tokens can attend to each other:
+        - Image tokens within a region attend only to each other
         - Image tokens attend only to their corresponding regional text
         - Text tokens attend only to their corresponding regional image
         - Text tokens attend to themselves
-        The unrestricted image self-attention allows the model to maintain global
-        coherence across regions, preventing the generation of separate/disconnected
-        images for each region.
         Z-Image sequence order: [img_tokens, txt_tokens]
         Args:
@@ -133,6 +129,12 @@ class ZImageRegionalPromptingExtension:
                 # 3. txt attends to corresponding regional img
                 # Reshape mask to (1, img_seq_len) for broadcasting
                 regional_attention_mask[txt_start:txt_end, :img_seq_len] = mask_flat.view(1, img_seq_len)
+                # 4. img self-attention within region
+                # mask @ mask.T creates pairwise attention within the masked region
+                regional_attention_mask[:img_seq_len, :img_seq_len] += mask_flat.view(img_seq_len, 1) @ mask_flat.view(
+                    1, img_seq_len
+                )
             else:
                 # Global prompt: allow attention to/from background regions only
                 if background_region_mask is not None:
@@ -150,10 +152,10 @@ class ZImageRegionalPromptingExtension:
                     regional_attention_mask[:img_seq_len, txt_start:txt_end] = 1.0
                     regional_attention_mask[txt_start:txt_end, :img_seq_len] = 1.0
-        # 4. Allow unrestricted image self-attention
-        # This is the key difference from the restricted approach - all image tokens
-        # can attend to each other, which helps maintain global coherence across regions
-        regional_attention_mask[:img_seq_len, :img_seq_len] = 1.0
+        # Allow background regions to attend to themselves
+        if background_region_mask is not None:
+            bg_mask = background_region_mask.view(img_seq_len, 1)
+            regional_attention_mask[:img_seq_len, :img_seq_len] += bg_mask @ bg_mask.T
         # Convert to boolean mask
         regional_attention_mask = regional_attention_mask > 0.5

InvokeAI 6.10.0__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl

InvokeAI 6.10.0py3-none-any.whl → 6.10.0rc1py3-none-any.whl