InvokeAI 6.9.0rc3__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. invokeai/app/api/dependencies.py +2 -0
  2. invokeai/app/api/routers/model_manager.py +91 -2
  3. invokeai/app/api/routers/workflows.py +9 -0
  4. invokeai/app/invocations/fields.py +19 -0
  5. invokeai/app/invocations/image_to_latents.py +23 -5
  6. invokeai/app/invocations/latents_to_image.py +2 -25
  7. invokeai/app/invocations/metadata.py +9 -1
  8. invokeai/app/invocations/model.py +8 -0
  9. invokeai/app/invocations/primitives.py +12 -0
  10. invokeai/app/invocations/prompt_template.py +57 -0
  11. invokeai/app/invocations/z_image_control.py +112 -0
  12. invokeai/app/invocations/z_image_denoise.py +610 -0
  13. invokeai/app/invocations/z_image_image_to_latents.py +102 -0
  14. invokeai/app/invocations/z_image_latents_to_image.py +103 -0
  15. invokeai/app/invocations/z_image_lora_loader.py +153 -0
  16. invokeai/app/invocations/z_image_model_loader.py +135 -0
  17. invokeai/app/invocations/z_image_text_encoder.py +197 -0
  18. invokeai/app/services/model_install/model_install_common.py +14 -1
  19. invokeai/app/services/model_install/model_install_default.py +119 -19
  20. invokeai/app/services/model_records/model_records_base.py +12 -0
  21. invokeai/app/services/model_records/model_records_sql.py +17 -0
  22. invokeai/app/services/shared/graph.py +132 -77
  23. invokeai/app/services/workflow_records/workflow_records_base.py +8 -0
  24. invokeai/app/services/workflow_records/workflow_records_sqlite.py +42 -0
  25. invokeai/app/util/step_callback.py +3 -0
  26. invokeai/backend/model_manager/configs/controlnet.py +47 -1
  27. invokeai/backend/model_manager/configs/factory.py +26 -1
  28. invokeai/backend/model_manager/configs/lora.py +43 -1
  29. invokeai/backend/model_manager/configs/main.py +113 -0
  30. invokeai/backend/model_manager/configs/qwen3_encoder.py +156 -0
  31. invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_diffusers_rms_norm.py +40 -0
  32. invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_layer_norm.py +25 -0
  33. invokeai/backend/model_manager/load/model_cache/torch_module_autocast/torch_module_autocast.py +11 -2
  34. invokeai/backend/model_manager/load/model_loaders/lora.py +11 -0
  35. invokeai/backend/model_manager/load/model_loaders/z_image.py +935 -0
  36. invokeai/backend/model_manager/load/model_util.py +6 -1
  37. invokeai/backend/model_manager/metadata/metadata_base.py +12 -5
  38. invokeai/backend/model_manager/model_on_disk.py +3 -0
  39. invokeai/backend/model_manager/starter_models.py +70 -0
  40. invokeai/backend/model_manager/taxonomy.py +5 -0
  41. invokeai/backend/model_manager/util/select_hf_files.py +23 -8
  42. invokeai/backend/patches/layer_patcher.py +34 -16
  43. invokeai/backend/patches/layers/lora_layer_base.py +2 -1
  44. invokeai/backend/patches/lora_conversions/flux_aitoolkit_lora_conversion_utils.py +17 -2
  45. invokeai/backend/patches/lora_conversions/flux_xlabs_lora_conversion_utils.py +92 -0
  46. invokeai/backend/patches/lora_conversions/formats.py +5 -0
  47. invokeai/backend/patches/lora_conversions/z_image_lora_constants.py +8 -0
  48. invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +155 -0
  49. invokeai/backend/quantization/gguf/ggml_tensor.py +27 -4
  50. invokeai/backend/quantization/gguf/loaders.py +47 -12
  51. invokeai/backend/stable_diffusion/diffusion/conditioning_data.py +13 -0
  52. invokeai/backend/util/devices.py +25 -0
  53. invokeai/backend/util/hotfixes.py +2 -2
  54. invokeai/backend/z_image/__init__.py +16 -0
  55. invokeai/backend/z_image/extensions/__init__.py +1 -0
  56. invokeai/backend/z_image/extensions/regional_prompting_extension.py +207 -0
  57. invokeai/backend/z_image/text_conditioning.py +74 -0
  58. invokeai/backend/z_image/z_image_control_adapter.py +238 -0
  59. invokeai/backend/z_image/z_image_control_transformer.py +643 -0
  60. invokeai/backend/z_image/z_image_controlnet_extension.py +531 -0
  61. invokeai/backend/z_image/z_image_patchify_utils.py +135 -0
  62. invokeai/backend/z_image/z_image_transformer_patch.py +234 -0
  63. invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +161 -0
  64. invokeai/frontend/web/dist/assets/{browser-ponyfill-CN1j0ARZ.js → browser-ponyfill-DHZxq1nk.js} +1 -1
  65. invokeai/frontend/web/dist/assets/index-dgSJAY--.js +530 -0
  66. invokeai/frontend/web/dist/index.html +1 -1
  67. invokeai/frontend/web/dist/locales/de.json +24 -6
  68. invokeai/frontend/web/dist/locales/en.json +70 -1
  69. invokeai/frontend/web/dist/locales/es.json +0 -5
  70. invokeai/frontend/web/dist/locales/fr.json +0 -6
  71. invokeai/frontend/web/dist/locales/it.json +17 -64
  72. invokeai/frontend/web/dist/locales/ja.json +379 -44
  73. invokeai/frontend/web/dist/locales/ru.json +0 -6
  74. invokeai/frontend/web/dist/locales/vi.json +7 -54
  75. invokeai/frontend/web/dist/locales/zh-CN.json +0 -6
  76. invokeai/version/invokeai_version.py +1 -1
  77. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/METADATA +3 -3
  78. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/RECORD +84 -60
  79. invokeai/frontend/web/dist/assets/App-Cn9UyjoV.js +0 -161
  80. invokeai/frontend/web/dist/assets/index-BDrf9CL-.js +0 -530
  81. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/WHEEL +0 -0
  82. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/entry_points.txt +0 -0
  83. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE +0 -0
  84. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  85. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  86. {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ import onnxruntime as ort
10
10
  import torch
11
11
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline
12
12
  from diffusers.schedulers.scheduling_utils import SchedulerMixin
13
- from transformers import CLIPTokenizer, T5Tokenizer, T5TokenizerFast
13
+ from transformers import CLIPTokenizer, PreTrainedTokenizerBase, T5Tokenizer, T5TokenizerFast
14
14
 
15
15
  from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import DepthAnythingPipeline
16
16
  from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
@@ -73,6 +73,10 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
73
73
  # relative to the text encoder that it's used with, so shouldn't matter too much, but we should fix this at some
74
74
  # point.
75
75
  return len(model)
76
+ elif isinstance(model, PreTrainedTokenizerBase):
77
+ # Catch-all for other tokenizer types (e.g., Qwen2Tokenizer, Qwen3Tokenizer).
78
+ # Tokenizers are small relative to models, so returning 0 is acceptable.
79
+ return 0
76
80
  else:
77
81
  # TODO(ryand): Promote this from a log to an exception once we are confident that we are handling all of the
78
82
  # supported model types.
@@ -156,6 +160,7 @@ def calc_model_size_by_fs(model_path: Path, subfolder: Optional[str] = None, var
156
160
  (".msgpack",), # flax
157
161
  (".ckpt",), # tf
158
162
  (".h5",), # tf2
163
+ (".gguf",), # gguf quantized
159
164
  ]
160
165
 
161
166
  for file_format in formats:
@@ -95,13 +95,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
95
95
  self,
96
96
  variant: Optional[ModelRepoVariant] = None,
97
97
  subfolder: Optional[Path] = None,
98
+ subfolders: Optional[List[Path]] = None,
98
99
  session: Optional[Session] = None,
99
100
  ) -> List[RemoteModelFile]:
100
101
  """
101
- Return list of downloadable files, filtering by variant and subfolder, if any.
102
+ Return list of downloadable files, filtering by variant and subfolder(s), if any.
102
103
 
103
104
  :param variant: Return model files needed to reconstruct the indicated variant
104
- :param subfolder: Return model files from the designated subfolder only
105
+ :param subfolder: Return model files from the designated subfolder only (deprecated, use subfolders)
106
+ :param subfolders: Return model files from the designated subfolders
105
107
  :param session: A request.Session object used for internet-free testing
106
108
 
107
109
  Note that there is special variant-filtering behavior here:
@@ -111,10 +113,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
111
113
  session = session or Session()
112
114
  configure_http_backend(backend_factory=lambda: session) # used in testing
113
115
 
114
- paths = filter_files([x.path for x in self.files], variant, subfolder) # all files in the model
115
- prefix = f"{subfolder}/" if subfolder else ""
116
+ paths = filter_files([x.path for x in self.files], variant, subfolder, subfolders) # all files in the model
117
+
118
+ # Determine prefix for model_index.json check - only applies for single subfolder
119
+ prefix = ""
120
+ if subfolder and not subfolders:
121
+ prefix = f"{subfolder}/"
122
+
116
123
  # the next step reads model_index.json to determine which subdirectories belong
117
- # to the model
124
+ # to the model (only for single subfolder case)
118
125
  if Path(f"{prefix}model_index.json") in paths:
119
126
  url = hf_hub_url(self.id, filename="model_index.json", subfolder=str(subfolder) if subfolder else None)
120
127
  resp = session.get(url)
@@ -84,6 +84,9 @@ class ModelOnDisk:
84
84
 
85
85
  path = self.resolve_weight_file(path)
86
86
 
87
+ if path in self._state_dict_cache:
88
+ return self._state_dict_cache[path]
89
+
87
90
  with SilenceWarnings():
88
91
  if path.suffix.endswith((".ckpt", ".pt", ".pth", ".bin")):
89
92
  scan_result = scan_file_path(path)
@@ -690,6 +690,69 @@ flux_fill = StarterModel(
690
690
  )
691
691
  # endregion
692
692
 
693
+ # region Z-Image
694
+ z_image_qwen3_encoder = StarterModel(
695
+ name="Z-Image Qwen3 Text Encoder",
696
+ base=BaseModelType.Any,
697
+ source="Tongyi-MAI/Z-Image-Turbo::text_encoder+tokenizer",
698
+ description="Qwen3 4B text encoder with tokenizer for Z-Image (full precision). ~8GB",
699
+ type=ModelType.Qwen3Encoder,
700
+ )
701
+
702
+ z_image_qwen3_encoder_quantized = StarterModel(
703
+ name="Z-Image Qwen3 Text Encoder (quantized)",
704
+ base=BaseModelType.Any,
705
+ source="https://huggingface.co/worstplayer/Z-Image_Qwen_3_4b_text_encoder_GGUF/resolve/main/Qwen_3_4b-Q6_K.gguf",
706
+ description="Qwen3 4B text encoder for Z-Image quantized to GGUF Q6_K format. ~3.3GB",
707
+ type=ModelType.Qwen3Encoder,
708
+ format=ModelFormat.GGUFQuantized,
709
+ )
710
+
711
+ z_image_turbo = StarterModel(
712
+ name="Z-Image Turbo",
713
+ base=BaseModelType.ZImage,
714
+ source="Tongyi-MAI/Z-Image-Turbo",
715
+ description="Z-Image Turbo - fast 6B parameter text-to-image model with 8 inference steps. Supports bilingual prompts (English & Chinese). ~13GB",
716
+ type=ModelType.Main,
717
+ )
718
+
719
+ z_image_turbo_quantized = StarterModel(
720
+ name="Z-Image Turbo (quantized)",
721
+ base=BaseModelType.ZImage,
722
+ source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
723
+ description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
724
+ type=ModelType.Main,
725
+ format=ModelFormat.GGUFQuantized,
726
+ dependencies=[z_image_qwen3_encoder_quantized],
727
+ )
728
+
729
+ z_image_turbo_q8 = StarterModel(
730
+ name="Z-Image Turbo (Q8)",
731
+ base=BaseModelType.ZImage,
732
+ source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
733
+ description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
734
+ type=ModelType.Main,
735
+ format=ModelFormat.GGUFQuantized,
736
+ dependencies=[z_image_qwen3_encoder_quantized],
737
+ )
738
+
739
+ z_image_controlnet_union = StarterModel(
740
+ name="Z-Image ControlNet Union",
741
+ base=BaseModelType.ZImage,
742
+ source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors",
743
+ description="Unified ControlNet for Z-Image Turbo supporting Canny, HED, Depth, Pose, MLSD, and Inpainting modes.",
744
+ type=ModelType.ControlNet,
745
+ )
746
+
747
+ z_image_controlnet_tile = StarterModel(
748
+ name="Z-Image ControlNet Tile",
749
+ base=BaseModelType.ZImage,
750
+ source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.safetensors",
751
+ description="Dedicated Tile ControlNet for Z-Image Turbo. Useful for upscaling and adding detail. ~6.7GB",
752
+ type=ModelType.ControlNet,
753
+ )
754
+ # endregion
755
+
693
756
  # List of starter models, displayed on the frontend.
694
757
  # The order/sort of this list is not changed by the frontend - set it how you want it here.
695
758
  STARTER_MODELS: list[StarterModel] = [
@@ -766,6 +829,13 @@ STARTER_MODELS: list[StarterModel] = [
766
829
  cogview4,
767
830
  flux_krea,
768
831
  flux_krea_quantized,
832
+ z_image_turbo,
833
+ z_image_turbo_quantized,
834
+ z_image_turbo_q8,
835
+ z_image_qwen3_encoder,
836
+ z_image_qwen3_encoder_quantized,
837
+ z_image_controlnet_union,
838
+ z_image_controlnet_tile,
769
839
  ]
770
840
 
771
841
  sd1_bundle: list[StarterModel] = [
@@ -48,6 +48,8 @@ class BaseModelType(str, Enum):
48
48
  """Indicates the model is associated with FLUX.1 model architecture, including FLUX Dev, Schnell and Fill."""
49
49
  CogView4 = "cogview4"
50
50
  """Indicates the model is associated with CogView 4 model architecture."""
51
+ ZImage = "z-image"
52
+ """Indicates the model is associated with Z-Image model architecture, including Z-Image-Turbo."""
51
53
  Unknown = "unknown"
52
54
  """Indicates the model's base architecture is unknown."""
53
55
 
@@ -67,6 +69,7 @@ class ModelType(str, Enum):
67
69
  CLIPEmbed = "clip_embed"
68
70
  T2IAdapter = "t2i_adapter"
69
71
  T5Encoder = "t5_encoder"
72
+ Qwen3Encoder = "qwen3_encoder"
70
73
  SpandrelImageToImage = "spandrel_image_to_image"
71
74
  SigLIP = "siglip"
72
75
  FluxRedux = "flux_redux"
@@ -126,6 +129,7 @@ class ModelFormat(str, Enum):
126
129
  EmbeddingFolder = "embedding_folder"
127
130
  InvokeAI = "invokeai"
128
131
  T5Encoder = "t5_encoder"
132
+ Qwen3Encoder = "qwen3_encoder"
129
133
  BnbQuantizedLlmInt8b = "bnb_quantized_int8b"
130
134
  BnbQuantizednf4b = "bnb_quantized_nf4b"
131
135
  GGUFQuantized = "gguf_quantized"
@@ -167,6 +171,7 @@ class FluxLoRAFormat(str, Enum):
167
171
  OneTrainer = "flux.onetrainer"
168
172
  Control = "flux.control"
169
173
  AIToolkit = "flux.aitoolkit"
174
+ XLabs = "flux.xlabs"
170
175
 
171
176
 
172
177
  AnyVariant: TypeAlias = Union[ModelVariantType, ClipVariantType, FluxVariantType]
@@ -24,12 +24,14 @@ def filter_files(
24
24
  files: List[Path],
25
25
  variant: Optional[ModelRepoVariant] = None,
26
26
  subfolder: Optional[Path] = None,
27
+ subfolders: Optional[List[Path]] = None,
27
28
  ) -> List[Path]:
28
29
  """
29
30
  Take a list of files in a HuggingFace repo root and return paths to files needed to load the model.
30
31
 
31
32
  :param files: List of files relative to the repo root.
32
- :param subfolder: Filter by the indicated subfolder.
33
+ :param subfolder: Filter by the indicated subfolder (deprecated, use subfolders instead).
34
+ :param subfolders: Filter by multiple subfolders. Files from any of these subfolders will be included.
33
35
  :param variant: Filter by files belonging to a particular variant, such as fp16.
34
36
 
35
37
  The file list can be obtained from the `files` field of HuggingFaceMetadata,
@@ -37,11 +39,24 @@ def filter_files(
37
39
  """
38
40
  variant = variant or ModelRepoVariant.Default
39
41
  paths: List[Path] = []
40
- root = files[0].parts[0]
42
+
43
+ if not files:
44
+ return []
45
+
46
+ root = files[0].parts[0] if files[0].parts else Path(".")
47
+
48
+ # Build list of subfolders to filter by
49
+ filter_subfolders: List[Path] = []
50
+ if subfolders:
51
+ filter_subfolders = subfolders
52
+ elif subfolder:
53
+ filter_subfolders = [subfolder]
41
54
 
42
55
  # if the subfolder is a single file, then bypass the selection and just return it
43
- if subfolder and subfolder.suffix in [".safetensors", ".bin", ".onnx", ".xml", ".pth", ".pt", ".ckpt", ".msgpack"]:
44
- return [root / subfolder]
56
+ if len(filter_subfolders) == 1:
57
+ sf = filter_subfolders[0]
58
+ if sf.suffix in [".safetensors", ".bin", ".onnx", ".xml", ".pth", ".pt", ".ckpt", ".msgpack"]:
59
+ return [root / sf]
45
60
 
46
61
  # Start by filtering on model file extensions, discarding images, docs, etc
47
62
  for file in files:
@@ -66,10 +81,10 @@ def filter_files(
66
81
  elif re.search(r"model.*\.(safetensors|bin|onnx|xml|pth|pt|ckpt|msgpack)$", file.name):
67
82
  paths.append(file)
68
83
 
69
- # limit search to subfolder if requested
70
- if subfolder:
71
- subfolder = root / subfolder
72
- paths = [x for x in paths if Path(subfolder) in x.parents]
84
+ # limit search to subfolder(s) if requested
85
+ if filter_subfolders:
86
+ absolute_subfolders = [root / sf for sf in filter_subfolders]
87
+ paths = [x for x in paths if any(Path(sf) in x.parents for sf in absolute_subfolders)]
73
88
 
74
89
  # _filter_by_variant uniquifies the paths and returns a set
75
90
  return sorted(_filter_by_variant(paths, variant))
@@ -86,7 +86,8 @@ class LayerPatcher:
86
86
  # submodules. If the layer keys do not contain a dot, then they are flattened, meaning that all '.' have been
87
87
  # replaced with '_'. Non-flattened keys are preferred, because they allow submodules to be accessed directly
88
88
  # without searching, but some legacy code still uses flattened keys.
89
- layer_keys_are_flattened = "." not in next(iter(patch.layers.keys()))
89
+ first_key = next(iter(patch.layers.keys()))
90
+ layer_keys_are_flattened = "." not in first_key
90
91
 
91
92
  prefix_len = len(prefix)
92
93
 
@@ -174,28 +175,45 @@ class LayerPatcher:
174
175
 
175
176
  # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
176
177
  # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
177
- for param_name, param_weight in patch.get_parameters(
178
- dict(module_to_patch.named_parameters(recurse=False)), weight=patch_weight
179
- ).items():
178
+ params_dict = patch.get_parameters(dict(module_to_patch.named_parameters(recurse=False)), weight=patch_weight)
179
+ if not params_dict:
180
+ logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
181
+ logger.warning(f"LoRA patch returned no parameters for module: {module_to_patch_key}")
182
+ return
183
+
184
+ for param_name, param_weight in params_dict.items():
180
185
  param_key = module_to_patch_key + "." + param_name
181
186
  module_param = module_to_patch.get_parameter(param_name)
182
187
 
183
188
  # Save original weight
184
189
  original_weights.save(param_key, module_param)
185
190
 
186
- # HACK(ryand): This condition is only necessary to handle layers in FLUX control LoRAs that change the
187
- # shape of the original layer.
191
+ # Handle layers that change the shape of the original layer.
192
+ # FLUX control LoRAs intentionally expand certain layers - we pad the original weight with zeros.
193
+ # For other LoRAs (e.g., Z-Image with architecture mismatch), skip incompatible layers with a warning.
188
194
  if module_param.nelement() != param_weight.nelement():
189
- assert isinstance(patch, FluxControlLoRALayer)
190
- expanded_weight = pad_with_zeros(module_param, param_weight.shape)
191
- setattr(
192
- module_to_patch,
193
- param_name,
194
- torch.nn.Parameter(expanded_weight, requires_grad=module_param.requires_grad),
195
- )
196
- module_param = expanded_weight
197
-
198
- module_param += param_weight.to(dtype=dtype)
195
+ if isinstance(patch, FluxControlLoRALayer):
196
+ # FLUX Control LoRAs intentionally expand layers - pad with zeros
197
+ expanded_weight = pad_with_zeros(module_param, param_weight.shape)
198
+ setattr(
199
+ module_to_patch,
200
+ param_name,
201
+ torch.nn.Parameter(expanded_weight, requires_grad=module_param.requires_grad),
202
+ )
203
+ module_param = expanded_weight
204
+ else:
205
+ # For other LoRAs, shape mismatch indicates architecture incompatibility - skip the layer
206
+ logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
207
+ logger.warning(
208
+ f"Skipping LoRA layer '{module_to_patch_key}.{param_name}' due to shape mismatch: "
209
+ f"model has {module_param.nelement()} elements, LoRA expects {param_weight.nelement()}. "
210
+ "This LoRA may be incompatible with this model architecture."
211
+ )
212
+ continue
213
+
214
+ # Convert param_weight to the correct device and dtype, then apply to model weights
215
+ param_weight_converted = param_weight.to(device=device, dtype=dtype)
216
+ module_param.data.copy_(module_param.data + param_weight_converted)
199
217
 
200
218
  patch.to(device=TorchDevice.CPU_DEVICE)
201
219
 
@@ -60,7 +60,8 @@ class LoRALayerBase(BaseLayerPatch):
60
60
 
61
61
  def get_parameters(self, orig_parameters: dict[str, torch.Tensor], weight: float) -> dict[str, torch.Tensor]:
62
62
  scale = self.scale()
63
- params = {"weight": self.get_weight(orig_parameters["weight"]) * (weight * scale)}
63
+ lora_weight = self.get_weight(orig_parameters["weight"])
64
+ params = {"weight": lora_weight * (weight * scale)}
64
65
  bias = self.get_bias(orig_parameters.get("bias", None))
65
66
  if bias is not None:
66
67
  params["bias"] = bias * (weight * scale)
@@ -12,18 +12,33 @@ from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
12
12
  from invokeai.backend.util import InvokeAILogger
13
13
 
14
14
 
15
+ def _has_flux_layer_structure(state_dict: dict[str | int, Any]) -> bool:
16
+ """Check if state dict has Flux-specific layer patterns (double_blocks/single_blocks)."""
17
+ return any(
18
+ k.startswith("diffusion_model.double_blocks.") or k.startswith("diffusion_model.single_blocks.")
19
+ for k in state_dict.keys()
20
+ if isinstance(k, str)
21
+ )
22
+
23
+
15
24
  def is_state_dict_likely_in_flux_aitoolkit_format(
16
25
  state_dict: dict[str | int, Any],
17
26
  metadata: dict[str, Any] | None = None,
18
27
  ) -> bool:
28
+ # Always check for Flux-specific layer structure first
29
+ # This prevents misidentifying Z-Image LoRAs (which use diffusion_model.layers.X) as Flux
30
+ if not _has_flux_layer_structure(state_dict):
31
+ return False
32
+
19
33
  if metadata:
20
34
  try:
21
35
  software = json.loads(metadata.get("software", "{}"))
22
36
  except json.JSONDecodeError:
23
37
  return False
24
38
  return software.get("name") == "ai-toolkit"
25
- # metadata got lost somewhere
26
- return any("diffusion_model" == k.split(".", 1)[0] for k in state_dict.keys() if isinstance(k, str))
39
+
40
+ # No metadata - if it has Flux layer structure, assume it's AI Toolkit format
41
+ return True
27
42
 
28
43
 
29
44
  @dataclass
@@ -0,0 +1,92 @@
1
+ import re
2
+ from typing import Any, Dict
3
+
4
+ import torch
5
+
6
+ from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
7
+ from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
8
+ from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_TRANSFORMER_PREFIX
9
+ from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
10
+
11
+ # A regex pattern that matches all of the transformer keys in the xlabs FLUX LoRA format.
12
+ # Example keys:
13
+ # double_blocks.0.processor.qkv_lora1.down.weight
14
+ # double_blocks.0.processor.qkv_lora1.up.weight
15
+ # double_blocks.0.processor.proj_lora1.down.weight
16
+ # double_blocks.0.processor.proj_lora1.up.weight
17
+ # double_blocks.0.processor.qkv_lora2.down.weight
18
+ # double_blocks.0.processor.proj_lora2.up.weight
19
+ FLUX_XLABS_KEY_REGEX = r"double_blocks\.(\d+)\.processor\.(qkv|proj)_lora([12])\.(down|up)\.weight"
20
+
21
+
22
+ def is_state_dict_likely_in_flux_xlabs_format(state_dict: dict[str | int, Any]) -> bool:
23
+ """Checks if the provided state dict is likely in the xlabs FLUX LoRA format.
24
+
25
+ The xlabs format is characterized by keys matching the pattern:
26
+ double_blocks.{block_idx}.processor.{qkv|proj}_lora{1|2}.{down|up}.weight
27
+
28
+ Where:
29
+ - lora1 corresponds to the image attention stream (img_attn)
30
+ - lora2 corresponds to the text attention stream (txt_attn)
31
+ """
32
+ if not state_dict:
33
+ return False
34
+
35
+ # Check that all keys match the xlabs pattern
36
+ for key in state_dict.keys():
37
+ if not isinstance(key, str):
38
+ continue
39
+ if not re.match(FLUX_XLABS_KEY_REGEX, key):
40
+ return False
41
+
42
+ # Ensure we have at least some valid keys
43
+ return any(isinstance(k, str) and re.match(FLUX_XLABS_KEY_REGEX, k) for k in state_dict.keys())
44
+
45
+
46
+ def lora_model_from_flux_xlabs_state_dict(state_dict: Dict[str, torch.Tensor]) -> ModelPatchRaw:
47
+ """Converts an xlabs FLUX LoRA state dict to the InvokeAI ModelPatchRaw format.
48
+
49
+ The xlabs format uses:
50
+ - lora1 for image attention stream (img_attn)
51
+ - lora2 for text attention stream (txt_attn)
52
+ - qkv for query/key/value projection
53
+ - proj for output projection
54
+
55
+ Key mapping:
56
+ - double_blocks.X.processor.qkv_lora1 -> double_blocks.X.img_attn.qkv
57
+ - double_blocks.X.processor.proj_lora1 -> double_blocks.X.img_attn.proj
58
+ - double_blocks.X.processor.qkv_lora2 -> double_blocks.X.txt_attn.qkv
59
+ - double_blocks.X.processor.proj_lora2 -> double_blocks.X.txt_attn.proj
60
+ """
61
+ # Group keys by layer (without the .down.weight/.up.weight suffix)
62
+ grouped_state_dict: dict[str, dict[str, torch.Tensor]] = {}
63
+
64
+ for key, value in state_dict.items():
65
+ match = re.match(FLUX_XLABS_KEY_REGEX, key)
66
+ if not match:
67
+ raise ValueError(f"Key '{key}' does not match the expected pattern for xlabs FLUX LoRA weights.")
68
+
69
+ block_idx = match.group(1)
70
+ component = match.group(2) # qkv or proj
71
+ lora_stream = match.group(3) # 1 or 2
72
+ direction = match.group(4) # down or up
73
+
74
+ # Map lora1 -> img_attn, lora2 -> txt_attn
75
+ attn_type = "img_attn" if lora_stream == "1" else "txt_attn"
76
+
77
+ # Create the InvokeAI-style layer key
78
+ layer_key = f"double_blocks.{block_idx}.{attn_type}.{component}"
79
+
80
+ if layer_key not in grouped_state_dict:
81
+ grouped_state_dict[layer_key] = {}
82
+
83
+ # Map down/up to lora_down/lora_up
84
+ param_name = f"lora_{direction}.weight"
85
+ grouped_state_dict[layer_key][param_name] = value
86
+
87
+ # Create LoRA layers
88
+ layers: dict[str, BaseLayerPatch] = {}
89
+ for layer_key, layer_state_dict in grouped_state_dict.items():
90
+ layers[FLUX_LORA_TRANSFORMER_PREFIX + layer_key] = any_lora_layer_from_state_dict(layer_state_dict)
91
+
92
+ return ModelPatchRaw(layers=layers)
@@ -14,6 +14,9 @@ from invokeai.backend.patches.lora_conversions.flux_kohya_lora_conversion_utils
14
14
  from invokeai.backend.patches.lora_conversions.flux_onetrainer_lora_conversion_utils import (
15
15
  is_state_dict_likely_in_flux_onetrainer_format,
16
16
  )
17
+ from invokeai.backend.patches.lora_conversions.flux_xlabs_lora_conversion_utils import (
18
+ is_state_dict_likely_in_flux_xlabs_format,
19
+ )
17
20
 
18
21
 
19
22
  def flux_format_from_state_dict(
@@ -30,5 +33,7 @@ def flux_format_from_state_dict(
30
33
  return FluxLoRAFormat.Control
31
34
  elif is_state_dict_likely_in_flux_aitoolkit_format(state_dict, metadata):
32
35
  return FluxLoRAFormat.AIToolkit
36
+ elif is_state_dict_likely_in_flux_xlabs_format(state_dict):
37
+ return FluxLoRAFormat.XLabs
33
38
  else:
34
39
  return None
@@ -0,0 +1,8 @@
1
+ # Z-Image LoRA prefix constants
2
+ # These prefixes are used for key mapping when applying LoRA patches to Z-Image models
3
+
4
+ # Prefix for Z-Image transformer (S3-DiT architecture) LoRA layers
5
+ Z_IMAGE_LORA_TRANSFORMER_PREFIX = "lora_transformer-"
6
+
7
+ # Prefix for Qwen3 text encoder LoRA layers
8
+ Z_IMAGE_LORA_QWEN3_PREFIX = "lora_qwen3-"
@@ -0,0 +1,155 @@
1
+ """Z-Image LoRA conversion utilities.
2
+
3
+ Z-Image uses S3-DiT transformer architecture with Qwen3 text encoder.
4
+ LoRAs for Z-Image typically follow the diffusers PEFT format.
5
+ """
6
+
7
+ from typing import Dict
8
+
9
+ import torch
10
+
11
+ from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
12
+ from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
13
+ from invokeai.backend.patches.lora_conversions.z_image_lora_constants import (
14
+ Z_IMAGE_LORA_QWEN3_PREFIX,
15
+ Z_IMAGE_LORA_TRANSFORMER_PREFIX,
16
+ )
17
+ from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
18
+
19
+
20
+ def is_state_dict_likely_z_image_lora(state_dict: dict[str | int, torch.Tensor]) -> bool:
21
+ """Checks if the provided state dict is likely a Z-Image LoRA.
22
+
23
+ Z-Image LoRAs can have keys for transformer and/or Qwen3 text encoder.
24
+ They may use various prefixes depending on the training framework.
25
+ """
26
+ str_keys = [k for k in state_dict.keys() if isinstance(k, str)]
27
+
28
+ # Check for Z-Image transformer keys (S3-DiT architecture)
29
+ # Various training frameworks use different prefixes
30
+ has_transformer_keys = any(
31
+ k.startswith(
32
+ (
33
+ "transformer.",
34
+ "base_model.model.transformer.",
35
+ "diffusion_model.",
36
+ )
37
+ )
38
+ for k in str_keys
39
+ )
40
+
41
+ # Check for Qwen3 text encoder keys
42
+ has_qwen3_keys = any(k.startswith(("text_encoder.", "base_model.model.text_encoder.")) for k in str_keys)
43
+
44
+ return has_transformer_keys or has_qwen3_keys
45
+
46
+
47
+ def lora_model_from_z_image_state_dict(
48
+ state_dict: Dict[str, torch.Tensor], alpha: float | None = None
49
+ ) -> ModelPatchRaw:
50
+ """Convert a Z-Image LoRA state dict to a ModelPatchRaw.
51
+
52
+ Z-Image LoRAs can contain layers for:
53
+ - Transformer (S3-DiT architecture)
54
+ - Qwen3 text encoder
55
+
56
+ Z-Image LoRAs may use various key prefixes depending on how they were trained:
57
+ - "transformer." or "base_model.model.transformer." for diffusers PEFT format
58
+ - "diffusion_model." for some training frameworks
59
+ - "text_encoder." or "base_model.model.text_encoder." for Qwen3 encoder
60
+
61
+ Args:
62
+ state_dict: The LoRA state dict
63
+ alpha: The alpha value for LoRA scaling. If None, uses rank as alpha.
64
+
65
+ Returns:
66
+ A ModelPatchRaw containing the LoRA layers
67
+ """
68
+ layers: dict[str, BaseLayerPatch] = {}
69
+
70
+ # Group keys by layer
71
+ grouped_state_dict = _group_by_layer(state_dict)
72
+
73
+ for layer_key, layer_dict in grouped_state_dict.items():
74
+ # Convert PEFT format keys to internal format
75
+ values = _get_lora_layer_values(layer_dict, alpha)
76
+
77
+ # Determine the appropriate prefix based on the layer type and clean up the key
78
+ clean_key = layer_key
79
+
80
+ # Handle various transformer prefixes
81
+ transformer_prefixes = [
82
+ "base_model.model.transformer.diffusion_model.",
83
+ "base_model.model.transformer.",
84
+ "transformer.diffusion_model.",
85
+ "transformer.",
86
+ "diffusion_model.",
87
+ ]
88
+
89
+ # Handle text encoder prefixes
90
+ text_encoder_prefixes = [
91
+ "base_model.model.text_encoder.",
92
+ "text_encoder.",
93
+ ]
94
+
95
+ is_text_encoder = False
96
+
97
+ # Check and strip text encoder prefixes first
98
+ for prefix in text_encoder_prefixes:
99
+ if layer_key.startswith(prefix):
100
+ clean_key = layer_key[len(prefix) :]
101
+ is_text_encoder = True
102
+ break
103
+
104
+ # If not text encoder, check transformer prefixes
105
+ if not is_text_encoder:
106
+ for prefix in transformer_prefixes:
107
+ if layer_key.startswith(prefix):
108
+ clean_key = layer_key[len(prefix) :]
109
+ break
110
+
111
+ # Apply the appropriate internal prefix
112
+ if is_text_encoder:
113
+ final_key = f"{Z_IMAGE_LORA_QWEN3_PREFIX}{clean_key}"
114
+ else:
115
+ final_key = f"{Z_IMAGE_LORA_TRANSFORMER_PREFIX}{clean_key}"
116
+
117
+ layer = any_lora_layer_from_state_dict(values)
118
+ layers[final_key] = layer
119
+
120
+ return ModelPatchRaw(layers=layers)
121
+
122
+
123
+ def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | None) -> dict[str, torch.Tensor]:
124
+ """Convert layer dict keys from PEFT format to internal format."""
125
+ if "lora_A.weight" in layer_dict:
126
+ # PEFT format: lora_A.weight, lora_B.weight
127
+ values = {
128
+ "lora_down.weight": layer_dict["lora_A.weight"],
129
+ "lora_up.weight": layer_dict["lora_B.weight"],
130
+ }
131
+ if alpha is not None:
132
+ values["alpha"] = torch.tensor(alpha)
133
+ return values
134
+ elif "lora_down.weight" in layer_dict:
135
+ # Already in internal format
136
+ return layer_dict
137
+ else:
138
+ # Unknown format, return as-is
139
+ return layer_dict
140
+
141
+
142
+ def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
143
+ """Groups the keys in the state dict by layer."""
144
+ layer_dict: dict[str, dict[str, torch.Tensor]] = {}
145
+ for key in state_dict:
146
+ if not isinstance(key, str):
147
+ continue
148
+ # Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
149
+ parts = key.rsplit(".", maxsplit=2)
150
+ layer_name = parts[0]
151
+ key_name = ".".join(parts[1:])
152
+ if layer_name not in layer_dict:
153
+ layer_dict[layer_name] = {}
154
+ layer_dict[layer_name][key_name] = state_dict[key]
155
+ return layer_dict