InvokeAI 6.9.0rc3__py3-none-any.whl → 6.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invokeai/app/api/dependencies.py +2 -0
- invokeai/app/api/routers/model_manager.py +91 -2
- invokeai/app/api/routers/workflows.py +9 -0
- invokeai/app/invocations/fields.py +19 -0
- invokeai/app/invocations/image_to_latents.py +23 -5
- invokeai/app/invocations/latents_to_image.py +2 -25
- invokeai/app/invocations/metadata.py +9 -1
- invokeai/app/invocations/model.py +8 -0
- invokeai/app/invocations/primitives.py +12 -0
- invokeai/app/invocations/prompt_template.py +57 -0
- invokeai/app/invocations/z_image_control.py +112 -0
- invokeai/app/invocations/z_image_denoise.py +610 -0
- invokeai/app/invocations/z_image_image_to_latents.py +102 -0
- invokeai/app/invocations/z_image_latents_to_image.py +103 -0
- invokeai/app/invocations/z_image_lora_loader.py +153 -0
- invokeai/app/invocations/z_image_model_loader.py +135 -0
- invokeai/app/invocations/z_image_text_encoder.py +197 -0
- invokeai/app/services/model_install/model_install_common.py +14 -1
- invokeai/app/services/model_install/model_install_default.py +119 -19
- invokeai/app/services/model_records/model_records_base.py +12 -0
- invokeai/app/services/model_records/model_records_sql.py +17 -0
- invokeai/app/services/shared/graph.py +132 -77
- invokeai/app/services/workflow_records/workflow_records_base.py +8 -0
- invokeai/app/services/workflow_records/workflow_records_sqlite.py +42 -0
- invokeai/app/util/step_callback.py +3 -0
- invokeai/backend/model_manager/configs/controlnet.py +47 -1
- invokeai/backend/model_manager/configs/factory.py +26 -1
- invokeai/backend/model_manager/configs/lora.py +43 -1
- invokeai/backend/model_manager/configs/main.py +113 -0
- invokeai/backend/model_manager/configs/qwen3_encoder.py +156 -0
- invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_diffusers_rms_norm.py +40 -0
- invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_layer_norm.py +25 -0
- invokeai/backend/model_manager/load/model_cache/torch_module_autocast/torch_module_autocast.py +11 -2
- invokeai/backend/model_manager/load/model_loaders/lora.py +11 -0
- invokeai/backend/model_manager/load/model_loaders/z_image.py +935 -0
- invokeai/backend/model_manager/load/model_util.py +6 -1
- invokeai/backend/model_manager/metadata/metadata_base.py +12 -5
- invokeai/backend/model_manager/model_on_disk.py +3 -0
- invokeai/backend/model_manager/starter_models.py +70 -0
- invokeai/backend/model_manager/taxonomy.py +5 -0
- invokeai/backend/model_manager/util/select_hf_files.py +23 -8
- invokeai/backend/patches/layer_patcher.py +34 -16
- invokeai/backend/patches/layers/lora_layer_base.py +2 -1
- invokeai/backend/patches/lora_conversions/flux_aitoolkit_lora_conversion_utils.py +17 -2
- invokeai/backend/patches/lora_conversions/flux_xlabs_lora_conversion_utils.py +92 -0
- invokeai/backend/patches/lora_conversions/formats.py +5 -0
- invokeai/backend/patches/lora_conversions/z_image_lora_constants.py +8 -0
- invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +155 -0
- invokeai/backend/quantization/gguf/ggml_tensor.py +27 -4
- invokeai/backend/quantization/gguf/loaders.py +47 -12
- invokeai/backend/stable_diffusion/diffusion/conditioning_data.py +13 -0
- invokeai/backend/util/devices.py +25 -0
- invokeai/backend/util/hotfixes.py +2 -2
- invokeai/backend/z_image/__init__.py +16 -0
- invokeai/backend/z_image/extensions/__init__.py +1 -0
- invokeai/backend/z_image/extensions/regional_prompting_extension.py +207 -0
- invokeai/backend/z_image/text_conditioning.py +74 -0
- invokeai/backend/z_image/z_image_control_adapter.py +238 -0
- invokeai/backend/z_image/z_image_control_transformer.py +643 -0
- invokeai/backend/z_image/z_image_controlnet_extension.py +531 -0
- invokeai/backend/z_image/z_image_patchify_utils.py +135 -0
- invokeai/backend/z_image/z_image_transformer_patch.py +234 -0
- invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +161 -0
- invokeai/frontend/web/dist/assets/{browser-ponyfill-CN1j0ARZ.js → browser-ponyfill-DHZxq1nk.js} +1 -1
- invokeai/frontend/web/dist/assets/index-dgSJAY--.js +530 -0
- invokeai/frontend/web/dist/index.html +1 -1
- invokeai/frontend/web/dist/locales/de.json +24 -6
- invokeai/frontend/web/dist/locales/en.json +70 -1
- invokeai/frontend/web/dist/locales/es.json +0 -5
- invokeai/frontend/web/dist/locales/fr.json +0 -6
- invokeai/frontend/web/dist/locales/it.json +17 -64
- invokeai/frontend/web/dist/locales/ja.json +379 -44
- invokeai/frontend/web/dist/locales/ru.json +0 -6
- invokeai/frontend/web/dist/locales/vi.json +7 -54
- invokeai/frontend/web/dist/locales/zh-CN.json +0 -6
- invokeai/version/invokeai_version.py +1 -1
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/METADATA +3 -3
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/RECORD +84 -60
- invokeai/frontend/web/dist/assets/App-Cn9UyjoV.js +0 -161
- invokeai/frontend/web/dist/assets/index-BDrf9CL-.js +0 -530
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/WHEEL +0 -0
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/entry_points.txt +0 -0
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE +0 -0
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
- {invokeai-6.9.0rc3.dist-info → invokeai-6.10.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,7 @@ import onnxruntime as ort
|
|
|
10
10
|
import torch
|
|
11
11
|
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
|
12
12
|
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
|
13
|
-
from transformers import CLIPTokenizer, T5Tokenizer, T5TokenizerFast
|
|
13
|
+
from transformers import CLIPTokenizer, PreTrainedTokenizerBase, T5Tokenizer, T5TokenizerFast
|
|
14
14
|
|
|
15
15
|
from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import DepthAnythingPipeline
|
|
16
16
|
from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
|
|
@@ -73,6 +73,10 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
|
|
|
73
73
|
# relative to the text encoder that it's used with, so shouldn't matter too much, but we should fix this at some
|
|
74
74
|
# point.
|
|
75
75
|
return len(model)
|
|
76
|
+
elif isinstance(model, PreTrainedTokenizerBase):
|
|
77
|
+
# Catch-all for other tokenizer types (e.g., Qwen2Tokenizer, Qwen3Tokenizer).
|
|
78
|
+
# Tokenizers are small relative to models, so returning 0 is acceptable.
|
|
79
|
+
return 0
|
|
76
80
|
else:
|
|
77
81
|
# TODO(ryand): Promote this from a log to an exception once we are confident that we are handling all of the
|
|
78
82
|
# supported model types.
|
|
@@ -156,6 +160,7 @@ def calc_model_size_by_fs(model_path: Path, subfolder: Optional[str] = None, var
|
|
|
156
160
|
(".msgpack",), # flax
|
|
157
161
|
(".ckpt",), # tf
|
|
158
162
|
(".h5",), # tf2
|
|
163
|
+
(".gguf",), # gguf quantized
|
|
159
164
|
]
|
|
160
165
|
|
|
161
166
|
for file_format in formats:
|
|
@@ -95,13 +95,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
|
|
|
95
95
|
self,
|
|
96
96
|
variant: Optional[ModelRepoVariant] = None,
|
|
97
97
|
subfolder: Optional[Path] = None,
|
|
98
|
+
subfolders: Optional[List[Path]] = None,
|
|
98
99
|
session: Optional[Session] = None,
|
|
99
100
|
) -> List[RemoteModelFile]:
|
|
100
101
|
"""
|
|
101
|
-
Return list of downloadable files, filtering by variant and subfolder, if any.
|
|
102
|
+
Return list of downloadable files, filtering by variant and subfolder(s), if any.
|
|
102
103
|
|
|
103
104
|
:param variant: Return model files needed to reconstruct the indicated variant
|
|
104
|
-
:param subfolder: Return model files from the designated subfolder only
|
|
105
|
+
:param subfolder: Return model files from the designated subfolder only (deprecated, use subfolders)
|
|
106
|
+
:param subfolders: Return model files from the designated subfolders
|
|
105
107
|
:param session: A request.Session object used for internet-free testing
|
|
106
108
|
|
|
107
109
|
Note that there is special variant-filtering behavior here:
|
|
@@ -111,10 +113,15 @@ class HuggingFaceMetadata(ModelMetadataWithFiles):
|
|
|
111
113
|
session = session or Session()
|
|
112
114
|
configure_http_backend(backend_factory=lambda: session) # used in testing
|
|
113
115
|
|
|
114
|
-
paths = filter_files([x.path for x in self.files], variant, subfolder) # all files in the model
|
|
115
|
-
|
|
116
|
+
paths = filter_files([x.path for x in self.files], variant, subfolder, subfolders) # all files in the model
|
|
117
|
+
|
|
118
|
+
# Determine prefix for model_index.json check - only applies for single subfolder
|
|
119
|
+
prefix = ""
|
|
120
|
+
if subfolder and not subfolders:
|
|
121
|
+
prefix = f"{subfolder}/"
|
|
122
|
+
|
|
116
123
|
# the next step reads model_index.json to determine which subdirectories belong
|
|
117
|
-
# to the model
|
|
124
|
+
# to the model (only for single subfolder case)
|
|
118
125
|
if Path(f"{prefix}model_index.json") in paths:
|
|
119
126
|
url = hf_hub_url(self.id, filename="model_index.json", subfolder=str(subfolder) if subfolder else None)
|
|
120
127
|
resp = session.get(url)
|
|
@@ -84,6 +84,9 @@ class ModelOnDisk:
|
|
|
84
84
|
|
|
85
85
|
path = self.resolve_weight_file(path)
|
|
86
86
|
|
|
87
|
+
if path in self._state_dict_cache:
|
|
88
|
+
return self._state_dict_cache[path]
|
|
89
|
+
|
|
87
90
|
with SilenceWarnings():
|
|
88
91
|
if path.suffix.endswith((".ckpt", ".pt", ".pth", ".bin")):
|
|
89
92
|
scan_result = scan_file_path(path)
|
|
@@ -690,6 +690,69 @@ flux_fill = StarterModel(
|
|
|
690
690
|
)
|
|
691
691
|
# endregion
|
|
692
692
|
|
|
693
|
+
# region Z-Image
|
|
694
|
+
z_image_qwen3_encoder = StarterModel(
|
|
695
|
+
name="Z-Image Qwen3 Text Encoder",
|
|
696
|
+
base=BaseModelType.Any,
|
|
697
|
+
source="Tongyi-MAI/Z-Image-Turbo::text_encoder+tokenizer",
|
|
698
|
+
description="Qwen3 4B text encoder with tokenizer for Z-Image (full precision). ~8GB",
|
|
699
|
+
type=ModelType.Qwen3Encoder,
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
z_image_qwen3_encoder_quantized = StarterModel(
|
|
703
|
+
name="Z-Image Qwen3 Text Encoder (quantized)",
|
|
704
|
+
base=BaseModelType.Any,
|
|
705
|
+
source="https://huggingface.co/worstplayer/Z-Image_Qwen_3_4b_text_encoder_GGUF/resolve/main/Qwen_3_4b-Q6_K.gguf",
|
|
706
|
+
description="Qwen3 4B text encoder for Z-Image quantized to GGUF Q6_K format. ~3.3GB",
|
|
707
|
+
type=ModelType.Qwen3Encoder,
|
|
708
|
+
format=ModelFormat.GGUFQuantized,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
z_image_turbo = StarterModel(
|
|
712
|
+
name="Z-Image Turbo",
|
|
713
|
+
base=BaseModelType.ZImage,
|
|
714
|
+
source="Tongyi-MAI/Z-Image-Turbo",
|
|
715
|
+
description="Z-Image Turbo - fast 6B parameter text-to-image model with 8 inference steps. Supports bilingual prompts (English & Chinese). ~13GB",
|
|
716
|
+
type=ModelType.Main,
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
z_image_turbo_quantized = StarterModel(
|
|
720
|
+
name="Z-Image Turbo (quantized)",
|
|
721
|
+
base=BaseModelType.ZImage,
|
|
722
|
+
source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
|
|
723
|
+
description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
|
|
724
|
+
type=ModelType.Main,
|
|
725
|
+
format=ModelFormat.GGUFQuantized,
|
|
726
|
+
dependencies=[z_image_qwen3_encoder_quantized],
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
z_image_turbo_q8 = StarterModel(
|
|
730
|
+
name="Z-Image Turbo (Q8)",
|
|
731
|
+
base=BaseModelType.ZImage,
|
|
732
|
+
source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
|
|
733
|
+
description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
|
|
734
|
+
type=ModelType.Main,
|
|
735
|
+
format=ModelFormat.GGUFQuantized,
|
|
736
|
+
dependencies=[z_image_qwen3_encoder_quantized],
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
z_image_controlnet_union = StarterModel(
|
|
740
|
+
name="Z-Image ControlNet Union",
|
|
741
|
+
base=BaseModelType.ZImage,
|
|
742
|
+
source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors",
|
|
743
|
+
description="Unified ControlNet for Z-Image Turbo supporting Canny, HED, Depth, Pose, MLSD, and Inpainting modes.",
|
|
744
|
+
type=ModelType.ControlNet,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
z_image_controlnet_tile = StarterModel(
|
|
748
|
+
name="Z-Image ControlNet Tile",
|
|
749
|
+
base=BaseModelType.ZImage,
|
|
750
|
+
source="https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1/resolve/main/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.safetensors",
|
|
751
|
+
description="Dedicated Tile ControlNet for Z-Image Turbo. Useful for upscaling and adding detail. ~6.7GB",
|
|
752
|
+
type=ModelType.ControlNet,
|
|
753
|
+
)
|
|
754
|
+
# endregion
|
|
755
|
+
|
|
693
756
|
# List of starter models, displayed on the frontend.
|
|
694
757
|
# The order/sort of this list is not changed by the frontend - set it how you want it here.
|
|
695
758
|
STARTER_MODELS: list[StarterModel] = [
|
|
@@ -766,6 +829,13 @@ STARTER_MODELS: list[StarterModel] = [
|
|
|
766
829
|
cogview4,
|
|
767
830
|
flux_krea,
|
|
768
831
|
flux_krea_quantized,
|
|
832
|
+
z_image_turbo,
|
|
833
|
+
z_image_turbo_quantized,
|
|
834
|
+
z_image_turbo_q8,
|
|
835
|
+
z_image_qwen3_encoder,
|
|
836
|
+
z_image_qwen3_encoder_quantized,
|
|
837
|
+
z_image_controlnet_union,
|
|
838
|
+
z_image_controlnet_tile,
|
|
769
839
|
]
|
|
770
840
|
|
|
771
841
|
sd1_bundle: list[StarterModel] = [
|
|
@@ -48,6 +48,8 @@ class BaseModelType(str, Enum):
|
|
|
48
48
|
"""Indicates the model is associated with FLUX.1 model architecture, including FLUX Dev, Schnell and Fill."""
|
|
49
49
|
CogView4 = "cogview4"
|
|
50
50
|
"""Indicates the model is associated with CogView 4 model architecture."""
|
|
51
|
+
ZImage = "z-image"
|
|
52
|
+
"""Indicates the model is associated with Z-Image model architecture, including Z-Image-Turbo."""
|
|
51
53
|
Unknown = "unknown"
|
|
52
54
|
"""Indicates the model's base architecture is unknown."""
|
|
53
55
|
|
|
@@ -67,6 +69,7 @@ class ModelType(str, Enum):
|
|
|
67
69
|
CLIPEmbed = "clip_embed"
|
|
68
70
|
T2IAdapter = "t2i_adapter"
|
|
69
71
|
T5Encoder = "t5_encoder"
|
|
72
|
+
Qwen3Encoder = "qwen3_encoder"
|
|
70
73
|
SpandrelImageToImage = "spandrel_image_to_image"
|
|
71
74
|
SigLIP = "siglip"
|
|
72
75
|
FluxRedux = "flux_redux"
|
|
@@ -126,6 +129,7 @@ class ModelFormat(str, Enum):
|
|
|
126
129
|
EmbeddingFolder = "embedding_folder"
|
|
127
130
|
InvokeAI = "invokeai"
|
|
128
131
|
T5Encoder = "t5_encoder"
|
|
132
|
+
Qwen3Encoder = "qwen3_encoder"
|
|
129
133
|
BnbQuantizedLlmInt8b = "bnb_quantized_int8b"
|
|
130
134
|
BnbQuantizednf4b = "bnb_quantized_nf4b"
|
|
131
135
|
GGUFQuantized = "gguf_quantized"
|
|
@@ -167,6 +171,7 @@ class FluxLoRAFormat(str, Enum):
|
|
|
167
171
|
OneTrainer = "flux.onetrainer"
|
|
168
172
|
Control = "flux.control"
|
|
169
173
|
AIToolkit = "flux.aitoolkit"
|
|
174
|
+
XLabs = "flux.xlabs"
|
|
170
175
|
|
|
171
176
|
|
|
172
177
|
AnyVariant: TypeAlias = Union[ModelVariantType, ClipVariantType, FluxVariantType]
|
|
@@ -24,12 +24,14 @@ def filter_files(
|
|
|
24
24
|
files: List[Path],
|
|
25
25
|
variant: Optional[ModelRepoVariant] = None,
|
|
26
26
|
subfolder: Optional[Path] = None,
|
|
27
|
+
subfolders: Optional[List[Path]] = None,
|
|
27
28
|
) -> List[Path]:
|
|
28
29
|
"""
|
|
29
30
|
Take a list of files in a HuggingFace repo root and return paths to files needed to load the model.
|
|
30
31
|
|
|
31
32
|
:param files: List of files relative to the repo root.
|
|
32
|
-
:param subfolder: Filter by the indicated subfolder.
|
|
33
|
+
:param subfolder: Filter by the indicated subfolder (deprecated, use subfolders instead).
|
|
34
|
+
:param subfolders: Filter by multiple subfolders. Files from any of these subfolders will be included.
|
|
33
35
|
:param variant: Filter by files belonging to a particular variant, such as fp16.
|
|
34
36
|
|
|
35
37
|
The file list can be obtained from the `files` field of HuggingFaceMetadata,
|
|
@@ -37,11 +39,24 @@ def filter_files(
|
|
|
37
39
|
"""
|
|
38
40
|
variant = variant or ModelRepoVariant.Default
|
|
39
41
|
paths: List[Path] = []
|
|
40
|
-
|
|
42
|
+
|
|
43
|
+
if not files:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
root = files[0].parts[0] if files[0].parts else Path(".")
|
|
47
|
+
|
|
48
|
+
# Build list of subfolders to filter by
|
|
49
|
+
filter_subfolders: List[Path] = []
|
|
50
|
+
if subfolders:
|
|
51
|
+
filter_subfolders = subfolders
|
|
52
|
+
elif subfolder:
|
|
53
|
+
filter_subfolders = [subfolder]
|
|
41
54
|
|
|
42
55
|
# if the subfolder is a single file, then bypass the selection and just return it
|
|
43
|
-
if
|
|
44
|
-
|
|
56
|
+
if len(filter_subfolders) == 1:
|
|
57
|
+
sf = filter_subfolders[0]
|
|
58
|
+
if sf.suffix in [".safetensors", ".bin", ".onnx", ".xml", ".pth", ".pt", ".ckpt", ".msgpack"]:
|
|
59
|
+
return [root / sf]
|
|
45
60
|
|
|
46
61
|
# Start by filtering on model file extensions, discarding images, docs, etc
|
|
47
62
|
for file in files:
|
|
@@ -66,10 +81,10 @@ def filter_files(
|
|
|
66
81
|
elif re.search(r"model.*\.(safetensors|bin|onnx|xml|pth|pt|ckpt|msgpack)$", file.name):
|
|
67
82
|
paths.append(file)
|
|
68
83
|
|
|
69
|
-
# limit search to subfolder if requested
|
|
70
|
-
if
|
|
71
|
-
|
|
72
|
-
paths = [x for x in paths if Path(
|
|
84
|
+
# limit search to subfolder(s) if requested
|
|
85
|
+
if filter_subfolders:
|
|
86
|
+
absolute_subfolders = [root / sf for sf in filter_subfolders]
|
|
87
|
+
paths = [x for x in paths if any(Path(sf) in x.parents for sf in absolute_subfolders)]
|
|
73
88
|
|
|
74
89
|
# _filter_by_variant uniquifies the paths and returns a set
|
|
75
90
|
return sorted(_filter_by_variant(paths, variant))
|
|
@@ -86,7 +86,8 @@ class LayerPatcher:
|
|
|
86
86
|
# submodules. If the layer keys do not contain a dot, then they are flattened, meaning that all '.' have been
|
|
87
87
|
# replaced with '_'. Non-flattened keys are preferred, because they allow submodules to be accessed directly
|
|
88
88
|
# without searching, but some legacy code still uses flattened keys.
|
|
89
|
-
|
|
89
|
+
first_key = next(iter(patch.layers.keys()))
|
|
90
|
+
layer_keys_are_flattened = "." not in first_key
|
|
90
91
|
|
|
91
92
|
prefix_len = len(prefix)
|
|
92
93
|
|
|
@@ -174,28 +175,45 @@ class LayerPatcher:
|
|
|
174
175
|
|
|
175
176
|
# TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
|
|
176
177
|
# devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
178
|
+
params_dict = patch.get_parameters(dict(module_to_patch.named_parameters(recurse=False)), weight=patch_weight)
|
|
179
|
+
if not params_dict:
|
|
180
|
+
logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
|
|
181
|
+
logger.warning(f"LoRA patch returned no parameters for module: {module_to_patch_key}")
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
for param_name, param_weight in params_dict.items():
|
|
180
185
|
param_key = module_to_patch_key + "." + param_name
|
|
181
186
|
module_param = module_to_patch.get_parameter(param_name)
|
|
182
187
|
|
|
183
188
|
# Save original weight
|
|
184
189
|
original_weights.save(param_key, module_param)
|
|
185
190
|
|
|
186
|
-
#
|
|
187
|
-
#
|
|
191
|
+
# Handle layers that change the shape of the original layer.
|
|
192
|
+
# FLUX control LoRAs intentionally expand certain layers - we pad the original weight with zeros.
|
|
193
|
+
# For other LoRAs (e.g., Z-Image with architecture mismatch), skip incompatible layers with a warning.
|
|
188
194
|
if module_param.nelement() != param_weight.nelement():
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
195
|
+
if isinstance(patch, FluxControlLoRALayer):
|
|
196
|
+
# FLUX Control LoRAs intentionally expand layers - pad with zeros
|
|
197
|
+
expanded_weight = pad_with_zeros(module_param, param_weight.shape)
|
|
198
|
+
setattr(
|
|
199
|
+
module_to_patch,
|
|
200
|
+
param_name,
|
|
201
|
+
torch.nn.Parameter(expanded_weight, requires_grad=module_param.requires_grad),
|
|
202
|
+
)
|
|
203
|
+
module_param = expanded_weight
|
|
204
|
+
else:
|
|
205
|
+
# For other LoRAs, shape mismatch indicates architecture incompatibility - skip the layer
|
|
206
|
+
logger = InvokeAILogger.get_logger(LayerPatcher.__name__)
|
|
207
|
+
logger.warning(
|
|
208
|
+
f"Skipping LoRA layer '{module_to_patch_key}.{param_name}' due to shape mismatch: "
|
|
209
|
+
f"model has {module_param.nelement()} elements, LoRA expects {param_weight.nelement()}. "
|
|
210
|
+
"This LoRA may be incompatible with this model architecture."
|
|
211
|
+
)
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
# Convert param_weight to the correct device and dtype, then apply to model weights
|
|
215
|
+
param_weight_converted = param_weight.to(device=device, dtype=dtype)
|
|
216
|
+
module_param.data.copy_(module_param.data + param_weight_converted)
|
|
199
217
|
|
|
200
218
|
patch.to(device=TorchDevice.CPU_DEVICE)
|
|
201
219
|
|
|
@@ -60,7 +60,8 @@ class LoRALayerBase(BaseLayerPatch):
|
|
|
60
60
|
|
|
61
61
|
def get_parameters(self, orig_parameters: dict[str, torch.Tensor], weight: float) -> dict[str, torch.Tensor]:
|
|
62
62
|
scale = self.scale()
|
|
63
|
-
|
|
63
|
+
lora_weight = self.get_weight(orig_parameters["weight"])
|
|
64
|
+
params = {"weight": lora_weight * (weight * scale)}
|
|
64
65
|
bias = self.get_bias(orig_parameters.get("bias", None))
|
|
65
66
|
if bias is not None:
|
|
66
67
|
params["bias"] = bias * (weight * scale)
|
|
@@ -12,18 +12,33 @@ from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
|
|
|
12
12
|
from invokeai.backend.util import InvokeAILogger
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def _has_flux_layer_structure(state_dict: dict[str | int, Any]) -> bool:
|
|
16
|
+
"""Check if state dict has Flux-specific layer patterns (double_blocks/single_blocks)."""
|
|
17
|
+
return any(
|
|
18
|
+
k.startswith("diffusion_model.double_blocks.") or k.startswith("diffusion_model.single_blocks.")
|
|
19
|
+
for k in state_dict.keys()
|
|
20
|
+
if isinstance(k, str)
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
15
24
|
def is_state_dict_likely_in_flux_aitoolkit_format(
|
|
16
25
|
state_dict: dict[str | int, Any],
|
|
17
26
|
metadata: dict[str, Any] | None = None,
|
|
18
27
|
) -> bool:
|
|
28
|
+
# Always check for Flux-specific layer structure first
|
|
29
|
+
# This prevents misidentifying Z-Image LoRAs (which use diffusion_model.layers.X) as Flux
|
|
30
|
+
if not _has_flux_layer_structure(state_dict):
|
|
31
|
+
return False
|
|
32
|
+
|
|
19
33
|
if metadata:
|
|
20
34
|
try:
|
|
21
35
|
software = json.loads(metadata.get("software", "{}"))
|
|
22
36
|
except json.JSONDecodeError:
|
|
23
37
|
return False
|
|
24
38
|
return software.get("name") == "ai-toolkit"
|
|
25
|
-
|
|
26
|
-
|
|
39
|
+
|
|
40
|
+
# No metadata - if it has Flux layer structure, assume it's AI Toolkit format
|
|
41
|
+
return True
|
|
27
42
|
|
|
28
43
|
|
|
29
44
|
@dataclass
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
|
|
6
|
+
from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
|
|
7
|
+
from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
|
|
8
|
+
from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_TRANSFORMER_PREFIX
|
|
9
|
+
from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
|
|
10
|
+
|
|
11
|
+
# A regex pattern that matches all of the transformer keys in the xlabs FLUX LoRA format.
|
|
12
|
+
# Example keys:
|
|
13
|
+
# double_blocks.0.processor.qkv_lora1.down.weight
|
|
14
|
+
# double_blocks.0.processor.qkv_lora1.up.weight
|
|
15
|
+
# double_blocks.0.processor.proj_lora1.down.weight
|
|
16
|
+
# double_blocks.0.processor.proj_lora1.up.weight
|
|
17
|
+
# double_blocks.0.processor.qkv_lora2.down.weight
|
|
18
|
+
# double_blocks.0.processor.proj_lora2.up.weight
|
|
19
|
+
FLUX_XLABS_KEY_REGEX = r"double_blocks\.(\d+)\.processor\.(qkv|proj)_lora([12])\.(down|up)\.weight"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_state_dict_likely_in_flux_xlabs_format(state_dict: dict[str | int, Any]) -> bool:
|
|
23
|
+
"""Checks if the provided state dict is likely in the xlabs FLUX LoRA format.
|
|
24
|
+
|
|
25
|
+
The xlabs format is characterized by keys matching the pattern:
|
|
26
|
+
double_blocks.{block_idx}.processor.{qkv|proj}_lora{1|2}.{down|up}.weight
|
|
27
|
+
|
|
28
|
+
Where:
|
|
29
|
+
- lora1 corresponds to the image attention stream (img_attn)
|
|
30
|
+
- lora2 corresponds to the text attention stream (txt_attn)
|
|
31
|
+
"""
|
|
32
|
+
if not state_dict:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
# Check that all keys match the xlabs pattern
|
|
36
|
+
for key in state_dict.keys():
|
|
37
|
+
if not isinstance(key, str):
|
|
38
|
+
continue
|
|
39
|
+
if not re.match(FLUX_XLABS_KEY_REGEX, key):
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# Ensure we have at least some valid keys
|
|
43
|
+
return any(isinstance(k, str) and re.match(FLUX_XLABS_KEY_REGEX, k) for k in state_dict.keys())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def lora_model_from_flux_xlabs_state_dict(state_dict: Dict[str, torch.Tensor]) -> ModelPatchRaw:
|
|
47
|
+
"""Converts an xlabs FLUX LoRA state dict to the InvokeAI ModelPatchRaw format.
|
|
48
|
+
|
|
49
|
+
The xlabs format uses:
|
|
50
|
+
- lora1 for image attention stream (img_attn)
|
|
51
|
+
- lora2 for text attention stream (txt_attn)
|
|
52
|
+
- qkv for query/key/value projection
|
|
53
|
+
- proj for output projection
|
|
54
|
+
|
|
55
|
+
Key mapping:
|
|
56
|
+
- double_blocks.X.processor.qkv_lora1 -> double_blocks.X.img_attn.qkv
|
|
57
|
+
- double_blocks.X.processor.proj_lora1 -> double_blocks.X.img_attn.proj
|
|
58
|
+
- double_blocks.X.processor.qkv_lora2 -> double_blocks.X.txt_attn.qkv
|
|
59
|
+
- double_blocks.X.processor.proj_lora2 -> double_blocks.X.txt_attn.proj
|
|
60
|
+
"""
|
|
61
|
+
# Group keys by layer (without the .down.weight/.up.weight suffix)
|
|
62
|
+
grouped_state_dict: dict[str, dict[str, torch.Tensor]] = {}
|
|
63
|
+
|
|
64
|
+
for key, value in state_dict.items():
|
|
65
|
+
match = re.match(FLUX_XLABS_KEY_REGEX, key)
|
|
66
|
+
if not match:
|
|
67
|
+
raise ValueError(f"Key '{key}' does not match the expected pattern for xlabs FLUX LoRA weights.")
|
|
68
|
+
|
|
69
|
+
block_idx = match.group(1)
|
|
70
|
+
component = match.group(2) # qkv or proj
|
|
71
|
+
lora_stream = match.group(3) # 1 or 2
|
|
72
|
+
direction = match.group(4) # down or up
|
|
73
|
+
|
|
74
|
+
# Map lora1 -> img_attn, lora2 -> txt_attn
|
|
75
|
+
attn_type = "img_attn" if lora_stream == "1" else "txt_attn"
|
|
76
|
+
|
|
77
|
+
# Create the InvokeAI-style layer key
|
|
78
|
+
layer_key = f"double_blocks.{block_idx}.{attn_type}.{component}"
|
|
79
|
+
|
|
80
|
+
if layer_key not in grouped_state_dict:
|
|
81
|
+
grouped_state_dict[layer_key] = {}
|
|
82
|
+
|
|
83
|
+
# Map down/up to lora_down/lora_up
|
|
84
|
+
param_name = f"lora_{direction}.weight"
|
|
85
|
+
grouped_state_dict[layer_key][param_name] = value
|
|
86
|
+
|
|
87
|
+
# Create LoRA layers
|
|
88
|
+
layers: dict[str, BaseLayerPatch] = {}
|
|
89
|
+
for layer_key, layer_state_dict in grouped_state_dict.items():
|
|
90
|
+
layers[FLUX_LORA_TRANSFORMER_PREFIX + layer_key] = any_lora_layer_from_state_dict(layer_state_dict)
|
|
91
|
+
|
|
92
|
+
return ModelPatchRaw(layers=layers)
|
|
@@ -14,6 +14,9 @@ from invokeai.backend.patches.lora_conversions.flux_kohya_lora_conversion_utils
|
|
|
14
14
|
from invokeai.backend.patches.lora_conversions.flux_onetrainer_lora_conversion_utils import (
|
|
15
15
|
is_state_dict_likely_in_flux_onetrainer_format,
|
|
16
16
|
)
|
|
17
|
+
from invokeai.backend.patches.lora_conversions.flux_xlabs_lora_conversion_utils import (
|
|
18
|
+
is_state_dict_likely_in_flux_xlabs_format,
|
|
19
|
+
)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def flux_format_from_state_dict(
|
|
@@ -30,5 +33,7 @@ def flux_format_from_state_dict(
|
|
|
30
33
|
return FluxLoRAFormat.Control
|
|
31
34
|
elif is_state_dict_likely_in_flux_aitoolkit_format(state_dict, metadata):
|
|
32
35
|
return FluxLoRAFormat.AIToolkit
|
|
36
|
+
elif is_state_dict_likely_in_flux_xlabs_format(state_dict):
|
|
37
|
+
return FluxLoRAFormat.XLabs
|
|
33
38
|
else:
|
|
34
39
|
return None
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Z-Image LoRA prefix constants
|
|
2
|
+
# These prefixes are used for key mapping when applying LoRA patches to Z-Image models
|
|
3
|
+
|
|
4
|
+
# Prefix for Z-Image transformer (S3-DiT architecture) LoRA layers
|
|
5
|
+
Z_IMAGE_LORA_TRANSFORMER_PREFIX = "lora_transformer-"
|
|
6
|
+
|
|
7
|
+
# Prefix for Qwen3 text encoder LoRA layers
|
|
8
|
+
Z_IMAGE_LORA_QWEN3_PREFIX = "lora_qwen3-"
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Z-Image LoRA conversion utilities.
|
|
2
|
+
|
|
3
|
+
Z-Image uses S3-DiT transformer architecture with Qwen3 text encoder.
|
|
4
|
+
LoRAs for Z-Image typically follow the diffusers PEFT format.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
from invokeai.backend.patches.layers.base_layer_patch import BaseLayerPatch
|
|
12
|
+
from invokeai.backend.patches.layers.utils import any_lora_layer_from_state_dict
|
|
13
|
+
from invokeai.backend.patches.lora_conversions.z_image_lora_constants import (
|
|
14
|
+
Z_IMAGE_LORA_QWEN3_PREFIX,
|
|
15
|
+
Z_IMAGE_LORA_TRANSFORMER_PREFIX,
|
|
16
|
+
)
|
|
17
|
+
from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_state_dict_likely_z_image_lora(state_dict: dict[str | int, torch.Tensor]) -> bool:
|
|
21
|
+
"""Checks if the provided state dict is likely a Z-Image LoRA.
|
|
22
|
+
|
|
23
|
+
Z-Image LoRAs can have keys for transformer and/or Qwen3 text encoder.
|
|
24
|
+
They may use various prefixes depending on the training framework.
|
|
25
|
+
"""
|
|
26
|
+
str_keys = [k for k in state_dict.keys() if isinstance(k, str)]
|
|
27
|
+
|
|
28
|
+
# Check for Z-Image transformer keys (S3-DiT architecture)
|
|
29
|
+
# Various training frameworks use different prefixes
|
|
30
|
+
has_transformer_keys = any(
|
|
31
|
+
k.startswith(
|
|
32
|
+
(
|
|
33
|
+
"transformer.",
|
|
34
|
+
"base_model.model.transformer.",
|
|
35
|
+
"diffusion_model.",
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
for k in str_keys
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Check for Qwen3 text encoder keys
|
|
42
|
+
has_qwen3_keys = any(k.startswith(("text_encoder.", "base_model.model.text_encoder.")) for k in str_keys)
|
|
43
|
+
|
|
44
|
+
return has_transformer_keys or has_qwen3_keys
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def lora_model_from_z_image_state_dict(
|
|
48
|
+
state_dict: Dict[str, torch.Tensor], alpha: float | None = None
|
|
49
|
+
) -> ModelPatchRaw:
|
|
50
|
+
"""Convert a Z-Image LoRA state dict to a ModelPatchRaw.
|
|
51
|
+
|
|
52
|
+
Z-Image LoRAs can contain layers for:
|
|
53
|
+
- Transformer (S3-DiT architecture)
|
|
54
|
+
- Qwen3 text encoder
|
|
55
|
+
|
|
56
|
+
Z-Image LoRAs may use various key prefixes depending on how they were trained:
|
|
57
|
+
- "transformer." or "base_model.model.transformer." for diffusers PEFT format
|
|
58
|
+
- "diffusion_model." for some training frameworks
|
|
59
|
+
- "text_encoder." or "base_model.model.text_encoder." for Qwen3 encoder
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
state_dict: The LoRA state dict
|
|
63
|
+
alpha: The alpha value for LoRA scaling. If None, uses rank as alpha.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
A ModelPatchRaw containing the LoRA layers
|
|
67
|
+
"""
|
|
68
|
+
layers: dict[str, BaseLayerPatch] = {}
|
|
69
|
+
|
|
70
|
+
# Group keys by layer
|
|
71
|
+
grouped_state_dict = _group_by_layer(state_dict)
|
|
72
|
+
|
|
73
|
+
for layer_key, layer_dict in grouped_state_dict.items():
|
|
74
|
+
# Convert PEFT format keys to internal format
|
|
75
|
+
values = _get_lora_layer_values(layer_dict, alpha)
|
|
76
|
+
|
|
77
|
+
# Determine the appropriate prefix based on the layer type and clean up the key
|
|
78
|
+
clean_key = layer_key
|
|
79
|
+
|
|
80
|
+
# Handle various transformer prefixes
|
|
81
|
+
transformer_prefixes = [
|
|
82
|
+
"base_model.model.transformer.diffusion_model.",
|
|
83
|
+
"base_model.model.transformer.",
|
|
84
|
+
"transformer.diffusion_model.",
|
|
85
|
+
"transformer.",
|
|
86
|
+
"diffusion_model.",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Handle text encoder prefixes
|
|
90
|
+
text_encoder_prefixes = [
|
|
91
|
+
"base_model.model.text_encoder.",
|
|
92
|
+
"text_encoder.",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
is_text_encoder = False
|
|
96
|
+
|
|
97
|
+
# Check and strip text encoder prefixes first
|
|
98
|
+
for prefix in text_encoder_prefixes:
|
|
99
|
+
if layer_key.startswith(prefix):
|
|
100
|
+
clean_key = layer_key[len(prefix) :]
|
|
101
|
+
is_text_encoder = True
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
# If not text encoder, check transformer prefixes
|
|
105
|
+
if not is_text_encoder:
|
|
106
|
+
for prefix in transformer_prefixes:
|
|
107
|
+
if layer_key.startswith(prefix):
|
|
108
|
+
clean_key = layer_key[len(prefix) :]
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
# Apply the appropriate internal prefix
|
|
112
|
+
if is_text_encoder:
|
|
113
|
+
final_key = f"{Z_IMAGE_LORA_QWEN3_PREFIX}{clean_key}"
|
|
114
|
+
else:
|
|
115
|
+
final_key = f"{Z_IMAGE_LORA_TRANSFORMER_PREFIX}{clean_key}"
|
|
116
|
+
|
|
117
|
+
layer = any_lora_layer_from_state_dict(values)
|
|
118
|
+
layers[final_key] = layer
|
|
119
|
+
|
|
120
|
+
return ModelPatchRaw(layers=layers)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | None) -> dict[str, torch.Tensor]:
|
|
124
|
+
"""Convert layer dict keys from PEFT format to internal format."""
|
|
125
|
+
if "lora_A.weight" in layer_dict:
|
|
126
|
+
# PEFT format: lora_A.weight, lora_B.weight
|
|
127
|
+
values = {
|
|
128
|
+
"lora_down.weight": layer_dict["lora_A.weight"],
|
|
129
|
+
"lora_up.weight": layer_dict["lora_B.weight"],
|
|
130
|
+
}
|
|
131
|
+
if alpha is not None:
|
|
132
|
+
values["alpha"] = torch.tensor(alpha)
|
|
133
|
+
return values
|
|
134
|
+
elif "lora_down.weight" in layer_dict:
|
|
135
|
+
# Already in internal format
|
|
136
|
+
return layer_dict
|
|
137
|
+
else:
|
|
138
|
+
# Unknown format, return as-is
|
|
139
|
+
return layer_dict
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
|
|
143
|
+
"""Groups the keys in the state dict by layer."""
|
|
144
|
+
layer_dict: dict[str, dict[str, torch.Tensor]] = {}
|
|
145
|
+
for key in state_dict:
|
|
146
|
+
if not isinstance(key, str):
|
|
147
|
+
continue
|
|
148
|
+
# Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
|
|
149
|
+
parts = key.rsplit(".", maxsplit=2)
|
|
150
|
+
layer_name = parts[0]
|
|
151
|
+
key_name = ".".join(parts[1:])
|
|
152
|
+
if layer_name not in layer_dict:
|
|
153
|
+
layer_dict[layer_name] = {}
|
|
154
|
+
layer_dict[layer_name][key_name] = state_dict[key]
|
|
155
|
+
return layer_dict
|