InvokeAI 6.10.0rc1__py3-none-any.whl → 6.10.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. invokeai/app/invocations/flux_denoise.py +15 -1
  2. invokeai/app/invocations/pbr_maps.py +59 -0
  3. invokeai/app/invocations/z_image_denoise.py +237 -82
  4. invokeai/backend/flux/denoise.py +196 -11
  5. invokeai/backend/flux/schedulers.py +62 -0
  6. invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
  7. invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
  8. invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
  9. invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
  10. invokeai/backend/model_manager/configs/lora.py +36 -0
  11. invokeai/backend/model_manager/load/load_default.py +1 -0
  12. invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
  13. invokeai/backend/model_manager/load/model_loaders/flux.py +13 -6
  14. invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
  15. invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
  16. invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +3 -1
  17. invokeai/backend/model_manager/load/model_loaders/z_image.py +37 -3
  18. invokeai/backend/model_manager/starter_models.py +13 -4
  19. invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
  20. invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
  21. invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
  22. invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +161 -0
  23. invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-BP0RxJ4G.js} +1 -1
  24. invokeai/frontend/web/dist/assets/{index-dgSJAY--.js → index-B44qKjrs.js} +51 -51
  25. invokeai/frontend/web/dist/index.html +1 -1
  26. invokeai/frontend/web/dist/locales/en-GB.json +1 -0
  27. invokeai/frontend/web/dist/locales/en.json +11 -5
  28. invokeai/version/invokeai_version.py +1 -1
  29. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/METADATA +2 -2
  30. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/RECORD +36 -29
  31. invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
  32. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/WHEEL +0 -0
  33. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/entry_points.txt +0 -0
  34. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE +0 -0
  35. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  36. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  37. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,70 @@
1
+ # Original: https://github.com/joeyballentine/Material-Map-Generator
2
+ # Adopted and optimized for Invoke AI
3
+
4
+ import math
5
+ from typing import Literal, Optional
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+ import invokeai.backend.image_util.pbr_maps.architecture.block as B
11
+
12
+ UPSCALE_MODE = Literal["upconv", "pixelshuffle"]
13
+
14
+
15
+ class PBR_RRDB_Net(nn.Module):
16
+ def __init__(
17
+ self,
18
+ in_nc: int,
19
+ out_nc: int,
20
+ nf: int,
21
+ nb: int,
22
+ gc: int = 32,
23
+ upscale: int = 4,
24
+ norm_type: Optional[B.NORMALIZATION_LAYER_TYPE] = None,
25
+ act_type: B.ACTIVATION_LAYER_TYPE = "leakyrelu",
26
+ mode: B.BLOCK_MODE = "CNA",
27
+ res_scale: int = 1,
28
+ upsample_mode: UPSCALE_MODE = "upconv",
29
+ ):
30
+ super(PBR_RRDB_Net, self).__init__()
31
+ n_upscale = int(math.log(upscale, 2))
32
+ if upscale == 3:
33
+ n_upscale = 1
34
+
35
+ fea_conv = B.conv_block(in_nc, nf, kernel_size=3, norm_type=None, act_type=None)
36
+ rb_blocks = [
37
+ B.RRDB(
38
+ nf,
39
+ kernel_size=3,
40
+ gc=32,
41
+ stride=1,
42
+ bias=True,
43
+ pad_type="zero",
44
+ norm_type=norm_type,
45
+ act_type=act_type,
46
+ mode="CNA",
47
+ )
48
+ for _ in range(nb)
49
+ ]
50
+ LR_conv = B.conv_block(nf, nf, kernel_size=3, norm_type=norm_type, act_type=None, mode=mode)
51
+
52
+ if upsample_mode == "upconv":
53
+ upsample_block = B.upconv_block
54
+ elif upsample_mode == "pixelshuffle":
55
+ upsample_block = B.pixelshuffle_block
56
+
57
+ if upscale == 3:
58
+ upsampler = upsample_block(nf, nf, 3, act_type=act_type)
59
+ else:
60
+ upsampler = [upsample_block(nf, nf, act_type=act_type) for _ in range(n_upscale)]
61
+
62
+ HR_conv0 = B.conv_block(nf, nf, kernel_size=3, norm_type=None, act_type=act_type)
63
+ HR_conv1 = B.conv_block(nf, out_nc, kernel_size=3, norm_type=None, act_type=None)
64
+
65
+ self.model = B.sequential(
66
+ fea_conv, B.ShortcutBlock(B.sequential(*rb_blocks, LR_conv)), *upsampler, HR_conv0, HR_conv1
67
+ )
68
+
69
+ def forward(self, x: torch.Tensor):
70
+ return self.model(x)
@@ -0,0 +1,141 @@
1
+ # Original: https://github.com/joeyballentine/Material-Map-Generator
2
+ # Adopted and optimized for Invoke AI
3
+
4
+ import pathlib
5
+ from typing import Any, Literal
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import numpy.typing as npt
10
+ import torch
11
+ from PIL import Image
12
+ from safetensors.torch import load_file
13
+
14
+ from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
15
+ from invokeai.backend.image_util.pbr_maps.utils.image_ops import crop_seamless, esrgan_launcher_split_merge
16
+
17
+ NORMAL_MAP_MODEL = (
18
+ "https://huggingface.co/InvokeAI/pbr-material-maps/resolve/main/normal_map_generator.safetensors?download=true"
19
+ )
20
+ OTHER_MAP_MODEL = (
21
+ "https://huggingface.co/InvokeAI/pbr-material-maps/resolve/main/franken_map_generator.safetensors?download=true"
22
+ )
23
+
24
+
25
+ class PBRMapsGenerator:
26
+ def __init__(self, normal_map_model: PBR_RRDB_Net, other_map_model: PBR_RRDB_Net, device: torch.device) -> None:
27
+ self.normal_map_model = normal_map_model
28
+ self.other_map_model = other_map_model
29
+ self.device = device
30
+
31
+ @staticmethod
32
+ def load_model(model_path: pathlib.Path, device: torch.device) -> PBR_RRDB_Net:
33
+ state_dict = load_file(model_path.as_posix(), device=device.type)
34
+
35
+ model = PBR_RRDB_Net(
36
+ 3,
37
+ 3,
38
+ 32,
39
+ 12,
40
+ gc=32,
41
+ upscale=1,
42
+ norm_type=None,
43
+ act_type="leakyrelu",
44
+ mode="CNA",
45
+ res_scale=1,
46
+ upsample_mode="upconv",
47
+ )
48
+
49
+ model.load_state_dict(state_dict, strict=False)
50
+
51
+ del state_dict
52
+ if torch.cuda.is_available() and device.type == "cuda":
53
+ torch.cuda.empty_cache()
54
+
55
+ model.eval()
56
+
57
+ for _, v in model.named_parameters():
58
+ v.requires_grad = False
59
+
60
+ return model.to(device)
61
+
62
+ def process(self, img: npt.NDArray[Any], model: PBR_RRDB_Net):
63
+ img = img.astype(np.float32) / np.iinfo(img.dtype).max
64
+ img = img[..., ::-1].copy()
65
+ tensor_img = torch.tensor(img).permute(2, 0, 1).unsqueeze(0).to(self.device)
66
+
67
+ with torch.no_grad():
68
+ output = model(tensor_img).data.squeeze(0).float().cpu().clamp_(0, 1).numpy()
69
+ output = output[[2, 1, 0], :, :]
70
+ output = np.transpose(output, (1, 2, 0))
71
+ output = (output * 255.0).round()
72
+ return output
73
+
74
+ def _cv2_to_pil(self, image: npt.NDArray[Any]):
75
+ return Image.fromarray(cv2.cvtColor(image.astype(np.uint8), cv2.COLOR_RGB2BGR))
76
+
77
+ def generate_maps(
78
+ self,
79
+ image: Image.Image,
80
+ tile_size: int = 512,
81
+ border_mode: Literal["none", "seamless", "mirror", "replicate"] = "none",
82
+ ):
83
+ """
84
+ Generate PBR texture maps (normal, roughness, and displacement) from an input image.
85
+ The image can optionally be padded before inference to control how borders are treated,
86
+ which can help create seamless or edge‑consistent textures.
87
+
88
+ Args:
89
+ image: Source image used to generate the PBR maps.
90
+ tile_size: Maximum tile size used for tiled inference. If the image is larger than
91
+ this size in either dimension, it will be split into tiles for processing and
92
+ then merged.
93
+
94
+ border_mode: Strategy for padding the image before inference:
95
+ - "none": No padding is applied; the image is processed as‑is.
96
+ - "seamless": Pads the image using wrap‑around tiling
97
+ (`cv2.BORDER_WRAP`) to help produce seamless textures.
98
+ - "mirror": Pads the image by mirroring border pixels
99
+ (`cv2.BORDER_REFLECT_101`) to reduce edge artifacts.
100
+ - "replicate": Pads the image by replicating the edge pixels outward
101
+ (`cv2.BORDER_REPLICATE`).
102
+
103
+ Returns:
104
+ A tuple of three PIL Images:
105
+ - normal_map: RGB normal map generated from the input.
106
+ - roughness: Single‑channel roughness map extracted from the second model output.
107
+ - displacement: Single‑channel displacement (height) map extracted from the
108
+ second model output.
109
+ """
110
+
111
+ models = [self.normal_map_model, self.other_map_model]
112
+ np_image = np.array(image).astype(np.uint8)
113
+
114
+ match border_mode:
115
+ case "seamless":
116
+ np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_WRAP)
117
+ case "mirror":
118
+ np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_REFLECT_101)
119
+ case "replicate":
120
+ np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_REPLICATE)
121
+ case "none":
122
+ pass
123
+
124
+ img_height, img_width = np_image.shape[:2]
125
+
126
+ # Checking whether to perform tiled inference
127
+ do_split = img_height > tile_size or img_width > tile_size
128
+
129
+ if do_split:
130
+ rlts = esrgan_launcher_split_merge(np_image, self.process, models, scale_factor=1, tile_size=tile_size)
131
+ else:
132
+ rlts = [self.process(np_image, model) for model in models]
133
+
134
+ if border_mode != "none":
135
+ rlts = [crop_seamless(rlt) for rlt in rlts]
136
+
137
+ normal_map = self._cv2_to_pil(rlts[0])
138
+ roughness = self._cv2_to_pil(rlts[1][:, :, 1])
139
+ displacement = self._cv2_to_pil(rlts[1][:, :, 0])
140
+
141
+ return normal_map, roughness, displacement
@@ -0,0 +1,93 @@
1
+ # Original: https://github.com/joeyballentine/Material-Map-Generator
2
+ # Adopted and optimized for Invoke AI
3
+
4
+ import math
5
+ from typing import Any, Callable, List
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+
10
+ from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
11
+
12
+
13
+ def crop_seamless(img: npt.NDArray[Any]):
14
+ img_height, img_width = img.shape[:2]
15
+ y, x = 16, 16
16
+ h, w = img_height - 32, img_width - 32
17
+ img = img[y : y + h, x : x + w]
18
+ return img
19
+
20
+
21
+ # from https://github.com/ata4/esrgan-launcher/blob/master/upscale.py
22
+ def esrgan_launcher_split_merge(
23
+ input_image: npt.NDArray[Any],
24
+ upscale_function: Callable[[npt.NDArray[Any], PBR_RRDB_Net], npt.NDArray[Any]],
25
+ models: List[PBR_RRDB_Net],
26
+ scale_factor: int = 4,
27
+ tile_size: int = 512,
28
+ tile_padding: float = 0.125,
29
+ ):
30
+ width, height, depth = input_image.shape
31
+ output_width = width * scale_factor
32
+ output_height = height * scale_factor
33
+ output_shape = (output_width, output_height, depth)
34
+
35
+ # start with black image
36
+ output_images = [np.zeros(output_shape, np.uint8) for _ in range(len(models))]
37
+
38
+ tile_padding = math.ceil(tile_size * tile_padding)
39
+ tile_size = math.ceil(tile_size / scale_factor)
40
+
41
+ tiles_x = math.ceil(width / tile_size)
42
+ tiles_y = math.ceil(height / tile_size)
43
+
44
+ for y in range(tiles_y):
45
+ for x in range(tiles_x):
46
+ # extract tile from input image
47
+ ofs_x = x * tile_size
48
+ ofs_y = y * tile_size
49
+
50
+ # input tile area on total image
51
+ input_start_x = ofs_x
52
+ input_end_x = min(ofs_x + tile_size, width)
53
+
54
+ input_start_y = ofs_y
55
+ input_end_y = min(ofs_y + tile_size, height)
56
+
57
+ # input tile area on total image with padding
58
+ input_start_x_pad = max(input_start_x - tile_padding, 0)
59
+ input_end_x_pad = min(input_end_x + tile_padding, width)
60
+
61
+ input_start_y_pad = max(input_start_y - tile_padding, 0)
62
+ input_end_y_pad = min(input_end_y + tile_padding, height)
63
+
64
+ # input tile dimensions
65
+ input_tile_width = input_end_x - input_start_x
66
+ input_tile_height = input_end_y - input_start_y
67
+
68
+ input_tile = input_image[input_start_x_pad:input_end_x_pad, input_start_y_pad:input_end_y_pad]
69
+
70
+ for idx, model in enumerate(models):
71
+ # upscale tile
72
+ output_tile = upscale_function(input_tile, model)
73
+
74
+ # output tile area on total image
75
+ output_start_x = input_start_x * scale_factor
76
+ output_end_x = input_end_x * scale_factor
77
+
78
+ output_start_y = input_start_y * scale_factor
79
+ output_end_y = input_end_y * scale_factor
80
+
81
+ # output tile area without padding
82
+ output_start_x_tile = (input_start_x - input_start_x_pad) * scale_factor
83
+ output_end_x_tile = output_start_x_tile + input_tile_width * scale_factor
84
+
85
+ output_start_y_tile = (input_start_y - input_start_y_pad) * scale_factor
86
+ output_end_y_tile = output_start_y_tile + input_tile_height * scale_factor
87
+
88
+ # put tile into output image
89
+ output_images[idx][output_start_x:output_end_x, output_start_y:output_end_y] = output_tile[
90
+ output_start_x_tile:output_end_x_tile, output_start_y_tile:output_end_y_tile
91
+ ]
92
+
93
+ return output_images
@@ -227,6 +227,42 @@ class LoRA_LyCORIS_ZImage_Config(LoRA_LyCORIS_Config_Base, Config_Base):
227
227
 
228
228
  base: Literal[BaseModelType.ZImage] = Field(default=BaseModelType.ZImage)
229
229
 
230
+ @classmethod
231
+ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None:
232
+ """Z-Image LoRAs have different key patterns than SD/SDXL LoRAs.
233
+
234
+ Z-Image LoRAs use keys like:
235
+ - diffusion_model.layers.X.attention.to_k.lora_down.weight (DoRA format)
236
+ - diffusion_model.layers.X.attention.to_k.lora_A.weight (PEFT format)
237
+ - diffusion_model.layers.X.attention.to_k.dora_scale (DoRA scale)
238
+ """
239
+ state_dict = mod.load_state_dict()
240
+
241
+ # Check for Z-Image specific LoRA patterns
242
+ has_z_image_lora_keys = state_dict_has_any_keys_starting_with(
243
+ state_dict,
244
+ {
245
+ "diffusion_model.layers.", # Z-Image S3-DiT layer pattern
246
+ },
247
+ )
248
+
249
+ # Also check for LoRA weight suffixes (various formats)
250
+ has_lora_suffix = state_dict_has_any_keys_ending_with(
251
+ state_dict,
252
+ {
253
+ "lora_A.weight",
254
+ "lora_B.weight",
255
+ "lora_down.weight",
256
+ "lora_up.weight",
257
+ "dora_scale",
258
+ },
259
+ )
260
+
261
+ if has_z_image_lora_keys and has_lora_suffix:
262
+ return
263
+
264
+ raise NotAMatchError("model does not match Z-Image LoRA heuristics")
265
+
230
266
  @classmethod
231
267
  def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType:
232
268
  """Z-Image LoRAs are identified by their diffusion_model.layers structure.
@@ -75,6 +75,7 @@ class ModelLoader(ModelLoaderBase):
75
75
 
76
76
  config.path = str(self._get_model_path(config))
77
77
  self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
78
+ self._logger.info(f"Loading model '{stats_name}' into RAM cache..., config={config}")
78
79
  loaded_model = self._load_model(config, submodel_type)
79
80
 
80
81
  self._ram_cache.put(
@@ -45,12 +45,13 @@ class CogView4DiffusersModel(GenericDiffusersLoader):
45
45
  model_path,
46
46
  torch_dtype=dtype,
47
47
  variant=variant,
48
+ local_files_only=True,
48
49
  )
49
50
  except OSError as e:
50
51
  if variant and "no file named" in str(
51
52
  e
52
53
  ): # try without the variant, just in case user's preferences changed
53
- result = load_class.from_pretrained(model_path, torch_dtype=dtype)
54
+ result = load_class.from_pretrained(model_path, torch_dtype=dtype, local_files_only=True)
54
55
  else:
55
56
  raise e
56
57
 
@@ -122,9 +122,9 @@ class CLIPDiffusersLoader(ModelLoader):
122
122
 
123
123
  match submodel_type:
124
124
  case SubModelType.Tokenizer:
125
- return CLIPTokenizer.from_pretrained(Path(config.path) / "tokenizer")
125
+ return CLIPTokenizer.from_pretrained(Path(config.path) / "tokenizer", local_files_only=True)
126
126
  case SubModelType.TextEncoder:
127
- return CLIPTextModel.from_pretrained(Path(config.path) / "text_encoder")
127
+ return CLIPTextModel.from_pretrained(Path(config.path) / "text_encoder", local_files_only=True)
128
128
 
129
129
  raise ValueError(
130
130
  f"Only Tokenizer and TextEncoder submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}"
@@ -148,10 +148,12 @@ class BnbQuantizedLlmInt8bCheckpointModel(ModelLoader):
148
148
  )
149
149
  match submodel_type:
150
150
  case SubModelType.Tokenizer2 | SubModelType.Tokenizer3:
151
- return T5TokenizerFast.from_pretrained(Path(config.path) / "tokenizer_2", max_length=512)
151
+ return T5TokenizerFast.from_pretrained(
152
+ Path(config.path) / "tokenizer_2", max_length=512, local_files_only=True
153
+ )
152
154
  case SubModelType.TextEncoder2 | SubModelType.TextEncoder3:
153
155
  te2_model_path = Path(config.path) / "text_encoder_2"
154
- model_config = AutoConfig.from_pretrained(te2_model_path)
156
+ model_config = AutoConfig.from_pretrained(te2_model_path, local_files_only=True)
155
157
  with accelerate.init_empty_weights():
156
158
  model = AutoModelForTextEncoding.from_config(model_config)
157
159
  model = quantize_model_llm_int8(model, modules_to_not_convert=set())
@@ -192,10 +194,15 @@ class T5EncoderCheckpointModel(ModelLoader):
192
194
 
193
195
  match submodel_type:
194
196
  case SubModelType.Tokenizer2 | SubModelType.Tokenizer3:
195
- return T5TokenizerFast.from_pretrained(Path(config.path) / "tokenizer_2", max_length=512)
197
+ return T5TokenizerFast.from_pretrained(
198
+ Path(config.path) / "tokenizer_2", max_length=512, local_files_only=True
199
+ )
196
200
  case SubModelType.TextEncoder2 | SubModelType.TextEncoder3:
197
201
  return T5EncoderModel.from_pretrained(
198
- Path(config.path) / "text_encoder_2", torch_dtype="auto", low_cpu_mem_usage=True
202
+ Path(config.path) / "text_encoder_2",
203
+ torch_dtype="auto",
204
+ low_cpu_mem_usage=True,
205
+ local_files_only=True,
199
206
  )
200
207
 
201
208
  raise ValueError(
@@ -37,12 +37,14 @@ class GenericDiffusersLoader(ModelLoader):
37
37
  repo_variant = config.repo_variant if isinstance(config, Diffusers_Config_Base) else None
38
38
  variant = repo_variant.value if repo_variant else None
39
39
  try:
40
- result: AnyModel = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, variant=variant)
40
+ result: AnyModel = model_class.from_pretrained(
41
+ model_path, torch_dtype=self._torch_dtype, variant=variant, local_files_only=True
42
+ )
41
43
  except OSError as e:
42
44
  if variant and "no file named" in str(
43
45
  e
44
46
  ): # try without the variant, just in case user's preferences changed
45
- result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
47
+ result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
46
48
  else:
47
49
  raise e
48
50
  return result
@@ -38,5 +38,6 @@ class OnnyxDiffusersModel(GenericDiffusersLoader):
38
38
  model_path,
39
39
  torch_dtype=self._torch_dtype,
40
40
  variant=variant,
41
+ local_files_only=True,
41
42
  )
42
43
  return result
@@ -80,12 +80,13 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
80
80
  model_path,
81
81
  torch_dtype=self._torch_dtype,
82
82
  variant=variant,
83
+ local_files_only=True,
83
84
  )
84
85
  except OSError as e:
85
86
  if variant and "no file named" in str(
86
87
  e
87
88
  ): # try without the variant, just in case user's preferences changed
88
- result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
89
+ result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
89
90
  else:
90
91
  raise e
91
92
 
@@ -139,6 +140,7 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
139
140
  # Some weights of the model checkpoint were not used when initializing CLIPTextModelWithProjection:
140
141
  # ['text_model.embeddings.position_ids']
141
142
 
143
+ self._logger.info(f"Loading model from single file at {config.path} using {load_class.__name__}")
142
144
  with SilenceWarnings():
143
145
  pipeline = load_class.from_single_file(config.path, torch_dtype=self._torch_dtype)
144
146
 
@@ -384,15 +384,19 @@ class Qwen3EncoderLoader(ModelLoader):
384
384
 
385
385
  match submodel_type:
386
386
  case SubModelType.Tokenizer:
387
- return AutoTokenizer.from_pretrained(tokenizer_path)
387
+ # Use local_files_only=True to prevent network requests for validation
388
+ # The tokenizer files should already exist locally in the model directory
389
+ return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
388
390
  case SubModelType.TextEncoder:
389
391
  # Determine safe dtype based on target device capabilities
390
392
  target_device = TorchDevice.choose_torch_device()
391
393
  model_dtype = TorchDevice.choose_bfloat16_safe_dtype(target_device)
394
+ # Use local_files_only=True to prevent network requests for validation
392
395
  return Qwen3ForCausalLM.from_pretrained(
393
396
  text_encoder_path,
394
397
  torch_dtype=model_dtype,
395
398
  low_cpu_mem_usage=True,
399
+ local_files_only=True,
396
400
  )
397
401
 
398
402
  raise ValueError(
@@ -526,12 +530,27 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
526
530
  return self._load_from_singlefile(config)
527
531
  case SubModelType.Tokenizer:
528
532
  # For single-file Qwen3, load tokenizer from HuggingFace
529
- return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
533
+ # Try local cache first to support offline usage after initial download
534
+ return self._load_tokenizer_with_offline_fallback()
530
535
 
531
536
  raise ValueError(
532
537
  f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
533
538
  )
534
539
 
540
+ def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
541
+ """Load tokenizer with local_files_only fallback for offline support.
542
+
543
+ First tries to load from local cache (offline), falling back to network download
544
+ if the tokenizer hasn't been cached yet. This ensures offline operation after
545
+ the initial download.
546
+ """
547
+ try:
548
+ # Try loading from local cache first (supports offline usage)
549
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
550
+ except OSError:
551
+ # Not in cache yet, download from HuggingFace
552
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
553
+
535
554
  def _load_from_singlefile(
536
555
  self,
537
556
  config: AnyModelConfig,
@@ -686,12 +705,27 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
686
705
  return self._load_from_gguf(config)
687
706
  case SubModelType.Tokenizer:
688
707
  # For GGUF Qwen3, load tokenizer from HuggingFace
689
- return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
708
+ # Try local cache first to support offline usage after initial download
709
+ return self._load_tokenizer_with_offline_fallback()
690
710
 
691
711
  raise ValueError(
692
712
  f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
693
713
  )
694
714
 
715
+ def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
716
+ """Load tokenizer with local_files_only fallback for offline support.
717
+
718
+ First tries to load from local cache (offline), falling back to network download
719
+ if the tokenizer hasn't been cached yet. This ensures offline operation after
720
+ the initial download.
721
+ """
722
+ try:
723
+ # Try loading from local cache first (supports offline usage)
724
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
725
+ except OSError:
726
+ # Not in cache yet, download from HuggingFace
727
+ return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
728
+
695
729
  def _load_from_gguf(
696
730
  self,
697
731
  config: AnyModelConfig,
@@ -720,20 +720,20 @@ z_image_turbo_quantized = StarterModel(
720
720
  name="Z-Image Turbo (quantized)",
721
721
  base=BaseModelType.ZImage,
722
722
  source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
723
- description="Z-Image Turbo quantized to GGUF Q4_K format. Requires separate Qwen3 text encoder. ~4GB",
723
+ description="Z-Image Turbo quantized to GGUF Q4_K format. Requires standalone Qwen3 text encoder and Flux VAE. ~4GB",
724
724
  type=ModelType.Main,
725
725
  format=ModelFormat.GGUFQuantized,
726
- dependencies=[z_image_qwen3_encoder_quantized],
726
+ dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
727
727
  )
728
728
 
729
729
  z_image_turbo_q8 = StarterModel(
730
730
  name="Z-Image Turbo (Q8)",
731
731
  base=BaseModelType.ZImage,
732
732
  source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
733
- description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires separate Qwen3 text encoder. ~6.6GB",
733
+ description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires standalone Qwen3 text encoder and Flux VAE. ~6.6GB",
734
734
  type=ModelType.Main,
735
735
  format=ModelFormat.GGUFQuantized,
736
- dependencies=[z_image_qwen3_encoder_quantized],
736
+ dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
737
737
  )
738
738
 
739
739
  z_image_controlnet_union = StarterModel(
@@ -890,10 +890,19 @@ flux_bundle: list[StarterModel] = [
890
890
  flux_krea_quantized,
891
891
  ]
892
892
 
893
+ zimage_bundle: list[StarterModel] = [
894
+ z_image_turbo_quantized,
895
+ z_image_qwen3_encoder_quantized,
896
+ z_image_controlnet_union,
897
+ z_image_controlnet_tile,
898
+ flux_vae,
899
+ ]
900
+
893
901
  STARTER_BUNDLES: dict[str, StarterModelBundle] = {
894
902
  BaseModelType.StableDiffusion1: StarterModelBundle(name="Stable Diffusion 1.5", models=sd1_bundle),
895
903
  BaseModelType.StableDiffusionXL: StarterModelBundle(name="SDXL", models=sdxl_bundle),
896
904
  BaseModelType.Flux: StarterModelBundle(name="FLUX.1 dev", models=flux_bundle),
905
+ BaseModelType.ZImage: StarterModelBundle(name="Z-Image Turbo", models=zimage_bundle),
897
906
  }
898
907
 
899
908
  assert len(STARTER_MODELS) == len({m.source for m in STARTER_MODELS}), "Duplicate starter models"
@@ -140,16 +140,50 @@ def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | N
140
140
 
141
141
 
142
142
  def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
143
- """Groups the keys in the state dict by layer."""
143
+ """Groups the keys in the state dict by layer.
144
+
145
+ Z-Image LoRAs have keys like:
146
+ - diffusion_model.layers.17.attention.to_k.alpha
147
+ - diffusion_model.layers.17.attention.to_k.dora_scale
148
+ - diffusion_model.layers.17.attention.to_k.lora_down.weight
149
+ - diffusion_model.layers.17.attention.to_k.lora_up.weight
150
+
151
+ We need to group these by the full layer path (e.g., diffusion_model.layers.17.attention.to_k)
152
+ and extract the suffix (alpha, dora_scale, lora_down.weight, lora_up.weight).
153
+ """
144
154
  layer_dict: dict[str, dict[str, torch.Tensor]] = {}
155
+
156
+ # Known suffixes that indicate the end of a layer name
157
+ known_suffixes = [
158
+ ".lora_A.weight",
159
+ ".lora_B.weight",
160
+ ".lora_down.weight",
161
+ ".lora_up.weight",
162
+ ".dora_scale",
163
+ ".alpha",
164
+ ]
165
+
145
166
  for key in state_dict:
146
167
  if not isinstance(key, str):
147
168
  continue
148
- # Split the 'lora_A.weight' or 'lora_B.weight' suffix from the layer name.
149
- parts = key.rsplit(".", maxsplit=2)
150
- layer_name = parts[0]
151
- key_name = ".".join(parts[1:])
169
+
170
+ # Try to find a known suffix
171
+ layer_name = None
172
+ key_name = None
173
+ for suffix in known_suffixes:
174
+ if key.endswith(suffix):
175
+ layer_name = key[: -len(suffix)]
176
+ key_name = suffix[1:] # Remove leading dot
177
+ break
178
+
179
+ if layer_name is None:
180
+ # Fallback to original logic for unknown formats
181
+ parts = key.rsplit(".", maxsplit=2)
182
+ layer_name = parts[0]
183
+ key_name = ".".join(parts[1:])
184
+
152
185
  if layer_name not in layer_dict:
153
186
  layer_dict[layer_name] = {}
154
187
  layer_dict[layer_name][key_name] = state_dict[key]
188
+
155
189
  return layer_dict