InvokeAI 6.10.0rc1__py3-none-any.whl → 6.10.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invokeai/app/invocations/flux_denoise.py +15 -1
- invokeai/app/invocations/pbr_maps.py +59 -0
- invokeai/app/invocations/z_image_denoise.py +237 -82
- invokeai/backend/flux/denoise.py +196 -11
- invokeai/backend/flux/schedulers.py +62 -0
- invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
- invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
- invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
- invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
- invokeai/backend/model_manager/configs/lora.py +36 -0
- invokeai/backend/model_manager/load/load_default.py +1 -0
- invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
- invokeai/backend/model_manager/load/model_loaders/flux.py +13 -6
- invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
- invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
- invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +3 -1
- invokeai/backend/model_manager/load/model_loaders/z_image.py +37 -3
- invokeai/backend/model_manager/starter_models.py +13 -4
- invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
- invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
- invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
- invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +161 -0
- invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-BP0RxJ4G.js} +1 -1
- invokeai/frontend/web/dist/assets/{index-dgSJAY--.js → index-B44qKjrs.js} +51 -51
- invokeai/frontend/web/dist/index.html +1 -1
- invokeai/frontend/web/dist/locales/en-GB.json +1 -0
- invokeai/frontend/web/dist/locales/en.json +11 -5
- invokeai/version/invokeai_version.py +1 -1
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/METADATA +2 -2
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/RECORD +36 -29
- invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/WHEEL +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/entry_points.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Original: https://github.com/joeyballentine/Material-Map-Generator
|
|
2
|
+
# Adopted and optimized for Invoke AI
|
|
3
|
+
|
|
4
|
+
import math
|
|
5
|
+
from typing import Literal, Optional
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
|
|
10
|
+
import invokeai.backend.image_util.pbr_maps.architecture.block as B
|
|
11
|
+
|
|
12
|
+
UPSCALE_MODE = Literal["upconv", "pixelshuffle"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PBR_RRDB_Net(nn.Module):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
in_nc: int,
|
|
19
|
+
out_nc: int,
|
|
20
|
+
nf: int,
|
|
21
|
+
nb: int,
|
|
22
|
+
gc: int = 32,
|
|
23
|
+
upscale: int = 4,
|
|
24
|
+
norm_type: Optional[B.NORMALIZATION_LAYER_TYPE] = None,
|
|
25
|
+
act_type: B.ACTIVATION_LAYER_TYPE = "leakyrelu",
|
|
26
|
+
mode: B.BLOCK_MODE = "CNA",
|
|
27
|
+
res_scale: int = 1,
|
|
28
|
+
upsample_mode: UPSCALE_MODE = "upconv",
|
|
29
|
+
):
|
|
30
|
+
super(PBR_RRDB_Net, self).__init__()
|
|
31
|
+
n_upscale = int(math.log(upscale, 2))
|
|
32
|
+
if upscale == 3:
|
|
33
|
+
n_upscale = 1
|
|
34
|
+
|
|
35
|
+
fea_conv = B.conv_block(in_nc, nf, kernel_size=3, norm_type=None, act_type=None)
|
|
36
|
+
rb_blocks = [
|
|
37
|
+
B.RRDB(
|
|
38
|
+
nf,
|
|
39
|
+
kernel_size=3,
|
|
40
|
+
gc=32,
|
|
41
|
+
stride=1,
|
|
42
|
+
bias=True,
|
|
43
|
+
pad_type="zero",
|
|
44
|
+
norm_type=norm_type,
|
|
45
|
+
act_type=act_type,
|
|
46
|
+
mode="CNA",
|
|
47
|
+
)
|
|
48
|
+
for _ in range(nb)
|
|
49
|
+
]
|
|
50
|
+
LR_conv = B.conv_block(nf, nf, kernel_size=3, norm_type=norm_type, act_type=None, mode=mode)
|
|
51
|
+
|
|
52
|
+
if upsample_mode == "upconv":
|
|
53
|
+
upsample_block = B.upconv_block
|
|
54
|
+
elif upsample_mode == "pixelshuffle":
|
|
55
|
+
upsample_block = B.pixelshuffle_block
|
|
56
|
+
|
|
57
|
+
if upscale == 3:
|
|
58
|
+
upsampler = upsample_block(nf, nf, 3, act_type=act_type)
|
|
59
|
+
else:
|
|
60
|
+
upsampler = [upsample_block(nf, nf, act_type=act_type) for _ in range(n_upscale)]
|
|
61
|
+
|
|
62
|
+
HR_conv0 = B.conv_block(nf, nf, kernel_size=3, norm_type=None, act_type=act_type)
|
|
63
|
+
HR_conv1 = B.conv_block(nf, out_nc, kernel_size=3, norm_type=None, act_type=None)
|
|
64
|
+
|
|
65
|
+
self.model = B.sequential(
|
|
66
|
+
fea_conv, B.ShortcutBlock(B.sequential(*rb_blocks, LR_conv)), *upsampler, HR_conv0, HR_conv1
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def forward(self, x: torch.Tensor):
|
|
70
|
+
return self.model(x)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Original: https://github.com/joeyballentine/Material-Map-Generator
|
|
2
|
+
# Adopted and optimized for Invoke AI
|
|
3
|
+
|
|
4
|
+
import pathlib
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
import cv2
|
|
8
|
+
import numpy as np
|
|
9
|
+
import numpy.typing as npt
|
|
10
|
+
import torch
|
|
11
|
+
from PIL import Image
|
|
12
|
+
from safetensors.torch import load_file
|
|
13
|
+
|
|
14
|
+
from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
|
|
15
|
+
from invokeai.backend.image_util.pbr_maps.utils.image_ops import crop_seamless, esrgan_launcher_split_merge
|
|
16
|
+
|
|
17
|
+
NORMAL_MAP_MODEL = (
|
|
18
|
+
"https://huggingface.co/InvokeAI/pbr-material-maps/resolve/main/normal_map_generator.safetensors?download=true"
|
|
19
|
+
)
|
|
20
|
+
OTHER_MAP_MODEL = (
|
|
21
|
+
"https://huggingface.co/InvokeAI/pbr-material-maps/resolve/main/franken_map_generator.safetensors?download=true"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PBRMapsGenerator:
|
|
26
|
+
def __init__(self, normal_map_model: PBR_RRDB_Net, other_map_model: PBR_RRDB_Net, device: torch.device) -> None:
|
|
27
|
+
self.normal_map_model = normal_map_model
|
|
28
|
+
self.other_map_model = other_map_model
|
|
29
|
+
self.device = device
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def load_model(model_path: pathlib.Path, device: torch.device) -> PBR_RRDB_Net:
|
|
33
|
+
state_dict = load_file(model_path.as_posix(), device=device.type)
|
|
34
|
+
|
|
35
|
+
model = PBR_RRDB_Net(
|
|
36
|
+
3,
|
|
37
|
+
3,
|
|
38
|
+
32,
|
|
39
|
+
12,
|
|
40
|
+
gc=32,
|
|
41
|
+
upscale=1,
|
|
42
|
+
norm_type=None,
|
|
43
|
+
act_type="leakyrelu",
|
|
44
|
+
mode="CNA",
|
|
45
|
+
res_scale=1,
|
|
46
|
+
upsample_mode="upconv",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
model.load_state_dict(state_dict, strict=False)
|
|
50
|
+
|
|
51
|
+
del state_dict
|
|
52
|
+
if torch.cuda.is_available() and device.type == "cuda":
|
|
53
|
+
torch.cuda.empty_cache()
|
|
54
|
+
|
|
55
|
+
model.eval()
|
|
56
|
+
|
|
57
|
+
for _, v in model.named_parameters():
|
|
58
|
+
v.requires_grad = False
|
|
59
|
+
|
|
60
|
+
return model.to(device)
|
|
61
|
+
|
|
62
|
+
def process(self, img: npt.NDArray[Any], model: PBR_RRDB_Net):
|
|
63
|
+
img = img.astype(np.float32) / np.iinfo(img.dtype).max
|
|
64
|
+
img = img[..., ::-1].copy()
|
|
65
|
+
tensor_img = torch.tensor(img).permute(2, 0, 1).unsqueeze(0).to(self.device)
|
|
66
|
+
|
|
67
|
+
with torch.no_grad():
|
|
68
|
+
output = model(tensor_img).data.squeeze(0).float().cpu().clamp_(0, 1).numpy()
|
|
69
|
+
output = output[[2, 1, 0], :, :]
|
|
70
|
+
output = np.transpose(output, (1, 2, 0))
|
|
71
|
+
output = (output * 255.0).round()
|
|
72
|
+
return output
|
|
73
|
+
|
|
74
|
+
def _cv2_to_pil(self, image: npt.NDArray[Any]):
|
|
75
|
+
return Image.fromarray(cv2.cvtColor(image.astype(np.uint8), cv2.COLOR_RGB2BGR))
|
|
76
|
+
|
|
77
|
+
def generate_maps(
|
|
78
|
+
self,
|
|
79
|
+
image: Image.Image,
|
|
80
|
+
tile_size: int = 512,
|
|
81
|
+
border_mode: Literal["none", "seamless", "mirror", "replicate"] = "none",
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Generate PBR texture maps (normal, roughness, and displacement) from an input image.
|
|
85
|
+
The image can optionally be padded before inference to control how borders are treated,
|
|
86
|
+
which can help create seamless or edge‑consistent textures.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
image: Source image used to generate the PBR maps.
|
|
90
|
+
tile_size: Maximum tile size used for tiled inference. If the image is larger than
|
|
91
|
+
this size in either dimension, it will be split into tiles for processing and
|
|
92
|
+
then merged.
|
|
93
|
+
|
|
94
|
+
border_mode: Strategy for padding the image before inference:
|
|
95
|
+
- "none": No padding is applied; the image is processed as‑is.
|
|
96
|
+
- "seamless": Pads the image using wrap‑around tiling
|
|
97
|
+
(`cv2.BORDER_WRAP`) to help produce seamless textures.
|
|
98
|
+
- "mirror": Pads the image by mirroring border pixels
|
|
99
|
+
(`cv2.BORDER_REFLECT_101`) to reduce edge artifacts.
|
|
100
|
+
- "replicate": Pads the image by replicating the edge pixels outward
|
|
101
|
+
(`cv2.BORDER_REPLICATE`).
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A tuple of three PIL Images:
|
|
105
|
+
- normal_map: RGB normal map generated from the input.
|
|
106
|
+
- roughness: Single‑channel roughness map extracted from the second model output.
|
|
107
|
+
- displacement: Single‑channel displacement (height) map extracted from the
|
|
108
|
+
second model output.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
models = [self.normal_map_model, self.other_map_model]
|
|
112
|
+
np_image = np.array(image).astype(np.uint8)
|
|
113
|
+
|
|
114
|
+
match border_mode:
|
|
115
|
+
case "seamless":
|
|
116
|
+
np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_WRAP)
|
|
117
|
+
case "mirror":
|
|
118
|
+
np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_REFLECT_101)
|
|
119
|
+
case "replicate":
|
|
120
|
+
np_image = cv2.copyMakeBorder(np_image, 16, 16, 16, 16, cv2.BORDER_REPLICATE)
|
|
121
|
+
case "none":
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
img_height, img_width = np_image.shape[:2]
|
|
125
|
+
|
|
126
|
+
# Checking whether to perform tiled inference
|
|
127
|
+
do_split = img_height > tile_size or img_width > tile_size
|
|
128
|
+
|
|
129
|
+
if do_split:
|
|
130
|
+
rlts = esrgan_launcher_split_merge(np_image, self.process, models, scale_factor=1, tile_size=tile_size)
|
|
131
|
+
else:
|
|
132
|
+
rlts = [self.process(np_image, model) for model in models]
|
|
133
|
+
|
|
134
|
+
if border_mode != "none":
|
|
135
|
+
rlts = [crop_seamless(rlt) for rlt in rlts]
|
|
136
|
+
|
|
137
|
+
normal_map = self._cv2_to_pil(rlts[0])
|
|
138
|
+
roughness = self._cv2_to_pil(rlts[1][:, :, 1])
|
|
139
|
+
displacement = self._cv2_to_pil(rlts[1][:, :, 0])
|
|
140
|
+
|
|
141
|
+
return normal_map, roughness, displacement
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Original: https://github.com/joeyballentine/Material-Map-Generator
|
|
2
|
+
# Adopted and optimized for Invoke AI
|
|
3
|
+
|
|
4
|
+
import math
|
|
5
|
+
from typing import Any, Callable, List
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import numpy.typing as npt
|
|
9
|
+
|
|
10
|
+
from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def crop_seamless(img: npt.NDArray[Any]):
|
|
14
|
+
img_height, img_width = img.shape[:2]
|
|
15
|
+
y, x = 16, 16
|
|
16
|
+
h, w = img_height - 32, img_width - 32
|
|
17
|
+
img = img[y : y + h, x : x + w]
|
|
18
|
+
return img
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# from https://github.com/ata4/esrgan-launcher/blob/master/upscale.py
|
|
22
|
+
def esrgan_launcher_split_merge(
|
|
23
|
+
input_image: npt.NDArray[Any],
|
|
24
|
+
upscale_function: Callable[[npt.NDArray[Any], PBR_RRDB_Net], npt.NDArray[Any]],
|
|
25
|
+
models: List[PBR_RRDB_Net],
|
|
26
|
+
scale_factor: int = 4,
|
|
27
|
+
tile_size: int = 512,
|
|
28
|
+
tile_padding: float = 0.125,
|
|
29
|
+
):
|
|
30
|
+
width, height, depth = input_image.shape
|
|
31
|
+
output_width = width * scale_factor
|
|
32
|
+
output_height = height * scale_factor
|
|
33
|
+
output_shape = (output_width, output_height, depth)
|
|
34
|
+
|
|
35
|
+
# start with black image
|
|
36
|
+
output_images = [np.zeros(output_shape, np.uint8) for _ in range(len(models))]
|
|
37
|
+
|
|
38
|
+
tile_padding = math.ceil(tile_size * tile_padding)
|
|
39
|
+
tile_size = math.ceil(tile_size / scale_factor)
|
|
40
|
+
|
|
41
|
+
tiles_x = math.ceil(width / tile_size)
|
|
42
|
+
tiles_y = math.ceil(height / tile_size)
|
|
43
|
+
|
|
44
|
+
for y in range(tiles_y):
|
|
45
|
+
for x in range(tiles_x):
|
|
46
|
+
# extract tile from input image
|
|
47
|
+
ofs_x = x * tile_size
|
|
48
|
+
ofs_y = y * tile_size
|
|
49
|
+
|
|
50
|
+
# input tile area on total image
|
|
51
|
+
input_start_x = ofs_x
|
|
52
|
+
input_end_x = min(ofs_x + tile_size, width)
|
|
53
|
+
|
|
54
|
+
input_start_y = ofs_y
|
|
55
|
+
input_end_y = min(ofs_y + tile_size, height)
|
|
56
|
+
|
|
57
|
+
# input tile area on total image with padding
|
|
58
|
+
input_start_x_pad = max(input_start_x - tile_padding, 0)
|
|
59
|
+
input_end_x_pad = min(input_end_x + tile_padding, width)
|
|
60
|
+
|
|
61
|
+
input_start_y_pad = max(input_start_y - tile_padding, 0)
|
|
62
|
+
input_end_y_pad = min(input_end_y + tile_padding, height)
|
|
63
|
+
|
|
64
|
+
# input tile dimensions
|
|
65
|
+
input_tile_width = input_end_x - input_start_x
|
|
66
|
+
input_tile_height = input_end_y - input_start_y
|
|
67
|
+
|
|
68
|
+
input_tile = input_image[input_start_x_pad:input_end_x_pad, input_start_y_pad:input_end_y_pad]
|
|
69
|
+
|
|
70
|
+
for idx, model in enumerate(models):
|
|
71
|
+
# upscale tile
|
|
72
|
+
output_tile = upscale_function(input_tile, model)
|
|
73
|
+
|
|
74
|
+
# output tile area on total image
|
|
75
|
+
output_start_x = input_start_x * scale_factor
|
|
76
|
+
output_end_x = input_end_x * scale_factor
|
|
77
|
+
|
|
78
|
+
output_start_y = input_start_y * scale_factor
|
|
79
|
+
output_end_y = input_end_y * scale_factor
|
|
80
|
+
|
|
81
|
+
# output tile area without padding
|
|
82
|
+
output_start_x_tile = (input_start_x - input_start_x_pad) * scale_factor
|
|
83
|
+
output_end_x_tile = output_start_x_tile + input_tile_width * scale_factor
|
|
84
|
+
|
|
85
|
+
output_start_y_tile = (input_start_y - input_start_y_pad) * scale_factor
|
|
86
|
+
output_end_y_tile = output_start_y_tile + input_tile_height * scale_factor
|
|
87
|
+
|
|
88
|
+
# put tile into output image
|
|
89
|
+
output_images[idx][output_start_x:output_end_x, output_start_y:output_end_y] = output_tile[
|
|
90
|
+
output_start_x_tile:output_end_x_tile, output_start_y_tile:output_end_y_tile
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
return output_images
|
|
@@ -227,6 +227,42 @@ class LoRA_LyCORIS_ZImage_Config(LoRA_LyCORIS_Config_Base, Config_Base):
|
|
|
227
227
|
|
|
228
228
|
base: Literal[BaseModelType.ZImage] = Field(default=BaseModelType.ZImage)
|
|
229
229
|
|
|
230
|
+
@classmethod
|
|
231
|
+
def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None:
|
|
232
|
+
"""Z-Image LoRAs have different key patterns than SD/SDXL LoRAs.
|
|
233
|
+
|
|
234
|
+
Z-Image LoRAs use keys like:
|
|
235
|
+
- diffusion_model.layers.X.attention.to_k.lora_down.weight (DoRA format)
|
|
236
|
+
- diffusion_model.layers.X.attention.to_k.lora_A.weight (PEFT format)
|
|
237
|
+
- diffusion_model.layers.X.attention.to_k.dora_scale (DoRA scale)
|
|
238
|
+
"""
|
|
239
|
+
state_dict = mod.load_state_dict()
|
|
240
|
+
|
|
241
|
+
# Check for Z-Image specific LoRA patterns
|
|
242
|
+
has_z_image_lora_keys = state_dict_has_any_keys_starting_with(
|
|
243
|
+
state_dict,
|
|
244
|
+
{
|
|
245
|
+
"diffusion_model.layers.", # Z-Image S3-DiT layer pattern
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Also check for LoRA weight suffixes (various formats)
|
|
250
|
+
has_lora_suffix = state_dict_has_any_keys_ending_with(
|
|
251
|
+
state_dict,
|
|
252
|
+
{
|
|
253
|
+
"lora_A.weight",
|
|
254
|
+
"lora_B.weight",
|
|
255
|
+
"lora_down.weight",
|
|
256
|
+
"lora_up.weight",
|
|
257
|
+
"dora_scale",
|
|
258
|
+
},
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if has_z_image_lora_keys and has_lora_suffix:
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
raise NotAMatchError("model does not match Z-Image LoRA heuristics")
|
|
265
|
+
|
|
230
266
|
@classmethod
|
|
231
267
|
def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType:
|
|
232
268
|
"""Z-Image LoRAs are identified by their diffusion_model.layers structure.
|
|
@@ -75,6 +75,7 @@ class ModelLoader(ModelLoaderBase):
|
|
|
75
75
|
|
|
76
76
|
config.path = str(self._get_model_path(config))
|
|
77
77
|
self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
|
|
78
|
+
self._logger.info(f"Loading model '{stats_name}' into RAM cache..., config={config}")
|
|
78
79
|
loaded_model = self._load_model(config, submodel_type)
|
|
79
80
|
|
|
80
81
|
self._ram_cache.put(
|
|
@@ -45,12 +45,13 @@ class CogView4DiffusersModel(GenericDiffusersLoader):
|
|
|
45
45
|
model_path,
|
|
46
46
|
torch_dtype=dtype,
|
|
47
47
|
variant=variant,
|
|
48
|
+
local_files_only=True,
|
|
48
49
|
)
|
|
49
50
|
except OSError as e:
|
|
50
51
|
if variant and "no file named" in str(
|
|
51
52
|
e
|
|
52
53
|
): # try without the variant, just in case user's preferences changed
|
|
53
|
-
result = load_class.from_pretrained(model_path, torch_dtype=dtype)
|
|
54
|
+
result = load_class.from_pretrained(model_path, torch_dtype=dtype, local_files_only=True)
|
|
54
55
|
else:
|
|
55
56
|
raise e
|
|
56
57
|
|
|
@@ -122,9 +122,9 @@ class CLIPDiffusersLoader(ModelLoader):
|
|
|
122
122
|
|
|
123
123
|
match submodel_type:
|
|
124
124
|
case SubModelType.Tokenizer:
|
|
125
|
-
return CLIPTokenizer.from_pretrained(Path(config.path) / "tokenizer")
|
|
125
|
+
return CLIPTokenizer.from_pretrained(Path(config.path) / "tokenizer", local_files_only=True)
|
|
126
126
|
case SubModelType.TextEncoder:
|
|
127
|
-
return CLIPTextModel.from_pretrained(Path(config.path) / "text_encoder")
|
|
127
|
+
return CLIPTextModel.from_pretrained(Path(config.path) / "text_encoder", local_files_only=True)
|
|
128
128
|
|
|
129
129
|
raise ValueError(
|
|
130
130
|
f"Only Tokenizer and TextEncoder submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}"
|
|
@@ -148,10 +148,12 @@ class BnbQuantizedLlmInt8bCheckpointModel(ModelLoader):
|
|
|
148
148
|
)
|
|
149
149
|
match submodel_type:
|
|
150
150
|
case SubModelType.Tokenizer2 | SubModelType.Tokenizer3:
|
|
151
|
-
return T5TokenizerFast.from_pretrained(
|
|
151
|
+
return T5TokenizerFast.from_pretrained(
|
|
152
|
+
Path(config.path) / "tokenizer_2", max_length=512, local_files_only=True
|
|
153
|
+
)
|
|
152
154
|
case SubModelType.TextEncoder2 | SubModelType.TextEncoder3:
|
|
153
155
|
te2_model_path = Path(config.path) / "text_encoder_2"
|
|
154
|
-
model_config = AutoConfig.from_pretrained(te2_model_path)
|
|
156
|
+
model_config = AutoConfig.from_pretrained(te2_model_path, local_files_only=True)
|
|
155
157
|
with accelerate.init_empty_weights():
|
|
156
158
|
model = AutoModelForTextEncoding.from_config(model_config)
|
|
157
159
|
model = quantize_model_llm_int8(model, modules_to_not_convert=set())
|
|
@@ -192,10 +194,15 @@ class T5EncoderCheckpointModel(ModelLoader):
|
|
|
192
194
|
|
|
193
195
|
match submodel_type:
|
|
194
196
|
case SubModelType.Tokenizer2 | SubModelType.Tokenizer3:
|
|
195
|
-
return T5TokenizerFast.from_pretrained(
|
|
197
|
+
return T5TokenizerFast.from_pretrained(
|
|
198
|
+
Path(config.path) / "tokenizer_2", max_length=512, local_files_only=True
|
|
199
|
+
)
|
|
196
200
|
case SubModelType.TextEncoder2 | SubModelType.TextEncoder3:
|
|
197
201
|
return T5EncoderModel.from_pretrained(
|
|
198
|
-
Path(config.path) / "text_encoder_2",
|
|
202
|
+
Path(config.path) / "text_encoder_2",
|
|
203
|
+
torch_dtype="auto",
|
|
204
|
+
low_cpu_mem_usage=True,
|
|
205
|
+
local_files_only=True,
|
|
199
206
|
)
|
|
200
207
|
|
|
201
208
|
raise ValueError(
|
|
@@ -37,12 +37,14 @@ class GenericDiffusersLoader(ModelLoader):
|
|
|
37
37
|
repo_variant = config.repo_variant if isinstance(config, Diffusers_Config_Base) else None
|
|
38
38
|
variant = repo_variant.value if repo_variant else None
|
|
39
39
|
try:
|
|
40
|
-
result: AnyModel = model_class.from_pretrained(
|
|
40
|
+
result: AnyModel = model_class.from_pretrained(
|
|
41
|
+
model_path, torch_dtype=self._torch_dtype, variant=variant, local_files_only=True
|
|
42
|
+
)
|
|
41
43
|
except OSError as e:
|
|
42
44
|
if variant and "no file named" in str(
|
|
43
45
|
e
|
|
44
46
|
): # try without the variant, just in case user's preferences changed
|
|
45
|
-
result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
|
|
47
|
+
result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
|
|
46
48
|
else:
|
|
47
49
|
raise e
|
|
48
50
|
return result
|
|
@@ -80,12 +80,13 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
|
|
|
80
80
|
model_path,
|
|
81
81
|
torch_dtype=self._torch_dtype,
|
|
82
82
|
variant=variant,
|
|
83
|
+
local_files_only=True,
|
|
83
84
|
)
|
|
84
85
|
except OSError as e:
|
|
85
86
|
if variant and "no file named" in str(
|
|
86
87
|
e
|
|
87
88
|
): # try without the variant, just in case user's preferences changed
|
|
88
|
-
result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype)
|
|
89
|
+
result = load_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
|
|
89
90
|
else:
|
|
90
91
|
raise e
|
|
91
92
|
|
|
@@ -139,6 +140,7 @@ class StableDiffusionDiffusersModel(GenericDiffusersLoader):
|
|
|
139
140
|
# Some weights of the model checkpoint were not used when initializing CLIPTextModelWithProjection:
|
|
140
141
|
# ['text_model.embeddings.position_ids']
|
|
141
142
|
|
|
143
|
+
self._logger.info(f"Loading model from single file at {config.path} using {load_class.__name__}")
|
|
142
144
|
with SilenceWarnings():
|
|
143
145
|
pipeline = load_class.from_single_file(config.path, torch_dtype=self._torch_dtype)
|
|
144
146
|
|
|
@@ -384,15 +384,19 @@ class Qwen3EncoderLoader(ModelLoader):
|
|
|
384
384
|
|
|
385
385
|
match submodel_type:
|
|
386
386
|
case SubModelType.Tokenizer:
|
|
387
|
-
|
|
387
|
+
# Use local_files_only=True to prevent network requests for validation
|
|
388
|
+
# The tokenizer files should already exist locally in the model directory
|
|
389
|
+
return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
|
|
388
390
|
case SubModelType.TextEncoder:
|
|
389
391
|
# Determine safe dtype based on target device capabilities
|
|
390
392
|
target_device = TorchDevice.choose_torch_device()
|
|
391
393
|
model_dtype = TorchDevice.choose_bfloat16_safe_dtype(target_device)
|
|
394
|
+
# Use local_files_only=True to prevent network requests for validation
|
|
392
395
|
return Qwen3ForCausalLM.from_pretrained(
|
|
393
396
|
text_encoder_path,
|
|
394
397
|
torch_dtype=model_dtype,
|
|
395
398
|
low_cpu_mem_usage=True,
|
|
399
|
+
local_files_only=True,
|
|
396
400
|
)
|
|
397
401
|
|
|
398
402
|
raise ValueError(
|
|
@@ -526,12 +530,27 @@ class Qwen3EncoderCheckpointLoader(ModelLoader):
|
|
|
526
530
|
return self._load_from_singlefile(config)
|
|
527
531
|
case SubModelType.Tokenizer:
|
|
528
532
|
# For single-file Qwen3, load tokenizer from HuggingFace
|
|
529
|
-
|
|
533
|
+
# Try local cache first to support offline usage after initial download
|
|
534
|
+
return self._load_tokenizer_with_offline_fallback()
|
|
530
535
|
|
|
531
536
|
raise ValueError(
|
|
532
537
|
f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
|
|
533
538
|
)
|
|
534
539
|
|
|
540
|
+
def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
|
|
541
|
+
"""Load tokenizer with local_files_only fallback for offline support.
|
|
542
|
+
|
|
543
|
+
First tries to load from local cache (offline), falling back to network download
|
|
544
|
+
if the tokenizer hasn't been cached yet. This ensures offline operation after
|
|
545
|
+
the initial download.
|
|
546
|
+
"""
|
|
547
|
+
try:
|
|
548
|
+
# Try loading from local cache first (supports offline usage)
|
|
549
|
+
return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
|
|
550
|
+
except OSError:
|
|
551
|
+
# Not in cache yet, download from HuggingFace
|
|
552
|
+
return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
|
|
553
|
+
|
|
535
554
|
def _load_from_singlefile(
|
|
536
555
|
self,
|
|
537
556
|
config: AnyModelConfig,
|
|
@@ -686,12 +705,27 @@ class Qwen3EncoderGGUFLoader(ModelLoader):
|
|
|
686
705
|
return self._load_from_gguf(config)
|
|
687
706
|
case SubModelType.Tokenizer:
|
|
688
707
|
# For GGUF Qwen3, load tokenizer from HuggingFace
|
|
689
|
-
|
|
708
|
+
# Try local cache first to support offline usage after initial download
|
|
709
|
+
return self._load_tokenizer_with_offline_fallback()
|
|
690
710
|
|
|
691
711
|
raise ValueError(
|
|
692
712
|
f"Only TextEncoder and Tokenizer submodels are supported. Received: {submodel_type.value if submodel_type else 'None'}"
|
|
693
713
|
)
|
|
694
714
|
|
|
715
|
+
def _load_tokenizer_with_offline_fallback(self) -> AnyModel:
|
|
716
|
+
"""Load tokenizer with local_files_only fallback for offline support.
|
|
717
|
+
|
|
718
|
+
First tries to load from local cache (offline), falling back to network download
|
|
719
|
+
if the tokenizer hasn't been cached yet. This ensures offline operation after
|
|
720
|
+
the initial download.
|
|
721
|
+
"""
|
|
722
|
+
try:
|
|
723
|
+
# Try loading from local cache first (supports offline usage)
|
|
724
|
+
return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE, local_files_only=True)
|
|
725
|
+
except OSError:
|
|
726
|
+
# Not in cache yet, download from HuggingFace
|
|
727
|
+
return AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_SOURCE)
|
|
728
|
+
|
|
695
729
|
def _load_from_gguf(
|
|
696
730
|
self,
|
|
697
731
|
config: AnyModelConfig,
|
|
@@ -720,20 +720,20 @@ z_image_turbo_quantized = StarterModel(
|
|
|
720
720
|
name="Z-Image Turbo (quantized)",
|
|
721
721
|
base=BaseModelType.ZImage,
|
|
722
722
|
source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_K.gguf",
|
|
723
|
-
description="Z-Image Turbo quantized to GGUF Q4_K format. Requires
|
|
723
|
+
description="Z-Image Turbo quantized to GGUF Q4_K format. Requires standalone Qwen3 text encoder and Flux VAE. ~4GB",
|
|
724
724
|
type=ModelType.Main,
|
|
725
725
|
format=ModelFormat.GGUFQuantized,
|
|
726
|
-
dependencies=[z_image_qwen3_encoder_quantized],
|
|
726
|
+
dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
|
|
727
727
|
)
|
|
728
728
|
|
|
729
729
|
z_image_turbo_q8 = StarterModel(
|
|
730
730
|
name="Z-Image Turbo (Q8)",
|
|
731
731
|
base=BaseModelType.ZImage,
|
|
732
732
|
source="https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q8_0.gguf",
|
|
733
|
-
description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires
|
|
733
|
+
description="Z-Image Turbo quantized to GGUF Q8_0 format. Higher quality, larger size. Requires standalone Qwen3 text encoder and Flux VAE. ~6.6GB",
|
|
734
734
|
type=ModelType.Main,
|
|
735
735
|
format=ModelFormat.GGUFQuantized,
|
|
736
|
-
dependencies=[z_image_qwen3_encoder_quantized],
|
|
736
|
+
dependencies=[z_image_qwen3_encoder_quantized, flux_vae],
|
|
737
737
|
)
|
|
738
738
|
|
|
739
739
|
z_image_controlnet_union = StarterModel(
|
|
@@ -890,10 +890,19 @@ flux_bundle: list[StarterModel] = [
|
|
|
890
890
|
flux_krea_quantized,
|
|
891
891
|
]
|
|
892
892
|
|
|
893
|
+
zimage_bundle: list[StarterModel] = [
|
|
894
|
+
z_image_turbo_quantized,
|
|
895
|
+
z_image_qwen3_encoder_quantized,
|
|
896
|
+
z_image_controlnet_union,
|
|
897
|
+
z_image_controlnet_tile,
|
|
898
|
+
flux_vae,
|
|
899
|
+
]
|
|
900
|
+
|
|
893
901
|
STARTER_BUNDLES: dict[str, StarterModelBundle] = {
|
|
894
902
|
BaseModelType.StableDiffusion1: StarterModelBundle(name="Stable Diffusion 1.5", models=sd1_bundle),
|
|
895
903
|
BaseModelType.StableDiffusionXL: StarterModelBundle(name="SDXL", models=sdxl_bundle),
|
|
896
904
|
BaseModelType.Flux: StarterModelBundle(name="FLUX.1 dev", models=flux_bundle),
|
|
905
|
+
BaseModelType.ZImage: StarterModelBundle(name="Z-Image Turbo", models=zimage_bundle),
|
|
897
906
|
}
|
|
898
907
|
|
|
899
908
|
assert len(STARTER_MODELS) == len({m.source for m in STARTER_MODELS}), "Duplicate starter models"
|
|
@@ -140,16 +140,50 @@ def _get_lora_layer_values(layer_dict: dict[str, torch.Tensor], alpha: float | N
|
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
def _group_by_layer(state_dict: Dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
|
|
143
|
-
"""Groups the keys in the state dict by layer.
|
|
143
|
+
"""Groups the keys in the state dict by layer.
|
|
144
|
+
|
|
145
|
+
Z-Image LoRAs have keys like:
|
|
146
|
+
- diffusion_model.layers.17.attention.to_k.alpha
|
|
147
|
+
- diffusion_model.layers.17.attention.to_k.dora_scale
|
|
148
|
+
- diffusion_model.layers.17.attention.to_k.lora_down.weight
|
|
149
|
+
- diffusion_model.layers.17.attention.to_k.lora_up.weight
|
|
150
|
+
|
|
151
|
+
We need to group these by the full layer path (e.g., diffusion_model.layers.17.attention.to_k)
|
|
152
|
+
and extract the suffix (alpha, dora_scale, lora_down.weight, lora_up.weight).
|
|
153
|
+
"""
|
|
144
154
|
layer_dict: dict[str, dict[str, torch.Tensor]] = {}
|
|
155
|
+
|
|
156
|
+
# Known suffixes that indicate the end of a layer name
|
|
157
|
+
known_suffixes = [
|
|
158
|
+
".lora_A.weight",
|
|
159
|
+
".lora_B.weight",
|
|
160
|
+
".lora_down.weight",
|
|
161
|
+
".lora_up.weight",
|
|
162
|
+
".dora_scale",
|
|
163
|
+
".alpha",
|
|
164
|
+
]
|
|
165
|
+
|
|
145
166
|
for key in state_dict:
|
|
146
167
|
if not isinstance(key, str):
|
|
147
168
|
continue
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
layer_name =
|
|
151
|
-
key_name =
|
|
169
|
+
|
|
170
|
+
# Try to find a known suffix
|
|
171
|
+
layer_name = None
|
|
172
|
+
key_name = None
|
|
173
|
+
for suffix in known_suffixes:
|
|
174
|
+
if key.endswith(suffix):
|
|
175
|
+
layer_name = key[: -len(suffix)]
|
|
176
|
+
key_name = suffix[1:] # Remove leading dot
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
if layer_name is None:
|
|
180
|
+
# Fallback to original logic for unknown formats
|
|
181
|
+
parts = key.rsplit(".", maxsplit=2)
|
|
182
|
+
layer_name = parts[0]
|
|
183
|
+
key_name = ".".join(parts[1:])
|
|
184
|
+
|
|
152
185
|
if layer_name not in layer_dict:
|
|
153
186
|
layer_dict[layer_name] = {}
|
|
154
187
|
layer_dict[layer_name][key_name] = state_dict[key]
|
|
188
|
+
|
|
155
189
|
return layer_dict
|