diffsynth-engine 0.6.1.dev34__py3-none-any.whl → 0.6.1.dev35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffsynth_engine/__init__.py +4 -0
- diffsynth_engine/conf/models/z_image/qwen3_config.json +30 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/merges.txt +151388 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer.json +757480 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer_config.json +239 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/vocab.json +1 -0
- diffsynth_engine/configs/__init__.py +4 -0
- diffsynth_engine/configs/pipeline.py +44 -1
- diffsynth_engine/models/z_image/__init__.py +11 -0
- diffsynth_engine/models/z_image/qwen3.py +124 -0
- diffsynth_engine/models/z_image/z_image_dit.py +602 -0
- diffsynth_engine/pipelines/__init__.py +2 -0
- diffsynth_engine/pipelines/z_image.py +377 -0
- diffsynth_engine/utils/constants.py +3 -0
- diffsynth_engine/utils/process_group.py +1 -1
- {diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/METADATA +1 -1
- {diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/RECORD +20 -11
- {diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/WHEEL +0 -0
- {diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/licenses/LICENSE +0 -0
- {diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.distributed as dist
|
|
3
|
+
import math
|
|
4
|
+
import json
|
|
5
|
+
from typing import Callable, List, Dict, Tuple, Optional, Union
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from diffsynth_engine.configs import (
|
|
9
|
+
ZImagePipelineConfig,
|
|
10
|
+
ZImageStateDicts,
|
|
11
|
+
)
|
|
12
|
+
from diffsynth_engine.models.basic.lora import LoRAContext
|
|
13
|
+
|
|
14
|
+
from diffsynth_engine.models.z_image import (
|
|
15
|
+
ZImageDiT,
|
|
16
|
+
Qwen3Model,
|
|
17
|
+
Qwen3Config,
|
|
18
|
+
)
|
|
19
|
+
from diffsynth_engine.tokenizers.qwen2 import Qwen2TokenizerFast
|
|
20
|
+
from diffsynth_engine.utils.constants import (
|
|
21
|
+
Z_IMAGE_TEXT_ENCODER_CONFIG_FILE,
|
|
22
|
+
Z_IMAGE_TOKENIZER_CONF_PATH,
|
|
23
|
+
)
|
|
24
|
+
from diffsynth_engine.models.flux import FluxVAEDecoder
|
|
25
|
+
from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
|
|
26
|
+
from diffsynth_engine.pipelines.utils import calculate_shift
|
|
27
|
+
from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
|
|
28
|
+
from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
|
|
29
|
+
from diffsynth_engine.utils.parallel import ParallelWrapper
|
|
30
|
+
from diffsynth_engine.utils import logging
|
|
31
|
+
from diffsynth_engine.utils.fp8_linear import enable_fp8_linear
|
|
32
|
+
from diffsynth_engine.utils.download import fetch_model
|
|
33
|
+
|
|
34
|
+
logger = logging.get_logger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ZImageLoRAConverter(LoRAStateDictConverter):
|
|
38
|
+
def _from_diffusers(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
|
|
39
|
+
dit_dict = {}
|
|
40
|
+
for key, param in lora_state_dict.items():
|
|
41
|
+
if "lora_A.weight" in key:
|
|
42
|
+
lora_b_key = key.replace("lora_A.weight", "lora_B.weight")
|
|
43
|
+
target_key = key.replace(".lora_A.weight", "").replace("transformer.", "")
|
|
44
|
+
|
|
45
|
+
if "attn.to_out.0" in target_key:
|
|
46
|
+
target_key = target_key.replace("attn.to_out.0", "attn.to_out")
|
|
47
|
+
|
|
48
|
+
dit_dict[target_key] = {
|
|
49
|
+
"down": param,
|
|
50
|
+
"up": lora_state_dict[lora_b_key],
|
|
51
|
+
"alpha": lora_state_dict.get(key.replace("lora_A.weight", "alpha"), None),
|
|
52
|
+
}
|
|
53
|
+
return {"dit": dit_dict}
|
|
54
|
+
|
|
55
|
+
def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
|
|
56
|
+
return self._from_diffusers(lora_state_dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ZImagePipeline(BasePipeline):
|
|
60
|
+
lora_converter = ZImageLoRAConverter()
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
config: ZImagePipelineConfig,
|
|
65
|
+
tokenizer: Qwen2TokenizerFast,
|
|
66
|
+
text_encoder: Qwen3Model,
|
|
67
|
+
dit: ZImageDiT,
|
|
68
|
+
vae_decoder: FluxVAEDecoder,
|
|
69
|
+
):
|
|
70
|
+
super().__init__(
|
|
71
|
+
vae_tiled=config.vae_tiled,
|
|
72
|
+
vae_tile_size=config.vae_tile_size,
|
|
73
|
+
vae_tile_stride=config.vae_tile_stride,
|
|
74
|
+
device=config.device,
|
|
75
|
+
dtype=config.model_dtype,
|
|
76
|
+
)
|
|
77
|
+
self.config = config
|
|
78
|
+
|
|
79
|
+
# Scheduler
|
|
80
|
+
self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
|
|
81
|
+
self.sampler = FlowMatchEulerSampler()
|
|
82
|
+
self.tokenizer = tokenizer
|
|
83
|
+
# Models
|
|
84
|
+
self.text_encoder = text_encoder
|
|
85
|
+
self.dit = dit
|
|
86
|
+
self.vae_decoder = vae_decoder
|
|
87
|
+
|
|
88
|
+
self.model_names = ["text_encoder", "dit", "vae_decoder"]
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_pretrained(cls, model_path_or_config: str | ZImagePipelineConfig) -> "ZImagePipeline":
|
|
92
|
+
if isinstance(model_path_or_config, str):
|
|
93
|
+
config = ZImagePipelineConfig(model_path=model_path_or_config)
|
|
94
|
+
else:
|
|
95
|
+
config = model_path_or_config
|
|
96
|
+
|
|
97
|
+
logger.info(f"Loading state dict from {config.model_path} ...")
|
|
98
|
+
|
|
99
|
+
model_state_dict = cls.load_model_checkpoint(
|
|
100
|
+
config.model_path, device="cpu", dtype=config.model_dtype, convert_dtype=False
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if config.vae_path is None:
|
|
104
|
+
config.vae_path = fetch_model(config.model_path, path="vae/diffusion_pytorch_model.safetensors")
|
|
105
|
+
logger.info(f"Loading VAE from {config.vae_path} ...")
|
|
106
|
+
vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
|
|
107
|
+
|
|
108
|
+
if config.encoder_path is None:
|
|
109
|
+
config.encoder_path = fetch_model(config.model_path, path="text_encoder")
|
|
110
|
+
logger.info(f"Loading Text Encoder from {config.encoder_path} ...")
|
|
111
|
+
text_encoder_state_dict = cls.load_model_checkpoint(
|
|
112
|
+
config.encoder_path, device="cpu", dtype=config.encoder_dtype
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
state_dicts = ZImageStateDicts(
|
|
116
|
+
model=model_state_dict,
|
|
117
|
+
vae=vae_state_dict,
|
|
118
|
+
encoder=text_encoder_state_dict,
|
|
119
|
+
)
|
|
120
|
+
return cls.from_state_dict(state_dicts, config)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def from_state_dict(cls, state_dicts: ZImageStateDicts, config: ZImagePipelineConfig) -> "ZImagePipeline":
|
|
124
|
+
if config.parallelism > 1:
|
|
125
|
+
pipe = ParallelWrapper(
|
|
126
|
+
cfg_degree=config.cfg_degree,
|
|
127
|
+
sp_ulysses_degree=config.sp_ulysses_degree,
|
|
128
|
+
sp_ring_degree=config.sp_ring_degree,
|
|
129
|
+
tp_degree=config.tp_degree,
|
|
130
|
+
use_fsdp=config.use_fsdp,
|
|
131
|
+
)
|
|
132
|
+
pipe.load_module(cls._from_state_dict, state_dicts=state_dicts, config=config)
|
|
133
|
+
else:
|
|
134
|
+
pipe = cls._from_state_dict(state_dicts, config)
|
|
135
|
+
return pipe
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def _from_state_dict(cls, state_dicts: ZImageStateDicts, config: ZImagePipelineConfig) -> "ZImagePipeline":
|
|
139
|
+
init_device = "cpu" if config.offload_mode is not None else config.device
|
|
140
|
+
with open(Z_IMAGE_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
|
|
141
|
+
qwen3_config = Qwen3Config(**json.load(f))
|
|
142
|
+
text_encoder = Qwen3Model.from_state_dict(
|
|
143
|
+
state_dicts.encoder, config=qwen3_config, device=init_device, dtype=config.encoder_dtype
|
|
144
|
+
)
|
|
145
|
+
tokenizer = Qwen2TokenizerFast.from_pretrained(Z_IMAGE_TOKENIZER_CONF_PATH)
|
|
146
|
+
vae_decoder = FluxVAEDecoder.from_state_dict(state_dicts.vae, device=init_device, dtype=config.vae_dtype)
|
|
147
|
+
|
|
148
|
+
with LoRAContext():
|
|
149
|
+
dit = ZImageDiT.from_state_dict(
|
|
150
|
+
state_dicts.model,
|
|
151
|
+
device=("cpu" if config.use_fsdp else init_device),
|
|
152
|
+
dtype=config.model_dtype,
|
|
153
|
+
)
|
|
154
|
+
if config.use_fp8_linear:
|
|
155
|
+
enable_fp8_linear(dit)
|
|
156
|
+
|
|
157
|
+
pipe = cls(
|
|
158
|
+
config=config,
|
|
159
|
+
tokenizer=tokenizer,
|
|
160
|
+
text_encoder=text_encoder,
|
|
161
|
+
dit=dit,
|
|
162
|
+
vae_decoder=vae_decoder,
|
|
163
|
+
)
|
|
164
|
+
pipe.eval()
|
|
165
|
+
|
|
166
|
+
if config.offload_mode is not None:
|
|
167
|
+
pipe.enable_cpu_offload(config.offload_mode, config.offload_to_disk)
|
|
168
|
+
|
|
169
|
+
if config.model_dtype == torch.float8_e4m3fn:
|
|
170
|
+
pipe.dtype = torch.bfloat16
|
|
171
|
+
pipe.enable_fp8_autocast(
|
|
172
|
+
model_names=["dit"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if config.use_torch_compile:
|
|
176
|
+
pipe.compile()
|
|
177
|
+
|
|
178
|
+
return pipe
|
|
179
|
+
|
|
180
|
+
def update_weights(self, state_dicts: ZImageStateDicts) -> None:
|
|
181
|
+
self.update_component(self.dit, state_dicts.model, self.config.device, self.config.model_dtype)
|
|
182
|
+
self.update_component(
|
|
183
|
+
self.text_encoder, state_dicts.text_encoder, self.config.device, self.config.encoder_dtype
|
|
184
|
+
)
|
|
185
|
+
self.update_component(self.vae_decoder, state_dicts.vae, self.config.device, self.config.vae_dtype)
|
|
186
|
+
|
|
187
|
+
def compile(self):
|
|
188
|
+
if hasattr(self.dit, "compile_repeated_blocks"):
|
|
189
|
+
self.dit.compile_repeated_blocks()
|
|
190
|
+
|
|
191
|
+
def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
|
|
192
|
+
assert self.config.tp_degree is None or self.config.tp_degree == 1, (
|
|
193
|
+
"load LoRA is not allowed when tensor parallel is enabled; "
|
|
194
|
+
"set tp_degree=None or tp_degree=1 during pipeline initialization"
|
|
195
|
+
)
|
|
196
|
+
assert not (self.config.use_fsdp and fused), (
|
|
197
|
+
"load fused LoRA is not allowed when fully sharded data parallel is enabled; "
|
|
198
|
+
"either load LoRA with fused=False or set use_fsdp=False during pipeline initialization"
|
|
199
|
+
)
|
|
200
|
+
super().load_loras(lora_list, fused, save_original_weight)
|
|
201
|
+
|
|
202
|
+
def unload_loras(self):
|
|
203
|
+
if hasattr(self.dit, "unload_loras"):
|
|
204
|
+
self.dit.unload_loras()
|
|
205
|
+
self.noise_scheduler.restore_config()
|
|
206
|
+
|
|
207
|
+
def apply_scheduler_config(self, scheduler_config: Dict):
|
|
208
|
+
self.noise_scheduler.update_config(scheduler_config)
|
|
209
|
+
|
|
210
|
+
def prepare_latents(
|
|
211
|
+
self,
|
|
212
|
+
latents: torch.Tensor,
|
|
213
|
+
num_inference_steps: int,
|
|
214
|
+
mu: float,
|
|
215
|
+
):
|
|
216
|
+
sigmas, timesteps = self.noise_scheduler.schedule(num_inference_steps, mu=mu, sigma_min=0, sigma_max=1.0)
|
|
217
|
+
|
|
218
|
+
sigmas = sigmas.to(device=self.device, dtype=self.dtype)
|
|
219
|
+
timesteps = timesteps.to(device=self.device, dtype=self.dtype)
|
|
220
|
+
latents = latents.to(device=self.device, dtype=self.dtype)
|
|
221
|
+
|
|
222
|
+
return latents, sigmas, timesteps
|
|
223
|
+
|
|
224
|
+
def encode_prompt(
|
|
225
|
+
self,
|
|
226
|
+
prompt: str,
|
|
227
|
+
max_sequence_length: int = 512,
|
|
228
|
+
):
|
|
229
|
+
if prompt is None:
|
|
230
|
+
return None
|
|
231
|
+
template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
|
232
|
+
txt = [template.format(prompt)]
|
|
233
|
+
text_inputs = self.tokenizer(
|
|
234
|
+
txt,
|
|
235
|
+
max_length=max_sequence_length,
|
|
236
|
+
padding_strategy="max_length",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
input_ids = text_inputs["input_ids"].to(self.device)
|
|
240
|
+
attention_mask = text_inputs["attention_mask"].to(self.device).bool()
|
|
241
|
+
# Encoder forward
|
|
242
|
+
outputs = self.text_encoder(
|
|
243
|
+
input_ids=input_ids,
|
|
244
|
+
attention_mask=attention_mask,
|
|
245
|
+
output_hidden_states=True,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
prompt_embeds = outputs["hidden_states"][-2]
|
|
249
|
+
embeddings_list = []
|
|
250
|
+
for i in range(len(prompt_embeds)):
|
|
251
|
+
embeddings_list.append(prompt_embeds[i][attention_mask[i]])
|
|
252
|
+
return embeddings_list
|
|
253
|
+
|
|
254
|
+
def predict_noise_with_cfg(
|
|
255
|
+
self,
|
|
256
|
+
latents: torch.Tensor,
|
|
257
|
+
timestep: torch.Tensor,
|
|
258
|
+
prompt_emb: List[torch.Tensor],
|
|
259
|
+
negative_prompt_emb: List[torch.Tensor],
|
|
260
|
+
cfg_scale: float = 5.0,
|
|
261
|
+
cfg_truncation: float = 1.0,
|
|
262
|
+
cfg_normalization: float = 0.0, # 0.0 means disabled
|
|
263
|
+
batch_cfg: bool = False,
|
|
264
|
+
):
|
|
265
|
+
t = timestep.expand(latents.shape[0])
|
|
266
|
+
t = (1000 - t) / 1000
|
|
267
|
+
progress = t[0].item()
|
|
268
|
+
|
|
269
|
+
current_cfg_scale = cfg_scale
|
|
270
|
+
if cfg_truncation <= 1.0 and progress > cfg_truncation:
|
|
271
|
+
current_cfg_scale = 0.0
|
|
272
|
+
|
|
273
|
+
do_cfg = current_cfg_scale > 0 and negative_prompt_emb is not None
|
|
274
|
+
|
|
275
|
+
if not do_cfg:
|
|
276
|
+
comb_pred = self.predict_noise(latents, t, prompt_emb)[0]
|
|
277
|
+
else:
|
|
278
|
+
if not batch_cfg:
|
|
279
|
+
positive_noise_pred = self.predict_noise(latents, t, prompt_emb)
|
|
280
|
+
negative_noise_pred = self.predict_noise(latents, t, negative_prompt_emb)
|
|
281
|
+
else:
|
|
282
|
+
latents_input = torch.cat([latents, latents], dim=0)
|
|
283
|
+
t = torch.cat([t, t], dim=0)
|
|
284
|
+
prompt_input = prompt_emb + negative_prompt_emb
|
|
285
|
+
|
|
286
|
+
noise_pred = self.predict_noise(latents_input, t, prompt_input)
|
|
287
|
+
|
|
288
|
+
positive_noise_pred, negative_noise_pred = noise_pred[0], noise_pred[1]
|
|
289
|
+
|
|
290
|
+
comb_pred = positive_noise_pred + current_cfg_scale * (positive_noise_pred - negative_noise_pred)
|
|
291
|
+
|
|
292
|
+
if cfg_normalization is not None and cfg_normalization > 0:
|
|
293
|
+
cond_norm = torch.linalg.vector_norm(positive_noise_pred)
|
|
294
|
+
new_norm = torch.linalg.vector_norm(comb_pred)
|
|
295
|
+
max_allowed_norm = cond_norm * cfg_normalization
|
|
296
|
+
new_norm = torch.where(new_norm < 1e-6, torch.ones_like(new_norm), new_norm)
|
|
297
|
+
scale_factor = max_allowed_norm / new_norm
|
|
298
|
+
scale_factor = torch.clamp(scale_factor, max=1.0)
|
|
299
|
+
comb_pred = comb_pred * scale_factor
|
|
300
|
+
|
|
301
|
+
comb_pred = -comb_pred.squeeze(1).unsqueeze(0)
|
|
302
|
+
return comb_pred
|
|
303
|
+
|
|
304
|
+
def predict_noise(
|
|
305
|
+
self,
|
|
306
|
+
latents: torch.Tensor,
|
|
307
|
+
timestep: torch.Tensor,
|
|
308
|
+
prompt_emb: List[torch.Tensor],
|
|
309
|
+
):
|
|
310
|
+
self.load_models_to_device(["dit"])
|
|
311
|
+
|
|
312
|
+
latents_list = list(latents.unsqueeze(2).unbind(dim=0))
|
|
313
|
+
|
|
314
|
+
noise_pred = self.dit(
|
|
315
|
+
image=latents_list,
|
|
316
|
+
timestep=timestep,
|
|
317
|
+
cap_feats=prompt_emb,
|
|
318
|
+
)
|
|
319
|
+
return noise_pred
|
|
320
|
+
|
|
321
|
+
@torch.no_grad()
|
|
322
|
+
def __call__(
|
|
323
|
+
self,
|
|
324
|
+
prompt: Union[str, List[str]],
|
|
325
|
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
326
|
+
height: int = 1024,
|
|
327
|
+
width: int = 1024,
|
|
328
|
+
num_inference_steps: int = 50,
|
|
329
|
+
cfg_scale: float = 5.0,
|
|
330
|
+
cfg_normalization: bool = False,
|
|
331
|
+
cfg_truncation: float = 1.0,
|
|
332
|
+
seed: Optional[int] = None,
|
|
333
|
+
progress_callback: Optional[Callable] = None,
|
|
334
|
+
):
|
|
335
|
+
self.validate_image_size(height, width, multiple_of=16)
|
|
336
|
+
|
|
337
|
+
self.load_models_to_device(["text_encoder"])
|
|
338
|
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(prompt), self.encode_prompt(negative_prompt)
|
|
339
|
+
self.model_lifecycle_finish(["text_encoder"])
|
|
340
|
+
|
|
341
|
+
noise = self.generate_noise((1, 16, height // 8, width // 8), seed=seed, device="cpu", dtype=self.dtype).to(
|
|
342
|
+
device=self.device
|
|
343
|
+
)
|
|
344
|
+
image_seq_len = math.ceil(height // 16) * math.ceil(width // 16)
|
|
345
|
+
|
|
346
|
+
mu = calculate_shift(image_seq_len, base_seq_len=256, max_seq_len=4096, base_shift=0.5, max_shift=1.15)
|
|
347
|
+
|
|
348
|
+
latents, sigmas, timesteps = self.prepare_latents(noise, num_inference_steps, mu)
|
|
349
|
+
|
|
350
|
+
self.sampler.initialize(sigmas=sigmas)
|
|
351
|
+
|
|
352
|
+
self.load_models_to_device(["dit"])
|
|
353
|
+
hide_progress = dist.is_initialized() and dist.get_rank() != 0
|
|
354
|
+
|
|
355
|
+
for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
|
|
356
|
+
timestep = timestep.unsqueeze(0).to(dtype=self.dtype)
|
|
357
|
+
noise_pred = self.predict_noise_with_cfg(
|
|
358
|
+
latents=latents,
|
|
359
|
+
timestep=timestep,
|
|
360
|
+
prompt_emb=prompt_embeds,
|
|
361
|
+
negative_prompt_emb=negative_prompt_embeds,
|
|
362
|
+
batch_cfg=self.config.batch_cfg,
|
|
363
|
+
cfg_truncation=cfg_truncation,
|
|
364
|
+
cfg_normalization=cfg_normalization,
|
|
365
|
+
)
|
|
366
|
+
latents = self.sampler.step(latents, noise_pred, i)
|
|
367
|
+
if progress_callback is not None:
|
|
368
|
+
progress_callback(i, len(timesteps), "DENOISING")
|
|
369
|
+
|
|
370
|
+
self.model_lifecycle_finish(["dit"])
|
|
371
|
+
|
|
372
|
+
self.load_models_to_device(["vae_decoder"])
|
|
373
|
+
vae_output = self.decode_image(latents)
|
|
374
|
+
image = self.vae_output_to_image(vae_output)
|
|
375
|
+
# Offload all models
|
|
376
|
+
self.load_models_to_device([])
|
|
377
|
+
return image
|
|
@@ -14,6 +14,7 @@ SDXL_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "toke
|
|
|
14
14
|
WAN_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "wan", "umt5-xxl")
|
|
15
15
|
QWEN_IMAGE_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "tokenizer")
|
|
16
16
|
QWEN_IMAGE_PROCESSOR_CONFIG_FILE = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "qwen2_vl_image_processor.json")
|
|
17
|
+
Z_IMAGE_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "z_image", "tokenizer")
|
|
17
18
|
|
|
18
19
|
# models
|
|
19
20
|
VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "components", "vae.json")
|
|
@@ -46,6 +47,8 @@ QWEN_IMAGE_VISION_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image",
|
|
|
46
47
|
QWEN_IMAGE_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen_image_vae.json")
|
|
47
48
|
QWEN_IMAGE_VAE_KEYMAP_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen_image_vae_keymap.json")
|
|
48
49
|
|
|
50
|
+
Z_IMAGE_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "z_image", "qwen3_config.json")
|
|
51
|
+
|
|
49
52
|
# data size
|
|
50
53
|
KB = 1024
|
|
51
54
|
MB = 1024 * KB
|
|
@@ -20,7 +20,7 @@ class Singleton:
|
|
|
20
20
|
|
|
21
21
|
class ProcessGroupSingleton(Singleton):
|
|
22
22
|
def __init__(self):
|
|
23
|
-
if not hasattr(self,
|
|
23
|
+
if not hasattr(self, "initialized"):
|
|
24
24
|
self.CFG_GROUP: Optional[dist.ProcessGroup] = None
|
|
25
25
|
self.SP_GROUP: Optional[dist.ProcessGroup] = None
|
|
26
26
|
self.SP_ULYSSUES_GROUP: Optional[dist.ProcessGroup] = None
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
diffsynth_engine/__init__.py,sha256=
|
|
1
|
+
diffsynth_engine/__init__.py,sha256=hN0jYaikjhpqHB4Mg-e53h-7ck1DsiY4FBti8K9lN2k,2390
|
|
2
2
|
diffsynth_engine/algorithm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
diffsynth_engine/algorithm/noise_scheduler/__init__.py,sha256=YvcwE2tCNua-OAX9GEPm0EXsINNWH4XvJMNZb-uaZMM,745
|
|
4
4
|
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py,sha256=3ve4bYxGyfuERynvoNYdFYSk0agdBgXKCeIOS6O6wgI,819
|
|
@@ -52,6 +52,7 @@ diffsynth_engine/conf/models/wan/dit/wan_dit_keymap.json,sha256=hfGytOIRkdYFgOR9
|
|
|
52
52
|
diffsynth_engine/conf/models/wan/vae/wan2.1_vae.json,sha256=eVLTSRqbXm3JD8QDkLbM6vFfCdynlS-8QxqCfi4BzrI,815
|
|
53
53
|
diffsynth_engine/conf/models/wan/vae/wan2.2_vae.json,sha256=pdnYEEZ_GcZHM_iH1y5ASdf_qZUGCOuDEaFmjdg9RKY,1860
|
|
54
54
|
diffsynth_engine/conf/models/wan/vae/wan_vae_keymap.json,sha256=u9MJ3yRL45kdqRVoBnYbHkmuUmOseUFtwte-_9ZvdHc,25224
|
|
55
|
+
diffsynth_engine/conf/models/z_image/qwen3_config.json,sha256=i6AG90_s-q6zkocqYPSkgOfsmGAVPS4bdp7IH5oUf4o,726
|
|
55
56
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt,sha256=n9aR98gDkhDg_O0VhlRmxlgg0JtjmIsBdL_iXeKZBRo,524619
|
|
56
57
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json,sha256=LNs7gzGmDJL8HlWhPp_WH9IpPFpRJ1_czNYreABSUw4,588
|
|
57
58
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json,sha256=a9zunMzioWyitMDF7QC0LFDqIl9EcqjEweljopAsKIE,705
|
|
@@ -79,9 +80,13 @@ diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json,sha256=e4q
|
|
|
79
80
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model,sha256=45CaZ7eAZQs1z1Kax4KtK2sm5tH4SdP7tqhykF9FJFg,4548313
|
|
80
81
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json,sha256=bhl7TT29cdoUtOslX0-pHJwfIGiyCi3iRylnyj0iYCs,16837417
|
|
81
82
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json,sha256=7Zo6iw-qcacKMoR-BDX-A25uES1N9O23u0ipIeNE3AU,61728
|
|
82
|
-
diffsynth_engine/
|
|
83
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/merges.txt,sha256=iDHk8aBERxNA98CoPXvXEwaluGfpX9hw900MUwipBNU,1671853
|
|
84
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer.json,sha256=rrEzB6cazY_oGGHZStVKtonfdzMYgJ7tPL55S0SS2uQ,11422654
|
|
85
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer_config.json,sha256=1dCfB7SMMIbFCLMNHJEUvRGJFFt06YKiZTUMkjrNgQE,9732
|
|
86
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/vocab.json,sha256=yhDX6fs-0YV13R4neiV5wW0QjjLydDloSvoOELFECRA,2776833
|
|
87
|
+
diffsynth_engine/configs/__init__.py,sha256=biluGSEw78PPwO7XFlms16iuWXDiM0Eg_qsOMMTY0NQ,1409
|
|
83
88
|
diffsynth_engine/configs/controlnet.py,sha256=f3vclyP3lcAjxDGD9C1vevhqqQ7W2LL_c6Wye0uxk3Q,1180
|
|
84
|
-
diffsynth_engine/configs/pipeline.py,sha256=
|
|
89
|
+
diffsynth_engine/configs/pipeline.py,sha256=0WmKz_mykmJkRCGwv9DjuN8s27LppkD_Ier4VtovZSg,15307
|
|
85
90
|
diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
86
91
|
diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
|
|
87
92
|
diffsynth_engine/models/base.py,sha256=svao__9WH8VNcyXz5o5dzywYXDcGV0YV9IfkLzDKews,2558
|
|
@@ -142,7 +147,10 @@ diffsynth_engine/models/wan/wan_image_encoder.py,sha256=Vdd39lv_QvOsmPxihZWZZbpP
|
|
|
142
147
|
diffsynth_engine/models/wan/wan_s2v_dit.py,sha256=j63ulcWLY4XGITOKUMGX292LtSEtP-n8BTvqb98YExU,23615
|
|
143
148
|
diffsynth_engine/models/wan/wan_text_encoder.py,sha256=ePeOifbTI_o650mckzugyWPuHn5vhM-uFMcDVCijxPM,11394
|
|
144
149
|
diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCghutMxpXr4,38889
|
|
145
|
-
diffsynth_engine/
|
|
150
|
+
diffsynth_engine/models/z_image/__init__.py,sha256=d1ztBNgM8GR2_uGwlxOE1Jf5URTq1g-WnmJH7nrMoaY,160
|
|
151
|
+
diffsynth_engine/models/z_image/qwen3.py,sha256=PmT6m46Fc7KZXNzG7ig23Mzj6QfHnMmrpX_MM0UuuYg,4580
|
|
152
|
+
diffsynth_engine/models/z_image/z_image_dit.py,sha256=kGtYzmfzk_FDe7KWfXpJagN7k7ROXl5J01IhRRs-Bsk,23806
|
|
153
|
+
diffsynth_engine/pipelines/__init__.py,sha256=xQUtz2cVmcEInazvT1dqv2HdPiJKmywWTIPfbK5dZXI,662
|
|
146
154
|
diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
|
|
147
155
|
diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
|
|
148
156
|
diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
|
|
@@ -152,6 +160,7 @@ diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8
|
|
|
152
160
|
diffsynth_engine/pipelines/utils.py,sha256=HZbJHErNJS1DhlwJKvZ9dY7Kh8Zdlsw3zE2e88TYGRY,2277
|
|
153
161
|
diffsynth_engine/pipelines/wan_s2v.py,sha256=QHlCLMqlmnp55iYm2mzg4qCq4jceRAP3Zt5Mubz3mAM,29384
|
|
154
162
|
diffsynth_engine/pipelines/wan_video.py,sha256=9xjSvQ4mlVEDdaL6QuUURj4iyxhJ2xABBphQjkfzK8s,31323
|
|
163
|
+
diffsynth_engine/pipelines/z_image.py,sha256=gSBhKV7TBL9xvCUrABdZA0kNqQzPuawmEv8OcI6KTcs,14756
|
|
155
164
|
diffsynth_engine/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
165
|
diffsynth_engine/processor/canny_processor.py,sha256=hV30NlblTkEFUAmF_O-LJrNlGVM2SFrqq6okfF8VpOo,602
|
|
157
166
|
diffsynth_engine/processor/depth_processor.py,sha256=dQvs3JsnyMbz4dyI9QoR8oO-mMFBFAgNvgqeCoaU5jk,1532
|
|
@@ -170,7 +179,7 @@ diffsynth_engine/tools/flux_reference_tool.py,sha256=6v0NRZPsDEHFlPruO-ZJTB4rYWx
|
|
|
170
179
|
diffsynth_engine/tools/flux_replace_tool.py,sha256=AOyEGxHsaNwpTS2VChAieIfECgMxlKsRw0lWPm1k9C0,4627
|
|
171
180
|
diffsynth_engine/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
172
181
|
diffsynth_engine/utils/cache.py,sha256=Ivef22pCuhEq-4H00gSvkLS8ceVZoGis7OSitYL6gH4,2101
|
|
173
|
-
diffsynth_engine/utils/constants.py,sha256=
|
|
182
|
+
diffsynth_engine/utils/constants.py,sha256=x0-bsPRplW-KkRpLVajuC9Yv6f3QbdHgSr3XZ-eBCsQ,3745
|
|
174
183
|
diffsynth_engine/utils/download.py,sha256=w9QQjllPfTUEY371UTREU7o_vvdMY-Q2DymDel3ZEZY,6792
|
|
175
184
|
diffsynth_engine/utils/env.py,sha256=k749eYt_qKGq38GocDiXfkhp8nZrowFefNVTZ8R755I,363
|
|
176
185
|
diffsynth_engine/utils/flag.py,sha256=KSzjnzRe7sleNCJm8IpbJQbmBY4KNV2kDrijxi27Jek,2928
|
|
@@ -184,14 +193,14 @@ diffsynth_engine/utils/offload.py,sha256=94og79TIkxldwYUgZT3L4OVu1WBlE7gfVPvO2MR
|
|
|
184
193
|
diffsynth_engine/utils/onnx.py,sha256=jeWUudJHnESjuiEAHyUZYUZz7dCj34O9aGjHCe8yjWo,1149
|
|
185
194
|
diffsynth_engine/utils/parallel.py,sha256=OBGsAK-3ncArRyMU1lea7tbYgxSdCucQvXheL3Ssl5M,17653
|
|
186
195
|
diffsynth_engine/utils/platform.py,sha256=nbpG-XHJFRmYY6u_e7IBQ9Q6GyItrIkKf3VKuBPTUpY,627
|
|
187
|
-
diffsynth_engine/utils/process_group.py,sha256=
|
|
196
|
+
diffsynth_engine/utils/process_group.py,sha256=I9uiqoVq-Hlu694GnrvbVi7nfVJBsgCCDo3p2kjU3yo,3783
|
|
188
197
|
diffsynth_engine/utils/prompt.py,sha256=YItMchoVzsG6y-LB4vzzDUWrkhKRVlt1HfVhxZjSxMQ,280
|
|
189
198
|
diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CDhg,2200
|
|
190
199
|
diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
191
200
|
diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
|
|
192
201
|
diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
|
|
193
|
-
diffsynth_engine-0.6.1.
|
|
194
|
-
diffsynth_engine-0.6.1.
|
|
195
|
-
diffsynth_engine-0.6.1.
|
|
196
|
-
diffsynth_engine-0.6.1.
|
|
197
|
-
diffsynth_engine-0.6.1.
|
|
202
|
+
diffsynth_engine-0.6.1.dev35.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
|
|
203
|
+
diffsynth_engine-0.6.1.dev35.dist-info/METADATA,sha256=mwDiBscVZRY6rz7Mbmv4qxhlFNSFoACIu4xl0YA9lVE,1164
|
|
204
|
+
diffsynth_engine-0.6.1.dev35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
205
|
+
diffsynth_engine-0.6.1.dev35.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
|
|
206
|
+
diffsynth_engine-0.6.1.dev35.dist-info/RECORD,,
|
|
File without changes
|
{diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{diffsynth_engine-0.6.1.dev34.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/top_level.txt
RENAMED
|
File without changes
|