diffsynth-engine 0.6.1.dev33__py3-none-any.whl → 0.6.1.dev35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffsynth_engine/__init__.py +4 -0
- diffsynth_engine/conf/models/z_image/qwen3_config.json +30 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/merges.txt +151388 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer.json +757480 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer_config.json +239 -0
- diffsynth_engine/conf/tokenizers/z_image/tokenizer/vocab.json +1 -0
- diffsynth_engine/configs/__init__.py +4 -0
- diffsynth_engine/configs/pipeline.py +44 -1
- diffsynth_engine/models/basic/attention.py +2 -2
- diffsynth_engine/models/qwen_image/qwen_image_dit.py +5 -5
- diffsynth_engine/models/qwen_image/qwen_image_vae.py +0 -1
- diffsynth_engine/models/z_image/__init__.py +11 -0
- diffsynth_engine/models/z_image/qwen3.py +124 -0
- diffsynth_engine/models/z_image/z_image_dit.py +602 -0
- diffsynth_engine/pipelines/__init__.py +2 -0
- diffsynth_engine/pipelines/qwen_image.py +4 -3
- diffsynth_engine/pipelines/z_image.py +377 -0
- diffsynth_engine/utils/constants.py +3 -0
- diffsynth_engine/utils/process_group.py +1 -1
- {diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/METADATA +1 -1
- {diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/RECORD +24 -15
- {diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/WHEEL +0 -0
- {diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/licenses/LICENSE +0 -0
- {diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.distributed as dist
|
|
3
|
+
import math
|
|
4
|
+
import json
|
|
5
|
+
from typing import Callable, List, Dict, Tuple, Optional, Union
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from diffsynth_engine.configs import (
|
|
9
|
+
ZImagePipelineConfig,
|
|
10
|
+
ZImageStateDicts,
|
|
11
|
+
)
|
|
12
|
+
from diffsynth_engine.models.basic.lora import LoRAContext
|
|
13
|
+
|
|
14
|
+
from diffsynth_engine.models.z_image import (
|
|
15
|
+
ZImageDiT,
|
|
16
|
+
Qwen3Model,
|
|
17
|
+
Qwen3Config,
|
|
18
|
+
)
|
|
19
|
+
from diffsynth_engine.tokenizers.qwen2 import Qwen2TokenizerFast
|
|
20
|
+
from diffsynth_engine.utils.constants import (
|
|
21
|
+
Z_IMAGE_TEXT_ENCODER_CONFIG_FILE,
|
|
22
|
+
Z_IMAGE_TOKENIZER_CONF_PATH,
|
|
23
|
+
)
|
|
24
|
+
from diffsynth_engine.models.flux import FluxVAEDecoder
|
|
25
|
+
from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
|
|
26
|
+
from diffsynth_engine.pipelines.utils import calculate_shift
|
|
27
|
+
from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
|
|
28
|
+
from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
|
|
29
|
+
from diffsynth_engine.utils.parallel import ParallelWrapper
|
|
30
|
+
from diffsynth_engine.utils import logging
|
|
31
|
+
from diffsynth_engine.utils.fp8_linear import enable_fp8_linear
|
|
32
|
+
from diffsynth_engine.utils.download import fetch_model
|
|
33
|
+
|
|
34
|
+
logger = logging.get_logger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ZImageLoRAConverter(LoRAStateDictConverter):
|
|
38
|
+
def _from_diffusers(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
|
|
39
|
+
dit_dict = {}
|
|
40
|
+
for key, param in lora_state_dict.items():
|
|
41
|
+
if "lora_A.weight" in key:
|
|
42
|
+
lora_b_key = key.replace("lora_A.weight", "lora_B.weight")
|
|
43
|
+
target_key = key.replace(".lora_A.weight", "").replace("transformer.", "")
|
|
44
|
+
|
|
45
|
+
if "attn.to_out.0" in target_key:
|
|
46
|
+
target_key = target_key.replace("attn.to_out.0", "attn.to_out")
|
|
47
|
+
|
|
48
|
+
dit_dict[target_key] = {
|
|
49
|
+
"down": param,
|
|
50
|
+
"up": lora_state_dict[lora_b_key],
|
|
51
|
+
"alpha": lora_state_dict.get(key.replace("lora_A.weight", "alpha"), None),
|
|
52
|
+
}
|
|
53
|
+
return {"dit": dit_dict}
|
|
54
|
+
|
|
55
|
+
def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
|
|
56
|
+
return self._from_diffusers(lora_state_dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ZImagePipeline(BasePipeline):
|
|
60
|
+
lora_converter = ZImageLoRAConverter()
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
config: ZImagePipelineConfig,
|
|
65
|
+
tokenizer: Qwen2TokenizerFast,
|
|
66
|
+
text_encoder: Qwen3Model,
|
|
67
|
+
dit: ZImageDiT,
|
|
68
|
+
vae_decoder: FluxVAEDecoder,
|
|
69
|
+
):
|
|
70
|
+
super().__init__(
|
|
71
|
+
vae_tiled=config.vae_tiled,
|
|
72
|
+
vae_tile_size=config.vae_tile_size,
|
|
73
|
+
vae_tile_stride=config.vae_tile_stride,
|
|
74
|
+
device=config.device,
|
|
75
|
+
dtype=config.model_dtype,
|
|
76
|
+
)
|
|
77
|
+
self.config = config
|
|
78
|
+
|
|
79
|
+
# Scheduler
|
|
80
|
+
self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
|
|
81
|
+
self.sampler = FlowMatchEulerSampler()
|
|
82
|
+
self.tokenizer = tokenizer
|
|
83
|
+
# Models
|
|
84
|
+
self.text_encoder = text_encoder
|
|
85
|
+
self.dit = dit
|
|
86
|
+
self.vae_decoder = vae_decoder
|
|
87
|
+
|
|
88
|
+
self.model_names = ["text_encoder", "dit", "vae_decoder"]
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_pretrained(cls, model_path_or_config: str | ZImagePipelineConfig) -> "ZImagePipeline":
|
|
92
|
+
if isinstance(model_path_or_config, str):
|
|
93
|
+
config = ZImagePipelineConfig(model_path=model_path_or_config)
|
|
94
|
+
else:
|
|
95
|
+
config = model_path_or_config
|
|
96
|
+
|
|
97
|
+
logger.info(f"Loading state dict from {config.model_path} ...")
|
|
98
|
+
|
|
99
|
+
model_state_dict = cls.load_model_checkpoint(
|
|
100
|
+
config.model_path, device="cpu", dtype=config.model_dtype, convert_dtype=False
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if config.vae_path is None:
|
|
104
|
+
config.vae_path = fetch_model(config.model_path, path="vae/diffusion_pytorch_model.safetensors")
|
|
105
|
+
logger.info(f"Loading VAE from {config.vae_path} ...")
|
|
106
|
+
vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
|
|
107
|
+
|
|
108
|
+
if config.encoder_path is None:
|
|
109
|
+
config.encoder_path = fetch_model(config.model_path, path="text_encoder")
|
|
110
|
+
logger.info(f"Loading Text Encoder from {config.encoder_path} ...")
|
|
111
|
+
text_encoder_state_dict = cls.load_model_checkpoint(
|
|
112
|
+
config.encoder_path, device="cpu", dtype=config.encoder_dtype
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
state_dicts = ZImageStateDicts(
|
|
116
|
+
model=model_state_dict,
|
|
117
|
+
vae=vae_state_dict,
|
|
118
|
+
encoder=text_encoder_state_dict,
|
|
119
|
+
)
|
|
120
|
+
return cls.from_state_dict(state_dicts, config)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def from_state_dict(cls, state_dicts: ZImageStateDicts, config: ZImagePipelineConfig) -> "ZImagePipeline":
|
|
124
|
+
if config.parallelism > 1:
|
|
125
|
+
pipe = ParallelWrapper(
|
|
126
|
+
cfg_degree=config.cfg_degree,
|
|
127
|
+
sp_ulysses_degree=config.sp_ulysses_degree,
|
|
128
|
+
sp_ring_degree=config.sp_ring_degree,
|
|
129
|
+
tp_degree=config.tp_degree,
|
|
130
|
+
use_fsdp=config.use_fsdp,
|
|
131
|
+
)
|
|
132
|
+
pipe.load_module(cls._from_state_dict, state_dicts=state_dicts, config=config)
|
|
133
|
+
else:
|
|
134
|
+
pipe = cls._from_state_dict(state_dicts, config)
|
|
135
|
+
return pipe
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def _from_state_dict(cls, state_dicts: ZImageStateDicts, config: ZImagePipelineConfig) -> "ZImagePipeline":
|
|
139
|
+
init_device = "cpu" if config.offload_mode is not None else config.device
|
|
140
|
+
with open(Z_IMAGE_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
|
|
141
|
+
qwen3_config = Qwen3Config(**json.load(f))
|
|
142
|
+
text_encoder = Qwen3Model.from_state_dict(
|
|
143
|
+
state_dicts.encoder, config=qwen3_config, device=init_device, dtype=config.encoder_dtype
|
|
144
|
+
)
|
|
145
|
+
tokenizer = Qwen2TokenizerFast.from_pretrained(Z_IMAGE_TOKENIZER_CONF_PATH)
|
|
146
|
+
vae_decoder = FluxVAEDecoder.from_state_dict(state_dicts.vae, device=init_device, dtype=config.vae_dtype)
|
|
147
|
+
|
|
148
|
+
with LoRAContext():
|
|
149
|
+
dit = ZImageDiT.from_state_dict(
|
|
150
|
+
state_dicts.model,
|
|
151
|
+
device=("cpu" if config.use_fsdp else init_device),
|
|
152
|
+
dtype=config.model_dtype,
|
|
153
|
+
)
|
|
154
|
+
if config.use_fp8_linear:
|
|
155
|
+
enable_fp8_linear(dit)
|
|
156
|
+
|
|
157
|
+
pipe = cls(
|
|
158
|
+
config=config,
|
|
159
|
+
tokenizer=tokenizer,
|
|
160
|
+
text_encoder=text_encoder,
|
|
161
|
+
dit=dit,
|
|
162
|
+
vae_decoder=vae_decoder,
|
|
163
|
+
)
|
|
164
|
+
pipe.eval()
|
|
165
|
+
|
|
166
|
+
if config.offload_mode is not None:
|
|
167
|
+
pipe.enable_cpu_offload(config.offload_mode, config.offload_to_disk)
|
|
168
|
+
|
|
169
|
+
if config.model_dtype == torch.float8_e4m3fn:
|
|
170
|
+
pipe.dtype = torch.bfloat16
|
|
171
|
+
pipe.enable_fp8_autocast(
|
|
172
|
+
model_names=["dit"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if config.use_torch_compile:
|
|
176
|
+
pipe.compile()
|
|
177
|
+
|
|
178
|
+
return pipe
|
|
179
|
+
|
|
180
|
+
def update_weights(self, state_dicts: ZImageStateDicts) -> None:
|
|
181
|
+
self.update_component(self.dit, state_dicts.model, self.config.device, self.config.model_dtype)
|
|
182
|
+
self.update_component(
|
|
183
|
+
self.text_encoder, state_dicts.text_encoder, self.config.device, self.config.encoder_dtype
|
|
184
|
+
)
|
|
185
|
+
self.update_component(self.vae_decoder, state_dicts.vae, self.config.device, self.config.vae_dtype)
|
|
186
|
+
|
|
187
|
+
def compile(self):
|
|
188
|
+
if hasattr(self.dit, "compile_repeated_blocks"):
|
|
189
|
+
self.dit.compile_repeated_blocks()
|
|
190
|
+
|
|
191
|
+
def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
|
|
192
|
+
assert self.config.tp_degree is None or self.config.tp_degree == 1, (
|
|
193
|
+
"load LoRA is not allowed when tensor parallel is enabled; "
|
|
194
|
+
"set tp_degree=None or tp_degree=1 during pipeline initialization"
|
|
195
|
+
)
|
|
196
|
+
assert not (self.config.use_fsdp and fused), (
|
|
197
|
+
"load fused LoRA is not allowed when fully sharded data parallel is enabled; "
|
|
198
|
+
"either load LoRA with fused=False or set use_fsdp=False during pipeline initialization"
|
|
199
|
+
)
|
|
200
|
+
super().load_loras(lora_list, fused, save_original_weight)
|
|
201
|
+
|
|
202
|
+
def unload_loras(self):
|
|
203
|
+
if hasattr(self.dit, "unload_loras"):
|
|
204
|
+
self.dit.unload_loras()
|
|
205
|
+
self.noise_scheduler.restore_config()
|
|
206
|
+
|
|
207
|
+
def apply_scheduler_config(self, scheduler_config: Dict):
|
|
208
|
+
self.noise_scheduler.update_config(scheduler_config)
|
|
209
|
+
|
|
210
|
+
def prepare_latents(
|
|
211
|
+
self,
|
|
212
|
+
latents: torch.Tensor,
|
|
213
|
+
num_inference_steps: int,
|
|
214
|
+
mu: float,
|
|
215
|
+
):
|
|
216
|
+
sigmas, timesteps = self.noise_scheduler.schedule(num_inference_steps, mu=mu, sigma_min=0, sigma_max=1.0)
|
|
217
|
+
|
|
218
|
+
sigmas = sigmas.to(device=self.device, dtype=self.dtype)
|
|
219
|
+
timesteps = timesteps.to(device=self.device, dtype=self.dtype)
|
|
220
|
+
latents = latents.to(device=self.device, dtype=self.dtype)
|
|
221
|
+
|
|
222
|
+
return latents, sigmas, timesteps
|
|
223
|
+
|
|
224
|
+
def encode_prompt(
|
|
225
|
+
self,
|
|
226
|
+
prompt: str,
|
|
227
|
+
max_sequence_length: int = 512,
|
|
228
|
+
):
|
|
229
|
+
if prompt is None:
|
|
230
|
+
return None
|
|
231
|
+
template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
|
232
|
+
txt = [template.format(prompt)]
|
|
233
|
+
text_inputs = self.tokenizer(
|
|
234
|
+
txt,
|
|
235
|
+
max_length=max_sequence_length,
|
|
236
|
+
padding_strategy="max_length",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
input_ids = text_inputs["input_ids"].to(self.device)
|
|
240
|
+
attention_mask = text_inputs["attention_mask"].to(self.device).bool()
|
|
241
|
+
# Encoder forward
|
|
242
|
+
outputs = self.text_encoder(
|
|
243
|
+
input_ids=input_ids,
|
|
244
|
+
attention_mask=attention_mask,
|
|
245
|
+
output_hidden_states=True,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
prompt_embeds = outputs["hidden_states"][-2]
|
|
249
|
+
embeddings_list = []
|
|
250
|
+
for i in range(len(prompt_embeds)):
|
|
251
|
+
embeddings_list.append(prompt_embeds[i][attention_mask[i]])
|
|
252
|
+
return embeddings_list
|
|
253
|
+
|
|
254
|
+
def predict_noise_with_cfg(
|
|
255
|
+
self,
|
|
256
|
+
latents: torch.Tensor,
|
|
257
|
+
timestep: torch.Tensor,
|
|
258
|
+
prompt_emb: List[torch.Tensor],
|
|
259
|
+
negative_prompt_emb: List[torch.Tensor],
|
|
260
|
+
cfg_scale: float = 5.0,
|
|
261
|
+
cfg_truncation: float = 1.0,
|
|
262
|
+
cfg_normalization: float = 0.0, # 0.0 means disabled
|
|
263
|
+
batch_cfg: bool = False,
|
|
264
|
+
):
|
|
265
|
+
t = timestep.expand(latents.shape[0])
|
|
266
|
+
t = (1000 - t) / 1000
|
|
267
|
+
progress = t[0].item()
|
|
268
|
+
|
|
269
|
+
current_cfg_scale = cfg_scale
|
|
270
|
+
if cfg_truncation <= 1.0 and progress > cfg_truncation:
|
|
271
|
+
current_cfg_scale = 0.0
|
|
272
|
+
|
|
273
|
+
do_cfg = current_cfg_scale > 0 and negative_prompt_emb is not None
|
|
274
|
+
|
|
275
|
+
if not do_cfg:
|
|
276
|
+
comb_pred = self.predict_noise(latents, t, prompt_emb)[0]
|
|
277
|
+
else:
|
|
278
|
+
if not batch_cfg:
|
|
279
|
+
positive_noise_pred = self.predict_noise(latents, t, prompt_emb)
|
|
280
|
+
negative_noise_pred = self.predict_noise(latents, t, negative_prompt_emb)
|
|
281
|
+
else:
|
|
282
|
+
latents_input = torch.cat([latents, latents], dim=0)
|
|
283
|
+
t = torch.cat([t, t], dim=0)
|
|
284
|
+
prompt_input = prompt_emb + negative_prompt_emb
|
|
285
|
+
|
|
286
|
+
noise_pred = self.predict_noise(latents_input, t, prompt_input)
|
|
287
|
+
|
|
288
|
+
positive_noise_pred, negative_noise_pred = noise_pred[0], noise_pred[1]
|
|
289
|
+
|
|
290
|
+
comb_pred = positive_noise_pred + current_cfg_scale * (positive_noise_pred - negative_noise_pred)
|
|
291
|
+
|
|
292
|
+
if cfg_normalization is not None and cfg_normalization > 0:
|
|
293
|
+
cond_norm = torch.linalg.vector_norm(positive_noise_pred)
|
|
294
|
+
new_norm = torch.linalg.vector_norm(comb_pred)
|
|
295
|
+
max_allowed_norm = cond_norm * cfg_normalization
|
|
296
|
+
new_norm = torch.where(new_norm < 1e-6, torch.ones_like(new_norm), new_norm)
|
|
297
|
+
scale_factor = max_allowed_norm / new_norm
|
|
298
|
+
scale_factor = torch.clamp(scale_factor, max=1.0)
|
|
299
|
+
comb_pred = comb_pred * scale_factor
|
|
300
|
+
|
|
301
|
+
comb_pred = -comb_pred.squeeze(1).unsqueeze(0)
|
|
302
|
+
return comb_pred
|
|
303
|
+
|
|
304
|
+
def predict_noise(
|
|
305
|
+
self,
|
|
306
|
+
latents: torch.Tensor,
|
|
307
|
+
timestep: torch.Tensor,
|
|
308
|
+
prompt_emb: List[torch.Tensor],
|
|
309
|
+
):
|
|
310
|
+
self.load_models_to_device(["dit"])
|
|
311
|
+
|
|
312
|
+
latents_list = list(latents.unsqueeze(2).unbind(dim=0))
|
|
313
|
+
|
|
314
|
+
noise_pred = self.dit(
|
|
315
|
+
image=latents_list,
|
|
316
|
+
timestep=timestep,
|
|
317
|
+
cap_feats=prompt_emb,
|
|
318
|
+
)
|
|
319
|
+
return noise_pred
|
|
320
|
+
|
|
321
|
+
@torch.no_grad()
|
|
322
|
+
def __call__(
|
|
323
|
+
self,
|
|
324
|
+
prompt: Union[str, List[str]],
|
|
325
|
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
326
|
+
height: int = 1024,
|
|
327
|
+
width: int = 1024,
|
|
328
|
+
num_inference_steps: int = 50,
|
|
329
|
+
cfg_scale: float = 5.0,
|
|
330
|
+
cfg_normalization: bool = False,
|
|
331
|
+
cfg_truncation: float = 1.0,
|
|
332
|
+
seed: Optional[int] = None,
|
|
333
|
+
progress_callback: Optional[Callable] = None,
|
|
334
|
+
):
|
|
335
|
+
self.validate_image_size(height, width, multiple_of=16)
|
|
336
|
+
|
|
337
|
+
self.load_models_to_device(["text_encoder"])
|
|
338
|
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(prompt), self.encode_prompt(negative_prompt)
|
|
339
|
+
self.model_lifecycle_finish(["text_encoder"])
|
|
340
|
+
|
|
341
|
+
noise = self.generate_noise((1, 16, height // 8, width // 8), seed=seed, device="cpu", dtype=self.dtype).to(
|
|
342
|
+
device=self.device
|
|
343
|
+
)
|
|
344
|
+
image_seq_len = math.ceil(height // 16) * math.ceil(width // 16)
|
|
345
|
+
|
|
346
|
+
mu = calculate_shift(image_seq_len, base_seq_len=256, max_seq_len=4096, base_shift=0.5, max_shift=1.15)
|
|
347
|
+
|
|
348
|
+
latents, sigmas, timesteps = self.prepare_latents(noise, num_inference_steps, mu)
|
|
349
|
+
|
|
350
|
+
self.sampler.initialize(sigmas=sigmas)
|
|
351
|
+
|
|
352
|
+
self.load_models_to_device(["dit"])
|
|
353
|
+
hide_progress = dist.is_initialized() and dist.get_rank() != 0
|
|
354
|
+
|
|
355
|
+
for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
|
|
356
|
+
timestep = timestep.unsqueeze(0).to(dtype=self.dtype)
|
|
357
|
+
noise_pred = self.predict_noise_with_cfg(
|
|
358
|
+
latents=latents,
|
|
359
|
+
timestep=timestep,
|
|
360
|
+
prompt_emb=prompt_embeds,
|
|
361
|
+
negative_prompt_emb=negative_prompt_embeds,
|
|
362
|
+
batch_cfg=self.config.batch_cfg,
|
|
363
|
+
cfg_truncation=cfg_truncation,
|
|
364
|
+
cfg_normalization=cfg_normalization,
|
|
365
|
+
)
|
|
366
|
+
latents = self.sampler.step(latents, noise_pred, i)
|
|
367
|
+
if progress_callback is not None:
|
|
368
|
+
progress_callback(i, len(timesteps), "DENOISING")
|
|
369
|
+
|
|
370
|
+
self.model_lifecycle_finish(["dit"])
|
|
371
|
+
|
|
372
|
+
self.load_models_to_device(["vae_decoder"])
|
|
373
|
+
vae_output = self.decode_image(latents)
|
|
374
|
+
image = self.vae_output_to_image(vae_output)
|
|
375
|
+
# Offload all models
|
|
376
|
+
self.load_models_to_device([])
|
|
377
|
+
return image
|
|
@@ -14,6 +14,7 @@ SDXL_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "toke
|
|
|
14
14
|
WAN_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "wan", "umt5-xxl")
|
|
15
15
|
QWEN_IMAGE_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "tokenizer")
|
|
16
16
|
QWEN_IMAGE_PROCESSOR_CONFIG_FILE = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "qwen2_vl_image_processor.json")
|
|
17
|
+
Z_IMAGE_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "z_image", "tokenizer")
|
|
17
18
|
|
|
18
19
|
# models
|
|
19
20
|
VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "components", "vae.json")
|
|
@@ -46,6 +47,8 @@ QWEN_IMAGE_VISION_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image",
|
|
|
46
47
|
QWEN_IMAGE_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen_image_vae.json")
|
|
47
48
|
QWEN_IMAGE_VAE_KEYMAP_FILE = os.path.join(CONF_PATH, "models", "qwen_image", "qwen_image_vae_keymap.json")
|
|
48
49
|
|
|
50
|
+
Z_IMAGE_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "z_image", "qwen3_config.json")
|
|
51
|
+
|
|
49
52
|
# data size
|
|
50
53
|
KB = 1024
|
|
51
54
|
MB = 1024 * KB
|
|
@@ -20,7 +20,7 @@ class Singleton:
|
|
|
20
20
|
|
|
21
21
|
class ProcessGroupSingleton(Singleton):
|
|
22
22
|
def __init__(self):
|
|
23
|
-
if not hasattr(self,
|
|
23
|
+
if not hasattr(self, "initialized"):
|
|
24
24
|
self.CFG_GROUP: Optional[dist.ProcessGroup] = None
|
|
25
25
|
self.SP_GROUP: Optional[dist.ProcessGroup] = None
|
|
26
26
|
self.SP_ULYSSUES_GROUP: Optional[dist.ProcessGroup] = None
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
diffsynth_engine/__init__.py,sha256=
|
|
1
|
+
diffsynth_engine/__init__.py,sha256=hN0jYaikjhpqHB4Mg-e53h-7ck1DsiY4FBti8K9lN2k,2390
|
|
2
2
|
diffsynth_engine/algorithm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
diffsynth_engine/algorithm/noise_scheduler/__init__.py,sha256=YvcwE2tCNua-OAX9GEPm0EXsINNWH4XvJMNZb-uaZMM,745
|
|
4
4
|
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py,sha256=3ve4bYxGyfuERynvoNYdFYSk0agdBgXKCeIOS6O6wgI,819
|
|
@@ -52,6 +52,7 @@ diffsynth_engine/conf/models/wan/dit/wan_dit_keymap.json,sha256=hfGytOIRkdYFgOR9
|
|
|
52
52
|
diffsynth_engine/conf/models/wan/vae/wan2.1_vae.json,sha256=eVLTSRqbXm3JD8QDkLbM6vFfCdynlS-8QxqCfi4BzrI,815
|
|
53
53
|
diffsynth_engine/conf/models/wan/vae/wan2.2_vae.json,sha256=pdnYEEZ_GcZHM_iH1y5ASdf_qZUGCOuDEaFmjdg9RKY,1860
|
|
54
54
|
diffsynth_engine/conf/models/wan/vae/wan_vae_keymap.json,sha256=u9MJ3yRL45kdqRVoBnYbHkmuUmOseUFtwte-_9ZvdHc,25224
|
|
55
|
+
diffsynth_engine/conf/models/z_image/qwen3_config.json,sha256=i6AG90_s-q6zkocqYPSkgOfsmGAVPS4bdp7IH5oUf4o,726
|
|
55
56
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt,sha256=n9aR98gDkhDg_O0VhlRmxlgg0JtjmIsBdL_iXeKZBRo,524619
|
|
56
57
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json,sha256=LNs7gzGmDJL8HlWhPp_WH9IpPFpRJ1_czNYreABSUw4,588
|
|
57
58
|
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json,sha256=a9zunMzioWyitMDF7QC0LFDqIl9EcqjEweljopAsKIE,705
|
|
@@ -79,14 +80,18 @@ diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json,sha256=e4q
|
|
|
79
80
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model,sha256=45CaZ7eAZQs1z1Kax4KtK2sm5tH4SdP7tqhykF9FJFg,4548313
|
|
80
81
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json,sha256=bhl7TT29cdoUtOslX0-pHJwfIGiyCi3iRylnyj0iYCs,16837417
|
|
81
82
|
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json,sha256=7Zo6iw-qcacKMoR-BDX-A25uES1N9O23u0ipIeNE3AU,61728
|
|
82
|
-
diffsynth_engine/
|
|
83
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/merges.txt,sha256=iDHk8aBERxNA98CoPXvXEwaluGfpX9hw900MUwipBNU,1671853
|
|
84
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer.json,sha256=rrEzB6cazY_oGGHZStVKtonfdzMYgJ7tPL55S0SS2uQ,11422654
|
|
85
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/tokenizer_config.json,sha256=1dCfB7SMMIbFCLMNHJEUvRGJFFt06YKiZTUMkjrNgQE,9732
|
|
86
|
+
diffsynth_engine/conf/tokenizers/z_image/tokenizer/vocab.json,sha256=yhDX6fs-0YV13R4neiV5wW0QjjLydDloSvoOELFECRA,2776833
|
|
87
|
+
diffsynth_engine/configs/__init__.py,sha256=biluGSEw78PPwO7XFlms16iuWXDiM0Eg_qsOMMTY0NQ,1409
|
|
83
88
|
diffsynth_engine/configs/controlnet.py,sha256=f3vclyP3lcAjxDGD9C1vevhqqQ7W2LL_c6Wye0uxk3Q,1180
|
|
84
|
-
diffsynth_engine/configs/pipeline.py,sha256=
|
|
89
|
+
diffsynth_engine/configs/pipeline.py,sha256=0WmKz_mykmJkRCGwv9DjuN8s27LppkD_Ier4VtovZSg,15307
|
|
85
90
|
diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
86
91
|
diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
|
|
87
92
|
diffsynth_engine/models/base.py,sha256=svao__9WH8VNcyXz5o5dzywYXDcGV0YV9IfkLzDKews,2558
|
|
88
93
|
diffsynth_engine/models/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
89
|
-
diffsynth_engine/models/basic/attention.py,sha256=
|
|
94
|
+
diffsynth_engine/models/basic/attention.py,sha256=YrIxkYoekC3I7-sMTw60CL4GIKMLOTrn-eCk-iHT7E4,15701
|
|
90
95
|
diffsynth_engine/models/basic/lora.py,sha256=Y6cBgrBsuDAP9FZz_fgK8vBi_EMg23saFIUSAsPIG-M,10670
|
|
91
96
|
diffsynth_engine/models/basic/lora_nunchaku.py,sha256=7qhzGCzUIfDrwtWG0nspwdyZ7YUkaM4vMqzxZby2Zds,7510
|
|
92
97
|
diffsynth_engine/models/basic/relative_position_emb.py,sha256=rCXOweZMcayVnNUVvBcYXMdhHS257B_PC8PZSWxvhNQ,2540
|
|
@@ -111,10 +116,10 @@ diffsynth_engine/models/hunyuan3d/surface_extractor.py,sha256=b15mb1N4PYwAvDk1Gu
|
|
|
111
116
|
diffsynth_engine/models/hunyuan3d/volume_decoder.py,sha256=sgflj1a8sIerqGSalBAVQOlyiIihkLOLXYysNbulCoQ,2355
|
|
112
117
|
diffsynth_engine/models/qwen_image/__init__.py,sha256=_6f0LWaoLdDvD2CsjK2OzEIQryt9efge8DFS4_GUnHQ,582
|
|
113
118
|
diffsynth_engine/models/qwen_image/qwen2_5_vl.py,sha256=Eu-r-c42t_q74Qpwz21ToCGHpvSi7VND4B1EI0e-ePA,57748
|
|
114
|
-
diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=
|
|
119
|
+
diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=mMU4zeZi8-uJe9voznNIxZCTCqJPbPXkMxHwgcqJ6z8,24640
|
|
115
120
|
diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py,sha256=LIv9X_BohKk5rcEzyl3ATLwd8MSoFX43wjkArQ68nq8,4828
|
|
116
121
|
diffsynth_engine/models/qwen_image/qwen_image_dit_nunchaku.py,sha256=1y1BkPRrX4_RioKjM09D9f9PK9neug1nSGJka0D9bvM,13516
|
|
117
|
-
diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=
|
|
122
|
+
diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=FpauZV9IVvpvBeS9volu7kzH2mmCISS86AbHt0Jk2bQ,38442
|
|
118
123
|
diffsynth_engine/models/sd/__init__.py,sha256=hjoKRnwoXOLD0wude-w7I6wK5ak7ACMbnbkPuBB2oU0,380
|
|
119
124
|
diffsynth_engine/models/sd/sd_controlnet.py,sha256=kMGfIdriXhC7reT6iO2Z0rPICXEkXpytjeBQcR_sjT8,50577
|
|
120
125
|
diffsynth_engine/models/sd/sd_text_encoder.py,sha256=BUOsBtSb7WH4Z37JhtYxOtpXMDJcQXZWzx_7JNbsJwM,5369
|
|
@@ -142,16 +147,20 @@ diffsynth_engine/models/wan/wan_image_encoder.py,sha256=Vdd39lv_QvOsmPxihZWZZbpP
|
|
|
142
147
|
diffsynth_engine/models/wan/wan_s2v_dit.py,sha256=j63ulcWLY4XGITOKUMGX292LtSEtP-n8BTvqb98YExU,23615
|
|
143
148
|
diffsynth_engine/models/wan/wan_text_encoder.py,sha256=ePeOifbTI_o650mckzugyWPuHn5vhM-uFMcDVCijxPM,11394
|
|
144
149
|
diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCghutMxpXr4,38889
|
|
145
|
-
diffsynth_engine/
|
|
150
|
+
diffsynth_engine/models/z_image/__init__.py,sha256=d1ztBNgM8GR2_uGwlxOE1Jf5URTq1g-WnmJH7nrMoaY,160
|
|
151
|
+
diffsynth_engine/models/z_image/qwen3.py,sha256=PmT6m46Fc7KZXNzG7ig23Mzj6QfHnMmrpX_MM0UuuYg,4580
|
|
152
|
+
diffsynth_engine/models/z_image/z_image_dit.py,sha256=kGtYzmfzk_FDe7KWfXpJagN7k7ROXl5J01IhRRs-Bsk,23806
|
|
153
|
+
diffsynth_engine/pipelines/__init__.py,sha256=xQUtz2cVmcEInazvT1dqv2HdPiJKmywWTIPfbK5dZXI,662
|
|
146
154
|
diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
|
|
147
155
|
diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
|
|
148
156
|
diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
|
|
149
|
-
diffsynth_engine/pipelines/qwen_image.py,sha256=
|
|
157
|
+
diffsynth_engine/pipelines/qwen_image.py,sha256=9n0fZCYw5E1iloXqd7vOU0XfHVPxQp_pm-v4D3Oloos,35751
|
|
150
158
|
diffsynth_engine/pipelines/sd_image.py,sha256=nr-Nhsnomq8CsUqhTM3i2l2zG01YjwXdfRXgr_bC3F0,17891
|
|
151
159
|
diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8etEV-ZLP91g,21704
|
|
152
160
|
diffsynth_engine/pipelines/utils.py,sha256=HZbJHErNJS1DhlwJKvZ9dY7Kh8Zdlsw3zE2e88TYGRY,2277
|
|
153
161
|
diffsynth_engine/pipelines/wan_s2v.py,sha256=QHlCLMqlmnp55iYm2mzg4qCq4jceRAP3Zt5Mubz3mAM,29384
|
|
154
162
|
diffsynth_engine/pipelines/wan_video.py,sha256=9xjSvQ4mlVEDdaL6QuUURj4iyxhJ2xABBphQjkfzK8s,31323
|
|
163
|
+
diffsynth_engine/pipelines/z_image.py,sha256=gSBhKV7TBL9xvCUrABdZA0kNqQzPuawmEv8OcI6KTcs,14756
|
|
155
164
|
diffsynth_engine/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
165
|
diffsynth_engine/processor/canny_processor.py,sha256=hV30NlblTkEFUAmF_O-LJrNlGVM2SFrqq6okfF8VpOo,602
|
|
157
166
|
diffsynth_engine/processor/depth_processor.py,sha256=dQvs3JsnyMbz4dyI9QoR8oO-mMFBFAgNvgqeCoaU5jk,1532
|
|
@@ -170,7 +179,7 @@ diffsynth_engine/tools/flux_reference_tool.py,sha256=6v0NRZPsDEHFlPruO-ZJTB4rYWx
|
|
|
170
179
|
diffsynth_engine/tools/flux_replace_tool.py,sha256=AOyEGxHsaNwpTS2VChAieIfECgMxlKsRw0lWPm1k9C0,4627
|
|
171
180
|
diffsynth_engine/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
172
181
|
diffsynth_engine/utils/cache.py,sha256=Ivef22pCuhEq-4H00gSvkLS8ceVZoGis7OSitYL6gH4,2101
|
|
173
|
-
diffsynth_engine/utils/constants.py,sha256=
|
|
182
|
+
diffsynth_engine/utils/constants.py,sha256=x0-bsPRplW-KkRpLVajuC9Yv6f3QbdHgSr3XZ-eBCsQ,3745
|
|
174
183
|
diffsynth_engine/utils/download.py,sha256=w9QQjllPfTUEY371UTREU7o_vvdMY-Q2DymDel3ZEZY,6792
|
|
175
184
|
diffsynth_engine/utils/env.py,sha256=k749eYt_qKGq38GocDiXfkhp8nZrowFefNVTZ8R755I,363
|
|
176
185
|
diffsynth_engine/utils/flag.py,sha256=KSzjnzRe7sleNCJm8IpbJQbmBY4KNV2kDrijxi27Jek,2928
|
|
@@ -184,14 +193,14 @@ diffsynth_engine/utils/offload.py,sha256=94og79TIkxldwYUgZT3L4OVu1WBlE7gfVPvO2MR
|
|
|
184
193
|
diffsynth_engine/utils/onnx.py,sha256=jeWUudJHnESjuiEAHyUZYUZz7dCj34O9aGjHCe8yjWo,1149
|
|
185
194
|
diffsynth_engine/utils/parallel.py,sha256=OBGsAK-3ncArRyMU1lea7tbYgxSdCucQvXheL3Ssl5M,17653
|
|
186
195
|
diffsynth_engine/utils/platform.py,sha256=nbpG-XHJFRmYY6u_e7IBQ9Q6GyItrIkKf3VKuBPTUpY,627
|
|
187
|
-
diffsynth_engine/utils/process_group.py,sha256=
|
|
196
|
+
diffsynth_engine/utils/process_group.py,sha256=I9uiqoVq-Hlu694GnrvbVi7nfVJBsgCCDo3p2kjU3yo,3783
|
|
188
197
|
diffsynth_engine/utils/prompt.py,sha256=YItMchoVzsG6y-LB4vzzDUWrkhKRVlt1HfVhxZjSxMQ,280
|
|
189
198
|
diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CDhg,2200
|
|
190
199
|
diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
191
200
|
diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
|
|
192
201
|
diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
|
|
193
|
-
diffsynth_engine-0.6.1.
|
|
194
|
-
diffsynth_engine-0.6.1.
|
|
195
|
-
diffsynth_engine-0.6.1.
|
|
196
|
-
diffsynth_engine-0.6.1.
|
|
197
|
-
diffsynth_engine-0.6.1.
|
|
202
|
+
diffsynth_engine-0.6.1.dev35.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
|
|
203
|
+
diffsynth_engine-0.6.1.dev35.dist-info/METADATA,sha256=mwDiBscVZRY6rz7Mbmv4qxhlFNSFoACIu4xl0YA9lVE,1164
|
|
204
|
+
diffsynth_engine-0.6.1.dev35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
205
|
+
diffsynth_engine-0.6.1.dev35.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
|
|
206
|
+
diffsynth_engine-0.6.1.dev35.dist-info/RECORD,,
|
|
File without changes
|
{diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{diffsynth_engine-0.6.1.dev33.dist-info → diffsynth_engine-0.6.1.dev35.dist-info}/top_level.txt
RENAMED
|
File without changes
|