InvokeAI 6.10.0rc1__py3-none-any.whl → 6.10.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invokeai/app/invocations/flux_denoise.py +15 -1
- invokeai/app/invocations/pbr_maps.py +59 -0
- invokeai/app/invocations/z_image_denoise.py +237 -82
- invokeai/backend/flux/denoise.py +196 -11
- invokeai/backend/flux/schedulers.py +62 -0
- invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
- invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
- invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
- invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
- invokeai/backend/model_manager/configs/lora.py +36 -0
- invokeai/backend/model_manager/load/load_default.py +1 -0
- invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
- invokeai/backend/model_manager/load/model_loaders/flux.py +13 -6
- invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
- invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
- invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +3 -1
- invokeai/backend/model_manager/load/model_loaders/z_image.py +37 -3
- invokeai/backend/model_manager/starter_models.py +13 -4
- invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
- invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
- invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
- invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +161 -0
- invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-BP0RxJ4G.js} +1 -1
- invokeai/frontend/web/dist/assets/{index-dgSJAY--.js → index-B44qKjrs.js} +51 -51
- invokeai/frontend/web/dist/index.html +1 -1
- invokeai/frontend/web/dist/locales/en-GB.json +1 -0
- invokeai/frontend/web/dist/locales/en.json +11 -5
- invokeai/version/invokeai_version.py +1 -1
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/METADATA +2 -2
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/RECORD +36 -29
- invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/WHEEL +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/entry_points.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -47,6 +47,7 @@ from invokeai.backend.flux.sampling_utils import (
|
|
|
47
47
|
pack,
|
|
48
48
|
unpack,
|
|
49
49
|
)
|
|
50
|
+
from invokeai.backend.flux.schedulers import FLUX_SCHEDULER_LABELS, FLUX_SCHEDULER_MAP, FLUX_SCHEDULER_NAME_VALUES
|
|
50
51
|
from invokeai.backend.flux.text_conditioning import FluxReduxConditioning, FluxTextConditioning
|
|
51
52
|
from invokeai.backend.model_manager.taxonomy import BaseModelType, FluxVariantType, ModelFormat, ModelType
|
|
52
53
|
from invokeai.backend.patches.layer_patcher import LayerPatcher
|
|
@@ -63,7 +64,7 @@ from invokeai.backend.util.devices import TorchDevice
|
|
|
63
64
|
title="FLUX Denoise",
|
|
64
65
|
tags=["image", "flux"],
|
|
65
66
|
category="image",
|
|
66
|
-
version="4.
|
|
67
|
+
version="4.2.0",
|
|
67
68
|
)
|
|
68
69
|
class FluxDenoiseInvocation(BaseInvocation):
|
|
69
70
|
"""Run denoising process with a FLUX transformer model."""
|
|
@@ -132,6 +133,12 @@ class FluxDenoiseInvocation(BaseInvocation):
|
|
|
132
133
|
num_steps: int = InputField(
|
|
133
134
|
default=4, description="Number of diffusion steps. Recommended values are schnell: 4, dev: 50."
|
|
134
135
|
)
|
|
136
|
+
scheduler: FLUX_SCHEDULER_NAME_VALUES = InputField(
|
|
137
|
+
default="euler",
|
|
138
|
+
description="Scheduler (sampler) for the denoising process. 'euler' is fast and standard. "
|
|
139
|
+
"'heun' is 2nd-order (better quality, 2x slower). 'lcm' is optimized for few steps.",
|
|
140
|
+
ui_choice_labels=FLUX_SCHEDULER_LABELS,
|
|
141
|
+
)
|
|
135
142
|
guidance: float = InputField(
|
|
136
143
|
default=4.0,
|
|
137
144
|
description="The guidance strength. Higher values adhere more strictly to the prompt, and will produce less diverse images. FLUX dev only, ignored for schnell.",
|
|
@@ -242,6 +249,12 @@ class FluxDenoiseInvocation(BaseInvocation):
|
|
|
242
249
|
shift=not is_schnell,
|
|
243
250
|
)
|
|
244
251
|
|
|
252
|
+
# Create scheduler if not using default euler
|
|
253
|
+
scheduler = None
|
|
254
|
+
if self.scheduler in FLUX_SCHEDULER_MAP:
|
|
255
|
+
scheduler_class = FLUX_SCHEDULER_MAP[self.scheduler]
|
|
256
|
+
scheduler = scheduler_class(num_train_timesteps=1000)
|
|
257
|
+
|
|
245
258
|
# Clip the timesteps schedule based on denoising_start and denoising_end.
|
|
246
259
|
timesteps = clip_timestep_schedule_fractional(timesteps, self.denoising_start, self.denoising_end)
|
|
247
260
|
|
|
@@ -426,6 +439,7 @@ class FluxDenoiseInvocation(BaseInvocation):
|
|
|
426
439
|
img_cond=img_cond,
|
|
427
440
|
img_cond_seq=img_cond_seq,
|
|
428
441
|
img_cond_seq_ids=img_cond_seq_ids,
|
|
442
|
+
scheduler=scheduler,
|
|
429
443
|
)
|
|
430
444
|
|
|
431
445
|
x = unpack(x.float(), self.height, self.width)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
|
|
5
|
+
from invokeai.app.invocations.fields import ImageField, InputField, OutputField, WithBoard, WithMetadata
|
|
6
|
+
from invokeai.app.services.shared.invocation_context import InvocationContext
|
|
7
|
+
from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
|
|
8
|
+
from invokeai.backend.image_util.pbr_maps.pbr_maps import NORMAL_MAP_MODEL, OTHER_MAP_MODEL, PBRMapsGenerator
|
|
9
|
+
from invokeai.backend.util.devices import TorchDevice
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@invocation_output("pbr_maps-output")
|
|
13
|
+
class PBRMapsOutput(BaseInvocationOutput):
|
|
14
|
+
normal_map: ImageField = OutputField(default=None, description="The generated normal map")
|
|
15
|
+
roughness_map: ImageField = OutputField(default=None, description="The generated roughness map")
|
|
16
|
+
displacement_map: ImageField = OutputField(default=None, description="The generated displacement map")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@invocation("pbr_maps", title="PBR Maps", tags=["image", "material"], category="image", version="1.0.0")
|
|
20
|
+
class PBRMapsInvocation(BaseInvocation, WithMetadata, WithBoard):
|
|
21
|
+
"""Generate Normal, Displacement and Roughness Map from a given image"""
|
|
22
|
+
|
|
23
|
+
image: ImageField = InputField(description="Input image")
|
|
24
|
+
tile_size: int = InputField(default=512, description="Tile size")
|
|
25
|
+
border_mode: Literal["none", "seamless", "mirror", "replicate"] = InputField(
|
|
26
|
+
default="none", description="Border mode to apply to eliminate any artifacts or seams"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def invoke(self, context: InvocationContext) -> PBRMapsOutput:
|
|
30
|
+
image_pil = context.images.get_pil(self.image.image_name, mode="RGB")
|
|
31
|
+
|
|
32
|
+
def loader(model_path: pathlib.Path):
|
|
33
|
+
return PBRMapsGenerator.load_model(model_path, TorchDevice.choose_torch_device())
|
|
34
|
+
|
|
35
|
+
torch_device = TorchDevice.choose_torch_device()
|
|
36
|
+
|
|
37
|
+
with (
|
|
38
|
+
context.models.load_remote_model(NORMAL_MAP_MODEL, loader) as normal_map_model,
|
|
39
|
+
context.models.load_remote_model(OTHER_MAP_MODEL, loader) as other_map_model,
|
|
40
|
+
):
|
|
41
|
+
assert isinstance(normal_map_model, PBR_RRDB_Net)
|
|
42
|
+
assert isinstance(other_map_model, PBR_RRDB_Net)
|
|
43
|
+
pbr_pipeline = PBRMapsGenerator(normal_map_model, other_map_model, torch_device)
|
|
44
|
+
normal_map, roughness_map, displacement_map = pbr_pipeline.generate_maps(
|
|
45
|
+
image_pil, self.tile_size, self.border_mode
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
normal_map = context.images.save(normal_map)
|
|
49
|
+
normal_map_field = ImageField(image_name=normal_map.image_name)
|
|
50
|
+
|
|
51
|
+
roughness_map = context.images.save(roughness_map)
|
|
52
|
+
roughness_map_field = ImageField(image_name=roughness_map.image_name)
|
|
53
|
+
|
|
54
|
+
displacement_map = context.images.save(displacement_map)
|
|
55
|
+
displacement_map_field = ImageField(image_name=displacement_map.image_name)
|
|
56
|
+
|
|
57
|
+
return PBRMapsOutput(
|
|
58
|
+
normal_map=normal_map_field, roughness_map=roughness_map_field, displacement_map=displacement_map_field
|
|
59
|
+
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import math
|
|
2
3
|
from contextlib import ExitStack
|
|
3
4
|
from typing import Callable, Iterator, Optional, Tuple
|
|
@@ -5,6 +6,7 @@ from typing import Callable, Iterator, Optional, Tuple
|
|
|
5
6
|
import einops
|
|
6
7
|
import torch
|
|
7
8
|
import torchvision.transforms as tv_transforms
|
|
9
|
+
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
|
8
10
|
from PIL import Image
|
|
9
11
|
from torchvision.transforms.functional import resize as tv_resize
|
|
10
12
|
from tqdm import tqdm
|
|
@@ -24,6 +26,7 @@ from invokeai.app.invocations.primitives import LatentsOutput
|
|
|
24
26
|
from invokeai.app.invocations.z_image_control import ZImageControlField
|
|
25
27
|
from invokeai.app.invocations.z_image_image_to_latents import ZImageImageToLatentsInvocation
|
|
26
28
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
|
29
|
+
from invokeai.backend.flux.schedulers import ZIMAGE_SCHEDULER_LABELS, ZIMAGE_SCHEDULER_MAP, ZIMAGE_SCHEDULER_NAME_VALUES
|
|
27
30
|
from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat
|
|
28
31
|
from invokeai.backend.patches.layer_patcher import LayerPatcher
|
|
29
32
|
from invokeai.backend.patches.lora_conversions.z_image_lora_constants import Z_IMAGE_LORA_TRANSFORMER_PREFIX
|
|
@@ -47,7 +50,7 @@ from invokeai.backend.z_image.z_image_transformer_patch import patch_transformer
|
|
|
47
50
|
title="Denoise - Z-Image",
|
|
48
51
|
tags=["image", "z-image"],
|
|
49
52
|
category="image",
|
|
50
|
-
version="1.
|
|
53
|
+
version="1.3.0",
|
|
51
54
|
classification=Classification.Prototype,
|
|
52
55
|
)
|
|
53
56
|
class ZImageDenoiseInvocation(BaseInvocation):
|
|
@@ -100,6 +103,13 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
100
103
|
description=FieldDescriptions.vae + " Required for control conditioning.",
|
|
101
104
|
input=Input.Connection,
|
|
102
105
|
)
|
|
106
|
+
# Scheduler selection for the denoising process
|
|
107
|
+
scheduler: ZIMAGE_SCHEDULER_NAME_VALUES = InputField(
|
|
108
|
+
default="euler",
|
|
109
|
+
description="Scheduler (sampler) for the denoising process. Euler is the default and recommended for "
|
|
110
|
+
"Z-Image-Turbo. Heun is 2nd-order (better quality, 2x slower). LCM is optimized for few steps.",
|
|
111
|
+
ui_choice_labels=ZIMAGE_SCHEDULER_LABELS,
|
|
112
|
+
)
|
|
103
113
|
|
|
104
114
|
@torch.no_grad()
|
|
105
115
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
@@ -361,15 +371,32 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
361
371
|
)
|
|
362
372
|
|
|
363
373
|
step_callback = self._build_step_callback(context)
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
374
|
+
|
|
375
|
+
# Initialize the diffusers scheduler if not using built-in Euler
|
|
376
|
+
scheduler: SchedulerMixin | None = None
|
|
377
|
+
use_scheduler = self.scheduler != "euler"
|
|
378
|
+
|
|
379
|
+
if use_scheduler:
|
|
380
|
+
scheduler_class = ZIMAGE_SCHEDULER_MAP[self.scheduler]
|
|
381
|
+
scheduler = scheduler_class(
|
|
382
|
+
num_train_timesteps=1000,
|
|
383
|
+
shift=1.0,
|
|
384
|
+
)
|
|
385
|
+
# Set timesteps - LCM should use num_inference_steps (it has its own sigma schedule),
|
|
386
|
+
# while other schedulers can use custom sigmas if supported
|
|
387
|
+
is_lcm = self.scheduler == "lcm"
|
|
388
|
+
set_timesteps_sig = inspect.signature(scheduler.set_timesteps)
|
|
389
|
+
if not is_lcm and "sigmas" in set_timesteps_sig.parameters:
|
|
390
|
+
# Convert sigmas list to tensor for scheduler
|
|
391
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device)
|
|
392
|
+
else:
|
|
393
|
+
# LCM or scheduler doesn't support custom sigmas - use num_inference_steps
|
|
394
|
+
scheduler.set_timesteps(num_inference_steps=total_steps, device=device)
|
|
395
|
+
|
|
396
|
+
# For Heun scheduler, the number of actual steps may differ
|
|
397
|
+
num_scheduler_steps = len(scheduler.timesteps)
|
|
398
|
+
else:
|
|
399
|
+
num_scheduler_steps = total_steps
|
|
373
400
|
|
|
374
401
|
with ExitStack() as exit_stack:
|
|
375
402
|
# Get transformer config to determine if it's quantized
|
|
@@ -503,91 +530,219 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
503
530
|
)
|
|
504
531
|
)
|
|
505
532
|
|
|
506
|
-
# Denoising loop
|
|
507
|
-
for
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
#
|
|
512
|
-
#
|
|
513
|
-
#
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
533
|
+
# Denoising loop - supports both built-in Euler and diffusers schedulers
|
|
534
|
+
# Track user-facing step for progress (accounts for Heun's double steps)
|
|
535
|
+
user_step = 0
|
|
536
|
+
|
|
537
|
+
if use_scheduler and scheduler is not None:
|
|
538
|
+
# Use diffusers scheduler for stepping
|
|
539
|
+
# Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
|
|
540
|
+
# This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
|
|
541
|
+
pbar = tqdm(total=total_steps, desc="Denoising")
|
|
542
|
+
for step_index in range(num_scheduler_steps):
|
|
543
|
+
sched_timestep = scheduler.timesteps[step_index]
|
|
544
|
+
# Convert scheduler timestep (0-1000) to normalized sigma (0-1)
|
|
545
|
+
sigma_curr = sched_timestep.item() / scheduler.config.num_train_timesteps
|
|
546
|
+
|
|
547
|
+
# For Heun scheduler, track if we're in first or second order step
|
|
548
|
+
is_heun = hasattr(scheduler, "state_in_first_order")
|
|
549
|
+
in_first_order = scheduler.state_in_first_order if is_heun else True
|
|
550
|
+
|
|
551
|
+
# Timestep tensor for Z-Image model
|
|
552
|
+
# The model expects t=0 at start (noise) and t=1 at end (clean)
|
|
553
|
+
model_t = 1.0 - sigma_curr
|
|
554
|
+
timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
|
|
555
|
+
|
|
556
|
+
# Run transformer for positive prediction
|
|
557
|
+
latent_model_input = latents.to(transformer.dtype)
|
|
558
|
+
latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
|
|
559
|
+
latent_model_input_list = list(latent_model_input.unbind(dim=0))
|
|
560
|
+
|
|
561
|
+
# Determine if control should be applied at this step
|
|
562
|
+
apply_control = control_extension is not None and control_extension.should_apply(
|
|
563
|
+
user_step, total_steps
|
|
535
564
|
)
|
|
536
|
-
else:
|
|
537
|
-
model_output = transformer(
|
|
538
|
-
x=latent_model_input_list,
|
|
539
|
-
t=timestep,
|
|
540
|
-
cap_feats=[pos_prompt_embeds],
|
|
541
|
-
)
|
|
542
|
-
model_out_list = model_output[0] # Extract list of tensors from tuple
|
|
543
565
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
566
|
+
# Run forward pass
|
|
567
|
+
if apply_control:
|
|
568
|
+
model_out_list, _ = z_image_forward_with_control(
|
|
569
|
+
transformer=transformer,
|
|
570
|
+
x=latent_model_input_list,
|
|
571
|
+
t=timestep,
|
|
572
|
+
cap_feats=[pos_prompt_embeds],
|
|
573
|
+
control_extension=control_extension,
|
|
574
|
+
)
|
|
575
|
+
else:
|
|
576
|
+
model_output = transformer(
|
|
577
|
+
x=latent_model_input_list,
|
|
578
|
+
t=timestep,
|
|
579
|
+
cap_feats=[pos_prompt_embeds],
|
|
580
|
+
)
|
|
581
|
+
model_out_list = model_output[0]
|
|
582
|
+
|
|
583
|
+
noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
|
|
584
|
+
noise_pred_cond = noise_pred_cond.squeeze(2)
|
|
585
|
+
noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
|
|
586
|
+
|
|
587
|
+
# Apply CFG if enabled
|
|
588
|
+
if do_classifier_free_guidance and neg_prompt_embeds is not None:
|
|
589
|
+
if apply_control:
|
|
590
|
+
model_out_list_uncond, _ = z_image_forward_with_control(
|
|
591
|
+
transformer=transformer,
|
|
592
|
+
x=latent_model_input_list,
|
|
593
|
+
t=timestep,
|
|
594
|
+
cap_feats=[neg_prompt_embeds],
|
|
595
|
+
control_extension=control_extension,
|
|
596
|
+
)
|
|
597
|
+
else:
|
|
598
|
+
model_output_uncond = transformer(
|
|
599
|
+
x=latent_model_input_list,
|
|
600
|
+
t=timestep,
|
|
601
|
+
cap_feats=[neg_prompt_embeds],
|
|
602
|
+
)
|
|
603
|
+
model_out_list_uncond = model_output_uncond[0]
|
|
604
|
+
|
|
605
|
+
noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
|
|
606
|
+
noise_pred_uncond = noise_pred_uncond.squeeze(2)
|
|
607
|
+
noise_pred_uncond = -noise_pred_uncond
|
|
608
|
+
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
|
609
|
+
else:
|
|
610
|
+
noise_pred = noise_pred_cond
|
|
611
|
+
|
|
612
|
+
# Use scheduler.step() for the update
|
|
613
|
+
step_output = scheduler.step(model_output=noise_pred, timestep=sched_timestep, sample=latents)
|
|
614
|
+
latents = step_output.prev_sample
|
|
615
|
+
|
|
616
|
+
# Get sigma_prev for inpainting (next sigma value)
|
|
617
|
+
if step_index + 1 < len(scheduler.sigmas):
|
|
618
|
+
sigma_prev = scheduler.sigmas[step_index + 1].item()
|
|
619
|
+
else:
|
|
620
|
+
sigma_prev = 0.0
|
|
621
|
+
|
|
622
|
+
if inpaint_extension is not None:
|
|
623
|
+
latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
|
|
624
|
+
|
|
625
|
+
# For Heun, only increment user step after second-order step completes
|
|
626
|
+
if is_heun:
|
|
627
|
+
if not in_first_order:
|
|
628
|
+
user_step += 1
|
|
629
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
630
|
+
if user_step <= total_steps:
|
|
631
|
+
pbar.update(1)
|
|
632
|
+
step_callback(
|
|
633
|
+
PipelineIntermediateState(
|
|
634
|
+
step=user_step,
|
|
635
|
+
order=2,
|
|
636
|
+
total_steps=total_steps,
|
|
637
|
+
timestep=int(sigma_curr * 1000),
|
|
638
|
+
latents=latents,
|
|
639
|
+
),
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
# For LCM and other first-order schedulers
|
|
643
|
+
user_step += 1
|
|
644
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
645
|
+
# (LCM scheduler may have more internal steps than user-facing steps)
|
|
646
|
+
if user_step <= total_steps:
|
|
647
|
+
pbar.update(1)
|
|
648
|
+
step_callback(
|
|
649
|
+
PipelineIntermediateState(
|
|
650
|
+
step=user_step,
|
|
651
|
+
order=1,
|
|
652
|
+
total_steps=total_steps,
|
|
653
|
+
timestep=int(sigma_curr * 1000),
|
|
654
|
+
latents=latents,
|
|
655
|
+
),
|
|
656
|
+
)
|
|
657
|
+
pbar.close()
|
|
658
|
+
else:
|
|
659
|
+
# Original Euler implementation (default, optimized for Z-Image)
|
|
660
|
+
for step_idx in tqdm(range(total_steps)):
|
|
661
|
+
sigma_curr = sigmas[step_idx]
|
|
662
|
+
sigma_prev = sigmas[step_idx + 1]
|
|
663
|
+
|
|
664
|
+
# Timestep tensor for Z-Image model
|
|
665
|
+
# The model expects t=0 at start (noise) and t=1 at end (clean)
|
|
666
|
+
# Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
|
|
667
|
+
model_t = 1.0 - sigma_curr
|
|
668
|
+
timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
|
|
669
|
+
|
|
670
|
+
# Run transformer for positive prediction
|
|
671
|
+
# Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
|
|
672
|
+
# Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
|
|
673
|
+
latent_model_input = latents.to(transformer.dtype)
|
|
674
|
+
latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
|
|
675
|
+
latent_model_input_list = list(latent_model_input.unbind(dim=0))
|
|
676
|
+
|
|
677
|
+
# Determine if control should be applied at this step
|
|
678
|
+
apply_control = control_extension is not None and control_extension.should_apply(
|
|
679
|
+
step_idx, total_steps
|
|
680
|
+
)
|
|
547
681
|
|
|
548
|
-
|
|
549
|
-
if do_classifier_free_guidance and neg_prompt_embeds is not None:
|
|
682
|
+
# Run forward pass - use custom forward with control if extension is active
|
|
550
683
|
if apply_control:
|
|
551
|
-
|
|
684
|
+
model_out_list, _ = z_image_forward_with_control(
|
|
552
685
|
transformer=transformer,
|
|
553
686
|
x=latent_model_input_list,
|
|
554
687
|
t=timestep,
|
|
555
|
-
cap_feats=[
|
|
688
|
+
cap_feats=[pos_prompt_embeds],
|
|
556
689
|
control_extension=control_extension,
|
|
557
690
|
)
|
|
558
691
|
else:
|
|
559
|
-
|
|
692
|
+
model_output = transformer(
|
|
560
693
|
x=latent_model_input_list,
|
|
561
694
|
t=timestep,
|
|
562
|
-
cap_feats=[
|
|
695
|
+
cap_feats=[pos_prompt_embeds],
|
|
563
696
|
)
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
697
|
+
model_out_list = model_output[0] # Extract list of tensors from tuple
|
|
698
|
+
|
|
699
|
+
noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
|
|
700
|
+
noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
|
|
701
|
+
noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
|
|
702
|
+
|
|
703
|
+
# Apply CFG if enabled
|
|
704
|
+
if do_classifier_free_guidance and neg_prompt_embeds is not None:
|
|
705
|
+
if apply_control:
|
|
706
|
+
model_out_list_uncond, _ = z_image_forward_with_control(
|
|
707
|
+
transformer=transformer,
|
|
708
|
+
x=latent_model_input_list,
|
|
709
|
+
t=timestep,
|
|
710
|
+
cap_feats=[neg_prompt_embeds],
|
|
711
|
+
control_extension=control_extension,
|
|
712
|
+
)
|
|
713
|
+
else:
|
|
714
|
+
model_output_uncond = transformer(
|
|
715
|
+
x=latent_model_input_list,
|
|
716
|
+
t=timestep,
|
|
717
|
+
cap_feats=[neg_prompt_embeds],
|
|
718
|
+
)
|
|
719
|
+
model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
|
|
720
|
+
|
|
721
|
+
noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
|
|
722
|
+
noise_pred_uncond = noise_pred_uncond.squeeze(2)
|
|
723
|
+
noise_pred_uncond = -noise_pred_uncond
|
|
724
|
+
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
|
725
|
+
else:
|
|
726
|
+
noise_pred = noise_pred_cond
|
|
727
|
+
|
|
728
|
+
# Euler step
|
|
729
|
+
latents_dtype = latents.dtype
|
|
730
|
+
latents = latents.to(dtype=torch.float32)
|
|
731
|
+
latents = latents + (sigma_prev - sigma_curr) * noise_pred
|
|
732
|
+
latents = latents.to(dtype=latents_dtype)
|
|
733
|
+
|
|
734
|
+
if inpaint_extension is not None:
|
|
735
|
+
latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
|
|
736
|
+
|
|
737
|
+
step_callback(
|
|
738
|
+
PipelineIntermediateState(
|
|
739
|
+
step=step_idx + 1,
|
|
740
|
+
order=1,
|
|
741
|
+
total_steps=total_steps,
|
|
742
|
+
timestep=int(sigma_curr * 1000),
|
|
743
|
+
latents=latents,
|
|
744
|
+
),
|
|
745
|
+
)
|
|
591
746
|
|
|
592
747
|
return latents
|
|
593
748
|
|