InvokeAI 6.10.0rc1__py3-none-any.whl → 6.10.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. invokeai/app/invocations/flux_denoise.py +15 -1
  2. invokeai/app/invocations/pbr_maps.py +59 -0
  3. invokeai/app/invocations/z_image_denoise.py +237 -82
  4. invokeai/backend/flux/denoise.py +196 -11
  5. invokeai/backend/flux/schedulers.py +62 -0
  6. invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
  7. invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
  8. invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
  9. invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
  10. invokeai/backend/model_manager/configs/lora.py +36 -0
  11. invokeai/backend/model_manager/load/load_default.py +1 -0
  12. invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
  13. invokeai/backend/model_manager/load/model_loaders/flux.py +13 -6
  14. invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
  15. invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
  16. invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +3 -1
  17. invokeai/backend/model_manager/load/model_loaders/z_image.py +37 -3
  18. invokeai/backend/model_manager/starter_models.py +13 -4
  19. invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
  20. invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
  21. invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
  22. invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +161 -0
  23. invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-BP0RxJ4G.js} +1 -1
  24. invokeai/frontend/web/dist/assets/{index-dgSJAY--.js → index-B44qKjrs.js} +51 -51
  25. invokeai/frontend/web/dist/index.html +1 -1
  26. invokeai/frontend/web/dist/locales/en-GB.json +1 -0
  27. invokeai/frontend/web/dist/locales/en.json +11 -5
  28. invokeai/version/invokeai_version.py +1 -1
  29. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/METADATA +2 -2
  30. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/RECORD +36 -29
  31. invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
  32. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/WHEEL +0 -0
  33. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/entry_points.txt +0 -0
  34. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE +0 -0
  35. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  36. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  37. {invokeai-6.10.0rc1.dist-info → invokeai-6.10.0rc2.dist-info}/top_level.txt +0 -0
@@ -47,6 +47,7 @@ from invokeai.backend.flux.sampling_utils import (
47
47
  pack,
48
48
  unpack,
49
49
  )
50
+ from invokeai.backend.flux.schedulers import FLUX_SCHEDULER_LABELS, FLUX_SCHEDULER_MAP, FLUX_SCHEDULER_NAME_VALUES
50
51
  from invokeai.backend.flux.text_conditioning import FluxReduxConditioning, FluxTextConditioning
51
52
  from invokeai.backend.model_manager.taxonomy import BaseModelType, FluxVariantType, ModelFormat, ModelType
52
53
  from invokeai.backend.patches.layer_patcher import LayerPatcher
@@ -63,7 +64,7 @@ from invokeai.backend.util.devices import TorchDevice
63
64
  title="FLUX Denoise",
64
65
  tags=["image", "flux"],
65
66
  category="image",
66
- version="4.1.0",
67
+ version="4.2.0",
67
68
  )
68
69
  class FluxDenoiseInvocation(BaseInvocation):
69
70
  """Run denoising process with a FLUX transformer model."""
@@ -132,6 +133,12 @@ class FluxDenoiseInvocation(BaseInvocation):
132
133
  num_steps: int = InputField(
133
134
  default=4, description="Number of diffusion steps. Recommended values are schnell: 4, dev: 50."
134
135
  )
136
+ scheduler: FLUX_SCHEDULER_NAME_VALUES = InputField(
137
+ default="euler",
138
+ description="Scheduler (sampler) for the denoising process. 'euler' is fast and standard. "
139
+ "'heun' is 2nd-order (better quality, 2x slower). 'lcm' is optimized for few steps.",
140
+ ui_choice_labels=FLUX_SCHEDULER_LABELS,
141
+ )
135
142
  guidance: float = InputField(
136
143
  default=4.0,
137
144
  description="The guidance strength. Higher values adhere more strictly to the prompt, and will produce less diverse images. FLUX dev only, ignored for schnell.",
@@ -242,6 +249,12 @@ class FluxDenoiseInvocation(BaseInvocation):
242
249
  shift=not is_schnell,
243
250
  )
244
251
 
252
+ # Create scheduler if not using default euler
253
+ scheduler = None
254
+ if self.scheduler in FLUX_SCHEDULER_MAP:
255
+ scheduler_class = FLUX_SCHEDULER_MAP[self.scheduler]
256
+ scheduler = scheduler_class(num_train_timesteps=1000)
257
+
245
258
  # Clip the timesteps schedule based on denoising_start and denoising_end.
246
259
  timesteps = clip_timestep_schedule_fractional(timesteps, self.denoising_start, self.denoising_end)
247
260
 
@@ -426,6 +439,7 @@ class FluxDenoiseInvocation(BaseInvocation):
426
439
  img_cond=img_cond,
427
440
  img_cond_seq=img_cond_seq,
428
441
  img_cond_seq_ids=img_cond_seq_ids,
442
+ scheduler=scheduler,
429
443
  )
430
444
 
431
445
  x = unpack(x.float(), self.height, self.width)
@@ -0,0 +1,59 @@
1
+ import pathlib
2
+ from typing import Literal
3
+
4
+ from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
5
+ from invokeai.app.invocations.fields import ImageField, InputField, OutputField, WithBoard, WithMetadata
6
+ from invokeai.app.services.shared.invocation_context import InvocationContext
7
+ from invokeai.backend.image_util.pbr_maps.architecture.pbr_rrdb_net import PBR_RRDB_Net
8
+ from invokeai.backend.image_util.pbr_maps.pbr_maps import NORMAL_MAP_MODEL, OTHER_MAP_MODEL, PBRMapsGenerator
9
+ from invokeai.backend.util.devices import TorchDevice
10
+
11
+
12
+ @invocation_output("pbr_maps-output")
13
+ class PBRMapsOutput(BaseInvocationOutput):
14
+ normal_map: ImageField = OutputField(default=None, description="The generated normal map")
15
+ roughness_map: ImageField = OutputField(default=None, description="The generated roughness map")
16
+ displacement_map: ImageField = OutputField(default=None, description="The generated displacement map")
17
+
18
+
19
+ @invocation("pbr_maps", title="PBR Maps", tags=["image", "material"], category="image", version="1.0.0")
20
+ class PBRMapsInvocation(BaseInvocation, WithMetadata, WithBoard):
21
+ """Generate Normal, Displacement and Roughness Map from a given image"""
22
+
23
+ image: ImageField = InputField(description="Input image")
24
+ tile_size: int = InputField(default=512, description="Tile size")
25
+ border_mode: Literal["none", "seamless", "mirror", "replicate"] = InputField(
26
+ default="none", description="Border mode to apply to eliminate any artifacts or seams"
27
+ )
28
+
29
+ def invoke(self, context: InvocationContext) -> PBRMapsOutput:
30
+ image_pil = context.images.get_pil(self.image.image_name, mode="RGB")
31
+
32
+ def loader(model_path: pathlib.Path):
33
+ return PBRMapsGenerator.load_model(model_path, TorchDevice.choose_torch_device())
34
+
35
+ torch_device = TorchDevice.choose_torch_device()
36
+
37
+ with (
38
+ context.models.load_remote_model(NORMAL_MAP_MODEL, loader) as normal_map_model,
39
+ context.models.load_remote_model(OTHER_MAP_MODEL, loader) as other_map_model,
40
+ ):
41
+ assert isinstance(normal_map_model, PBR_RRDB_Net)
42
+ assert isinstance(other_map_model, PBR_RRDB_Net)
43
+ pbr_pipeline = PBRMapsGenerator(normal_map_model, other_map_model, torch_device)
44
+ normal_map, roughness_map, displacement_map = pbr_pipeline.generate_maps(
45
+ image_pil, self.tile_size, self.border_mode
46
+ )
47
+
48
+ normal_map = context.images.save(normal_map)
49
+ normal_map_field = ImageField(image_name=normal_map.image_name)
50
+
51
+ roughness_map = context.images.save(roughness_map)
52
+ roughness_map_field = ImageField(image_name=roughness_map.image_name)
53
+
54
+ displacement_map = context.images.save(displacement_map)
55
+ displacement_map_field = ImageField(image_name=displacement_map.image_name)
56
+
57
+ return PBRMapsOutput(
58
+ normal_map=normal_map_field, roughness_map=roughness_map_field, displacement_map=displacement_map_field
59
+ )
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import math
2
3
  from contextlib import ExitStack
3
4
  from typing import Callable, Iterator, Optional, Tuple
@@ -5,6 +6,7 @@ from typing import Callable, Iterator, Optional, Tuple
5
6
  import einops
6
7
  import torch
7
8
  import torchvision.transforms as tv_transforms
9
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
8
10
  from PIL import Image
9
11
  from torchvision.transforms.functional import resize as tv_resize
10
12
  from tqdm import tqdm
@@ -24,6 +26,7 @@ from invokeai.app.invocations.primitives import LatentsOutput
24
26
  from invokeai.app.invocations.z_image_control import ZImageControlField
25
27
  from invokeai.app.invocations.z_image_image_to_latents import ZImageImageToLatentsInvocation
26
28
  from invokeai.app.services.shared.invocation_context import InvocationContext
29
+ from invokeai.backend.flux.schedulers import ZIMAGE_SCHEDULER_LABELS, ZIMAGE_SCHEDULER_MAP, ZIMAGE_SCHEDULER_NAME_VALUES
27
30
  from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat
28
31
  from invokeai.backend.patches.layer_patcher import LayerPatcher
29
32
  from invokeai.backend.patches.lora_conversions.z_image_lora_constants import Z_IMAGE_LORA_TRANSFORMER_PREFIX
@@ -47,7 +50,7 @@ from invokeai.backend.z_image.z_image_transformer_patch import patch_transformer
47
50
  title="Denoise - Z-Image",
48
51
  tags=["image", "z-image"],
49
52
  category="image",
50
- version="1.2.0",
53
+ version="1.3.0",
51
54
  classification=Classification.Prototype,
52
55
  )
53
56
  class ZImageDenoiseInvocation(BaseInvocation):
@@ -100,6 +103,13 @@ class ZImageDenoiseInvocation(BaseInvocation):
100
103
  description=FieldDescriptions.vae + " Required for control conditioning.",
101
104
  input=Input.Connection,
102
105
  )
106
+ # Scheduler selection for the denoising process
107
+ scheduler: ZIMAGE_SCHEDULER_NAME_VALUES = InputField(
108
+ default="euler",
109
+ description="Scheduler (sampler) for the denoising process. Euler is the default and recommended for "
110
+ "Z-Image-Turbo. Heun is 2nd-order (better quality, 2x slower). LCM is optimized for few steps.",
111
+ ui_choice_labels=ZIMAGE_SCHEDULER_LABELS,
112
+ )
103
113
 
104
114
  @torch.no_grad()
105
115
  def invoke(self, context: InvocationContext) -> LatentsOutput:
@@ -361,15 +371,32 @@ class ZImageDenoiseInvocation(BaseInvocation):
361
371
  )
362
372
 
363
373
  step_callback = self._build_step_callback(context)
364
- step_callback(
365
- PipelineIntermediateState(
366
- step=0,
367
- order=1,
368
- total_steps=total_steps,
369
- timestep=int(sigmas[0] * 1000),
370
- latents=latents,
371
- ),
372
- )
374
+
375
+ # Initialize the diffusers scheduler if not using built-in Euler
376
+ scheduler: SchedulerMixin | None = None
377
+ use_scheduler = self.scheduler != "euler"
378
+
379
+ if use_scheduler:
380
+ scheduler_class = ZIMAGE_SCHEDULER_MAP[self.scheduler]
381
+ scheduler = scheduler_class(
382
+ num_train_timesteps=1000,
383
+ shift=1.0,
384
+ )
385
+ # Set timesteps - LCM should use num_inference_steps (it has its own sigma schedule),
386
+ # while other schedulers can use custom sigmas if supported
387
+ is_lcm = self.scheduler == "lcm"
388
+ set_timesteps_sig = inspect.signature(scheduler.set_timesteps)
389
+ if not is_lcm and "sigmas" in set_timesteps_sig.parameters:
390
+ # Convert sigmas list to tensor for scheduler
391
+ scheduler.set_timesteps(sigmas=sigmas, device=device)
392
+ else:
393
+ # LCM or scheduler doesn't support custom sigmas - use num_inference_steps
394
+ scheduler.set_timesteps(num_inference_steps=total_steps, device=device)
395
+
396
+ # For Heun scheduler, the number of actual steps may differ
397
+ num_scheduler_steps = len(scheduler.timesteps)
398
+ else:
399
+ num_scheduler_steps = total_steps
373
400
 
374
401
  with ExitStack() as exit_stack:
375
402
  # Get transformer config to determine if it's quantized
@@ -503,91 +530,219 @@ class ZImageDenoiseInvocation(BaseInvocation):
503
530
  )
504
531
  )
505
532
 
506
- # Denoising loop
507
- for step_idx in tqdm(range(total_steps)):
508
- sigma_curr = sigmas[step_idx]
509
- sigma_prev = sigmas[step_idx + 1]
510
-
511
- # Timestep tensor for Z-Image model
512
- # The model expects t=0 at start (noise) and t=1 at end (clean)
513
- # Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
514
- model_t = 1.0 - sigma_curr
515
- timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
516
-
517
- # Run transformer for positive prediction
518
- # Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
519
- # Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
520
- latent_model_input = latents.to(transformer.dtype)
521
- latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
522
- latent_model_input_list = list(latent_model_input.unbind(dim=0))
523
-
524
- # Determine if control should be applied at this step
525
- apply_control = control_extension is not None and control_extension.should_apply(step_idx, total_steps)
526
-
527
- # Run forward pass - use custom forward with control if extension is active
528
- if apply_control:
529
- model_out_list, _ = z_image_forward_with_control(
530
- transformer=transformer,
531
- x=latent_model_input_list,
532
- t=timestep,
533
- cap_feats=[pos_prompt_embeds],
534
- control_extension=control_extension,
533
+ # Denoising loop - supports both built-in Euler and diffusers schedulers
534
+ # Track user-facing step for progress (accounts for Heun's double steps)
535
+ user_step = 0
536
+
537
+ if use_scheduler and scheduler is not None:
538
+ # Use diffusers scheduler for stepping
539
+ # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
540
+ # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
541
+ pbar = tqdm(total=total_steps, desc="Denoising")
542
+ for step_index in range(num_scheduler_steps):
543
+ sched_timestep = scheduler.timesteps[step_index]
544
+ # Convert scheduler timestep (0-1000) to normalized sigma (0-1)
545
+ sigma_curr = sched_timestep.item() / scheduler.config.num_train_timesteps
546
+
547
+ # For Heun scheduler, track if we're in first or second order step
548
+ is_heun = hasattr(scheduler, "state_in_first_order")
549
+ in_first_order = scheduler.state_in_first_order if is_heun else True
550
+
551
+ # Timestep tensor for Z-Image model
552
+ # The model expects t=0 at start (noise) and t=1 at end (clean)
553
+ model_t = 1.0 - sigma_curr
554
+ timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
555
+
556
+ # Run transformer for positive prediction
557
+ latent_model_input = latents.to(transformer.dtype)
558
+ latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
559
+ latent_model_input_list = list(latent_model_input.unbind(dim=0))
560
+
561
+ # Determine if control should be applied at this step
562
+ apply_control = control_extension is not None and control_extension.should_apply(
563
+ user_step, total_steps
535
564
  )
536
- else:
537
- model_output = transformer(
538
- x=latent_model_input_list,
539
- t=timestep,
540
- cap_feats=[pos_prompt_embeds],
541
- )
542
- model_out_list = model_output[0] # Extract list of tensors from tuple
543
565
 
544
- noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
545
- noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
546
- noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
566
+ # Run forward pass
567
+ if apply_control:
568
+ model_out_list, _ = z_image_forward_with_control(
569
+ transformer=transformer,
570
+ x=latent_model_input_list,
571
+ t=timestep,
572
+ cap_feats=[pos_prompt_embeds],
573
+ control_extension=control_extension,
574
+ )
575
+ else:
576
+ model_output = transformer(
577
+ x=latent_model_input_list,
578
+ t=timestep,
579
+ cap_feats=[pos_prompt_embeds],
580
+ )
581
+ model_out_list = model_output[0]
582
+
583
+ noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
584
+ noise_pred_cond = noise_pred_cond.squeeze(2)
585
+ noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
586
+
587
+ # Apply CFG if enabled
588
+ if do_classifier_free_guidance and neg_prompt_embeds is not None:
589
+ if apply_control:
590
+ model_out_list_uncond, _ = z_image_forward_with_control(
591
+ transformer=transformer,
592
+ x=latent_model_input_list,
593
+ t=timestep,
594
+ cap_feats=[neg_prompt_embeds],
595
+ control_extension=control_extension,
596
+ )
597
+ else:
598
+ model_output_uncond = transformer(
599
+ x=latent_model_input_list,
600
+ t=timestep,
601
+ cap_feats=[neg_prompt_embeds],
602
+ )
603
+ model_out_list_uncond = model_output_uncond[0]
604
+
605
+ noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
606
+ noise_pred_uncond = noise_pred_uncond.squeeze(2)
607
+ noise_pred_uncond = -noise_pred_uncond
608
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
609
+ else:
610
+ noise_pred = noise_pred_cond
611
+
612
+ # Use scheduler.step() for the update
613
+ step_output = scheduler.step(model_output=noise_pred, timestep=sched_timestep, sample=latents)
614
+ latents = step_output.prev_sample
615
+
616
+ # Get sigma_prev for inpainting (next sigma value)
617
+ if step_index + 1 < len(scheduler.sigmas):
618
+ sigma_prev = scheduler.sigmas[step_index + 1].item()
619
+ else:
620
+ sigma_prev = 0.0
621
+
622
+ if inpaint_extension is not None:
623
+ latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
624
+
625
+ # For Heun, only increment user step after second-order step completes
626
+ if is_heun:
627
+ if not in_first_order:
628
+ user_step += 1
629
+ # Only call step_callback if we haven't exceeded total_steps
630
+ if user_step <= total_steps:
631
+ pbar.update(1)
632
+ step_callback(
633
+ PipelineIntermediateState(
634
+ step=user_step,
635
+ order=2,
636
+ total_steps=total_steps,
637
+ timestep=int(sigma_curr * 1000),
638
+ latents=latents,
639
+ ),
640
+ )
641
+ else:
642
+ # For LCM and other first-order schedulers
643
+ user_step += 1
644
+ # Only call step_callback if we haven't exceeded total_steps
645
+ # (LCM scheduler may have more internal steps than user-facing steps)
646
+ if user_step <= total_steps:
647
+ pbar.update(1)
648
+ step_callback(
649
+ PipelineIntermediateState(
650
+ step=user_step,
651
+ order=1,
652
+ total_steps=total_steps,
653
+ timestep=int(sigma_curr * 1000),
654
+ latents=latents,
655
+ ),
656
+ )
657
+ pbar.close()
658
+ else:
659
+ # Original Euler implementation (default, optimized for Z-Image)
660
+ for step_idx in tqdm(range(total_steps)):
661
+ sigma_curr = sigmas[step_idx]
662
+ sigma_prev = sigmas[step_idx + 1]
663
+
664
+ # Timestep tensor for Z-Image model
665
+ # The model expects t=0 at start (noise) and t=1 at end (clean)
666
+ # Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
667
+ model_t = 1.0 - sigma_curr
668
+ timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
669
+
670
+ # Run transformer for positive prediction
671
+ # Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
672
+ # Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
673
+ latent_model_input = latents.to(transformer.dtype)
674
+ latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
675
+ latent_model_input_list = list(latent_model_input.unbind(dim=0))
676
+
677
+ # Determine if control should be applied at this step
678
+ apply_control = control_extension is not None and control_extension.should_apply(
679
+ step_idx, total_steps
680
+ )
547
681
 
548
- # Apply CFG if enabled
549
- if do_classifier_free_guidance and neg_prompt_embeds is not None:
682
+ # Run forward pass - use custom forward with control if extension is active
550
683
  if apply_control:
551
- model_out_list_uncond, _ = z_image_forward_with_control(
684
+ model_out_list, _ = z_image_forward_with_control(
552
685
  transformer=transformer,
553
686
  x=latent_model_input_list,
554
687
  t=timestep,
555
- cap_feats=[neg_prompt_embeds],
688
+ cap_feats=[pos_prompt_embeds],
556
689
  control_extension=control_extension,
557
690
  )
558
691
  else:
559
- model_output_uncond = transformer(
692
+ model_output = transformer(
560
693
  x=latent_model_input_list,
561
694
  t=timestep,
562
- cap_feats=[neg_prompt_embeds],
695
+ cap_feats=[pos_prompt_embeds],
563
696
  )
564
- model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
565
-
566
- noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
567
- noise_pred_uncond = noise_pred_uncond.squeeze(2)
568
- noise_pred_uncond = -noise_pred_uncond
569
- noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
570
- else:
571
- noise_pred = noise_pred_cond
572
-
573
- # Euler step
574
- latents_dtype = latents.dtype
575
- latents = latents.to(dtype=torch.float32)
576
- latents = latents + (sigma_prev - sigma_curr) * noise_pred
577
- latents = latents.to(dtype=latents_dtype)
578
-
579
- if inpaint_extension is not None:
580
- latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
581
-
582
- step_callback(
583
- PipelineIntermediateState(
584
- step=step_idx + 1,
585
- order=1,
586
- total_steps=total_steps,
587
- timestep=int(sigma_curr * 1000),
588
- latents=latents,
589
- ),
590
- )
697
+ model_out_list = model_output[0] # Extract list of tensors from tuple
698
+
699
+ noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
700
+ noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
701
+ noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
702
+
703
+ # Apply CFG if enabled
704
+ if do_classifier_free_guidance and neg_prompt_embeds is not None:
705
+ if apply_control:
706
+ model_out_list_uncond, _ = z_image_forward_with_control(
707
+ transformer=transformer,
708
+ x=latent_model_input_list,
709
+ t=timestep,
710
+ cap_feats=[neg_prompt_embeds],
711
+ control_extension=control_extension,
712
+ )
713
+ else:
714
+ model_output_uncond = transformer(
715
+ x=latent_model_input_list,
716
+ t=timestep,
717
+ cap_feats=[neg_prompt_embeds],
718
+ )
719
+ model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
720
+
721
+ noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
722
+ noise_pred_uncond = noise_pred_uncond.squeeze(2)
723
+ noise_pred_uncond = -noise_pred_uncond
724
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
725
+ else:
726
+ noise_pred = noise_pred_cond
727
+
728
+ # Euler step
729
+ latents_dtype = latents.dtype
730
+ latents = latents.to(dtype=torch.float32)
731
+ latents = latents + (sigma_prev - sigma_curr) * noise_pred
732
+ latents = latents.to(dtype=latents_dtype)
733
+
734
+ if inpaint_extension is not None:
735
+ latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
736
+
737
+ step_callback(
738
+ PipelineIntermediateState(
739
+ step=step_idx + 1,
740
+ order=1,
741
+ total_steps=total_steps,
742
+ timestep=int(sigma_curr * 1000),
743
+ latents=latents,
744
+ ),
745
+ )
591
746
 
592
747
  return latents
593
748