InvokeAI 6.10.0rc1__py3-none-any.whl → 6.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. invokeai/app/api/routers/model_manager.py +43 -1
  2. invokeai/app/invocations/fields.py +1 -1
  3. invokeai/app/invocations/flux2_denoise.py +499 -0
  4. invokeai/app/invocations/flux2_klein_model_loader.py +222 -0
  5. invokeai/app/invocations/flux2_klein_text_encoder.py +222 -0
  6. invokeai/app/invocations/flux2_vae_decode.py +106 -0
  7. invokeai/app/invocations/flux2_vae_encode.py +88 -0
  8. invokeai/app/invocations/flux_denoise.py +77 -3
  9. invokeai/app/invocations/flux_lora_loader.py +1 -1
  10. invokeai/app/invocations/flux_model_loader.py +2 -5
  11. invokeai/app/invocations/ideal_size.py +6 -1
  12. invokeai/app/invocations/metadata.py +4 -0
  13. invokeai/app/invocations/metadata_linked.py +47 -0
  14. invokeai/app/invocations/model.py +1 -0
  15. invokeai/app/invocations/pbr_maps.py +59 -0
  16. invokeai/app/invocations/z_image_denoise.py +244 -84
  17. invokeai/app/invocations/z_image_image_to_latents.py +9 -1
  18. invokeai/app/invocations/z_image_latents_to_image.py +9 -1
  19. invokeai/app/invocations/z_image_seed_variance_enhancer.py +110 -0
  20. invokeai/app/services/config/config_default.py +3 -1
  21. invokeai/app/services/invocation_stats/invocation_stats_common.py +6 -6
  22. invokeai/app/services/invocation_stats/invocation_stats_default.py +9 -4
  23. invokeai/app/services/model_manager/model_manager_default.py +7 -0
  24. invokeai/app/services/model_records/model_records_base.py +4 -2
  25. invokeai/app/services/shared/invocation_context.py +15 -0
  26. invokeai/app/services/shared/sqlite/sqlite_util.py +2 -0
  27. invokeai/app/services/shared/sqlite_migrator/migrations/migration_25.py +61 -0
  28. invokeai/app/util/step_callback.py +58 -2
  29. invokeai/backend/flux/denoise.py +338 -118
  30. invokeai/backend/flux/dype/__init__.py +31 -0
  31. invokeai/backend/flux/dype/base.py +260 -0
  32. invokeai/backend/flux/dype/embed.py +116 -0
  33. invokeai/backend/flux/dype/presets.py +148 -0
  34. invokeai/backend/flux/dype/rope.py +110 -0
  35. invokeai/backend/flux/extensions/dype_extension.py +91 -0
  36. invokeai/backend/flux/schedulers.py +62 -0
  37. invokeai/backend/flux/util.py +35 -1
  38. invokeai/backend/flux2/__init__.py +4 -0
  39. invokeai/backend/flux2/denoise.py +280 -0
  40. invokeai/backend/flux2/ref_image_extension.py +294 -0
  41. invokeai/backend/flux2/sampling_utils.py +209 -0
  42. invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
  43. invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
  44. invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
  45. invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
  46. invokeai/backend/model_manager/configs/factory.py +19 -1
  47. invokeai/backend/model_manager/configs/lora.py +36 -0
  48. invokeai/backend/model_manager/configs/main.py +395 -3
  49. invokeai/backend/model_manager/configs/qwen3_encoder.py +116 -7
  50. invokeai/backend/model_manager/configs/vae.py +104 -2
  51. invokeai/backend/model_manager/load/model_cache/model_cache.py +107 -2
  52. invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
  53. invokeai/backend/model_manager/load/model_loaders/flux.py +1020 -8
  54. invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
  55. invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
  56. invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +2 -1
  57. invokeai/backend/model_manager/load/model_loaders/z_image.py +158 -31
  58. invokeai/backend/model_manager/starter_models.py +141 -4
  59. invokeai/backend/model_manager/taxonomy.py +31 -4
  60. invokeai/backend/model_manager/util/select_hf_files.py +3 -2
  61. invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
  62. invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
  63. invokeai/backend/util/vae_working_memory.py +0 -2
  64. invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
  65. invokeai/frontend/web/dist/assets/App-D13dX7be.js +161 -0
  66. invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-u_ZjhQTI.js} +1 -1
  67. invokeai/frontend/web/dist/assets/index-BB0nHmDe.js +530 -0
  68. invokeai/frontend/web/dist/index.html +1 -1
  69. invokeai/frontend/web/dist/locales/en-GB.json +1 -0
  70. invokeai/frontend/web/dist/locales/en.json +85 -6
  71. invokeai/frontend/web/dist/locales/it.json +135 -15
  72. invokeai/frontend/web/dist/locales/ru.json +11 -11
  73. invokeai/version/invokeai_version.py +1 -1
  74. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/METADATA +8 -2
  75. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/RECORD +81 -57
  76. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/WHEEL +1 -1
  77. invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
  78. invokeai/frontend/web/dist/assets/index-dgSJAY--.js +0 -530
  79. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/entry_points.txt +0 -0
  80. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE +0 -0
  81. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  82. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  83. {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import math
2
3
  from contextlib import ExitStack
3
4
  from typing import Callable, Iterator, Optional, Tuple
@@ -5,6 +6,7 @@ from typing import Callable, Iterator, Optional, Tuple
5
6
  import einops
6
7
  import torch
7
8
  import torchvision.transforms as tv_transforms
9
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
8
10
  from PIL import Image
9
11
  from torchvision.transforms.functional import resize as tv_resize
10
12
  from tqdm import tqdm
@@ -24,6 +26,7 @@ from invokeai.app.invocations.primitives import LatentsOutput
24
26
  from invokeai.app.invocations.z_image_control import ZImageControlField
25
27
  from invokeai.app.invocations.z_image_image_to_latents import ZImageImageToLatentsInvocation
26
28
  from invokeai.app.services.shared.invocation_context import InvocationContext
29
+ from invokeai.backend.flux.schedulers import ZIMAGE_SCHEDULER_LABELS, ZIMAGE_SCHEDULER_MAP, ZIMAGE_SCHEDULER_NAME_VALUES
27
30
  from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat
28
31
  from invokeai.backend.patches.layer_patcher import LayerPatcher
29
32
  from invokeai.backend.patches.lora_conversions.z_image_lora_constants import Z_IMAGE_LORA_TRANSFORMER_PREFIX
@@ -47,7 +50,7 @@ from invokeai.backend.z_image.z_image_transformer_patch import patch_transformer
47
50
  title="Denoise - Z-Image",
48
51
  tags=["image", "z-image"],
49
52
  category="image",
50
- version="1.2.0",
53
+ version="1.4.0",
51
54
  classification=Classification.Prototype,
52
55
  )
53
56
  class ZImageDenoiseInvocation(BaseInvocation):
@@ -66,6 +69,7 @@ class ZImageDenoiseInvocation(BaseInvocation):
66
69
  )
67
70
  denoising_start: float = InputField(default=0.0, ge=0, le=1, description=FieldDescriptions.denoising_start)
68
71
  denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
72
+ add_noise: bool = InputField(default=True, description="Add noise based on denoising start.")
69
73
  transformer: TransformerField = InputField(
70
74
  description=FieldDescriptions.z_image_model, input=Input.Connection, title="Transformer"
71
75
  )
@@ -100,6 +104,13 @@ class ZImageDenoiseInvocation(BaseInvocation):
100
104
  description=FieldDescriptions.vae + " Required for control conditioning.",
101
105
  input=Input.Connection,
102
106
  )
107
+ # Scheduler selection for the denoising process
108
+ scheduler: ZIMAGE_SCHEDULER_NAME_VALUES = InputField(
109
+ default="euler",
110
+ description="Scheduler (sampler) for the denoising process. Euler is the default and recommended for "
111
+ "Z-Image-Turbo. Heun is 2nd-order (better quality, 2x slower). LCM is optimized for few steps.",
112
+ ui_choice_labels=ZIMAGE_SCHEDULER_LABELS,
113
+ )
103
114
 
104
115
  @torch.no_grad()
105
116
  def invoke(self, context: InvocationContext) -> LatentsOutput:
@@ -337,8 +348,12 @@ class ZImageDenoiseInvocation(BaseInvocation):
337
348
 
338
349
  # Prepare input latent image
339
350
  if init_latents is not None:
340
- s_0 = sigmas[0]
341
- latents = s_0 * noise + (1.0 - s_0) * init_latents
351
+ if self.add_noise:
352
+ # Noise the init_latents by the appropriate amount for the first timestep.
353
+ s_0 = sigmas[0]
354
+ latents = s_0 * noise + (1.0 - s_0) * init_latents
355
+ else:
356
+ latents = init_latents
342
357
  else:
343
358
  if self.denoising_start > 1e-5:
344
359
  raise ValueError("denoising_start should be 0 when initial latents are not provided.")
@@ -361,15 +376,32 @@ class ZImageDenoiseInvocation(BaseInvocation):
361
376
  )
362
377
 
363
378
  step_callback = self._build_step_callback(context)
364
- step_callback(
365
- PipelineIntermediateState(
366
- step=0,
367
- order=1,
368
- total_steps=total_steps,
369
- timestep=int(sigmas[0] * 1000),
370
- latents=latents,
371
- ),
372
- )
379
+
380
+ # Initialize the diffusers scheduler if not using built-in Euler
381
+ scheduler: SchedulerMixin | None = None
382
+ use_scheduler = self.scheduler != "euler"
383
+
384
+ if use_scheduler:
385
+ scheduler_class = ZIMAGE_SCHEDULER_MAP[self.scheduler]
386
+ scheduler = scheduler_class(
387
+ num_train_timesteps=1000,
388
+ shift=1.0,
389
+ )
390
+ # Set timesteps - LCM should use num_inference_steps (it has its own sigma schedule),
391
+ # while other schedulers can use custom sigmas if supported
392
+ is_lcm = self.scheduler == "lcm"
393
+ set_timesteps_sig = inspect.signature(scheduler.set_timesteps)
394
+ if not is_lcm and "sigmas" in set_timesteps_sig.parameters:
395
+ # Convert sigmas list to tensor for scheduler
396
+ scheduler.set_timesteps(sigmas=sigmas, device=device)
397
+ else:
398
+ # LCM or scheduler doesn't support custom sigmas - use num_inference_steps
399
+ scheduler.set_timesteps(num_inference_steps=total_steps, device=device)
400
+
401
+ # For Heun scheduler, the number of actual steps may differ
402
+ num_scheduler_steps = len(scheduler.timesteps)
403
+ else:
404
+ num_scheduler_steps = total_steps
373
405
 
374
406
  with ExitStack() as exit_stack:
375
407
  # Get transformer config to determine if it's quantized
@@ -503,91 +535,219 @@ class ZImageDenoiseInvocation(BaseInvocation):
503
535
  )
504
536
  )
505
537
 
506
- # Denoising loop
507
- for step_idx in tqdm(range(total_steps)):
508
- sigma_curr = sigmas[step_idx]
509
- sigma_prev = sigmas[step_idx + 1]
510
-
511
- # Timestep tensor for Z-Image model
512
- # The model expects t=0 at start (noise) and t=1 at end (clean)
513
- # Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
514
- model_t = 1.0 - sigma_curr
515
- timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
516
-
517
- # Run transformer for positive prediction
518
- # Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
519
- # Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
520
- latent_model_input = latents.to(transformer.dtype)
521
- latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
522
- latent_model_input_list = list(latent_model_input.unbind(dim=0))
523
-
524
- # Determine if control should be applied at this step
525
- apply_control = control_extension is not None and control_extension.should_apply(step_idx, total_steps)
526
-
527
- # Run forward pass - use custom forward with control if extension is active
528
- if apply_control:
529
- model_out_list, _ = z_image_forward_with_control(
530
- transformer=transformer,
531
- x=latent_model_input_list,
532
- t=timestep,
533
- cap_feats=[pos_prompt_embeds],
534
- control_extension=control_extension,
535
- )
536
- else:
537
- model_output = transformer(
538
- x=latent_model_input_list,
539
- t=timestep,
540
- cap_feats=[pos_prompt_embeds],
538
+ # Denoising loop - supports both built-in Euler and diffusers schedulers
539
+ # Track user-facing step for progress (accounts for Heun's double steps)
540
+ user_step = 0
541
+
542
+ if use_scheduler and scheduler is not None:
543
+ # Use diffusers scheduler for stepping
544
+ # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
545
+ # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
546
+ pbar = tqdm(total=total_steps, desc="Denoising")
547
+ for step_index in range(num_scheduler_steps):
548
+ sched_timestep = scheduler.timesteps[step_index]
549
+ # Convert scheduler timestep (0-1000) to normalized sigma (0-1)
550
+ sigma_curr = sched_timestep.item() / scheduler.config.num_train_timesteps
551
+
552
+ # For Heun scheduler, track if we're in first or second order step
553
+ is_heun = hasattr(scheduler, "state_in_first_order")
554
+ in_first_order = scheduler.state_in_first_order if is_heun else True
555
+
556
+ # Timestep tensor for Z-Image model
557
+ # The model expects t=0 at start (noise) and t=1 at end (clean)
558
+ model_t = 1.0 - sigma_curr
559
+ timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
560
+
561
+ # Run transformer for positive prediction
562
+ latent_model_input = latents.to(transformer.dtype)
563
+ latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
564
+ latent_model_input_list = list(latent_model_input.unbind(dim=0))
565
+
566
+ # Determine if control should be applied at this step
567
+ apply_control = control_extension is not None and control_extension.should_apply(
568
+ user_step, total_steps
541
569
  )
542
- model_out_list = model_output[0] # Extract list of tensors from tuple
543
570
 
544
- noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
545
- noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
546
- noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
571
+ # Run forward pass
572
+ if apply_control:
573
+ model_out_list, _ = z_image_forward_with_control(
574
+ transformer=transformer,
575
+ x=latent_model_input_list,
576
+ t=timestep,
577
+ cap_feats=[pos_prompt_embeds],
578
+ control_extension=control_extension,
579
+ )
580
+ else:
581
+ model_output = transformer(
582
+ x=latent_model_input_list,
583
+ t=timestep,
584
+ cap_feats=[pos_prompt_embeds],
585
+ )
586
+ model_out_list = model_output[0]
587
+
588
+ noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
589
+ noise_pred_cond = noise_pred_cond.squeeze(2)
590
+ noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
591
+
592
+ # Apply CFG if enabled
593
+ if do_classifier_free_guidance and neg_prompt_embeds is not None:
594
+ if apply_control:
595
+ model_out_list_uncond, _ = z_image_forward_with_control(
596
+ transformer=transformer,
597
+ x=latent_model_input_list,
598
+ t=timestep,
599
+ cap_feats=[neg_prompt_embeds],
600
+ control_extension=control_extension,
601
+ )
602
+ else:
603
+ model_output_uncond = transformer(
604
+ x=latent_model_input_list,
605
+ t=timestep,
606
+ cap_feats=[neg_prompt_embeds],
607
+ )
608
+ model_out_list_uncond = model_output_uncond[0]
609
+
610
+ noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
611
+ noise_pred_uncond = noise_pred_uncond.squeeze(2)
612
+ noise_pred_uncond = -noise_pred_uncond
613
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
614
+ else:
615
+ noise_pred = noise_pred_cond
616
+
617
+ # Use scheduler.step() for the update
618
+ step_output = scheduler.step(model_output=noise_pred, timestep=sched_timestep, sample=latents)
619
+ latents = step_output.prev_sample
547
620
 
548
- # Apply CFG if enabled
549
- if do_classifier_free_guidance and neg_prompt_embeds is not None:
621
+ # Get sigma_prev for inpainting (next sigma value)
622
+ if step_index + 1 < len(scheduler.sigmas):
623
+ sigma_prev = scheduler.sigmas[step_index + 1].item()
624
+ else:
625
+ sigma_prev = 0.0
626
+
627
+ if inpaint_extension is not None:
628
+ latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
629
+
630
+ # For Heun, only increment user step after second-order step completes
631
+ if is_heun:
632
+ if not in_first_order:
633
+ user_step += 1
634
+ # Only call step_callback if we haven't exceeded total_steps
635
+ if user_step <= total_steps:
636
+ pbar.update(1)
637
+ step_callback(
638
+ PipelineIntermediateState(
639
+ step=user_step,
640
+ order=2,
641
+ total_steps=total_steps,
642
+ timestep=int(sigma_curr * 1000),
643
+ latents=latents,
644
+ ),
645
+ )
646
+ else:
647
+ # For LCM and other first-order schedulers
648
+ user_step += 1
649
+ # Only call step_callback if we haven't exceeded total_steps
650
+ # (LCM scheduler may have more internal steps than user-facing steps)
651
+ if user_step <= total_steps:
652
+ pbar.update(1)
653
+ step_callback(
654
+ PipelineIntermediateState(
655
+ step=user_step,
656
+ order=1,
657
+ total_steps=total_steps,
658
+ timestep=int(sigma_curr * 1000),
659
+ latents=latents,
660
+ ),
661
+ )
662
+ pbar.close()
663
+ else:
664
+ # Original Euler implementation (default, optimized for Z-Image)
665
+ for step_idx in tqdm(range(total_steps)):
666
+ sigma_curr = sigmas[step_idx]
667
+ sigma_prev = sigmas[step_idx + 1]
668
+
669
+ # Timestep tensor for Z-Image model
670
+ # The model expects t=0 at start (noise) and t=1 at end (clean)
671
+ # Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
672
+ model_t = 1.0 - sigma_curr
673
+ timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
674
+
675
+ # Run transformer for positive prediction
676
+ # Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
677
+ # Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
678
+ latent_model_input = latents.to(transformer.dtype)
679
+ latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
680
+ latent_model_input_list = list(latent_model_input.unbind(dim=0))
681
+
682
+ # Determine if control should be applied at this step
683
+ apply_control = control_extension is not None and control_extension.should_apply(
684
+ step_idx, total_steps
685
+ )
686
+
687
+ # Run forward pass - use custom forward with control if extension is active
550
688
  if apply_control:
551
- model_out_list_uncond, _ = z_image_forward_with_control(
689
+ model_out_list, _ = z_image_forward_with_control(
552
690
  transformer=transformer,
553
691
  x=latent_model_input_list,
554
692
  t=timestep,
555
- cap_feats=[neg_prompt_embeds],
693
+ cap_feats=[pos_prompt_embeds],
556
694
  control_extension=control_extension,
557
695
  )
558
696
  else:
559
- model_output_uncond = transformer(
697
+ model_output = transformer(
560
698
  x=latent_model_input_list,
561
699
  t=timestep,
562
- cap_feats=[neg_prompt_embeds],
700
+ cap_feats=[pos_prompt_embeds],
563
701
  )
564
- model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
565
-
566
- noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
567
- noise_pred_uncond = noise_pred_uncond.squeeze(2)
568
- noise_pred_uncond = -noise_pred_uncond
569
- noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
570
- else:
571
- noise_pred = noise_pred_cond
572
-
573
- # Euler step
574
- latents_dtype = latents.dtype
575
- latents = latents.to(dtype=torch.float32)
576
- latents = latents + (sigma_prev - sigma_curr) * noise_pred
577
- latents = latents.to(dtype=latents_dtype)
578
-
579
- if inpaint_extension is not None:
580
- latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
581
-
582
- step_callback(
583
- PipelineIntermediateState(
584
- step=step_idx + 1,
585
- order=1,
586
- total_steps=total_steps,
587
- timestep=int(sigma_curr * 1000),
588
- latents=latents,
589
- ),
590
- )
702
+ model_out_list = model_output[0] # Extract list of tensors from tuple
703
+
704
+ noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
705
+ noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
706
+ noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
707
+
708
+ # Apply CFG if enabled
709
+ if do_classifier_free_guidance and neg_prompt_embeds is not None:
710
+ if apply_control:
711
+ model_out_list_uncond, _ = z_image_forward_with_control(
712
+ transformer=transformer,
713
+ x=latent_model_input_list,
714
+ t=timestep,
715
+ cap_feats=[neg_prompt_embeds],
716
+ control_extension=control_extension,
717
+ )
718
+ else:
719
+ model_output_uncond = transformer(
720
+ x=latent_model_input_list,
721
+ t=timestep,
722
+ cap_feats=[neg_prompt_embeds],
723
+ )
724
+ model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
725
+
726
+ noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
727
+ noise_pred_uncond = noise_pred_uncond.squeeze(2)
728
+ noise_pred_uncond = -noise_pred_uncond
729
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
730
+ else:
731
+ noise_pred = noise_pred_cond
732
+
733
+ # Euler step
734
+ latents_dtype = latents.dtype
735
+ latents = latents.to(dtype=torch.float32)
736
+ latents = latents + (sigma_prev - sigma_curr) * noise_pred
737
+ latents = latents.to(dtype=latents_dtype)
738
+
739
+ if inpaint_extension is not None:
740
+ latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
741
+
742
+ step_callback(
743
+ PipelineIntermediateState(
744
+ step=step_idx + 1,
745
+ order=1,
746
+ total_steps=total_steps,
747
+ timestep=int(sigma_curr * 1000),
748
+ latents=latents,
749
+ ),
750
+ )
591
751
 
592
752
  return latents
593
753
 
@@ -20,6 +20,7 @@ from invokeai.backend.flux.modules.autoencoder import AutoEncoder as FluxAutoEnc
20
20
  from invokeai.backend.model_manager.load.load_base import LoadedModel
21
21
  from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
22
22
  from invokeai.backend.util.devices import TorchDevice
23
+ from invokeai.backend.util.vae_working_memory import estimate_vae_working_memory_flux
23
24
 
24
25
  # Z-Image can use either the Diffusers AutoencoderKL or the FLUX AutoEncoder
25
26
  ZImageVAE = Union[AutoencoderKL, FluxAutoEncoder]
@@ -47,7 +48,14 @@ class ZImageImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBoard):
47
48
  "Ensure you are using a compatible VAE model."
48
49
  )
49
50
 
50
- with vae_info.model_on_device() as (_, vae):
51
+ # Estimate working memory needed for VAE encode
52
+ estimated_working_memory = estimate_vae_working_memory_flux(
53
+ operation="encode",
54
+ image_tensor=image_tensor,
55
+ vae=vae_info.model,
56
+ )
57
+
58
+ with vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
51
59
  if not isinstance(vae, (AutoencoderKL, FluxAutoEncoder)):
52
60
  raise TypeError(
53
61
  f"Expected AutoencoderKL or FluxAutoEncoder, got {type(vae).__name__}. "
@@ -21,6 +21,7 @@ from invokeai.app.services.shared.invocation_context import InvocationContext
21
21
  from invokeai.backend.flux.modules.autoencoder import AutoEncoder as FluxAutoEncoder
22
22
  from invokeai.backend.stable_diffusion.extensions.seamless import SeamlessExt
23
23
  from invokeai.backend.util.devices import TorchDevice
24
+ from invokeai.backend.util.vae_working_memory import estimate_vae_working_memory_flux
24
25
 
25
26
  # Z-Image can use either the Diffusers AutoencoderKL or the FLUX AutoEncoder
26
27
  ZImageVAE = Union[AutoencoderKL, FluxAutoEncoder]
@@ -53,12 +54,19 @@ class ZImageLatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
53
54
 
54
55
  is_flux_vae = isinstance(vae_info.model, FluxAutoEncoder)
55
56
 
57
+ # Estimate working memory needed for VAE decode
58
+ estimated_working_memory = estimate_vae_working_memory_flux(
59
+ operation="decode",
60
+ image_tensor=latents,
61
+ vae=vae_info.model,
62
+ )
63
+
56
64
  # FLUX VAE doesn't support seamless, so only apply for AutoencoderKL
57
65
  seamless_context = (
58
66
  nullcontext() if is_flux_vae else SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes)
59
67
  )
60
68
 
61
- with seamless_context, vae_info.model_on_device() as (_, vae):
69
+ with seamless_context, vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
62
70
  context.util.signal_progress("Running VAE")
63
71
  if not isinstance(vae, (AutoencoderKL, FluxAutoEncoder)):
64
72
  raise TypeError(
@@ -0,0 +1,110 @@
1
+ import torch
2
+
3
+ from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
4
+ from invokeai.app.invocations.fields import (
5
+ FieldDescriptions,
6
+ Input,
7
+ InputField,
8
+ ZImageConditioningField,
9
+ )
10
+ from invokeai.app.invocations.primitives import ZImageConditioningOutput
11
+ from invokeai.app.services.shared.invocation_context import InvocationContext
12
+ from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
13
+ ConditioningFieldData,
14
+ ZImageConditioningInfo,
15
+ )
16
+
17
+
18
+ @invocation(
19
+ "z_image_seed_variance_enhancer",
20
+ title="Seed Variance Enhancer - Z-Image",
21
+ tags=["conditioning", "z-image", "variance", "seed"],
22
+ category="conditioning",
23
+ version="1.0.0",
24
+ classification=Classification.Prototype,
25
+ )
26
+ class ZImageSeedVarianceEnhancerInvocation(BaseInvocation):
27
+ """Adds seed-based noise to Z-Image conditioning to increase variance between seeds.
28
+
29
+ Z-Image-Turbo can produce relatively similar images with different seeds,
30
+ making it harder to explore variations of a prompt. This node implements
31
+ reproducible, seed-based noise injection into text embeddings to increase
32
+ visual variation while maintaining reproducibility.
33
+
34
+ The noise strength is auto-calibrated relative to the embedding's standard
35
+ deviation, ensuring consistent results across different prompts.
36
+ """
37
+
38
+ conditioning: ZImageConditioningField = InputField(
39
+ description=FieldDescriptions.cond,
40
+ input=Input.Connection,
41
+ title="Conditioning",
42
+ )
43
+ seed: int = InputField(
44
+ default=0,
45
+ ge=0,
46
+ description="Seed for reproducible noise generation. Different seeds produce different noise patterns.",
47
+ )
48
+ strength: float = InputField(
49
+ default=0.1,
50
+ ge=0.0,
51
+ le=2.0,
52
+ description="Noise strength as multiplier of embedding std. 0=off, 0.1=subtle, 0.5=strong.",
53
+ )
54
+ randomize_percent: float = InputField(
55
+ default=50.0,
56
+ ge=1.0,
57
+ le=100.0,
58
+ description="Percentage of embedding values to add noise to (1-100). Lower values create more selective noise patterns.",
59
+ )
60
+
61
+ @torch.no_grad()
62
+ def invoke(self, context: InvocationContext) -> ZImageConditioningOutput:
63
+ # Load conditioning data
64
+ cond_data = context.conditioning.load(self.conditioning.conditioning_name)
65
+ assert len(cond_data.conditionings) == 1, "Expected exactly one conditioning tensor"
66
+ z_image_conditioning = cond_data.conditionings[0]
67
+ assert isinstance(z_image_conditioning, ZImageConditioningInfo), "Expected ZImageConditioningInfo"
68
+
69
+ # Early return if strength is zero (no modification needed)
70
+ if self.strength == 0:
71
+ return ZImageConditioningOutput(conditioning=self.conditioning)
72
+
73
+ # Clone embeddings to avoid modifying the original
74
+ prompt_embeds = z_image_conditioning.prompt_embeds.clone()
75
+
76
+ # Calculate actual noise strength based on embedding statistics
77
+ # This auto-calibration ensures consistent results across different prompts
78
+ embed_std = torch.std(prompt_embeds).item()
79
+ actual_strength = self.strength * embed_std
80
+
81
+ # Generate deterministic noise using the seed
82
+ generator = torch.Generator(device=prompt_embeds.device)
83
+ generator.manual_seed(self.seed)
84
+ noise = torch.rand(
85
+ prompt_embeds.shape, generator=generator, device=prompt_embeds.device, dtype=prompt_embeds.dtype
86
+ )
87
+ noise = noise * 2 - 1 # Scale to [-1, 1)
88
+ noise = noise * actual_strength
89
+
90
+ # Create selective mask for noise application
91
+ generator.manual_seed(self.seed + 1)
92
+ noise_mask = torch.bernoulli(
93
+ torch.ones_like(prompt_embeds) * (self.randomize_percent / 100.0),
94
+ generator=generator,
95
+ ).bool()
96
+
97
+ # Apply noise only to masked positions
98
+ prompt_embeds = prompt_embeds + (noise * noise_mask)
99
+
100
+ # Save modified conditioning
101
+ new_conditioning = ZImageConditioningInfo(prompt_embeds=prompt_embeds)
102
+ conditioning_data = ConditioningFieldData(conditionings=[new_conditioning])
103
+ conditioning_name = context.conditioning.save(conditioning_data)
104
+
105
+ return ZImageConditioningOutput(
106
+ conditioning=ZImageConditioningField(
107
+ conditioning_name=conditioning_name,
108
+ mask=self.conditioning.mask,
109
+ )
110
+ )
@@ -85,6 +85,7 @@ class InvokeAIAppConfig(BaseSettings):
85
85
  max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.
86
86
  max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.
87
87
  log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
88
+ model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.
88
89
  device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
89
90
  enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
90
91
  keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.
@@ -165,9 +166,10 @@ class InvokeAIAppConfig(BaseSettings):
165
166
  max_cache_ram_gb: Optional[float] = Field(default=None, gt=0, description="The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.")
166
167
  max_cache_vram_gb: Optional[float] = Field(default=None, ge=0, description="The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.")
167
168
  log_memory_usage: bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
169
+ model_cache_keep_alive_min: float = Field(default=0, ge=0, description="How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.")
168
170
  device_working_mem_gb: float = Field(default=3, description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
169
171
  enable_partial_loading: bool = Field(default=False, description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
170
- keep_ram_copy_of_weights: bool = Field(default=True, description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
172
+ keep_ram_copy_of_weights: bool = Field(default=True, description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
171
173
  # Deprecated CACHE configs
172
174
  ram: Optional[float] = Field(default=None, gt=0, description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
173
175
  vram: Optional[float] = Field(default=None, ge=0, description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
@@ -14,7 +14,7 @@ class NodeExecutionStatsSummary:
14
14
  node_type: str
15
15
  num_calls: int
16
16
  time_used_seconds: float
17
- peak_vram_gb: float
17
+ delta_vram_gb: float
18
18
 
19
19
 
20
20
  @dataclass
@@ -58,10 +58,10 @@ class InvocationStatsSummary:
58
58
  def __str__(self) -> str:
59
59
  _str = ""
60
60
  _str = f"Graph stats: {self.graph_stats.graph_execution_state_id}\n"
61
- _str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM Used':>10}\n"
61
+ _str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM Change':+>10}\n"
62
62
 
63
63
  for summary in self.node_stats:
64
- _str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.peak_vram_gb:>9.3f}G\n"
64
+ _str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.delta_vram_gb:+10.3f}G\n"
65
65
 
66
66
  _str += f"TOTAL GRAPH EXECUTION TIME: {self.graph_stats.execution_time_seconds:7.3f}s\n"
67
67
 
@@ -100,7 +100,7 @@ class NodeExecutionStats:
100
100
  start_ram_gb: float # GB
101
101
  end_ram_gb: float # GB
102
102
 
103
- peak_vram_gb: float # GB
103
+ delta_vram_gb: float # GB
104
104
 
105
105
  def total_time(self) -> float:
106
106
  return self.end_time - self.start_time
@@ -174,9 +174,9 @@ class GraphExecutionStats:
174
174
  for node_type, node_type_stats_list in node_stats_by_type.items():
175
175
  num_calls = len(node_type_stats_list)
176
176
  time_used = sum([n.total_time() for n in node_type_stats_list])
177
- peak_vram = max([n.peak_vram_gb for n in node_type_stats_list])
177
+ delta_vram = max([n.delta_vram_gb for n in node_type_stats_list])
178
178
  summary = NodeExecutionStatsSummary(
179
- node_type=node_type, num_calls=num_calls, time_used_seconds=time_used, peak_vram_gb=peak_vram
179
+ node_type=node_type, num_calls=num_calls, time_used_seconds=time_used, delta_vram_gb=delta_vram
180
180
  )
181
181
  summaries.append(summary)
182
182