InvokeAI 6.10.0rc1__py3-none-any.whl → 6.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invokeai/app/api/routers/model_manager.py +43 -1
- invokeai/app/invocations/fields.py +1 -1
- invokeai/app/invocations/flux2_denoise.py +499 -0
- invokeai/app/invocations/flux2_klein_model_loader.py +222 -0
- invokeai/app/invocations/flux2_klein_text_encoder.py +222 -0
- invokeai/app/invocations/flux2_vae_decode.py +106 -0
- invokeai/app/invocations/flux2_vae_encode.py +88 -0
- invokeai/app/invocations/flux_denoise.py +77 -3
- invokeai/app/invocations/flux_lora_loader.py +1 -1
- invokeai/app/invocations/flux_model_loader.py +2 -5
- invokeai/app/invocations/ideal_size.py +6 -1
- invokeai/app/invocations/metadata.py +4 -0
- invokeai/app/invocations/metadata_linked.py +47 -0
- invokeai/app/invocations/model.py +1 -0
- invokeai/app/invocations/pbr_maps.py +59 -0
- invokeai/app/invocations/z_image_denoise.py +244 -84
- invokeai/app/invocations/z_image_image_to_latents.py +9 -1
- invokeai/app/invocations/z_image_latents_to_image.py +9 -1
- invokeai/app/invocations/z_image_seed_variance_enhancer.py +110 -0
- invokeai/app/services/config/config_default.py +3 -1
- invokeai/app/services/invocation_stats/invocation_stats_common.py +6 -6
- invokeai/app/services/invocation_stats/invocation_stats_default.py +9 -4
- invokeai/app/services/model_manager/model_manager_default.py +7 -0
- invokeai/app/services/model_records/model_records_base.py +4 -2
- invokeai/app/services/shared/invocation_context.py +15 -0
- invokeai/app/services/shared/sqlite/sqlite_util.py +2 -0
- invokeai/app/services/shared/sqlite_migrator/migrations/migration_25.py +61 -0
- invokeai/app/util/step_callback.py +58 -2
- invokeai/backend/flux/denoise.py +338 -118
- invokeai/backend/flux/dype/__init__.py +31 -0
- invokeai/backend/flux/dype/base.py +260 -0
- invokeai/backend/flux/dype/embed.py +116 -0
- invokeai/backend/flux/dype/presets.py +148 -0
- invokeai/backend/flux/dype/rope.py +110 -0
- invokeai/backend/flux/extensions/dype_extension.py +91 -0
- invokeai/backend/flux/schedulers.py +62 -0
- invokeai/backend/flux/util.py +35 -1
- invokeai/backend/flux2/__init__.py +4 -0
- invokeai/backend/flux2/denoise.py +280 -0
- invokeai/backend/flux2/ref_image_extension.py +294 -0
- invokeai/backend/flux2/sampling_utils.py +209 -0
- invokeai/backend/image_util/pbr_maps/architecture/block.py +367 -0
- invokeai/backend/image_util/pbr_maps/architecture/pbr_rrdb_net.py +70 -0
- invokeai/backend/image_util/pbr_maps/pbr_maps.py +141 -0
- invokeai/backend/image_util/pbr_maps/utils/image_ops.py +93 -0
- invokeai/backend/model_manager/configs/factory.py +19 -1
- invokeai/backend/model_manager/configs/lora.py +36 -0
- invokeai/backend/model_manager/configs/main.py +395 -3
- invokeai/backend/model_manager/configs/qwen3_encoder.py +116 -7
- invokeai/backend/model_manager/configs/vae.py +104 -2
- invokeai/backend/model_manager/load/model_cache/model_cache.py +107 -2
- invokeai/backend/model_manager/load/model_loaders/cogview4.py +2 -1
- invokeai/backend/model_manager/load/model_loaders/flux.py +1020 -8
- invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +4 -2
- invokeai/backend/model_manager/load/model_loaders/onnx.py +1 -0
- invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +2 -1
- invokeai/backend/model_manager/load/model_loaders/z_image.py +158 -31
- invokeai/backend/model_manager/starter_models.py +141 -4
- invokeai/backend/model_manager/taxonomy.py +31 -4
- invokeai/backend/model_manager/util/select_hf_files.py +3 -2
- invokeai/backend/patches/lora_conversions/z_image_lora_conversion_utils.py +39 -5
- invokeai/backend/quantization/gguf/ggml_tensor.py +15 -4
- invokeai/backend/util/vae_working_memory.py +0 -2
- invokeai/backend/z_image/extensions/regional_prompting_extension.py +10 -12
- invokeai/frontend/web/dist/assets/App-D13dX7be.js +161 -0
- invokeai/frontend/web/dist/assets/{browser-ponyfill-DHZxq1nk.js → browser-ponyfill-u_ZjhQTI.js} +1 -1
- invokeai/frontend/web/dist/assets/index-BB0nHmDe.js +530 -0
- invokeai/frontend/web/dist/index.html +1 -1
- invokeai/frontend/web/dist/locales/en-GB.json +1 -0
- invokeai/frontend/web/dist/locales/en.json +85 -6
- invokeai/frontend/web/dist/locales/it.json +135 -15
- invokeai/frontend/web/dist/locales/ru.json +11 -11
- invokeai/version/invokeai_version.py +1 -1
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/METADATA +8 -2
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/RECORD +81 -57
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/WHEEL +1 -1
- invokeai/frontend/web/dist/assets/App-CYhlZO3Q.js +0 -161
- invokeai/frontend/web/dist/assets/index-dgSJAY--.js +0 -530
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/entry_points.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
- {invokeai-6.10.0rc1.dist-info → invokeai-6.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import math
|
|
2
3
|
from contextlib import ExitStack
|
|
3
4
|
from typing import Callable, Iterator, Optional, Tuple
|
|
@@ -5,6 +6,7 @@ from typing import Callable, Iterator, Optional, Tuple
|
|
|
5
6
|
import einops
|
|
6
7
|
import torch
|
|
7
8
|
import torchvision.transforms as tv_transforms
|
|
9
|
+
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
|
8
10
|
from PIL import Image
|
|
9
11
|
from torchvision.transforms.functional import resize as tv_resize
|
|
10
12
|
from tqdm import tqdm
|
|
@@ -24,6 +26,7 @@ from invokeai.app.invocations.primitives import LatentsOutput
|
|
|
24
26
|
from invokeai.app.invocations.z_image_control import ZImageControlField
|
|
25
27
|
from invokeai.app.invocations.z_image_image_to_latents import ZImageImageToLatentsInvocation
|
|
26
28
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
|
29
|
+
from invokeai.backend.flux.schedulers import ZIMAGE_SCHEDULER_LABELS, ZIMAGE_SCHEDULER_MAP, ZIMAGE_SCHEDULER_NAME_VALUES
|
|
27
30
|
from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat
|
|
28
31
|
from invokeai.backend.patches.layer_patcher import LayerPatcher
|
|
29
32
|
from invokeai.backend.patches.lora_conversions.z_image_lora_constants import Z_IMAGE_LORA_TRANSFORMER_PREFIX
|
|
@@ -47,7 +50,7 @@ from invokeai.backend.z_image.z_image_transformer_patch import patch_transformer
|
|
|
47
50
|
title="Denoise - Z-Image",
|
|
48
51
|
tags=["image", "z-image"],
|
|
49
52
|
category="image",
|
|
50
|
-
version="1.
|
|
53
|
+
version="1.4.0",
|
|
51
54
|
classification=Classification.Prototype,
|
|
52
55
|
)
|
|
53
56
|
class ZImageDenoiseInvocation(BaseInvocation):
|
|
@@ -66,6 +69,7 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
66
69
|
)
|
|
67
70
|
denoising_start: float = InputField(default=0.0, ge=0, le=1, description=FieldDescriptions.denoising_start)
|
|
68
71
|
denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
|
|
72
|
+
add_noise: bool = InputField(default=True, description="Add noise based on denoising start.")
|
|
69
73
|
transformer: TransformerField = InputField(
|
|
70
74
|
description=FieldDescriptions.z_image_model, input=Input.Connection, title="Transformer"
|
|
71
75
|
)
|
|
@@ -100,6 +104,13 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
100
104
|
description=FieldDescriptions.vae + " Required for control conditioning.",
|
|
101
105
|
input=Input.Connection,
|
|
102
106
|
)
|
|
107
|
+
# Scheduler selection for the denoising process
|
|
108
|
+
scheduler: ZIMAGE_SCHEDULER_NAME_VALUES = InputField(
|
|
109
|
+
default="euler",
|
|
110
|
+
description="Scheduler (sampler) for the denoising process. Euler is the default and recommended for "
|
|
111
|
+
"Z-Image-Turbo. Heun is 2nd-order (better quality, 2x slower). LCM is optimized for few steps.",
|
|
112
|
+
ui_choice_labels=ZIMAGE_SCHEDULER_LABELS,
|
|
113
|
+
)
|
|
103
114
|
|
|
104
115
|
@torch.no_grad()
|
|
105
116
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
@@ -337,8 +348,12 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
337
348
|
|
|
338
349
|
# Prepare input latent image
|
|
339
350
|
if init_latents is not None:
|
|
340
|
-
|
|
341
|
-
|
|
351
|
+
if self.add_noise:
|
|
352
|
+
# Noise the init_latents by the appropriate amount for the first timestep.
|
|
353
|
+
s_0 = sigmas[0]
|
|
354
|
+
latents = s_0 * noise + (1.0 - s_0) * init_latents
|
|
355
|
+
else:
|
|
356
|
+
latents = init_latents
|
|
342
357
|
else:
|
|
343
358
|
if self.denoising_start > 1e-5:
|
|
344
359
|
raise ValueError("denoising_start should be 0 when initial latents are not provided.")
|
|
@@ -361,15 +376,32 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
361
376
|
)
|
|
362
377
|
|
|
363
378
|
step_callback = self._build_step_callback(context)
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
379
|
+
|
|
380
|
+
# Initialize the diffusers scheduler if not using built-in Euler
|
|
381
|
+
scheduler: SchedulerMixin | None = None
|
|
382
|
+
use_scheduler = self.scheduler != "euler"
|
|
383
|
+
|
|
384
|
+
if use_scheduler:
|
|
385
|
+
scheduler_class = ZIMAGE_SCHEDULER_MAP[self.scheduler]
|
|
386
|
+
scheduler = scheduler_class(
|
|
387
|
+
num_train_timesteps=1000,
|
|
388
|
+
shift=1.0,
|
|
389
|
+
)
|
|
390
|
+
# Set timesteps - LCM should use num_inference_steps (it has its own sigma schedule),
|
|
391
|
+
# while other schedulers can use custom sigmas if supported
|
|
392
|
+
is_lcm = self.scheduler == "lcm"
|
|
393
|
+
set_timesteps_sig = inspect.signature(scheduler.set_timesteps)
|
|
394
|
+
if not is_lcm and "sigmas" in set_timesteps_sig.parameters:
|
|
395
|
+
# Convert sigmas list to tensor for scheduler
|
|
396
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device)
|
|
397
|
+
else:
|
|
398
|
+
# LCM or scheduler doesn't support custom sigmas - use num_inference_steps
|
|
399
|
+
scheduler.set_timesteps(num_inference_steps=total_steps, device=device)
|
|
400
|
+
|
|
401
|
+
# For Heun scheduler, the number of actual steps may differ
|
|
402
|
+
num_scheduler_steps = len(scheduler.timesteps)
|
|
403
|
+
else:
|
|
404
|
+
num_scheduler_steps = total_steps
|
|
373
405
|
|
|
374
406
|
with ExitStack() as exit_stack:
|
|
375
407
|
# Get transformer config to determine if it's quantized
|
|
@@ -503,91 +535,219 @@ class ZImageDenoiseInvocation(BaseInvocation):
|
|
|
503
535
|
)
|
|
504
536
|
)
|
|
505
537
|
|
|
506
|
-
# Denoising loop
|
|
507
|
-
for
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
#
|
|
512
|
-
#
|
|
513
|
-
#
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
model_output = transformer(
|
|
538
|
-
x=latent_model_input_list,
|
|
539
|
-
t=timestep,
|
|
540
|
-
cap_feats=[pos_prompt_embeds],
|
|
538
|
+
# Denoising loop - supports both built-in Euler and diffusers schedulers
|
|
539
|
+
# Track user-facing step for progress (accounts for Heun's double steps)
|
|
540
|
+
user_step = 0
|
|
541
|
+
|
|
542
|
+
if use_scheduler and scheduler is not None:
|
|
543
|
+
# Use diffusers scheduler for stepping
|
|
544
|
+
# Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
|
|
545
|
+
# This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
|
|
546
|
+
pbar = tqdm(total=total_steps, desc="Denoising")
|
|
547
|
+
for step_index in range(num_scheduler_steps):
|
|
548
|
+
sched_timestep = scheduler.timesteps[step_index]
|
|
549
|
+
# Convert scheduler timestep (0-1000) to normalized sigma (0-1)
|
|
550
|
+
sigma_curr = sched_timestep.item() / scheduler.config.num_train_timesteps
|
|
551
|
+
|
|
552
|
+
# For Heun scheduler, track if we're in first or second order step
|
|
553
|
+
is_heun = hasattr(scheduler, "state_in_first_order")
|
|
554
|
+
in_first_order = scheduler.state_in_first_order if is_heun else True
|
|
555
|
+
|
|
556
|
+
# Timestep tensor for Z-Image model
|
|
557
|
+
# The model expects t=0 at start (noise) and t=1 at end (clean)
|
|
558
|
+
model_t = 1.0 - sigma_curr
|
|
559
|
+
timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
|
|
560
|
+
|
|
561
|
+
# Run transformer for positive prediction
|
|
562
|
+
latent_model_input = latents.to(transformer.dtype)
|
|
563
|
+
latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
|
|
564
|
+
latent_model_input_list = list(latent_model_input.unbind(dim=0))
|
|
565
|
+
|
|
566
|
+
# Determine if control should be applied at this step
|
|
567
|
+
apply_control = control_extension is not None and control_extension.should_apply(
|
|
568
|
+
user_step, total_steps
|
|
541
569
|
)
|
|
542
|
-
model_out_list = model_output[0] # Extract list of tensors from tuple
|
|
543
570
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
571
|
+
# Run forward pass
|
|
572
|
+
if apply_control:
|
|
573
|
+
model_out_list, _ = z_image_forward_with_control(
|
|
574
|
+
transformer=transformer,
|
|
575
|
+
x=latent_model_input_list,
|
|
576
|
+
t=timestep,
|
|
577
|
+
cap_feats=[pos_prompt_embeds],
|
|
578
|
+
control_extension=control_extension,
|
|
579
|
+
)
|
|
580
|
+
else:
|
|
581
|
+
model_output = transformer(
|
|
582
|
+
x=latent_model_input_list,
|
|
583
|
+
t=timestep,
|
|
584
|
+
cap_feats=[pos_prompt_embeds],
|
|
585
|
+
)
|
|
586
|
+
model_out_list = model_output[0]
|
|
587
|
+
|
|
588
|
+
noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
|
|
589
|
+
noise_pred_cond = noise_pred_cond.squeeze(2)
|
|
590
|
+
noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
|
|
591
|
+
|
|
592
|
+
# Apply CFG if enabled
|
|
593
|
+
if do_classifier_free_guidance and neg_prompt_embeds is not None:
|
|
594
|
+
if apply_control:
|
|
595
|
+
model_out_list_uncond, _ = z_image_forward_with_control(
|
|
596
|
+
transformer=transformer,
|
|
597
|
+
x=latent_model_input_list,
|
|
598
|
+
t=timestep,
|
|
599
|
+
cap_feats=[neg_prompt_embeds],
|
|
600
|
+
control_extension=control_extension,
|
|
601
|
+
)
|
|
602
|
+
else:
|
|
603
|
+
model_output_uncond = transformer(
|
|
604
|
+
x=latent_model_input_list,
|
|
605
|
+
t=timestep,
|
|
606
|
+
cap_feats=[neg_prompt_embeds],
|
|
607
|
+
)
|
|
608
|
+
model_out_list_uncond = model_output_uncond[0]
|
|
609
|
+
|
|
610
|
+
noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
|
|
611
|
+
noise_pred_uncond = noise_pred_uncond.squeeze(2)
|
|
612
|
+
noise_pred_uncond = -noise_pred_uncond
|
|
613
|
+
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
|
614
|
+
else:
|
|
615
|
+
noise_pred = noise_pred_cond
|
|
616
|
+
|
|
617
|
+
# Use scheduler.step() for the update
|
|
618
|
+
step_output = scheduler.step(model_output=noise_pred, timestep=sched_timestep, sample=latents)
|
|
619
|
+
latents = step_output.prev_sample
|
|
547
620
|
|
|
548
|
-
|
|
549
|
-
|
|
621
|
+
# Get sigma_prev for inpainting (next sigma value)
|
|
622
|
+
if step_index + 1 < len(scheduler.sigmas):
|
|
623
|
+
sigma_prev = scheduler.sigmas[step_index + 1].item()
|
|
624
|
+
else:
|
|
625
|
+
sigma_prev = 0.0
|
|
626
|
+
|
|
627
|
+
if inpaint_extension is not None:
|
|
628
|
+
latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
|
|
629
|
+
|
|
630
|
+
# For Heun, only increment user step after second-order step completes
|
|
631
|
+
if is_heun:
|
|
632
|
+
if not in_first_order:
|
|
633
|
+
user_step += 1
|
|
634
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
635
|
+
if user_step <= total_steps:
|
|
636
|
+
pbar.update(1)
|
|
637
|
+
step_callback(
|
|
638
|
+
PipelineIntermediateState(
|
|
639
|
+
step=user_step,
|
|
640
|
+
order=2,
|
|
641
|
+
total_steps=total_steps,
|
|
642
|
+
timestep=int(sigma_curr * 1000),
|
|
643
|
+
latents=latents,
|
|
644
|
+
),
|
|
645
|
+
)
|
|
646
|
+
else:
|
|
647
|
+
# For LCM and other first-order schedulers
|
|
648
|
+
user_step += 1
|
|
649
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
650
|
+
# (LCM scheduler may have more internal steps than user-facing steps)
|
|
651
|
+
if user_step <= total_steps:
|
|
652
|
+
pbar.update(1)
|
|
653
|
+
step_callback(
|
|
654
|
+
PipelineIntermediateState(
|
|
655
|
+
step=user_step,
|
|
656
|
+
order=1,
|
|
657
|
+
total_steps=total_steps,
|
|
658
|
+
timestep=int(sigma_curr * 1000),
|
|
659
|
+
latents=latents,
|
|
660
|
+
),
|
|
661
|
+
)
|
|
662
|
+
pbar.close()
|
|
663
|
+
else:
|
|
664
|
+
# Original Euler implementation (default, optimized for Z-Image)
|
|
665
|
+
for step_idx in tqdm(range(total_steps)):
|
|
666
|
+
sigma_curr = sigmas[step_idx]
|
|
667
|
+
sigma_prev = sigmas[step_idx + 1]
|
|
668
|
+
|
|
669
|
+
# Timestep tensor for Z-Image model
|
|
670
|
+
# The model expects t=0 at start (noise) and t=1 at end (clean)
|
|
671
|
+
# Sigma goes from 1 (noise) to 0 (clean), so model_t = 1 - sigma
|
|
672
|
+
model_t = 1.0 - sigma_curr
|
|
673
|
+
timestep = torch.tensor([model_t], device=device, dtype=inference_dtype).expand(latents.shape[0])
|
|
674
|
+
|
|
675
|
+
# Run transformer for positive prediction
|
|
676
|
+
# Z-Image transformer expects: x as list of [C, 1, H, W] tensors, t, cap_feats as list
|
|
677
|
+
# Prepare latent input: [B, C, H, W] -> [B, C, 1, H, W] -> list of [C, 1, H, W]
|
|
678
|
+
latent_model_input = latents.to(transformer.dtype)
|
|
679
|
+
latent_model_input = latent_model_input.unsqueeze(2) # Add frame dimension
|
|
680
|
+
latent_model_input_list = list(latent_model_input.unbind(dim=0))
|
|
681
|
+
|
|
682
|
+
# Determine if control should be applied at this step
|
|
683
|
+
apply_control = control_extension is not None and control_extension.should_apply(
|
|
684
|
+
step_idx, total_steps
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
# Run forward pass - use custom forward with control if extension is active
|
|
550
688
|
if apply_control:
|
|
551
|
-
|
|
689
|
+
model_out_list, _ = z_image_forward_with_control(
|
|
552
690
|
transformer=transformer,
|
|
553
691
|
x=latent_model_input_list,
|
|
554
692
|
t=timestep,
|
|
555
|
-
cap_feats=[
|
|
693
|
+
cap_feats=[pos_prompt_embeds],
|
|
556
694
|
control_extension=control_extension,
|
|
557
695
|
)
|
|
558
696
|
else:
|
|
559
|
-
|
|
697
|
+
model_output = transformer(
|
|
560
698
|
x=latent_model_input_list,
|
|
561
699
|
t=timestep,
|
|
562
|
-
cap_feats=[
|
|
700
|
+
cap_feats=[pos_prompt_embeds],
|
|
563
701
|
)
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
702
|
+
model_out_list = model_output[0] # Extract list of tensors from tuple
|
|
703
|
+
|
|
704
|
+
noise_pred_cond = torch.stack([t.float() for t in model_out_list], dim=0)
|
|
705
|
+
noise_pred_cond = noise_pred_cond.squeeze(2) # Remove frame dimension
|
|
706
|
+
noise_pred_cond = -noise_pred_cond # Z-Image uses v-prediction with negation
|
|
707
|
+
|
|
708
|
+
# Apply CFG if enabled
|
|
709
|
+
if do_classifier_free_guidance and neg_prompt_embeds is not None:
|
|
710
|
+
if apply_control:
|
|
711
|
+
model_out_list_uncond, _ = z_image_forward_with_control(
|
|
712
|
+
transformer=transformer,
|
|
713
|
+
x=latent_model_input_list,
|
|
714
|
+
t=timestep,
|
|
715
|
+
cap_feats=[neg_prompt_embeds],
|
|
716
|
+
control_extension=control_extension,
|
|
717
|
+
)
|
|
718
|
+
else:
|
|
719
|
+
model_output_uncond = transformer(
|
|
720
|
+
x=latent_model_input_list,
|
|
721
|
+
t=timestep,
|
|
722
|
+
cap_feats=[neg_prompt_embeds],
|
|
723
|
+
)
|
|
724
|
+
model_out_list_uncond = model_output_uncond[0] # Extract list of tensors from tuple
|
|
725
|
+
|
|
726
|
+
noise_pred_uncond = torch.stack([t.float() for t in model_out_list_uncond], dim=0)
|
|
727
|
+
noise_pred_uncond = noise_pred_uncond.squeeze(2)
|
|
728
|
+
noise_pred_uncond = -noise_pred_uncond
|
|
729
|
+
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
|
730
|
+
else:
|
|
731
|
+
noise_pred = noise_pred_cond
|
|
732
|
+
|
|
733
|
+
# Euler step
|
|
734
|
+
latents_dtype = latents.dtype
|
|
735
|
+
latents = latents.to(dtype=torch.float32)
|
|
736
|
+
latents = latents + (sigma_prev - sigma_curr) * noise_pred
|
|
737
|
+
latents = latents.to(dtype=latents_dtype)
|
|
738
|
+
|
|
739
|
+
if inpaint_extension is not None:
|
|
740
|
+
latents = inpaint_extension.merge_intermediate_latents_with_init_latents(latents, sigma_prev)
|
|
741
|
+
|
|
742
|
+
step_callback(
|
|
743
|
+
PipelineIntermediateState(
|
|
744
|
+
step=step_idx + 1,
|
|
745
|
+
order=1,
|
|
746
|
+
total_steps=total_steps,
|
|
747
|
+
timestep=int(sigma_curr * 1000),
|
|
748
|
+
latents=latents,
|
|
749
|
+
),
|
|
750
|
+
)
|
|
591
751
|
|
|
592
752
|
return latents
|
|
593
753
|
|
|
@@ -20,6 +20,7 @@ from invokeai.backend.flux.modules.autoencoder import AutoEncoder as FluxAutoEnc
|
|
|
20
20
|
from invokeai.backend.model_manager.load.load_base import LoadedModel
|
|
21
21
|
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
|
|
22
22
|
from invokeai.backend.util.devices import TorchDevice
|
|
23
|
+
from invokeai.backend.util.vae_working_memory import estimate_vae_working_memory_flux
|
|
23
24
|
|
|
24
25
|
# Z-Image can use either the Diffusers AutoencoderKL or the FLUX AutoEncoder
|
|
25
26
|
ZImageVAE = Union[AutoencoderKL, FluxAutoEncoder]
|
|
@@ -47,7 +48,14 @@ class ZImageImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBoard):
|
|
|
47
48
|
"Ensure you are using a compatible VAE model."
|
|
48
49
|
)
|
|
49
50
|
|
|
50
|
-
|
|
51
|
+
# Estimate working memory needed for VAE encode
|
|
52
|
+
estimated_working_memory = estimate_vae_working_memory_flux(
|
|
53
|
+
operation="encode",
|
|
54
|
+
image_tensor=image_tensor,
|
|
55
|
+
vae=vae_info.model,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
with vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
|
|
51
59
|
if not isinstance(vae, (AutoencoderKL, FluxAutoEncoder)):
|
|
52
60
|
raise TypeError(
|
|
53
61
|
f"Expected AutoencoderKL or FluxAutoEncoder, got {type(vae).__name__}. "
|
|
@@ -21,6 +21,7 @@ from invokeai.app.services.shared.invocation_context import InvocationContext
|
|
|
21
21
|
from invokeai.backend.flux.modules.autoencoder import AutoEncoder as FluxAutoEncoder
|
|
22
22
|
from invokeai.backend.stable_diffusion.extensions.seamless import SeamlessExt
|
|
23
23
|
from invokeai.backend.util.devices import TorchDevice
|
|
24
|
+
from invokeai.backend.util.vae_working_memory import estimate_vae_working_memory_flux
|
|
24
25
|
|
|
25
26
|
# Z-Image can use either the Diffusers AutoencoderKL or the FLUX AutoEncoder
|
|
26
27
|
ZImageVAE = Union[AutoencoderKL, FluxAutoEncoder]
|
|
@@ -53,12 +54,19 @@ class ZImageLatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
|
|
|
53
54
|
|
|
54
55
|
is_flux_vae = isinstance(vae_info.model, FluxAutoEncoder)
|
|
55
56
|
|
|
57
|
+
# Estimate working memory needed for VAE decode
|
|
58
|
+
estimated_working_memory = estimate_vae_working_memory_flux(
|
|
59
|
+
operation="decode",
|
|
60
|
+
image_tensor=latents,
|
|
61
|
+
vae=vae_info.model,
|
|
62
|
+
)
|
|
63
|
+
|
|
56
64
|
# FLUX VAE doesn't support seamless, so only apply for AutoencoderKL
|
|
57
65
|
seamless_context = (
|
|
58
66
|
nullcontext() if is_flux_vae else SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes)
|
|
59
67
|
)
|
|
60
68
|
|
|
61
|
-
with seamless_context, vae_info.model_on_device() as (_, vae):
|
|
69
|
+
with seamless_context, vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
|
|
62
70
|
context.util.signal_progress("Running VAE")
|
|
63
71
|
if not isinstance(vae, (AutoencoderKL, FluxAutoEncoder)):
|
|
64
72
|
raise TypeError(
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
|
|
4
|
+
from invokeai.app.invocations.fields import (
|
|
5
|
+
FieldDescriptions,
|
|
6
|
+
Input,
|
|
7
|
+
InputField,
|
|
8
|
+
ZImageConditioningField,
|
|
9
|
+
)
|
|
10
|
+
from invokeai.app.invocations.primitives import ZImageConditioningOutput
|
|
11
|
+
from invokeai.app.services.shared.invocation_context import InvocationContext
|
|
12
|
+
from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
|
|
13
|
+
ConditioningFieldData,
|
|
14
|
+
ZImageConditioningInfo,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@invocation(
|
|
19
|
+
"z_image_seed_variance_enhancer",
|
|
20
|
+
title="Seed Variance Enhancer - Z-Image",
|
|
21
|
+
tags=["conditioning", "z-image", "variance", "seed"],
|
|
22
|
+
category="conditioning",
|
|
23
|
+
version="1.0.0",
|
|
24
|
+
classification=Classification.Prototype,
|
|
25
|
+
)
|
|
26
|
+
class ZImageSeedVarianceEnhancerInvocation(BaseInvocation):
|
|
27
|
+
"""Adds seed-based noise to Z-Image conditioning to increase variance between seeds.
|
|
28
|
+
|
|
29
|
+
Z-Image-Turbo can produce relatively similar images with different seeds,
|
|
30
|
+
making it harder to explore variations of a prompt. This node implements
|
|
31
|
+
reproducible, seed-based noise injection into text embeddings to increase
|
|
32
|
+
visual variation while maintaining reproducibility.
|
|
33
|
+
|
|
34
|
+
The noise strength is auto-calibrated relative to the embedding's standard
|
|
35
|
+
deviation, ensuring consistent results across different prompts.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
conditioning: ZImageConditioningField = InputField(
|
|
39
|
+
description=FieldDescriptions.cond,
|
|
40
|
+
input=Input.Connection,
|
|
41
|
+
title="Conditioning",
|
|
42
|
+
)
|
|
43
|
+
seed: int = InputField(
|
|
44
|
+
default=0,
|
|
45
|
+
ge=0,
|
|
46
|
+
description="Seed for reproducible noise generation. Different seeds produce different noise patterns.",
|
|
47
|
+
)
|
|
48
|
+
strength: float = InputField(
|
|
49
|
+
default=0.1,
|
|
50
|
+
ge=0.0,
|
|
51
|
+
le=2.0,
|
|
52
|
+
description="Noise strength as multiplier of embedding std. 0=off, 0.1=subtle, 0.5=strong.",
|
|
53
|
+
)
|
|
54
|
+
randomize_percent: float = InputField(
|
|
55
|
+
default=50.0,
|
|
56
|
+
ge=1.0,
|
|
57
|
+
le=100.0,
|
|
58
|
+
description="Percentage of embedding values to add noise to (1-100). Lower values create more selective noise patterns.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@torch.no_grad()
|
|
62
|
+
def invoke(self, context: InvocationContext) -> ZImageConditioningOutput:
|
|
63
|
+
# Load conditioning data
|
|
64
|
+
cond_data = context.conditioning.load(self.conditioning.conditioning_name)
|
|
65
|
+
assert len(cond_data.conditionings) == 1, "Expected exactly one conditioning tensor"
|
|
66
|
+
z_image_conditioning = cond_data.conditionings[0]
|
|
67
|
+
assert isinstance(z_image_conditioning, ZImageConditioningInfo), "Expected ZImageConditioningInfo"
|
|
68
|
+
|
|
69
|
+
# Early return if strength is zero (no modification needed)
|
|
70
|
+
if self.strength == 0:
|
|
71
|
+
return ZImageConditioningOutput(conditioning=self.conditioning)
|
|
72
|
+
|
|
73
|
+
# Clone embeddings to avoid modifying the original
|
|
74
|
+
prompt_embeds = z_image_conditioning.prompt_embeds.clone()
|
|
75
|
+
|
|
76
|
+
# Calculate actual noise strength based on embedding statistics
|
|
77
|
+
# This auto-calibration ensures consistent results across different prompts
|
|
78
|
+
embed_std = torch.std(prompt_embeds).item()
|
|
79
|
+
actual_strength = self.strength * embed_std
|
|
80
|
+
|
|
81
|
+
# Generate deterministic noise using the seed
|
|
82
|
+
generator = torch.Generator(device=prompt_embeds.device)
|
|
83
|
+
generator.manual_seed(self.seed)
|
|
84
|
+
noise = torch.rand(
|
|
85
|
+
prompt_embeds.shape, generator=generator, device=prompt_embeds.device, dtype=prompt_embeds.dtype
|
|
86
|
+
)
|
|
87
|
+
noise = noise * 2 - 1 # Scale to [-1, 1)
|
|
88
|
+
noise = noise * actual_strength
|
|
89
|
+
|
|
90
|
+
# Create selective mask for noise application
|
|
91
|
+
generator.manual_seed(self.seed + 1)
|
|
92
|
+
noise_mask = torch.bernoulli(
|
|
93
|
+
torch.ones_like(prompt_embeds) * (self.randomize_percent / 100.0),
|
|
94
|
+
generator=generator,
|
|
95
|
+
).bool()
|
|
96
|
+
|
|
97
|
+
# Apply noise only to masked positions
|
|
98
|
+
prompt_embeds = prompt_embeds + (noise * noise_mask)
|
|
99
|
+
|
|
100
|
+
# Save modified conditioning
|
|
101
|
+
new_conditioning = ZImageConditioningInfo(prompt_embeds=prompt_embeds)
|
|
102
|
+
conditioning_data = ConditioningFieldData(conditionings=[new_conditioning])
|
|
103
|
+
conditioning_name = context.conditioning.save(conditioning_data)
|
|
104
|
+
|
|
105
|
+
return ZImageConditioningOutput(
|
|
106
|
+
conditioning=ZImageConditioningField(
|
|
107
|
+
conditioning_name=conditioning_name,
|
|
108
|
+
mask=self.conditioning.mask,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
@@ -85,6 +85,7 @@ class InvokeAIAppConfig(BaseSettings):
|
|
|
85
85
|
max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.
|
|
86
86
|
max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.
|
|
87
87
|
log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
|
|
88
|
+
model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.
|
|
88
89
|
device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
|
|
89
90
|
enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
|
|
90
91
|
keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.
|
|
@@ -165,9 +166,10 @@ class InvokeAIAppConfig(BaseSettings):
|
|
|
165
166
|
max_cache_ram_gb: Optional[float] = Field(default=None, gt=0, description="The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.")
|
|
166
167
|
max_cache_vram_gb: Optional[float] = Field(default=None, ge=0, description="The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.")
|
|
167
168
|
log_memory_usage: bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
|
|
169
|
+
model_cache_keep_alive_min: float = Field(default=0, ge=0, description="How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.")
|
|
168
170
|
device_working_mem_gb: float = Field(default=3, description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
|
|
169
171
|
enable_partial_loading: bool = Field(default=False, description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
|
|
170
|
-
keep_ram_copy_of_weights: bool = Field(default=True,
|
|
172
|
+
keep_ram_copy_of_weights: bool = Field(default=True, description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
|
|
171
173
|
# Deprecated CACHE configs
|
|
172
174
|
ram: Optional[float] = Field(default=None, gt=0, description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
|
|
173
175
|
vram: Optional[float] = Field(default=None, ge=0, description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
|
|
@@ -14,7 +14,7 @@ class NodeExecutionStatsSummary:
|
|
|
14
14
|
node_type: str
|
|
15
15
|
num_calls: int
|
|
16
16
|
time_used_seconds: float
|
|
17
|
-
|
|
17
|
+
delta_vram_gb: float
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
@dataclass
|
|
@@ -58,10 +58,10 @@ class InvocationStatsSummary:
|
|
|
58
58
|
def __str__(self) -> str:
|
|
59
59
|
_str = ""
|
|
60
60
|
_str = f"Graph stats: {self.graph_stats.graph_execution_state_id}\n"
|
|
61
|
-
_str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM
|
|
61
|
+
_str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM Change':+>10}\n"
|
|
62
62
|
|
|
63
63
|
for summary in self.node_stats:
|
|
64
|
-
_str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.
|
|
64
|
+
_str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.delta_vram_gb:+10.3f}G\n"
|
|
65
65
|
|
|
66
66
|
_str += f"TOTAL GRAPH EXECUTION TIME: {self.graph_stats.execution_time_seconds:7.3f}s\n"
|
|
67
67
|
|
|
@@ -100,7 +100,7 @@ class NodeExecutionStats:
|
|
|
100
100
|
start_ram_gb: float # GB
|
|
101
101
|
end_ram_gb: float # GB
|
|
102
102
|
|
|
103
|
-
|
|
103
|
+
delta_vram_gb: float # GB
|
|
104
104
|
|
|
105
105
|
def total_time(self) -> float:
|
|
106
106
|
return self.end_time - self.start_time
|
|
@@ -174,9 +174,9 @@ class GraphExecutionStats:
|
|
|
174
174
|
for node_type, node_type_stats_list in node_stats_by_type.items():
|
|
175
175
|
num_calls = len(node_type_stats_list)
|
|
176
176
|
time_used = sum([n.total_time() for n in node_type_stats_list])
|
|
177
|
-
|
|
177
|
+
delta_vram = max([n.delta_vram_gb for n in node_type_stats_list])
|
|
178
178
|
summary = NodeExecutionStatsSummary(
|
|
179
|
-
node_type=node_type, num_calls=num_calls, time_used_seconds=time_used,
|
|
179
|
+
node_type=node_type, num_calls=num_calls, time_used_seconds=time_used, delta_vram_gb=delta_vram
|
|
180
180
|
)
|
|
181
181
|
summaries.append(summary)
|
|
182
182
|
|