InvokeAI 6.10.0rc2__py3-none-any.whl → 6.11.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invokeai/app/api/routers/model_manager.py +43 -1
- invokeai/app/invocations/fields.py +1 -1
- invokeai/app/invocations/flux2_denoise.py +499 -0
- invokeai/app/invocations/flux2_klein_model_loader.py +222 -0
- invokeai/app/invocations/flux2_klein_text_encoder.py +222 -0
- invokeai/app/invocations/flux2_vae_decode.py +106 -0
- invokeai/app/invocations/flux2_vae_encode.py +88 -0
- invokeai/app/invocations/flux_denoise.py +50 -3
- invokeai/app/invocations/flux_lora_loader.py +1 -1
- invokeai/app/invocations/ideal_size.py +6 -1
- invokeai/app/invocations/metadata.py +4 -0
- invokeai/app/invocations/metadata_linked.py +47 -0
- invokeai/app/invocations/model.py +1 -0
- invokeai/app/invocations/z_image_denoise.py +8 -3
- invokeai/app/invocations/z_image_image_to_latents.py +9 -1
- invokeai/app/invocations/z_image_latents_to_image.py +9 -1
- invokeai/app/invocations/z_image_seed_variance_enhancer.py +110 -0
- invokeai/app/services/config/config_default.py +3 -1
- invokeai/app/services/invocation_stats/invocation_stats_common.py +6 -6
- invokeai/app/services/invocation_stats/invocation_stats_default.py +9 -4
- invokeai/app/services/model_manager/model_manager_default.py +7 -0
- invokeai/app/services/model_records/model_records_base.py +4 -2
- invokeai/app/services/shared/invocation_context.py +15 -0
- invokeai/app/services/shared/sqlite/sqlite_util.py +2 -0
- invokeai/app/services/shared/sqlite_migrator/migrations/migration_25.py +61 -0
- invokeai/app/util/step_callback.py +42 -0
- invokeai/backend/flux/denoise.py +239 -204
- invokeai/backend/flux/dype/__init__.py +18 -0
- invokeai/backend/flux/dype/base.py +226 -0
- invokeai/backend/flux/dype/embed.py +116 -0
- invokeai/backend/flux/dype/presets.py +141 -0
- invokeai/backend/flux/dype/rope.py +110 -0
- invokeai/backend/flux/extensions/dype_extension.py +91 -0
- invokeai/backend/flux/util.py +35 -1
- invokeai/backend/flux2/__init__.py +4 -0
- invokeai/backend/flux2/denoise.py +261 -0
- invokeai/backend/flux2/ref_image_extension.py +294 -0
- invokeai/backend/flux2/sampling_utils.py +209 -0
- invokeai/backend/model_manager/configs/factory.py +19 -1
- invokeai/backend/model_manager/configs/main.py +395 -3
- invokeai/backend/model_manager/configs/qwen3_encoder.py +116 -7
- invokeai/backend/model_manager/configs/vae.py +104 -2
- invokeai/backend/model_manager/load/load_default.py +0 -1
- invokeai/backend/model_manager/load/model_cache/model_cache.py +107 -2
- invokeai/backend/model_manager/load/model_loaders/flux.py +1007 -2
- invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +0 -1
- invokeai/backend/model_manager/load/model_loaders/z_image.py +121 -28
- invokeai/backend/model_manager/starter_models.py +128 -0
- invokeai/backend/model_manager/taxonomy.py +31 -4
- invokeai/backend/model_manager/util/select_hf_files.py +3 -2
- invokeai/backend/util/vae_working_memory.py +0 -2
- invokeai/frontend/web/dist/assets/App-ClpIJstk.js +161 -0
- invokeai/frontend/web/dist/assets/{browser-ponyfill-BP0RxJ4G.js → browser-ponyfill-Cw07u5G1.js} +1 -1
- invokeai/frontend/web/dist/assets/{index-B44qKjrs.js → index-DSKM8iGj.js} +69 -69
- invokeai/frontend/web/dist/index.html +1 -1
- invokeai/frontend/web/dist/locales/en.json +58 -5
- invokeai/frontend/web/dist/locales/it.json +2 -1
- invokeai/version/invokeai_version.py +1 -1
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/METADATA +7 -1
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/RECORD +66 -49
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/WHEEL +1 -1
- invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +0 -161
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/entry_points.txt +0 -0
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE +0 -0
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
- {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -93,6 +93,46 @@ COGVIEW4_LATENT_RGB_FACTORS = [
|
|
|
93
93
|
[-0.00955853, -0.00980067, -0.00977842],
|
|
94
94
|
]
|
|
95
95
|
|
|
96
|
+
# FLUX.2 uses 32 latent channels. Since we don't have proper factors yet,
|
|
97
|
+
# we extend FLUX factors with zeros for preview approximation.
|
|
98
|
+
FLUX2_LATENT_RGB_FACTORS = [
|
|
99
|
+
# R G B
|
|
100
|
+
# First 16 channels (from FLUX)
|
|
101
|
+
[0.0118, 0.0024, 0.0017],
|
|
102
|
+
[-0.0074, -0.0108, -0.0003],
|
|
103
|
+
[0.0056, 0.0291, 0.0768],
|
|
104
|
+
[0.0342, -0.0681, -0.0427],
|
|
105
|
+
[-0.0258, 0.0092, 0.0463],
|
|
106
|
+
[0.0863, 0.0784, 0.0547],
|
|
107
|
+
[-0.0017, 0.0402, 0.0158],
|
|
108
|
+
[0.0501, 0.1058, 0.1152],
|
|
109
|
+
[-0.0209, -0.0218, -0.0329],
|
|
110
|
+
[-0.0314, 0.0083, 0.0896],
|
|
111
|
+
[0.0851, 0.0665, -0.0472],
|
|
112
|
+
[-0.0534, 0.0238, -0.0024],
|
|
113
|
+
[0.0452, -0.0026, 0.0048],
|
|
114
|
+
[0.0892, 0.0831, 0.0881],
|
|
115
|
+
[-0.1117, -0.0304, -0.0789],
|
|
116
|
+
[0.0027, -0.0479, -0.0043],
|
|
117
|
+
# Additional 16 channels (zeros as placeholder)
|
|
118
|
+
[0.0, 0.0, 0.0],
|
|
119
|
+
[0.0, 0.0, 0.0],
|
|
120
|
+
[0.0, 0.0, 0.0],
|
|
121
|
+
[0.0, 0.0, 0.0],
|
|
122
|
+
[0.0, 0.0, 0.0],
|
|
123
|
+
[0.0, 0.0, 0.0],
|
|
124
|
+
[0.0, 0.0, 0.0],
|
|
125
|
+
[0.0, 0.0, 0.0],
|
|
126
|
+
[0.0, 0.0, 0.0],
|
|
127
|
+
[0.0, 0.0, 0.0],
|
|
128
|
+
[0.0, 0.0, 0.0],
|
|
129
|
+
[0.0, 0.0, 0.0],
|
|
130
|
+
[0.0, 0.0, 0.0],
|
|
131
|
+
[0.0, 0.0, 0.0],
|
|
132
|
+
[0.0, 0.0, 0.0],
|
|
133
|
+
[0.0, 0.0, 0.0],
|
|
134
|
+
]
|
|
135
|
+
|
|
96
136
|
|
|
97
137
|
def sample_to_lowres_estimated_image(
|
|
98
138
|
samples: torch.Tensor, latent_rgb_factors: torch.Tensor, smooth_matrix: Optional[torch.Tensor] = None
|
|
@@ -164,6 +204,8 @@ def diffusion_step_callback(
|
|
|
164
204
|
latent_rgb_factors = COGVIEW4_LATENT_RGB_FACTORS
|
|
165
205
|
elif base_model == BaseModelType.Flux:
|
|
166
206
|
latent_rgb_factors = FLUX_LATENT_RGB_FACTORS
|
|
207
|
+
elif base_model == BaseModelType.Flux2:
|
|
208
|
+
latent_rgb_factors = FLUX2_LATENT_RGB_FACTORS
|
|
167
209
|
elif base_model == BaseModelType.ZImage:
|
|
168
210
|
# Z-Image uses FLUX-compatible VAE with 16 latent channels
|
|
169
211
|
latent_rgb_factors = FLUX_LATENT_RGB_FACTORS
|
invokeai/backend/flux/denoise.py
CHANGED
|
@@ -7,6 +7,7 @@ from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
|
|
9
9
|
from invokeai.backend.flux.controlnet.controlnet_flux_output import ControlNetFluxOutput, sum_controlnet_flux_outputs
|
|
10
|
+
from invokeai.backend.flux.extensions.dype_extension import DyPEExtension
|
|
10
11
|
from invokeai.backend.flux.extensions.instantx_controlnet_extension import InstantXControlNetExtension
|
|
11
12
|
from invokeai.backend.flux.extensions.regional_prompting_extension import RegionalPromptingExtension
|
|
12
13
|
from invokeai.backend.flux.extensions.xlabs_controlnet_extension import XLabsControlNetExtension
|
|
@@ -37,6 +38,8 @@ def denoise(
|
|
|
37
38
|
# extra img tokens (sequence-wise) - for Kontext conditioning
|
|
38
39
|
img_cond_seq: torch.Tensor | None = None,
|
|
39
40
|
img_cond_seq_ids: torch.Tensor | None = None,
|
|
41
|
+
# DyPE extension for high-resolution generation
|
|
42
|
+
dype_extension: DyPEExtension | None = None,
|
|
40
43
|
# Optional scheduler for alternative sampling methods
|
|
41
44
|
scheduler: SchedulerMixin | None = None,
|
|
42
45
|
):
|
|
@@ -74,30 +77,206 @@ def denoise(
|
|
|
74
77
|
# Store original sequence length for slicing predictions
|
|
75
78
|
original_seq_len = img.shape[1]
|
|
76
79
|
|
|
77
|
-
#
|
|
78
|
-
|
|
80
|
+
# DyPE: Patch model with DyPE-aware position embedder
|
|
81
|
+
dype_embedder = None
|
|
82
|
+
original_pe_embedder = None
|
|
83
|
+
if dype_extension is not None:
|
|
84
|
+
dype_embedder, original_pe_embedder = dype_extension.patch_model(model)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Track the actual step for user-facing progress (accounts for Heun's double steps)
|
|
88
|
+
user_step = 0
|
|
89
|
+
|
|
90
|
+
if use_scheduler:
|
|
91
|
+
# Use diffusers scheduler for stepping
|
|
92
|
+
# Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
|
|
93
|
+
# This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
|
|
94
|
+
pbar = tqdm(total=total_steps, desc="Denoising")
|
|
95
|
+
for step_index in range(num_scheduler_steps):
|
|
96
|
+
timestep = scheduler.timesteps[step_index]
|
|
97
|
+
# Convert scheduler timestep (0-1000) to normalized (0-1) for the model
|
|
98
|
+
t_curr = timestep.item() / scheduler.config.num_train_timesteps
|
|
99
|
+
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
|
100
|
+
|
|
101
|
+
# DyPE: Update step state for timestep-dependent scaling
|
|
102
|
+
if dype_extension is not None and dype_embedder is not None:
|
|
103
|
+
dype_extension.update_step_state(
|
|
104
|
+
embedder=dype_embedder,
|
|
105
|
+
timestep=t_curr,
|
|
106
|
+
timestep_index=user_step,
|
|
107
|
+
total_steps=total_steps,
|
|
108
|
+
)
|
|
79
109
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
110
|
+
# For Heun scheduler, track if we're in first or second order step
|
|
111
|
+
is_heun = hasattr(scheduler, "state_in_first_order")
|
|
112
|
+
in_first_order = scheduler.state_in_first_order if is_heun else True
|
|
113
|
+
|
|
114
|
+
# Run ControlNet models
|
|
115
|
+
controlnet_residuals: list[ControlNetFluxOutput] = []
|
|
116
|
+
for controlnet_extension in controlnet_extensions:
|
|
117
|
+
controlnet_residuals.append(
|
|
118
|
+
controlnet_extension.run_controlnet(
|
|
119
|
+
timestep_index=user_step,
|
|
120
|
+
total_num_timesteps=total_steps,
|
|
121
|
+
img=img,
|
|
122
|
+
img_ids=img_ids,
|
|
123
|
+
txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
124
|
+
txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
125
|
+
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
126
|
+
timesteps=t_vec,
|
|
127
|
+
guidance=guidance_vec,
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
|
|
132
|
+
|
|
133
|
+
# Prepare input for model
|
|
134
|
+
img_input = img
|
|
135
|
+
img_input_ids = img_ids
|
|
136
|
+
|
|
137
|
+
if img_cond is not None:
|
|
138
|
+
img_input = torch.cat((img_input, img_cond), dim=-1)
|
|
139
|
+
|
|
140
|
+
if img_cond_seq is not None:
|
|
141
|
+
assert img_cond_seq_ids is not None
|
|
142
|
+
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
|
143
|
+
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
|
144
|
+
|
|
145
|
+
pred = model(
|
|
146
|
+
img=img_input,
|
|
147
|
+
img_ids=img_input_ids,
|
|
148
|
+
txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
149
|
+
txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
150
|
+
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
151
|
+
timesteps=t_vec,
|
|
152
|
+
guidance=guidance_vec,
|
|
153
|
+
timestep_index=user_step,
|
|
154
|
+
total_num_timesteps=total_steps,
|
|
155
|
+
controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
|
|
156
|
+
controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
|
|
157
|
+
ip_adapter_extensions=pos_ip_adapter_extensions,
|
|
158
|
+
regional_prompting_extension=pos_regional_prompting_extension,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if img_cond_seq is not None:
|
|
162
|
+
pred = pred[:, :original_seq_len]
|
|
163
|
+
|
|
164
|
+
# Get CFG scale for current user step
|
|
165
|
+
step_cfg_scale = cfg_scale[min(user_step, len(cfg_scale) - 1)]
|
|
166
|
+
|
|
167
|
+
if not math.isclose(step_cfg_scale, 1.0):
|
|
168
|
+
if neg_regional_prompting_extension is None:
|
|
169
|
+
raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
|
|
170
|
+
|
|
171
|
+
neg_img_input = img
|
|
172
|
+
neg_img_input_ids = img_ids
|
|
173
|
+
|
|
174
|
+
if img_cond is not None:
|
|
175
|
+
neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
|
|
176
|
+
|
|
177
|
+
if img_cond_seq is not None:
|
|
178
|
+
neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
|
|
179
|
+
neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
|
|
180
|
+
|
|
181
|
+
neg_pred = model(
|
|
182
|
+
img=neg_img_input,
|
|
183
|
+
img_ids=neg_img_input_ids,
|
|
184
|
+
txt=neg_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
185
|
+
txt_ids=neg_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
186
|
+
y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
187
|
+
timesteps=t_vec,
|
|
188
|
+
guidance=guidance_vec,
|
|
189
|
+
timestep_index=user_step,
|
|
190
|
+
total_num_timesteps=total_steps,
|
|
191
|
+
controlnet_double_block_residuals=None,
|
|
192
|
+
controlnet_single_block_residuals=None,
|
|
193
|
+
ip_adapter_extensions=neg_ip_adapter_extensions,
|
|
194
|
+
regional_prompting_extension=neg_regional_prompting_extension,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if img_cond_seq is not None:
|
|
198
|
+
neg_pred = neg_pred[:, :original_seq_len]
|
|
199
|
+
pred = neg_pred + step_cfg_scale * (pred - neg_pred)
|
|
200
|
+
|
|
201
|
+
# Use scheduler.step() for the update
|
|
202
|
+
step_output = scheduler.step(model_output=pred, timestep=timestep, sample=img)
|
|
203
|
+
img = step_output.prev_sample
|
|
204
|
+
|
|
205
|
+
# Get t_prev for inpainting (next sigma value)
|
|
206
|
+
if step_index + 1 < len(scheduler.sigmas):
|
|
207
|
+
t_prev = scheduler.sigmas[step_index + 1].item()
|
|
208
|
+
else:
|
|
209
|
+
t_prev = 0.0
|
|
210
|
+
|
|
211
|
+
if inpaint_extension is not None:
|
|
212
|
+
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
|
|
213
|
+
|
|
214
|
+
# For Heun, only increment user step after second-order step completes
|
|
215
|
+
if is_heun:
|
|
216
|
+
if not in_first_order:
|
|
217
|
+
# Second order step completed
|
|
218
|
+
user_step += 1
|
|
219
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
220
|
+
if user_step <= total_steps:
|
|
221
|
+
pbar.update(1)
|
|
222
|
+
preview_img = img - t_curr * pred
|
|
223
|
+
if inpaint_extension is not None:
|
|
224
|
+
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
|
|
225
|
+
preview_img, 0.0
|
|
226
|
+
)
|
|
227
|
+
step_callback(
|
|
228
|
+
PipelineIntermediateState(
|
|
229
|
+
step=user_step,
|
|
230
|
+
order=2,
|
|
231
|
+
total_steps=total_steps,
|
|
232
|
+
timestep=int(t_curr * 1000),
|
|
233
|
+
latents=preview_img,
|
|
234
|
+
),
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
# For LCM and other first-order schedulers
|
|
238
|
+
user_step += 1
|
|
239
|
+
# Only call step_callback if we haven't exceeded total_steps
|
|
240
|
+
# (LCM scheduler may have more internal steps than user-facing steps)
|
|
241
|
+
if user_step <= total_steps:
|
|
242
|
+
pbar.update(1)
|
|
243
|
+
preview_img = img - t_curr * pred
|
|
244
|
+
if inpaint_extension is not None:
|
|
245
|
+
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
|
|
246
|
+
preview_img, 0.0
|
|
247
|
+
)
|
|
248
|
+
step_callback(
|
|
249
|
+
PipelineIntermediateState(
|
|
250
|
+
step=user_step,
|
|
251
|
+
order=1,
|
|
252
|
+
total_steps=total_steps,
|
|
253
|
+
timestep=int(t_curr * 1000),
|
|
254
|
+
latents=preview_img,
|
|
255
|
+
),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
pbar.close()
|
|
259
|
+
return img
|
|
260
|
+
|
|
261
|
+
# Original Euler implementation (when scheduler is None)
|
|
262
|
+
for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
|
|
263
|
+
# DyPE: Update step state for timestep-dependent scaling
|
|
264
|
+
if dype_extension is not None and dype_embedder is not None:
|
|
265
|
+
dype_extension.update_step_state(
|
|
266
|
+
embedder=dype_embedder,
|
|
267
|
+
timestep=t_curr,
|
|
268
|
+
timestep_index=step_index,
|
|
269
|
+
total_steps=total_steps,
|
|
270
|
+
)
|
|
90
271
|
|
|
91
|
-
|
|
92
|
-
is_heun = hasattr(scheduler, "state_in_first_order")
|
|
93
|
-
in_first_order = scheduler.state_in_first_order if is_heun else True
|
|
272
|
+
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
|
94
273
|
|
|
95
|
-
# Run ControlNet models
|
|
274
|
+
# Run ControlNet models.
|
|
96
275
|
controlnet_residuals: list[ControlNetFluxOutput] = []
|
|
97
276
|
for controlnet_extension in controlnet_extensions:
|
|
98
277
|
controlnet_residuals.append(
|
|
99
278
|
controlnet_extension.run_controlnet(
|
|
100
|
-
timestep_index=
|
|
279
|
+
timestep_index=step_index,
|
|
101
280
|
total_num_timesteps=total_steps,
|
|
102
281
|
img=img,
|
|
103
282
|
img_ids=img_ids,
|
|
@@ -109,17 +288,25 @@ def denoise(
|
|
|
109
288
|
)
|
|
110
289
|
)
|
|
111
290
|
|
|
291
|
+
# Merge the ControlNet residuals from multiple ControlNets.
|
|
292
|
+
# TODO(ryand): We may want to calculate the sum just-in-time to keep peak memory low. Keep in mind, that the
|
|
293
|
+
# controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same
|
|
294
|
+
# tensors. Calculating the sum materializes each tensor into its own instance.
|
|
112
295
|
merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
|
|
113
296
|
|
|
114
|
-
# Prepare input for model
|
|
297
|
+
# Prepare input for model - concatenate fresh each step
|
|
115
298
|
img_input = img
|
|
116
299
|
img_input_ids = img_ids
|
|
117
300
|
|
|
301
|
+
# Add channel-wise conditioning (for ControlNet, FLUX Fill, etc.)
|
|
118
302
|
if img_cond is not None:
|
|
119
303
|
img_input = torch.cat((img_input, img_cond), dim=-1)
|
|
120
304
|
|
|
305
|
+
# Add sequence-wise conditioning (for Kontext)
|
|
121
306
|
if img_cond_seq is not None:
|
|
122
|
-
assert img_cond_seq_ids is not None
|
|
307
|
+
assert img_cond_seq_ids is not None, (
|
|
308
|
+
"You need to provide either both or neither of the sequence conditioning"
|
|
309
|
+
)
|
|
123
310
|
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
|
124
311
|
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
|
125
312
|
|
|
@@ -131,7 +318,7 @@ def denoise(
|
|
|
131
318
|
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
132
319
|
timesteps=t_vec,
|
|
133
320
|
guidance=guidance_vec,
|
|
134
|
-
timestep_index=
|
|
321
|
+
timestep_index=step_index,
|
|
135
322
|
total_num_timesteps=total_steps,
|
|
136
323
|
controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
|
|
137
324
|
controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
|
|
@@ -139,22 +326,33 @@ def denoise(
|
|
|
139
326
|
regional_prompting_extension=pos_regional_prompting_extension,
|
|
140
327
|
)
|
|
141
328
|
|
|
329
|
+
# Slice prediction to only include the main image tokens
|
|
142
330
|
if img_cond_seq is not None:
|
|
143
331
|
pred = pred[:, :original_seq_len]
|
|
144
332
|
|
|
145
|
-
|
|
146
|
-
step_cfg_scale = cfg_scale[min(user_step, len(cfg_scale) - 1)]
|
|
333
|
+
step_cfg_scale = cfg_scale[step_index]
|
|
147
334
|
|
|
335
|
+
# If step_cfg_scale, is 1.0, then we don't need to run the negative prediction.
|
|
148
336
|
if not math.isclose(step_cfg_scale, 1.0):
|
|
337
|
+
# TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance
|
|
338
|
+
# on systems with sufficient VRAM.
|
|
339
|
+
|
|
149
340
|
if neg_regional_prompting_extension is None:
|
|
150
341
|
raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
|
|
151
342
|
|
|
343
|
+
# For negative prediction with Kontext, we need to include the reference images
|
|
344
|
+
# to maintain consistency between positive and negative passes. Without this,
|
|
345
|
+
# CFG would create artifacts as the attention mechanism would see different
|
|
346
|
+
# spatial structures in each pass
|
|
152
347
|
neg_img_input = img
|
|
153
348
|
neg_img_input_ids = img_ids
|
|
154
349
|
|
|
350
|
+
# Add channel-wise conditioning for negative pass if present
|
|
155
351
|
if img_cond is not None:
|
|
156
352
|
neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
|
|
157
353
|
|
|
354
|
+
# Add sequence-wise conditioning (Kontext) for negative pass
|
|
355
|
+
# This ensures reference images are processed consistently
|
|
158
356
|
if img_cond_seq is not None:
|
|
159
357
|
neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
|
|
160
358
|
neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
|
|
@@ -167,7 +365,7 @@ def denoise(
|
|
|
167
365
|
y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
168
366
|
timesteps=t_vec,
|
|
169
367
|
guidance=guidance_vec,
|
|
170
|
-
timestep_index=
|
|
368
|
+
timestep_index=step_index,
|
|
171
369
|
total_num_timesteps=total_steps,
|
|
172
370
|
controlnet_double_block_residuals=None,
|
|
173
371
|
controlnet_single_block_residuals=None,
|
|
@@ -175,194 +373,31 @@ def denoise(
|
|
|
175
373
|
regional_prompting_extension=neg_regional_prompting_extension,
|
|
176
374
|
)
|
|
177
375
|
|
|
376
|
+
# Slice negative prediction to match main image tokens
|
|
178
377
|
if img_cond_seq is not None:
|
|
179
378
|
neg_pred = neg_pred[:, :original_seq_len]
|
|
180
379
|
pred = neg_pred + step_cfg_scale * (pred - neg_pred)
|
|
181
380
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
img = step_output.prev_sample
|
|
185
|
-
|
|
186
|
-
# Get t_prev for inpainting (next sigma value)
|
|
187
|
-
if step_index + 1 < len(scheduler.sigmas):
|
|
188
|
-
t_prev = scheduler.sigmas[step_index + 1].item()
|
|
189
|
-
else:
|
|
190
|
-
t_prev = 0.0
|
|
381
|
+
preview_img = img - t_curr * pred
|
|
382
|
+
img = img + (t_prev - t_curr) * pred
|
|
191
383
|
|
|
192
384
|
if inpaint_extension is not None:
|
|
193
385
|
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if inpaint_extension is not None:
|
|
205
|
-
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
|
|
206
|
-
preview_img, 0.0
|
|
207
|
-
)
|
|
208
|
-
step_callback(
|
|
209
|
-
PipelineIntermediateState(
|
|
210
|
-
step=user_step,
|
|
211
|
-
order=2,
|
|
212
|
-
total_steps=total_steps,
|
|
213
|
-
timestep=int(t_curr * 1000),
|
|
214
|
-
latents=preview_img,
|
|
215
|
-
),
|
|
216
|
-
)
|
|
217
|
-
else:
|
|
218
|
-
# For LCM and other first-order schedulers
|
|
219
|
-
user_step += 1
|
|
220
|
-
# Only call step_callback if we haven't exceeded total_steps
|
|
221
|
-
# (LCM scheduler may have more internal steps than user-facing steps)
|
|
222
|
-
if user_step <= total_steps:
|
|
223
|
-
pbar.update(1)
|
|
224
|
-
preview_img = img - t_curr * pred
|
|
225
|
-
if inpaint_extension is not None:
|
|
226
|
-
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
|
|
227
|
-
step_callback(
|
|
228
|
-
PipelineIntermediateState(
|
|
229
|
-
step=user_step,
|
|
230
|
-
order=1,
|
|
231
|
-
total_steps=total_steps,
|
|
232
|
-
timestep=int(t_curr * 1000),
|
|
233
|
-
latents=preview_img,
|
|
234
|
-
),
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
pbar.close()
|
|
238
|
-
return img
|
|
239
|
-
|
|
240
|
-
# Original Euler implementation (when scheduler is None)
|
|
241
|
-
for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
|
|
242
|
-
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
|
243
|
-
|
|
244
|
-
# Run ControlNet models.
|
|
245
|
-
controlnet_residuals: list[ControlNetFluxOutput] = []
|
|
246
|
-
for controlnet_extension in controlnet_extensions:
|
|
247
|
-
controlnet_residuals.append(
|
|
248
|
-
controlnet_extension.run_controlnet(
|
|
249
|
-
timestep_index=step_index,
|
|
250
|
-
total_num_timesteps=total_steps,
|
|
251
|
-
img=img,
|
|
252
|
-
img_ids=img_ids,
|
|
253
|
-
txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
254
|
-
txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
255
|
-
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
256
|
-
timesteps=t_vec,
|
|
257
|
-
guidance=guidance_vec,
|
|
258
|
-
)
|
|
386
|
+
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
|
|
387
|
+
|
|
388
|
+
step_callback(
|
|
389
|
+
PipelineIntermediateState(
|
|
390
|
+
step=step_index + 1,
|
|
391
|
+
order=1,
|
|
392
|
+
total_steps=total_steps,
|
|
393
|
+
timestep=int(t_curr),
|
|
394
|
+
latents=preview_img,
|
|
395
|
+
),
|
|
259
396
|
)
|
|
260
397
|
|
|
261
|
-
|
|
262
|
-
# TODO(ryand): We may want to calculate the sum just-in-time to keep peak memory low. Keep in mind, that the
|
|
263
|
-
# controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same
|
|
264
|
-
# tensors. Calculating the sum materializes each tensor into its own instance.
|
|
265
|
-
merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
|
|
266
|
-
|
|
267
|
-
# Prepare input for model - concatenate fresh each step
|
|
268
|
-
img_input = img
|
|
269
|
-
img_input_ids = img_ids
|
|
270
|
-
|
|
271
|
-
# Add channel-wise conditioning (for ControlNet, FLUX Fill, etc.)
|
|
272
|
-
if img_cond is not None:
|
|
273
|
-
img_input = torch.cat((img_input, img_cond), dim=-1)
|
|
274
|
-
|
|
275
|
-
# Add sequence-wise conditioning (for Kontext)
|
|
276
|
-
if img_cond_seq is not None:
|
|
277
|
-
assert img_cond_seq_ids is not None, (
|
|
278
|
-
"You need to provide either both or neither of the sequence conditioning"
|
|
279
|
-
)
|
|
280
|
-
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
|
281
|
-
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
|
282
|
-
|
|
283
|
-
pred = model(
|
|
284
|
-
img=img_input,
|
|
285
|
-
img_ids=img_input_ids,
|
|
286
|
-
txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
287
|
-
txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
288
|
-
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
289
|
-
timesteps=t_vec,
|
|
290
|
-
guidance=guidance_vec,
|
|
291
|
-
timestep_index=step_index,
|
|
292
|
-
total_num_timesteps=total_steps,
|
|
293
|
-
controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
|
|
294
|
-
controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
|
|
295
|
-
ip_adapter_extensions=pos_ip_adapter_extensions,
|
|
296
|
-
regional_prompting_extension=pos_regional_prompting_extension,
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
# Slice prediction to only include the main image tokens
|
|
300
|
-
if img_cond_seq is not None:
|
|
301
|
-
pred = pred[:, :original_seq_len]
|
|
302
|
-
|
|
303
|
-
step_cfg_scale = cfg_scale[step_index]
|
|
304
|
-
|
|
305
|
-
# If step_cfg_scale, is 1.0, then we don't need to run the negative prediction.
|
|
306
|
-
if not math.isclose(step_cfg_scale, 1.0):
|
|
307
|
-
# TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance
|
|
308
|
-
# on systems with sufficient VRAM.
|
|
309
|
-
|
|
310
|
-
if neg_regional_prompting_extension is None:
|
|
311
|
-
raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
|
|
312
|
-
|
|
313
|
-
# For negative prediction with Kontext, we need to include the reference images
|
|
314
|
-
# to maintain consistency between positive and negative passes. Without this,
|
|
315
|
-
# CFG would create artifacts as the attention mechanism would see different
|
|
316
|
-
# spatial structures in each pass
|
|
317
|
-
neg_img_input = img
|
|
318
|
-
neg_img_input_ids = img_ids
|
|
319
|
-
|
|
320
|
-
# Add channel-wise conditioning for negative pass if present
|
|
321
|
-
if img_cond is not None:
|
|
322
|
-
neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
|
|
323
|
-
|
|
324
|
-
# Add sequence-wise conditioning (Kontext) for negative pass
|
|
325
|
-
# This ensures reference images are processed consistently
|
|
326
|
-
if img_cond_seq is not None:
|
|
327
|
-
neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
|
|
328
|
-
neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
|
|
329
|
-
|
|
330
|
-
neg_pred = model(
|
|
331
|
-
img=neg_img_input,
|
|
332
|
-
img_ids=neg_img_input_ids,
|
|
333
|
-
txt=neg_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
|
334
|
-
txt_ids=neg_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
|
335
|
-
y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
|
336
|
-
timesteps=t_vec,
|
|
337
|
-
guidance=guidance_vec,
|
|
338
|
-
timestep_index=step_index,
|
|
339
|
-
total_num_timesteps=total_steps,
|
|
340
|
-
controlnet_double_block_residuals=None,
|
|
341
|
-
controlnet_single_block_residuals=None,
|
|
342
|
-
ip_adapter_extensions=neg_ip_adapter_extensions,
|
|
343
|
-
regional_prompting_extension=neg_regional_prompting_extension,
|
|
344
|
-
)
|
|
398
|
+
return img
|
|
345
399
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
preview_img = img - t_curr * pred
|
|
352
|
-
img = img + (t_prev - t_curr) * pred
|
|
353
|
-
|
|
354
|
-
if inpaint_extension is not None:
|
|
355
|
-
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
|
|
356
|
-
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
|
|
357
|
-
|
|
358
|
-
step_callback(
|
|
359
|
-
PipelineIntermediateState(
|
|
360
|
-
step=step_index + 1,
|
|
361
|
-
order=1,
|
|
362
|
-
total_steps=total_steps,
|
|
363
|
-
timestep=int(t_curr),
|
|
364
|
-
latents=preview_img,
|
|
365
|
-
),
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
return img
|
|
400
|
+
finally:
|
|
401
|
+
# DyPE: Restore original position embedder
|
|
402
|
+
if original_pe_embedder is not None:
|
|
403
|
+
DyPEExtension.restore_model(model, original_pe_embedder)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Dynamic Position Extrapolation (DyPE) for FLUX models.
|
|
2
|
+
|
|
3
|
+
DyPE enables high-resolution image generation (4K+) with pretrained FLUX models
|
|
4
|
+
by dynamically scaling RoPE position embeddings during the denoising process.
|
|
5
|
+
|
|
6
|
+
Based on: https://github.com/wildminder/ComfyUI-DyPE
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from invokeai.backend.flux.dype.base import DyPEConfig
|
|
10
|
+
from invokeai.backend.flux.dype.embed import DyPEEmbedND
|
|
11
|
+
from invokeai.backend.flux.dype.presets import DyPEPreset, get_dype_config_for_resolution
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"DyPEConfig",
|
|
15
|
+
"DyPEEmbedND",
|
|
16
|
+
"DyPEPreset",
|
|
17
|
+
"get_dype_config_for_resolution",
|
|
18
|
+
]
|