InvokeAI 6.10.0rc2__py3-none-any.whl → 6.11.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. invokeai/app/api/routers/model_manager.py +43 -1
  2. invokeai/app/invocations/fields.py +1 -1
  3. invokeai/app/invocations/flux2_denoise.py +499 -0
  4. invokeai/app/invocations/flux2_klein_model_loader.py +222 -0
  5. invokeai/app/invocations/flux2_klein_text_encoder.py +222 -0
  6. invokeai/app/invocations/flux2_vae_decode.py +106 -0
  7. invokeai/app/invocations/flux2_vae_encode.py +88 -0
  8. invokeai/app/invocations/flux_denoise.py +50 -3
  9. invokeai/app/invocations/flux_lora_loader.py +1 -1
  10. invokeai/app/invocations/ideal_size.py +6 -1
  11. invokeai/app/invocations/metadata.py +4 -0
  12. invokeai/app/invocations/metadata_linked.py +47 -0
  13. invokeai/app/invocations/model.py +1 -0
  14. invokeai/app/invocations/z_image_denoise.py +8 -3
  15. invokeai/app/invocations/z_image_image_to_latents.py +9 -1
  16. invokeai/app/invocations/z_image_latents_to_image.py +9 -1
  17. invokeai/app/invocations/z_image_seed_variance_enhancer.py +110 -0
  18. invokeai/app/services/config/config_default.py +3 -1
  19. invokeai/app/services/invocation_stats/invocation_stats_common.py +6 -6
  20. invokeai/app/services/invocation_stats/invocation_stats_default.py +9 -4
  21. invokeai/app/services/model_manager/model_manager_default.py +7 -0
  22. invokeai/app/services/model_records/model_records_base.py +4 -2
  23. invokeai/app/services/shared/invocation_context.py +15 -0
  24. invokeai/app/services/shared/sqlite/sqlite_util.py +2 -0
  25. invokeai/app/services/shared/sqlite_migrator/migrations/migration_25.py +61 -0
  26. invokeai/app/util/step_callback.py +42 -0
  27. invokeai/backend/flux/denoise.py +239 -204
  28. invokeai/backend/flux/dype/__init__.py +18 -0
  29. invokeai/backend/flux/dype/base.py +226 -0
  30. invokeai/backend/flux/dype/embed.py +116 -0
  31. invokeai/backend/flux/dype/presets.py +141 -0
  32. invokeai/backend/flux/dype/rope.py +110 -0
  33. invokeai/backend/flux/extensions/dype_extension.py +91 -0
  34. invokeai/backend/flux/util.py +35 -1
  35. invokeai/backend/flux2/__init__.py +4 -0
  36. invokeai/backend/flux2/denoise.py +261 -0
  37. invokeai/backend/flux2/ref_image_extension.py +294 -0
  38. invokeai/backend/flux2/sampling_utils.py +209 -0
  39. invokeai/backend/model_manager/configs/factory.py +19 -1
  40. invokeai/backend/model_manager/configs/main.py +395 -3
  41. invokeai/backend/model_manager/configs/qwen3_encoder.py +116 -7
  42. invokeai/backend/model_manager/configs/vae.py +104 -2
  43. invokeai/backend/model_manager/load/load_default.py +0 -1
  44. invokeai/backend/model_manager/load/model_cache/model_cache.py +107 -2
  45. invokeai/backend/model_manager/load/model_loaders/flux.py +1007 -2
  46. invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py +0 -1
  47. invokeai/backend/model_manager/load/model_loaders/z_image.py +121 -28
  48. invokeai/backend/model_manager/starter_models.py +128 -0
  49. invokeai/backend/model_manager/taxonomy.py +31 -4
  50. invokeai/backend/model_manager/util/select_hf_files.py +3 -2
  51. invokeai/backend/util/vae_working_memory.py +0 -2
  52. invokeai/frontend/web/dist/assets/App-ClpIJstk.js +161 -0
  53. invokeai/frontend/web/dist/assets/{browser-ponyfill-BP0RxJ4G.js → browser-ponyfill-Cw07u5G1.js} +1 -1
  54. invokeai/frontend/web/dist/assets/{index-B44qKjrs.js → index-DSKM8iGj.js} +69 -69
  55. invokeai/frontend/web/dist/index.html +1 -1
  56. invokeai/frontend/web/dist/locales/en.json +58 -5
  57. invokeai/frontend/web/dist/locales/it.json +2 -1
  58. invokeai/version/invokeai_version.py +1 -1
  59. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/METADATA +7 -1
  60. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/RECORD +66 -49
  61. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/WHEEL +1 -1
  62. invokeai/frontend/web/dist/assets/App-DllqPQ3j.js +0 -161
  63. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/entry_points.txt +0 -0
  64. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE +0 -0
  65. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE-SD1+SD2.txt +0 -0
  66. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/licenses/LICENSE-SDXL.txt +0 -0
  67. {invokeai-6.10.0rc2.dist-info → invokeai-6.11.0rc1.dist-info}/top_level.txt +0 -0
@@ -93,6 +93,46 @@ COGVIEW4_LATENT_RGB_FACTORS = [
93
93
  [-0.00955853, -0.00980067, -0.00977842],
94
94
  ]
95
95
 
96
+ # FLUX.2 uses 32 latent channels. Since we don't have proper factors yet,
97
+ # we extend FLUX factors with zeros for preview approximation.
98
+ FLUX2_LATENT_RGB_FACTORS = [
99
+ # R G B
100
+ # First 16 channels (from FLUX)
101
+ [0.0118, 0.0024, 0.0017],
102
+ [-0.0074, -0.0108, -0.0003],
103
+ [0.0056, 0.0291, 0.0768],
104
+ [0.0342, -0.0681, -0.0427],
105
+ [-0.0258, 0.0092, 0.0463],
106
+ [0.0863, 0.0784, 0.0547],
107
+ [-0.0017, 0.0402, 0.0158],
108
+ [0.0501, 0.1058, 0.1152],
109
+ [-0.0209, -0.0218, -0.0329],
110
+ [-0.0314, 0.0083, 0.0896],
111
+ [0.0851, 0.0665, -0.0472],
112
+ [-0.0534, 0.0238, -0.0024],
113
+ [0.0452, -0.0026, 0.0048],
114
+ [0.0892, 0.0831, 0.0881],
115
+ [-0.1117, -0.0304, -0.0789],
116
+ [0.0027, -0.0479, -0.0043],
117
+ # Additional 16 channels (zeros as placeholder)
118
+ [0.0, 0.0, 0.0],
119
+ [0.0, 0.0, 0.0],
120
+ [0.0, 0.0, 0.0],
121
+ [0.0, 0.0, 0.0],
122
+ [0.0, 0.0, 0.0],
123
+ [0.0, 0.0, 0.0],
124
+ [0.0, 0.0, 0.0],
125
+ [0.0, 0.0, 0.0],
126
+ [0.0, 0.0, 0.0],
127
+ [0.0, 0.0, 0.0],
128
+ [0.0, 0.0, 0.0],
129
+ [0.0, 0.0, 0.0],
130
+ [0.0, 0.0, 0.0],
131
+ [0.0, 0.0, 0.0],
132
+ [0.0, 0.0, 0.0],
133
+ [0.0, 0.0, 0.0],
134
+ ]
135
+
96
136
 
97
137
  def sample_to_lowres_estimated_image(
98
138
  samples: torch.Tensor, latent_rgb_factors: torch.Tensor, smooth_matrix: Optional[torch.Tensor] = None
@@ -164,6 +204,8 @@ def diffusion_step_callback(
164
204
  latent_rgb_factors = COGVIEW4_LATENT_RGB_FACTORS
165
205
  elif base_model == BaseModelType.Flux:
166
206
  latent_rgb_factors = FLUX_LATENT_RGB_FACTORS
207
+ elif base_model == BaseModelType.Flux2:
208
+ latent_rgb_factors = FLUX2_LATENT_RGB_FACTORS
167
209
  elif base_model == BaseModelType.ZImage:
168
210
  # Z-Image uses FLUX-compatible VAE with 16 latent channels
169
211
  latent_rgb_factors = FLUX_LATENT_RGB_FACTORS
@@ -7,6 +7,7 @@ from diffusers.schedulers.scheduling_utils import SchedulerMixin
7
7
  from tqdm import tqdm
8
8
 
9
9
  from invokeai.backend.flux.controlnet.controlnet_flux_output import ControlNetFluxOutput, sum_controlnet_flux_outputs
10
+ from invokeai.backend.flux.extensions.dype_extension import DyPEExtension
10
11
  from invokeai.backend.flux.extensions.instantx_controlnet_extension import InstantXControlNetExtension
11
12
  from invokeai.backend.flux.extensions.regional_prompting_extension import RegionalPromptingExtension
12
13
  from invokeai.backend.flux.extensions.xlabs_controlnet_extension import XLabsControlNetExtension
@@ -37,6 +38,8 @@ def denoise(
37
38
  # extra img tokens (sequence-wise) - for Kontext conditioning
38
39
  img_cond_seq: torch.Tensor | None = None,
39
40
  img_cond_seq_ids: torch.Tensor | None = None,
41
+ # DyPE extension for high-resolution generation
42
+ dype_extension: DyPEExtension | None = None,
40
43
  # Optional scheduler for alternative sampling methods
41
44
  scheduler: SchedulerMixin | None = None,
42
45
  ):
@@ -74,30 +77,206 @@ def denoise(
74
77
  # Store original sequence length for slicing predictions
75
78
  original_seq_len = img.shape[1]
76
79
 
77
- # Track the actual step for user-facing progress (accounts for Heun's double steps)
78
- user_step = 0
80
+ # DyPE: Patch model with DyPE-aware position embedder
81
+ dype_embedder = None
82
+ original_pe_embedder = None
83
+ if dype_extension is not None:
84
+ dype_embedder, original_pe_embedder = dype_extension.patch_model(model)
85
+
86
+ try:
87
+ # Track the actual step for user-facing progress (accounts for Heun's double steps)
88
+ user_step = 0
89
+
90
+ if use_scheduler:
91
+ # Use diffusers scheduler for stepping
92
+ # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
93
+ # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
94
+ pbar = tqdm(total=total_steps, desc="Denoising")
95
+ for step_index in range(num_scheduler_steps):
96
+ timestep = scheduler.timesteps[step_index]
97
+ # Convert scheduler timestep (0-1000) to normalized (0-1) for the model
98
+ t_curr = timestep.item() / scheduler.config.num_train_timesteps
99
+ t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
100
+
101
+ # DyPE: Update step state for timestep-dependent scaling
102
+ if dype_extension is not None and dype_embedder is not None:
103
+ dype_extension.update_step_state(
104
+ embedder=dype_embedder,
105
+ timestep=t_curr,
106
+ timestep_index=user_step,
107
+ total_steps=total_steps,
108
+ )
79
109
 
80
- if use_scheduler:
81
- # Use diffusers scheduler for stepping
82
- # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
83
- # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
84
- pbar = tqdm(total=total_steps, desc="Denoising")
85
- for step_index in range(num_scheduler_steps):
86
- timestep = scheduler.timesteps[step_index]
87
- # Convert scheduler timestep (0-1000) to normalized (0-1) for the model
88
- t_curr = timestep.item() / scheduler.config.num_train_timesteps
89
- t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
110
+ # For Heun scheduler, track if we're in first or second order step
111
+ is_heun = hasattr(scheduler, "state_in_first_order")
112
+ in_first_order = scheduler.state_in_first_order if is_heun else True
113
+
114
+ # Run ControlNet models
115
+ controlnet_residuals: list[ControlNetFluxOutput] = []
116
+ for controlnet_extension in controlnet_extensions:
117
+ controlnet_residuals.append(
118
+ controlnet_extension.run_controlnet(
119
+ timestep_index=user_step,
120
+ total_num_timesteps=total_steps,
121
+ img=img,
122
+ img_ids=img_ids,
123
+ txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
124
+ txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
125
+ y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
126
+ timesteps=t_vec,
127
+ guidance=guidance_vec,
128
+ )
129
+ )
130
+
131
+ merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
132
+
133
+ # Prepare input for model
134
+ img_input = img
135
+ img_input_ids = img_ids
136
+
137
+ if img_cond is not None:
138
+ img_input = torch.cat((img_input, img_cond), dim=-1)
139
+
140
+ if img_cond_seq is not None:
141
+ assert img_cond_seq_ids is not None
142
+ img_input = torch.cat((img_input, img_cond_seq), dim=1)
143
+ img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
144
+
145
+ pred = model(
146
+ img=img_input,
147
+ img_ids=img_input_ids,
148
+ txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
149
+ txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
150
+ y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
151
+ timesteps=t_vec,
152
+ guidance=guidance_vec,
153
+ timestep_index=user_step,
154
+ total_num_timesteps=total_steps,
155
+ controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
156
+ controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
157
+ ip_adapter_extensions=pos_ip_adapter_extensions,
158
+ regional_prompting_extension=pos_regional_prompting_extension,
159
+ )
160
+
161
+ if img_cond_seq is not None:
162
+ pred = pred[:, :original_seq_len]
163
+
164
+ # Get CFG scale for current user step
165
+ step_cfg_scale = cfg_scale[min(user_step, len(cfg_scale) - 1)]
166
+
167
+ if not math.isclose(step_cfg_scale, 1.0):
168
+ if neg_regional_prompting_extension is None:
169
+ raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
170
+
171
+ neg_img_input = img
172
+ neg_img_input_ids = img_ids
173
+
174
+ if img_cond is not None:
175
+ neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
176
+
177
+ if img_cond_seq is not None:
178
+ neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
179
+ neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
180
+
181
+ neg_pred = model(
182
+ img=neg_img_input,
183
+ img_ids=neg_img_input_ids,
184
+ txt=neg_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
185
+ txt_ids=neg_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
186
+ y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
187
+ timesteps=t_vec,
188
+ guidance=guidance_vec,
189
+ timestep_index=user_step,
190
+ total_num_timesteps=total_steps,
191
+ controlnet_double_block_residuals=None,
192
+ controlnet_single_block_residuals=None,
193
+ ip_adapter_extensions=neg_ip_adapter_extensions,
194
+ regional_prompting_extension=neg_regional_prompting_extension,
195
+ )
196
+
197
+ if img_cond_seq is not None:
198
+ neg_pred = neg_pred[:, :original_seq_len]
199
+ pred = neg_pred + step_cfg_scale * (pred - neg_pred)
200
+
201
+ # Use scheduler.step() for the update
202
+ step_output = scheduler.step(model_output=pred, timestep=timestep, sample=img)
203
+ img = step_output.prev_sample
204
+
205
+ # Get t_prev for inpainting (next sigma value)
206
+ if step_index + 1 < len(scheduler.sigmas):
207
+ t_prev = scheduler.sigmas[step_index + 1].item()
208
+ else:
209
+ t_prev = 0.0
210
+
211
+ if inpaint_extension is not None:
212
+ img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
213
+
214
+ # For Heun, only increment user step after second-order step completes
215
+ if is_heun:
216
+ if not in_first_order:
217
+ # Second order step completed
218
+ user_step += 1
219
+ # Only call step_callback if we haven't exceeded total_steps
220
+ if user_step <= total_steps:
221
+ pbar.update(1)
222
+ preview_img = img - t_curr * pred
223
+ if inpaint_extension is not None:
224
+ preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
225
+ preview_img, 0.0
226
+ )
227
+ step_callback(
228
+ PipelineIntermediateState(
229
+ step=user_step,
230
+ order=2,
231
+ total_steps=total_steps,
232
+ timestep=int(t_curr * 1000),
233
+ latents=preview_img,
234
+ ),
235
+ )
236
+ else:
237
+ # For LCM and other first-order schedulers
238
+ user_step += 1
239
+ # Only call step_callback if we haven't exceeded total_steps
240
+ # (LCM scheduler may have more internal steps than user-facing steps)
241
+ if user_step <= total_steps:
242
+ pbar.update(1)
243
+ preview_img = img - t_curr * pred
244
+ if inpaint_extension is not None:
245
+ preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
246
+ preview_img, 0.0
247
+ )
248
+ step_callback(
249
+ PipelineIntermediateState(
250
+ step=user_step,
251
+ order=1,
252
+ total_steps=total_steps,
253
+ timestep=int(t_curr * 1000),
254
+ latents=preview_img,
255
+ ),
256
+ )
257
+
258
+ pbar.close()
259
+ return img
260
+
261
+ # Original Euler implementation (when scheduler is None)
262
+ for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
263
+ # DyPE: Update step state for timestep-dependent scaling
264
+ if dype_extension is not None and dype_embedder is not None:
265
+ dype_extension.update_step_state(
266
+ embedder=dype_embedder,
267
+ timestep=t_curr,
268
+ timestep_index=step_index,
269
+ total_steps=total_steps,
270
+ )
90
271
 
91
- # For Heun scheduler, track if we're in first or second order step
92
- is_heun = hasattr(scheduler, "state_in_first_order")
93
- in_first_order = scheduler.state_in_first_order if is_heun else True
272
+ t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
94
273
 
95
- # Run ControlNet models
274
+ # Run ControlNet models.
96
275
  controlnet_residuals: list[ControlNetFluxOutput] = []
97
276
  for controlnet_extension in controlnet_extensions:
98
277
  controlnet_residuals.append(
99
278
  controlnet_extension.run_controlnet(
100
- timestep_index=user_step,
279
+ timestep_index=step_index,
101
280
  total_num_timesteps=total_steps,
102
281
  img=img,
103
282
  img_ids=img_ids,
@@ -109,17 +288,25 @@ def denoise(
109
288
  )
110
289
  )
111
290
 
291
+ # Merge the ControlNet residuals from multiple ControlNets.
292
+ # TODO(ryand): We may want to calculate the sum just-in-time to keep peak memory low. Keep in mind, that the
293
+ # controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same
294
+ # tensors. Calculating the sum materializes each tensor into its own instance.
112
295
  merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
113
296
 
114
- # Prepare input for model
297
+ # Prepare input for model - concatenate fresh each step
115
298
  img_input = img
116
299
  img_input_ids = img_ids
117
300
 
301
+ # Add channel-wise conditioning (for ControlNet, FLUX Fill, etc.)
118
302
  if img_cond is not None:
119
303
  img_input = torch.cat((img_input, img_cond), dim=-1)
120
304
 
305
+ # Add sequence-wise conditioning (for Kontext)
121
306
  if img_cond_seq is not None:
122
- assert img_cond_seq_ids is not None
307
+ assert img_cond_seq_ids is not None, (
308
+ "You need to provide either both or neither of the sequence conditioning"
309
+ )
123
310
  img_input = torch.cat((img_input, img_cond_seq), dim=1)
124
311
  img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
125
312
 
@@ -131,7 +318,7 @@ def denoise(
131
318
  y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
132
319
  timesteps=t_vec,
133
320
  guidance=guidance_vec,
134
- timestep_index=user_step,
321
+ timestep_index=step_index,
135
322
  total_num_timesteps=total_steps,
136
323
  controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
137
324
  controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
@@ -139,22 +326,33 @@ def denoise(
139
326
  regional_prompting_extension=pos_regional_prompting_extension,
140
327
  )
141
328
 
329
+ # Slice prediction to only include the main image tokens
142
330
  if img_cond_seq is not None:
143
331
  pred = pred[:, :original_seq_len]
144
332
 
145
- # Get CFG scale for current user step
146
- step_cfg_scale = cfg_scale[min(user_step, len(cfg_scale) - 1)]
333
+ step_cfg_scale = cfg_scale[step_index]
147
334
 
335
+ # If step_cfg_scale, is 1.0, then we don't need to run the negative prediction.
148
336
  if not math.isclose(step_cfg_scale, 1.0):
337
+ # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance
338
+ # on systems with sufficient VRAM.
339
+
149
340
  if neg_regional_prompting_extension is None:
150
341
  raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
151
342
 
343
+ # For negative prediction with Kontext, we need to include the reference images
344
+ # to maintain consistency between positive and negative passes. Without this,
345
+ # CFG would create artifacts as the attention mechanism would see different
346
+ # spatial structures in each pass
152
347
  neg_img_input = img
153
348
  neg_img_input_ids = img_ids
154
349
 
350
+ # Add channel-wise conditioning for negative pass if present
155
351
  if img_cond is not None:
156
352
  neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
157
353
 
354
+ # Add sequence-wise conditioning (Kontext) for negative pass
355
+ # This ensures reference images are processed consistently
158
356
  if img_cond_seq is not None:
159
357
  neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
160
358
  neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
@@ -167,7 +365,7 @@ def denoise(
167
365
  y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
168
366
  timesteps=t_vec,
169
367
  guidance=guidance_vec,
170
- timestep_index=user_step,
368
+ timestep_index=step_index,
171
369
  total_num_timesteps=total_steps,
172
370
  controlnet_double_block_residuals=None,
173
371
  controlnet_single_block_residuals=None,
@@ -175,194 +373,31 @@ def denoise(
175
373
  regional_prompting_extension=neg_regional_prompting_extension,
176
374
  )
177
375
 
376
+ # Slice negative prediction to match main image tokens
178
377
  if img_cond_seq is not None:
179
378
  neg_pred = neg_pred[:, :original_seq_len]
180
379
  pred = neg_pred + step_cfg_scale * (pred - neg_pred)
181
380
 
182
- # Use scheduler.step() for the update
183
- step_output = scheduler.step(model_output=pred, timestep=timestep, sample=img)
184
- img = step_output.prev_sample
185
-
186
- # Get t_prev for inpainting (next sigma value)
187
- if step_index + 1 < len(scheduler.sigmas):
188
- t_prev = scheduler.sigmas[step_index + 1].item()
189
- else:
190
- t_prev = 0.0
381
+ preview_img = img - t_curr * pred
382
+ img = img + (t_prev - t_curr) * pred
191
383
 
192
384
  if inpaint_extension is not None:
193
385
  img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
194
-
195
- # For Heun, only increment user step after second-order step completes
196
- if is_heun:
197
- if not in_first_order:
198
- # Second order step completed
199
- user_step += 1
200
- # Only call step_callback if we haven't exceeded total_steps
201
- if user_step <= total_steps:
202
- pbar.update(1)
203
- preview_img = img - t_curr * pred
204
- if inpaint_extension is not None:
205
- preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(
206
- preview_img, 0.0
207
- )
208
- step_callback(
209
- PipelineIntermediateState(
210
- step=user_step,
211
- order=2,
212
- total_steps=total_steps,
213
- timestep=int(t_curr * 1000),
214
- latents=preview_img,
215
- ),
216
- )
217
- else:
218
- # For LCM and other first-order schedulers
219
- user_step += 1
220
- # Only call step_callback if we haven't exceeded total_steps
221
- # (LCM scheduler may have more internal steps than user-facing steps)
222
- if user_step <= total_steps:
223
- pbar.update(1)
224
- preview_img = img - t_curr * pred
225
- if inpaint_extension is not None:
226
- preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
227
- step_callback(
228
- PipelineIntermediateState(
229
- step=user_step,
230
- order=1,
231
- total_steps=total_steps,
232
- timestep=int(t_curr * 1000),
233
- latents=preview_img,
234
- ),
235
- )
236
-
237
- pbar.close()
238
- return img
239
-
240
- # Original Euler implementation (when scheduler is None)
241
- for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
242
- t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
243
-
244
- # Run ControlNet models.
245
- controlnet_residuals: list[ControlNetFluxOutput] = []
246
- for controlnet_extension in controlnet_extensions:
247
- controlnet_residuals.append(
248
- controlnet_extension.run_controlnet(
249
- timestep_index=step_index,
250
- total_num_timesteps=total_steps,
251
- img=img,
252
- img_ids=img_ids,
253
- txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
254
- txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
255
- y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
256
- timesteps=t_vec,
257
- guidance=guidance_vec,
258
- )
386
+ preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
387
+
388
+ step_callback(
389
+ PipelineIntermediateState(
390
+ step=step_index + 1,
391
+ order=1,
392
+ total_steps=total_steps,
393
+ timestep=int(t_curr),
394
+ latents=preview_img,
395
+ ),
259
396
  )
260
397
 
261
- # Merge the ControlNet residuals from multiple ControlNets.
262
- # TODO(ryand): We may want to calculate the sum just-in-time to keep peak memory low. Keep in mind, that the
263
- # controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same
264
- # tensors. Calculating the sum materializes each tensor into its own instance.
265
- merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
266
-
267
- # Prepare input for model - concatenate fresh each step
268
- img_input = img
269
- img_input_ids = img_ids
270
-
271
- # Add channel-wise conditioning (for ControlNet, FLUX Fill, etc.)
272
- if img_cond is not None:
273
- img_input = torch.cat((img_input, img_cond), dim=-1)
274
-
275
- # Add sequence-wise conditioning (for Kontext)
276
- if img_cond_seq is not None:
277
- assert img_cond_seq_ids is not None, (
278
- "You need to provide either both or neither of the sequence conditioning"
279
- )
280
- img_input = torch.cat((img_input, img_cond_seq), dim=1)
281
- img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
282
-
283
- pred = model(
284
- img=img_input,
285
- img_ids=img_input_ids,
286
- txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
287
- txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
288
- y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
289
- timesteps=t_vec,
290
- guidance=guidance_vec,
291
- timestep_index=step_index,
292
- total_num_timesteps=total_steps,
293
- controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals,
294
- controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals,
295
- ip_adapter_extensions=pos_ip_adapter_extensions,
296
- regional_prompting_extension=pos_regional_prompting_extension,
297
- )
298
-
299
- # Slice prediction to only include the main image tokens
300
- if img_cond_seq is not None:
301
- pred = pred[:, :original_seq_len]
302
-
303
- step_cfg_scale = cfg_scale[step_index]
304
-
305
- # If step_cfg_scale, is 1.0, then we don't need to run the negative prediction.
306
- if not math.isclose(step_cfg_scale, 1.0):
307
- # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance
308
- # on systems with sufficient VRAM.
309
-
310
- if neg_regional_prompting_extension is None:
311
- raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.")
312
-
313
- # For negative prediction with Kontext, we need to include the reference images
314
- # to maintain consistency between positive and negative passes. Without this,
315
- # CFG would create artifacts as the attention mechanism would see different
316
- # spatial structures in each pass
317
- neg_img_input = img
318
- neg_img_input_ids = img_ids
319
-
320
- # Add channel-wise conditioning for negative pass if present
321
- if img_cond is not None:
322
- neg_img_input = torch.cat((neg_img_input, img_cond), dim=-1)
323
-
324
- # Add sequence-wise conditioning (Kontext) for negative pass
325
- # This ensures reference images are processed consistently
326
- if img_cond_seq is not None:
327
- neg_img_input = torch.cat((neg_img_input, img_cond_seq), dim=1)
328
- neg_img_input_ids = torch.cat((neg_img_input_ids, img_cond_seq_ids), dim=1)
329
-
330
- neg_pred = model(
331
- img=neg_img_input,
332
- img_ids=neg_img_input_ids,
333
- txt=neg_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
334
- txt_ids=neg_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
335
- y=neg_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
336
- timesteps=t_vec,
337
- guidance=guidance_vec,
338
- timestep_index=step_index,
339
- total_num_timesteps=total_steps,
340
- controlnet_double_block_residuals=None,
341
- controlnet_single_block_residuals=None,
342
- ip_adapter_extensions=neg_ip_adapter_extensions,
343
- regional_prompting_extension=neg_regional_prompting_extension,
344
- )
398
+ return img
345
399
 
346
- # Slice negative prediction to match main image tokens
347
- if img_cond_seq is not None:
348
- neg_pred = neg_pred[:, :original_seq_len]
349
- pred = neg_pred + step_cfg_scale * (pred - neg_pred)
350
-
351
- preview_img = img - t_curr * pred
352
- img = img + (t_prev - t_curr) * pred
353
-
354
- if inpaint_extension is not None:
355
- img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
356
- preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
357
-
358
- step_callback(
359
- PipelineIntermediateState(
360
- step=step_index + 1,
361
- order=1,
362
- total_steps=total_steps,
363
- timestep=int(t_curr),
364
- latents=preview_img,
365
- ),
366
- )
367
-
368
- return img
400
+ finally:
401
+ # DyPE: Restore original position embedder
402
+ if original_pe_embedder is not None:
403
+ DyPEExtension.restore_model(model, original_pe_embedder)
@@ -0,0 +1,18 @@
1
+ """Dynamic Position Extrapolation (DyPE) for FLUX models.
2
+
3
+ DyPE enables high-resolution image generation (4K+) with pretrained FLUX models
4
+ by dynamically scaling RoPE position embeddings during the denoising process.
5
+
6
+ Based on: https://github.com/wildminder/ComfyUI-DyPE
7
+ """
8
+
9
+ from invokeai.backend.flux.dype.base import DyPEConfig
10
+ from invokeai.backend.flux.dype.embed import DyPEEmbedND
11
+ from invokeai.backend.flux.dype.presets import DyPEPreset, get_dype_config_for_resolution
12
+
13
+ __all__ = [
14
+ "DyPEConfig",
15
+ "DyPEEmbedND",
16
+ "DyPEPreset",
17
+ "get_dype_config_for_resolution",
18
+ ]