diffusers 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. diffusers/__init__.py +66 -5
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +1 -1
  4. diffusers/dependency_versions_table.py +1 -1
  5. diffusers/image_processor.py +25 -17
  6. diffusers/loaders/__init__.py +22 -3
  7. diffusers/loaders/ip_adapter.py +538 -15
  8. diffusers/loaders/lora_base.py +124 -118
  9. diffusers/loaders/lora_conversion_utils.py +318 -3
  10. diffusers/loaders/lora_pipeline.py +1688 -368
  11. diffusers/loaders/peft.py +379 -0
  12. diffusers/loaders/single_file_model.py +71 -4
  13. diffusers/loaders/single_file_utils.py +519 -9
  14. diffusers/loaders/textual_inversion.py +3 -3
  15. diffusers/loaders/transformer_flux.py +181 -0
  16. diffusers/loaders/transformer_sd3.py +89 -0
  17. diffusers/loaders/unet.py +17 -4
  18. diffusers/models/__init__.py +47 -14
  19. diffusers/models/activations.py +22 -9
  20. diffusers/models/attention.py +13 -4
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2059 -281
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +2 -1
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +29 -495
  36. diffusers/models/controlnet_sd3.py +25 -379
  37. diffusers/models/controlnet_sparsectrl.py +46 -718
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +838 -43
  49. diffusers/models/model_loading_utils.py +88 -6
  50. diffusers/models/modeling_flax_utils.py +1 -1
  51. diffusers/models/modeling_utils.py +74 -28
  52. diffusers/models/normalization.py +78 -13
  53. diffusers/models/transformers/__init__.py +5 -0
  54. diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
  55. diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
  56. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  57. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  58. diffusers/models/transformers/pixart_transformer_2d.py +1 -1
  59. diffusers/models/transformers/sana_transformer.py +488 -0
  60. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  61. diffusers/models/transformers/transformer_2d.py +1 -1
  62. diffusers/models/transformers/transformer_allegro.py +422 -0
  63. diffusers/models/transformers/transformer_cogview3plus.py +1 -1
  64. diffusers/models/transformers/transformer_flux.py +30 -9
  65. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  66. diffusers/models/transformers/transformer_ltx.py +469 -0
  67. diffusers/models/transformers/transformer_mochi.py +499 -0
  68. diffusers/models/transformers/transformer_sd3.py +105 -17
  69. diffusers/models/transformers/transformer_temporal.py +1 -1
  70. diffusers/models/unets/unet_1d_blocks.py +1 -1
  71. diffusers/models/unets/unet_2d.py +8 -1
  72. diffusers/models/unets/unet_2d_blocks.py +88 -21
  73. diffusers/models/unets/unet_2d_condition.py +1 -1
  74. diffusers/models/unets/unet_3d_blocks.py +9 -7
  75. diffusers/models/unets/unet_motion_model.py +5 -5
  76. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  77. diffusers/models/unets/unet_stable_cascade.py +2 -2
  78. diffusers/models/unets/uvit_2d.py +1 -1
  79. diffusers/models/upsampling.py +8 -0
  80. diffusers/pipelines/__init__.py +34 -0
  81. diffusers/pipelines/allegro/__init__.py +48 -0
  82. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  83. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  84. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
  85. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
  86. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
  87. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
  88. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  89. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
  90. diffusers/pipelines/auto_pipeline.py +53 -6
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
  93. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
  94. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
  95. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
  96. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
  97. diffusers/pipelines/controlnet/__init__.py +86 -80
  98. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  99. diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
  100. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
  101. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
  102. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
  103. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
  104. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
  105. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  106. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  107. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  108. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
  109. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
  110. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  111. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
  112. diffusers/pipelines/flux/__init__.py +13 -1
  113. diffusers/pipelines/flux/modeling_flux.py +47 -0
  114. diffusers/pipelines/flux/pipeline_flux.py +204 -29
  115. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  116. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  117. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  118. diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
  119. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
  120. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
  121. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  122. diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
  123. diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
  124. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  125. diffusers/pipelines/flux/pipeline_output.py +16 -0
  126. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  127. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  128. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  129. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
  130. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  131. diffusers/pipelines/kolors/text_encoder.py +2 -2
  132. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  133. diffusers/pipelines/ltx/__init__.py +50 -0
  134. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  135. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  136. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  137. diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
  138. diffusers/pipelines/mochi/__init__.py +48 -0
  139. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  140. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  141. diffusers/pipelines/pag/__init__.py +7 -0
  142. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
  143. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
  144. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
  145. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
  146. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
  147. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
  148. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  149. diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
  150. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  151. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
  152. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  153. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  154. diffusers/pipelines/pipeline_loading_utils.py +25 -4
  155. diffusers/pipelines/pipeline_utils.py +35 -6
  156. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
  158. diffusers/pipelines/sana/__init__.py +47 -0
  159. diffusers/pipelines/sana/pipeline_output.py +21 -0
  160. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  161. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
  163. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
  164. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
  165. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
  166. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  167. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
  168. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
  170. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  171. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  172. diffusers/quantizers/auto.py +14 -1
  173. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
  174. diffusers/quantizers/gguf/__init__.py +1 -0
  175. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  176. diffusers/quantizers/gguf/utils.py +456 -0
  177. diffusers/quantizers/quantization_config.py +280 -2
  178. diffusers/quantizers/torchao/__init__.py +15 -0
  179. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  180. diffusers/schedulers/scheduling_ddpm.py +2 -6
  181. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
  182. diffusers/schedulers/scheduling_deis_multistep.py +28 -9
  183. diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
  184. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
  185. diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
  186. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
  187. diffusers/schedulers/scheduling_euler_discrete.py +4 -4
  188. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  189. diffusers/schedulers/scheduling_heun_discrete.py +4 -4
  190. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
  191. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
  192. diffusers/schedulers/scheduling_lcm.py +2 -6
  193. diffusers/schedulers/scheduling_lms_discrete.py +4 -4
  194. diffusers/schedulers/scheduling_repaint.py +1 -1
  195. diffusers/schedulers/scheduling_sasolver.py +28 -9
  196. diffusers/schedulers/scheduling_tcd.py +2 -6
  197. diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
  198. diffusers/training_utils.py +16 -2
  199. diffusers/utils/__init__.py +5 -0
  200. diffusers/utils/constants.py +1 -0
  201. diffusers/utils/dummy_pt_objects.py +180 -0
  202. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  203. diffusers/utils/dynamic_modules_utils.py +3 -3
  204. diffusers/utils/hub_utils.py +31 -39
  205. diffusers/utils/import_utils.py +67 -0
  206. diffusers/utils/peft_utils.py +3 -0
  207. diffusers/utils/testing_utils.py +56 -1
  208. diffusers/utils/torch_utils.py +3 -0
  209. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/METADATA +69 -69
  210. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/RECORD +214 -162
  211. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  212. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  213. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  214. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ import torch
20
20
  from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
21
21
 
22
22
  from ...image_processor import PipelineImageInput, VaeImageProcessor
23
- from ...loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
23
+ from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
24
24
  from ...models.autoencoders import AutoencoderKL
25
25
  from ...models.transformers import FluxTransformer2DModel
26
26
  from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -159,7 +159,7 @@ def retrieve_timesteps(
159
159
  return timesteps, num_inference_steps
160
160
 
161
161
 
162
- class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
162
+ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
163
163
  r"""
164
164
  The Flux pipeline for image inpainting.
165
165
 
@@ -212,13 +212,15 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
212
212
  scheduler=scheduler,
213
213
  )
214
214
  self.vae_scale_factor = (
215
- 2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
215
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
216
216
  )
217
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
217
+ # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
218
+ # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
219
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
218
220
  self.tokenizer_max_length = (
219
221
  self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
220
222
  )
221
- self.default_sample_size = 64
223
+ self.default_sample_size = 128
222
224
 
223
225
  # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
224
226
  def _get_t5_prompt_embeds(
@@ -437,8 +439,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
437
439
  if strength < 0 or strength > 1:
438
440
  raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
439
441
 
440
- if height % 8 != 0 or width % 8 != 0:
441
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
442
+ if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
443
+ logger.warning(
444
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
445
+ )
442
446
 
443
447
  if callback_on_step_end_tensor_inputs is not None and not all(
444
448
  k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -477,9 +481,9 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
477
481
  @staticmethod
478
482
  # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
479
483
  def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
480
- latent_image_ids = torch.zeros(height // 2, width // 2, 3)
481
- latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
482
- latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
484
+ latent_image_ids = torch.zeros(height, width, 3)
485
+ latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
486
+ latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
483
487
 
484
488
  latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
485
489
 
@@ -503,13 +507,15 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
503
507
  def _unpack_latents(latents, height, width, vae_scale_factor):
504
508
  batch_size, num_patches, channels = latents.shape
505
509
 
506
- height = height // vae_scale_factor
507
- width = width // vae_scale_factor
510
+ # VAE applies 8x compression on images but we must also account for packing which requires
511
+ # latent height and width to be divisible by 2.
512
+ height = 2 * (int(height) // (vae_scale_factor * 2))
513
+ width = 2 * (int(width) // (vae_scale_factor * 2))
508
514
 
509
- latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
515
+ latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
510
516
  latents = latents.permute(0, 3, 1, 4, 2, 5)
511
517
 
512
- latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
518
+ latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
513
519
 
514
520
  return latents
515
521
 
@@ -532,11 +538,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
532
538
  f" size of {batch_size}. Make sure the batch size matches the length of the generators."
533
539
  )
534
540
 
535
- height = 2 * (int(height) // self.vae_scale_factor)
536
- width = 2 * (int(width) // self.vae_scale_factor)
537
-
541
+ # VAE applies 8x compression on images but we must also account for packing which requires
542
+ # latent height and width to be divisible by 2.
543
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
544
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
538
545
  shape = (batch_size, num_channels_latents, height, width)
539
- latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
546
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
540
547
 
541
548
  if latents is not None:
542
549
  return latents.to(device=device, dtype=dtype), latent_image_ids
@@ -586,7 +593,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
586
593
  width: Optional[int] = None,
587
594
  strength: float = 0.6,
588
595
  num_inference_steps: int = 28,
589
- timesteps: List[int] = None,
596
+ sigmas: Optional[List[float]] = None,
590
597
  guidance_scale: float = 7.0,
591
598
  num_images_per_prompt: Optional[int] = 1,
592
599
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -629,10 +636,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
629
636
  num_inference_steps (`int`, *optional*, defaults to 50):
630
637
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
631
638
  expense of slower inference.
632
- timesteps (`List[int]`, *optional*):
633
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
634
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
635
- passed will be used. Must be in descending order.
639
+ sigmas (`List[float]`, *optional*):
640
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
641
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
642
+ will be used.
636
643
  guidance_scale (`float`, *optional*, defaults to 7.0):
637
644
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
638
645
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -735,8 +742,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
735
742
  )
736
743
 
737
744
  # 4.Prepare timesteps
738
- sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
739
- image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
745
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
746
+ image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
740
747
  mu = calculate_shift(
741
748
  image_seq_len,
742
749
  self.scheduler.config.base_image_seq_len,
@@ -748,8 +755,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
748
755
  self.scheduler,
749
756
  num_inference_steps,
750
757
  device,
751
- timesteps,
752
- sigmas,
758
+ sigmas=sigmas,
753
759
  mu=mu,
754
760
  )
755
761
  timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
@@ -209,11 +209,13 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
209
209
  scheduler=scheduler,
210
210
  )
211
211
  self.vae_scale_factor = (
212
- 2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
212
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
213
213
  )
214
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
214
+ # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
215
+ # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
216
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
215
217
  self.mask_processor = VaeImageProcessor(
216
- vae_scale_factor=self.vae_scale_factor,
218
+ vae_scale_factor=self.vae_scale_factor * 2,
217
219
  vae_latent_channels=self.vae.config.latent_channels,
218
220
  do_normalize=False,
219
221
  do_binarize=True,
@@ -222,7 +224,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
222
224
  self.tokenizer_max_length = (
223
225
  self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
224
226
  )
225
- self.default_sample_size = 64
227
+ self.default_sample_size = 128
226
228
 
227
229
  # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
228
230
  def _get_t5_prompt_embeds(
@@ -445,8 +447,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
445
447
  if strength < 0 or strength > 1:
446
448
  raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
447
449
 
448
- if height % 8 != 0 or width % 8 != 0:
449
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
450
+ if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
451
+ logger.warning(
452
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
453
+ )
450
454
 
451
455
  if callback_on_step_end_tensor_inputs is not None and not all(
452
456
  k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -498,9 +502,9 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
498
502
  @staticmethod
499
503
  # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
500
504
  def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
501
- latent_image_ids = torch.zeros(height // 2, width // 2, 3)
502
- latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
503
- latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
505
+ latent_image_ids = torch.zeros(height, width, 3)
506
+ latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
507
+ latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
504
508
 
505
509
  latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
506
510
 
@@ -524,13 +528,15 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
524
528
  def _unpack_latents(latents, height, width, vae_scale_factor):
525
529
  batch_size, num_patches, channels = latents.shape
526
530
 
527
- height = height // vae_scale_factor
528
- width = width // vae_scale_factor
531
+ # VAE applies 8x compression on images but we must also account for packing which requires
532
+ # latent height and width to be divisible by 2.
533
+ height = 2 * (int(height) // (vae_scale_factor * 2))
534
+ width = 2 * (int(width) // (vae_scale_factor * 2))
529
535
 
530
- latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
536
+ latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
531
537
  latents = latents.permute(0, 3, 1, 4, 2, 5)
532
538
 
533
- latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
539
+ latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
534
540
 
535
541
  return latents
536
542
 
@@ -553,11 +559,12 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
553
559
  f" size of {batch_size}. Make sure the batch size matches the length of the generators."
554
560
  )
555
561
 
556
- height = 2 * (int(height) // self.vae_scale_factor)
557
- width = 2 * (int(width) // self.vae_scale_factor)
558
-
562
+ # VAE applies 8x compression on images but we must also account for packing which requires
563
+ # latent height and width to be divisible by 2.
564
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
565
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
559
566
  shape = (batch_size, num_channels_latents, height, width)
560
- latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
567
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
561
568
 
562
569
  image = image.to(device=device, dtype=dtype)
563
570
  image_latents = self._encode_vae_image(image=image, generator=generator)
@@ -598,8 +605,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
598
605
  device,
599
606
  generator,
600
607
  ):
601
- height = 2 * (int(height) // self.vae_scale_factor)
602
- width = 2 * (int(width) // self.vae_scale_factor)
608
+ # VAE applies 8x compression on images but we must also account for packing which requires
609
+ # latent height and width to be divisible by 2.
610
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
611
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
603
612
  # resize the mask to latents shape as we concatenate the mask to the latents
604
613
  # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
605
614
  # and half precision
@@ -637,7 +646,6 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
637
646
 
638
647
  # aligning device to prevent device errors when concating it with the latent model input
639
648
  masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
640
-
641
649
  masked_image_latents = self._pack_latents(
642
650
  masked_image_latents,
643
651
  batch_size,
@@ -685,7 +693,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
685
693
  padding_mask_crop: Optional[int] = None,
686
694
  strength: float = 0.6,
687
695
  num_inference_steps: int = 28,
688
- timesteps: List[int] = None,
696
+ sigmas: Optional[List[float]] = None,
689
697
  guidance_scale: float = 7.0,
690
698
  num_images_per_prompt: Optional[int] = 1,
691
699
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -745,10 +753,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
745
753
  num_inference_steps (`int`, *optional*, defaults to 50):
746
754
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
747
755
  expense of slower inference.
748
- timesteps (`List[int]`, *optional*):
749
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
750
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
751
- passed will be used. Must be in descending order.
756
+ sigmas (`List[float]`, *optional*):
757
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
758
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
759
+ will be used.
752
760
  guidance_scale (`float`, *optional*, defaults to 7.0):
753
761
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
754
762
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -865,8 +873,8 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
865
873
  )
866
874
 
867
875
  # 4.Prepare timesteps
868
- sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
869
- image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
876
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
877
+ image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
870
878
  mu = calculate_shift(
871
879
  image_seq_len,
872
880
  self.scheduler.config.base_image_seq_len,
@@ -878,8 +886,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
878
886
  self.scheduler,
879
887
  num_inference_steps,
880
888
  device,
881
- timesteps,
882
- sigmas,
889
+ sigmas=sigmas,
883
890
  mu=mu,
884
891
  )
885
892
  timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)