diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (389) hide show
  1. diffusers/__init__.py +186 -3
  2. diffusers/configuration_utils.py +40 -12
  3. diffusers/dependency_versions_table.py +9 -2
  4. diffusers/hooks/__init__.py +9 -0
  5. diffusers/hooks/faster_cache.py +653 -0
  6. diffusers/hooks/group_offloading.py +793 -0
  7. diffusers/hooks/hooks.py +236 -0
  8. diffusers/hooks/layerwise_casting.py +245 -0
  9. diffusers/hooks/pyramid_attention_broadcast.py +311 -0
  10. diffusers/loaders/__init__.py +6 -0
  11. diffusers/loaders/ip_adapter.py +38 -30
  12. diffusers/loaders/lora_base.py +198 -28
  13. diffusers/loaders/lora_conversion_utils.py +679 -44
  14. diffusers/loaders/lora_pipeline.py +1963 -801
  15. diffusers/loaders/peft.py +169 -84
  16. diffusers/loaders/single_file.py +17 -2
  17. diffusers/loaders/single_file_model.py +53 -5
  18. diffusers/loaders/single_file_utils.py +653 -75
  19. diffusers/loaders/textual_inversion.py +9 -9
  20. diffusers/loaders/transformer_flux.py +8 -9
  21. diffusers/loaders/transformer_sd3.py +120 -39
  22. diffusers/loaders/unet.py +22 -32
  23. diffusers/models/__init__.py +22 -0
  24. diffusers/models/activations.py +9 -9
  25. diffusers/models/attention.py +0 -1
  26. diffusers/models/attention_processor.py +163 -25
  27. diffusers/models/auto_model.py +169 -0
  28. diffusers/models/autoencoders/__init__.py +2 -0
  29. diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
  30. diffusers/models/autoencoders/autoencoder_dc.py +106 -4
  31. diffusers/models/autoencoders/autoencoder_kl.py +0 -4
  32. diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
  33. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
  34. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
  35. diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
  36. diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
  37. diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
  38. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
  39. diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
  40. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
  41. diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
  42. diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
  43. diffusers/models/autoencoders/vae.py +31 -141
  44. diffusers/models/autoencoders/vq_model.py +3 -0
  45. diffusers/models/cache_utils.py +108 -0
  46. diffusers/models/controlnets/__init__.py +1 -0
  47. diffusers/models/controlnets/controlnet.py +3 -8
  48. diffusers/models/controlnets/controlnet_flux.py +14 -42
  49. diffusers/models/controlnets/controlnet_sd3.py +58 -34
  50. diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
  51. diffusers/models/controlnets/controlnet_union.py +27 -18
  52. diffusers/models/controlnets/controlnet_xs.py +7 -46
  53. diffusers/models/controlnets/multicontrolnet_union.py +196 -0
  54. diffusers/models/embeddings.py +18 -7
  55. diffusers/models/model_loading_utils.py +122 -80
  56. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  57. diffusers/models/modeling_flax_utils.py +1 -1
  58. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  59. diffusers/models/modeling_utils.py +617 -272
  60. diffusers/models/normalization.py +67 -14
  61. diffusers/models/resnet.py +1 -1
  62. diffusers/models/transformers/__init__.py +6 -0
  63. diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
  64. diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
  65. diffusers/models/transformers/consisid_transformer_3d.py +789 -0
  66. diffusers/models/transformers/dit_transformer_2d.py +5 -19
  67. diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
  68. diffusers/models/transformers/latte_transformer_3d.py +20 -15
  69. diffusers/models/transformers/lumina_nextdit2d.py +3 -1
  70. diffusers/models/transformers/pixart_transformer_2d.py +4 -19
  71. diffusers/models/transformers/prior_transformer.py +5 -1
  72. diffusers/models/transformers/sana_transformer.py +144 -40
  73. diffusers/models/transformers/stable_audio_transformer.py +5 -20
  74. diffusers/models/transformers/transformer_2d.py +7 -22
  75. diffusers/models/transformers/transformer_allegro.py +9 -17
  76. diffusers/models/transformers/transformer_cogview3plus.py +6 -17
  77. diffusers/models/transformers/transformer_cogview4.py +462 -0
  78. diffusers/models/transformers/transformer_easyanimate.py +527 -0
  79. diffusers/models/transformers/transformer_flux.py +68 -110
  80. diffusers/models/transformers/transformer_hunyuan_video.py +409 -49
  81. diffusers/models/transformers/transformer_ltx.py +53 -35
  82. diffusers/models/transformers/transformer_lumina2.py +548 -0
  83. diffusers/models/transformers/transformer_mochi.py +6 -17
  84. diffusers/models/transformers/transformer_omnigen.py +469 -0
  85. diffusers/models/transformers/transformer_sd3.py +56 -86
  86. diffusers/models/transformers/transformer_temporal.py +5 -11
  87. diffusers/models/transformers/transformer_wan.py +469 -0
  88. diffusers/models/unets/unet_1d.py +3 -1
  89. diffusers/models/unets/unet_2d.py +21 -20
  90. diffusers/models/unets/unet_2d_blocks.py +19 -243
  91. diffusers/models/unets/unet_2d_condition.py +4 -6
  92. diffusers/models/unets/unet_3d_blocks.py +14 -127
  93. diffusers/models/unets/unet_3d_condition.py +8 -12
  94. diffusers/models/unets/unet_i2vgen_xl.py +5 -13
  95. diffusers/models/unets/unet_kandinsky3.py +0 -4
  96. diffusers/models/unets/unet_motion_model.py +20 -114
  97. diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
  98. diffusers/models/unets/unet_stable_cascade.py +8 -35
  99. diffusers/models/unets/uvit_2d.py +1 -4
  100. diffusers/optimization.py +2 -2
  101. diffusers/pipelines/__init__.py +57 -8
  102. diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
  103. diffusers/pipelines/amused/pipeline_amused.py +15 -2
  104. diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
  105. diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
  106. diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
  107. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
  108. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
  109. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
  110. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
  111. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
  112. diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
  113. diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
  114. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
  115. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
  116. diffusers/pipelines/auto_pipeline.py +35 -14
  117. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  118. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
  119. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
  120. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
  121. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
  122. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
  123. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
  124. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
  125. diffusers/pipelines/cogview4/__init__.py +49 -0
  126. diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
  127. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
  128. diffusers/pipelines/cogview4/pipeline_output.py +21 -0
  129. diffusers/pipelines/consisid/__init__.py +49 -0
  130. diffusers/pipelines/consisid/consisid_utils.py +357 -0
  131. diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
  132. diffusers/pipelines/consisid/pipeline_output.py +20 -0
  133. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
  134. diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
  135. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
  136. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
  137. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
  138. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
  139. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
  140. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
  141. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
  142. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
  143. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
  144. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
  145. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
  146. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
  147. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
  148. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
  149. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
  150. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
  151. diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
  152. diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
  153. diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
  154. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
  155. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
  156. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
  157. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
  158. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
  159. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
  160. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
  161. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  162. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
  163. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
  164. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
  165. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
  166. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
  167. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
  168. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
  169. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
  170. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  171. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  172. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
  173. diffusers/pipelines/dit/pipeline_dit.py +15 -2
  174. diffusers/pipelines/easyanimate/__init__.py +52 -0
  175. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
  176. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
  177. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
  178. diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
  179. diffusers/pipelines/flux/pipeline_flux.py +53 -21
  180. diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
  181. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
  182. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
  183. diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
  184. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
  185. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
  186. diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
  187. diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
  188. diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
  189. diffusers/pipelines/free_noise_utils.py +3 -3
  190. diffusers/pipelines/hunyuan_video/__init__.py +4 -0
  191. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
  192. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
  193. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
  194. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
  195. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
  196. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
  197. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  198. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
  199. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
  200. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
  201. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
  202. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
  203. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
  204. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
  205. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
  206. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
  207. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
  208. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
  209. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
  210. diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
  211. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
  212. diffusers/pipelines/kolors/text_encoder.py +7 -34
  213. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
  214. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
  215. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
  216. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
  217. diffusers/pipelines/latte/pipeline_latte.py +36 -7
  218. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
  219. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
  220. diffusers/pipelines/ltx/__init__.py +2 -0
  221. diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
  222. diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
  223. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
  224. diffusers/pipelines/lumina/__init__.py +2 -2
  225. diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
  226. diffusers/pipelines/lumina2/__init__.py +48 -0
  227. diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
  228. diffusers/pipelines/marigold/__init__.py +2 -0
  229. diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
  230. diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
  231. diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
  232. diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
  233. diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
  234. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
  235. diffusers/pipelines/omnigen/__init__.py +50 -0
  236. diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
  237. diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
  238. diffusers/pipelines/onnx_utils.py +5 -3
  239. diffusers/pipelines/pag/pag_utils.py +1 -1
  240. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
  241. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
  242. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
  243. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
  244. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
  245. diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
  246. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
  247. diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
  248. diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
  249. diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
  250. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
  251. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
  252. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
  253. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
  254. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
  255. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
  256. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
  257. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
  258. diffusers/pipelines/pia/pipeline_pia.py +13 -1
  259. diffusers/pipelines/pipeline_flax_utils.py +7 -7
  260. diffusers/pipelines/pipeline_loading_utils.py +193 -83
  261. diffusers/pipelines/pipeline_utils.py +221 -106
  262. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
  263. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
  264. diffusers/pipelines/sana/__init__.py +2 -0
  265. diffusers/pipelines/sana/pipeline_sana.py +183 -58
  266. diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
  267. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
  268. diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
  269. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
  270. diffusers/pipelines/shap_e/renderer.py +6 -6
  271. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
  272. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
  273. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
  274. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
  275. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
  276. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
  277. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
  278. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
  279. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  280. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
  281. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
  282. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
  283. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
  284. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
  285. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
  286. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
  287. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
  288. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
  289. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
  290. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
  291. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
  292. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
  293. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
  294. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
  295. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
  296. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
  297. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
  298. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
  299. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
  300. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
  301. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
  302. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
  303. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
  304. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
  305. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
  306. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  307. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
  308. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
  309. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
  310. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
  311. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
  312. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
  313. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
  314. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
  315. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
  316. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
  317. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
  318. diffusers/pipelines/transformers_loading_utils.py +121 -0
  319. diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
  320. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
  321. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
  322. diffusers/pipelines/wan/__init__.py +51 -0
  323. diffusers/pipelines/wan/pipeline_output.py +20 -0
  324. diffusers/pipelines/wan/pipeline_wan.py +593 -0
  325. diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
  326. diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
  327. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
  328. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
  329. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
  330. diffusers/quantizers/auto.py +5 -1
  331. diffusers/quantizers/base.py +5 -9
  332. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
  333. diffusers/quantizers/bitsandbytes/utils.py +30 -20
  334. diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
  335. diffusers/quantizers/gguf/utils.py +4 -2
  336. diffusers/quantizers/quantization_config.py +59 -4
  337. diffusers/quantizers/quanto/__init__.py +1 -0
  338. diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
  339. diffusers/quantizers/quanto/utils.py +60 -0
  340. diffusers/quantizers/torchao/__init__.py +1 -1
  341. diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
  342. diffusers/schedulers/__init__.py +2 -1
  343. diffusers/schedulers/scheduling_consistency_models.py +1 -2
  344. diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
  345. diffusers/schedulers/scheduling_ddpm.py +2 -3
  346. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
  347. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
  348. diffusers/schedulers/scheduling_edm_euler.py +45 -10
  349. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
  350. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
  351. diffusers/schedulers/scheduling_heun_discrete.py +1 -1
  352. diffusers/schedulers/scheduling_lcm.py +1 -2
  353. diffusers/schedulers/scheduling_lms_discrete.py +1 -1
  354. diffusers/schedulers/scheduling_repaint.py +5 -1
  355. diffusers/schedulers/scheduling_scm.py +265 -0
  356. diffusers/schedulers/scheduling_tcd.py +1 -2
  357. diffusers/schedulers/scheduling_utils.py +2 -1
  358. diffusers/training_utils.py +14 -7
  359. diffusers/utils/__init__.py +10 -2
  360. diffusers/utils/constants.py +13 -1
  361. diffusers/utils/deprecation_utils.py +1 -1
  362. diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
  363. diffusers/utils/dummy_gguf_objects.py +17 -0
  364. diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
  365. diffusers/utils/dummy_pt_objects.py +233 -0
  366. diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
  367. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  368. diffusers/utils/dummy_torchao_objects.py +17 -0
  369. diffusers/utils/dynamic_modules_utils.py +1 -1
  370. diffusers/utils/export_utils.py +28 -3
  371. diffusers/utils/hub_utils.py +52 -102
  372. diffusers/utils/import_utils.py +121 -221
  373. diffusers/utils/loading_utils.py +14 -1
  374. diffusers/utils/logging.py +1 -2
  375. diffusers/utils/peft_utils.py +6 -14
  376. diffusers/utils/remote_utils.py +425 -0
  377. diffusers/utils/source_code_parsing_utils.py +52 -0
  378. diffusers/utils/state_dict_utils.py +15 -1
  379. diffusers/utils/testing_utils.py +243 -13
  380. diffusers/utils/torch_utils.py +10 -0
  381. diffusers/utils/typing_utils.py +91 -0
  382. diffusers/video_processor.py +1 -1
  383. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
  384. diffusers-0.33.0.dist-info/RECORD +608 -0
  385. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
  386. diffusers-0.32.1.dist-info/RECORD +0 -550
  387. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
  388. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
  389. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -82,7 +82,7 @@ def calculate_shift(
82
82
  base_seq_len: int = 256,
83
83
  max_seq_len: int = 4096,
84
84
  base_shift: float = 0.5,
85
- max_shift: float = 1.16,
85
+ max_shift: float = 1.15,
86
86
  ):
87
87
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
88
88
  b = base_shift - m * base_seq_len
@@ -221,15 +221,16 @@ class FluxFillPipeline(
221
221
  transformer=transformer,
222
222
  scheduler=scheduler,
223
223
  )
224
- self.vae_scale_factor = (
225
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
226
- )
224
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
227
225
  # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
228
226
  # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
229
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
227
+ self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
228
+ self.image_processor = VaeImageProcessor(
229
+ vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels
230
+ )
230
231
  self.mask_processor = VaeImageProcessor(
231
232
  vae_scale_factor=self.vae_scale_factor * 2,
232
- vae_latent_channels=self.vae.config.latent_channels,
233
+ vae_latent_channels=self.latent_channels,
233
234
  do_normalize=False,
234
235
  do_binarize=True,
235
236
  do_convert_grayscale=True,
@@ -494,10 +495,38 @@ class FluxFillPipeline(
494
495
 
495
496
  return prompt_embeds, pooled_prompt_embeds, text_ids
496
497
 
498
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
499
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
500
+ if isinstance(generator, list):
501
+ image_latents = [
502
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
503
+ for i in range(image.shape[0])
504
+ ]
505
+ image_latents = torch.cat(image_latents, dim=0)
506
+ else:
507
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
508
+
509
+ image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
510
+
511
+ return image_latents
512
+
513
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
514
+ def get_timesteps(self, num_inference_steps, strength, device):
515
+ # get the original timestep using init_timestep
516
+ init_timestep = min(num_inference_steps * strength, num_inference_steps)
517
+
518
+ t_start = int(max(num_inference_steps - init_timestep, 0))
519
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
520
+ if hasattr(self.scheduler, "set_begin_index"):
521
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
522
+
523
+ return timesteps, num_inference_steps - t_start
524
+
497
525
  def check_inputs(
498
526
  self,
499
527
  prompt,
500
528
  prompt_2,
529
+ strength,
501
530
  height,
502
531
  width,
503
532
  prompt_embeds=None,
@@ -508,6 +537,9 @@ class FluxFillPipeline(
508
537
  mask_image=None,
509
538
  masked_image_latents=None,
510
539
  ):
540
+ if strength < 0 or strength > 1:
541
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
542
+
511
543
  if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
512
544
  logger.warning(
513
545
  f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
@@ -625,9 +657,11 @@ class FluxFillPipeline(
625
657
  """
626
658
  self.vae.disable_tiling()
627
659
 
628
- # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
660
+ # Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
629
661
  def prepare_latents(
630
662
  self,
663
+ image,
664
+ timestep,
631
665
  batch_size,
632
666
  num_channels_latents,
633
667
  height,
@@ -637,28 +671,41 @@ class FluxFillPipeline(
637
671
  generator,
638
672
  latents=None,
639
673
  ):
674
+ if isinstance(generator, list) and len(generator) != batch_size:
675
+ raise ValueError(
676
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
677
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
678
+ )
679
+
640
680
  # VAE applies 8x compression on images but we must also account for packing which requires
641
681
  # latent height and width to be divisible by 2.
642
682
  height = 2 * (int(height) // (self.vae_scale_factor * 2))
643
683
  width = 2 * (int(width) // (self.vae_scale_factor * 2))
644
-
645
684
  shape = (batch_size, num_channels_latents, height, width)
685
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
646
686
 
647
687
  if latents is not None:
648
- latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
649
688
  return latents.to(device=device, dtype=dtype), latent_image_ids
650
689
 
651
- if isinstance(generator, list) and len(generator) != batch_size:
690
+ image = image.to(device=device, dtype=dtype)
691
+ if image.shape[1] != self.latent_channels:
692
+ image_latents = self._encode_vae_image(image=image, generator=generator)
693
+ else:
694
+ image_latents = image
695
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
696
+ # expand init_latents for batch_size
697
+ additional_image_per_prompt = batch_size // image_latents.shape[0]
698
+ image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
699
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
652
700
  raise ValueError(
653
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
654
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
701
+ f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
655
702
  )
703
+ else:
704
+ image_latents = torch.cat([image_latents], dim=0)
656
705
 
657
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
706
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
707
+ latents = self.scheduler.scale_noise(image_latents, timestep, noise)
658
708
  latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
659
-
660
- latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
661
-
662
709
  return latents, latent_image_ids
663
710
 
664
711
  @property
@@ -688,6 +735,7 @@ class FluxFillPipeline(
688
735
  masked_image_latents: Optional[torch.FloatTensor] = None,
689
736
  height: Optional[int] = None,
690
737
  width: Optional[int] = None,
738
+ strength: float = 1.0,
691
739
  num_inference_steps: int = 50,
692
740
  sigmas: Optional[List[float]] = None,
693
741
  guidance_scale: float = 30.0,
@@ -732,6 +780,12 @@ class FluxFillPipeline(
732
780
  The height in pixels of the generated image. This is set to 1024 by default for the best results.
733
781
  width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
734
782
  The width in pixels of the generated image. This is set to 1024 by default for the best results.
783
+ strength (`float`, *optional*, defaults to 1.0):
784
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
785
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
786
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
787
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
788
+ essentially ignores `image`.
735
789
  num_inference_steps (`int`, *optional*, defaults to 50):
736
790
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
737
791
  expense of slower inference.
@@ -739,7 +793,7 @@ class FluxFillPipeline(
739
793
  Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
740
794
  their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
741
795
  will be used.
742
- guidance_scale (`float`, *optional*, defaults to 7.0):
796
+ guidance_scale (`float`, *optional*, defaults to 30.0):
743
797
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
744
798
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
745
799
  Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -795,6 +849,7 @@ class FluxFillPipeline(
795
849
  self.check_inputs(
796
850
  prompt,
797
851
  prompt_2,
852
+ strength,
798
853
  height,
799
854
  width,
800
855
  prompt_embeds=prompt_embeds,
@@ -810,6 +865,9 @@ class FluxFillPipeline(
810
865
  self._joint_attention_kwargs = joint_attention_kwargs
811
866
  self._interrupt = False
812
867
 
868
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
869
+ init_image = init_image.to(dtype=torch.float32)
870
+
813
871
  # 2. Define call parameters
814
872
  if prompt is not None and isinstance(prompt, str):
815
873
  batch_size = 1
@@ -839,9 +897,37 @@ class FluxFillPipeline(
839
897
  lora_scale=lora_scale,
840
898
  )
841
899
 
842
- # 4. Prepare latent variables
900
+ # 4. Prepare timesteps
901
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
902
+ image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
903
+ mu = calculate_shift(
904
+ image_seq_len,
905
+ self.scheduler.config.get("base_image_seq_len", 256),
906
+ self.scheduler.config.get("max_image_seq_len", 4096),
907
+ self.scheduler.config.get("base_shift", 0.5),
908
+ self.scheduler.config.get("max_shift", 1.15),
909
+ )
910
+ timesteps, num_inference_steps = retrieve_timesteps(
911
+ self.scheduler,
912
+ num_inference_steps,
913
+ device,
914
+ sigmas=sigmas,
915
+ mu=mu,
916
+ )
917
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
918
+
919
+ if num_inference_steps < 1:
920
+ raise ValueError(
921
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
922
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
923
+ )
924
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
925
+
926
+ # 5. Prepare latent variables
843
927
  num_channels_latents = self.vae.config.latent_channels
844
928
  latents, latent_image_ids = self.prepare_latents(
929
+ init_image,
930
+ latent_timestep,
845
931
  batch_size * num_images_per_prompt,
846
932
  num_channels_latents,
847
933
  height,
@@ -852,17 +938,16 @@ class FluxFillPipeline(
852
938
  latents,
853
939
  )
854
940
 
855
- # 5. Prepare mask and masked image latents
941
+ # 6. Prepare mask and masked image latents
856
942
  if masked_image_latents is not None:
857
943
  masked_image_latents = masked_image_latents.to(latents.device)
858
944
  else:
859
- image = self.image_processor.preprocess(image, height=height, width=width)
860
945
  mask_image = self.mask_processor.preprocess(mask_image, height=height, width=width)
861
946
 
862
- masked_image = image * (1 - mask_image)
947
+ masked_image = init_image * (1 - mask_image)
863
948
  masked_image = masked_image.to(device=device, dtype=prompt_embeds.dtype)
864
949
 
865
- height, width = image.shape[-2:]
950
+ height, width = init_image.shape[-2:]
866
951
  mask, masked_image_latents = self.prepare_mask_latents(
867
952
  mask_image,
868
953
  masked_image,
@@ -877,23 +962,6 @@ class FluxFillPipeline(
877
962
  )
878
963
  masked_image_latents = torch.cat((masked_image_latents, mask), dim=-1)
879
964
 
880
- # 6. Prepare timesteps
881
- sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
882
- image_seq_len = latents.shape[1]
883
- mu = calculate_shift(
884
- image_seq_len,
885
- self.scheduler.config.base_image_seq_len,
886
- self.scheduler.config.max_image_seq_len,
887
- self.scheduler.config.base_shift,
888
- self.scheduler.config.max_shift,
889
- )
890
- timesteps, num_inference_steps = retrieve_timesteps(
891
- self.scheduler,
892
- num_inference_steps,
893
- device,
894
- sigmas=sigmas,
895
- mu=mu,
896
- )
897
965
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
898
966
  self._num_timesteps = len(timesteps)
899
967
 
@@ -17,10 +17,17 @@ from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
18
  import numpy as np
19
19
  import torch
20
- from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
20
+ from transformers import (
21
+ CLIPImageProcessor,
22
+ CLIPTextModel,
23
+ CLIPTokenizer,
24
+ CLIPVisionModelWithProjection,
25
+ T5EncoderModel,
26
+ T5TokenizerFast,
27
+ )
21
28
 
22
29
  from ...image_processor import PipelineImageInput, VaeImageProcessor
23
- from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
30
+ from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
24
31
  from ...models.autoencoders import AutoencoderKL
25
32
  from ...models.transformers import FluxTransformer2DModel
26
33
  from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -77,7 +84,7 @@ def calculate_shift(
77
84
  base_seq_len: int = 256,
78
85
  max_seq_len: int = 4096,
79
86
  base_shift: float = 0.5,
80
- max_shift: float = 1.16,
87
+ max_shift: float = 1.15,
81
88
  ):
82
89
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
83
90
  b = base_shift - m * base_seq_len
@@ -159,7 +166,7 @@ def retrieve_timesteps(
159
166
  return timesteps, num_inference_steps
160
167
 
161
168
 
162
- class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
169
+ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin, FluxIPAdapterMixin):
163
170
  r"""
164
171
  The Flux pipeline for image inpainting.
165
172
 
@@ -186,8 +193,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
186
193
  [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
187
194
  """
188
195
 
189
- model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
190
- _optional_components = []
196
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
197
+ _optional_components = ["image_encoder", "feature_extractor"]
191
198
  _callback_tensor_inputs = ["latents", "prompt_embeds"]
192
199
 
193
200
  def __init__(
@@ -199,6 +206,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
199
206
  text_encoder_2: T5EncoderModel,
200
207
  tokenizer_2: T5TokenizerFast,
201
208
  transformer: FluxTransformer2DModel,
209
+ image_encoder: CLIPVisionModelWithProjection = None,
210
+ feature_extractor: CLIPImageProcessor = None,
202
211
  ):
203
212
  super().__init__()
204
213
 
@@ -210,13 +219,16 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
210
219
  tokenizer_2=tokenizer_2,
211
220
  transformer=transformer,
212
221
  scheduler=scheduler,
222
+ image_encoder=image_encoder,
223
+ feature_extractor=feature_extractor,
213
224
  )
214
- self.vae_scale_factor = (
215
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
216
- )
225
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
217
226
  # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
218
227
  # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
219
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
228
+ self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
229
+ self.image_processor = VaeImageProcessor(
230
+ vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels
231
+ )
220
232
  self.tokenizer_max_length = (
221
233
  self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
222
234
  )
@@ -397,6 +409,55 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
397
409
 
398
410
  return prompt_embeds, pooled_prompt_embeds, text_ids
399
411
 
412
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
413
+ def encode_image(self, image, device, num_images_per_prompt):
414
+ dtype = next(self.image_encoder.parameters()).dtype
415
+
416
+ if not isinstance(image, torch.Tensor):
417
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
418
+
419
+ image = image.to(device=device, dtype=dtype)
420
+ image_embeds = self.image_encoder(image).image_embeds
421
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
422
+ return image_embeds
423
+
424
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
425
+ def prepare_ip_adapter_image_embeds(
426
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
427
+ ):
428
+ image_embeds = []
429
+ if ip_adapter_image_embeds is None:
430
+ if not isinstance(ip_adapter_image, list):
431
+ ip_adapter_image = [ip_adapter_image]
432
+
433
+ if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
434
+ raise ValueError(
435
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
436
+ )
437
+
438
+ for single_ip_adapter_image in ip_adapter_image:
439
+ single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
440
+ image_embeds.append(single_image_embeds[None, :])
441
+ else:
442
+ if not isinstance(ip_adapter_image_embeds, list):
443
+ ip_adapter_image_embeds = [ip_adapter_image_embeds]
444
+
445
+ if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
446
+ raise ValueError(
447
+ f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
448
+ )
449
+
450
+ for single_image_embeds in ip_adapter_image_embeds:
451
+ image_embeds.append(single_image_embeds)
452
+
453
+ ip_adapter_image_embeds = []
454
+ for single_image_embeds in image_embeds:
455
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
456
+ single_image_embeds = single_image_embeds.to(device=device)
457
+ ip_adapter_image_embeds.append(single_image_embeds)
458
+
459
+ return ip_adapter_image_embeds
460
+
400
461
  # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
401
462
  def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
402
463
  if isinstance(generator, list):
@@ -431,8 +492,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
431
492
  strength,
432
493
  height,
433
494
  width,
495
+ negative_prompt=None,
496
+ negative_prompt_2=None,
434
497
  prompt_embeds=None,
498
+ negative_prompt_embeds=None,
435
499
  pooled_prompt_embeds=None,
500
+ negative_pooled_prompt_embeds=None,
436
501
  callback_on_step_end_tensor_inputs=None,
437
502
  max_sequence_length=None,
438
503
  ):
@@ -470,10 +535,33 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
470
535
  elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
471
536
  raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
472
537
 
538
+ if negative_prompt is not None and negative_prompt_embeds is not None:
539
+ raise ValueError(
540
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
541
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
542
+ )
543
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
544
+ raise ValueError(
545
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
546
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
547
+ )
548
+
549
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
550
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
551
+ raise ValueError(
552
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
553
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
554
+ f" {negative_prompt_embeds.shape}."
555
+ )
556
+
473
557
  if prompt_embeds is not None and pooled_prompt_embeds is None:
474
558
  raise ValueError(
475
559
  "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
476
560
  )
561
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
562
+ raise ValueError(
563
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
564
+ )
477
565
 
478
566
  if max_sequence_length is not None and max_sequence_length > 512:
479
567
  raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
@@ -549,7 +637,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
549
637
  return latents.to(device=device, dtype=dtype), latent_image_ids
550
638
 
551
639
  image = image.to(device=device, dtype=dtype)
552
- image_latents = self._encode_vae_image(image=image, generator=generator)
640
+ if image.shape[1] != self.latent_channels:
641
+ image_latents = self._encode_vae_image(image=image, generator=generator)
642
+ else:
643
+ image_latents = image
553
644
  if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
554
645
  # expand init_latents for batch_size
555
646
  additional_image_per_prompt = batch_size // image_latents.shape[0]
@@ -588,6 +679,9 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
588
679
  self,
589
680
  prompt: Union[str, List[str]] = None,
590
681
  prompt_2: Optional[Union[str, List[str]]] = None,
682
+ negative_prompt: Union[str, List[str]] = None,
683
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
684
+ true_cfg_scale: float = 1.0,
591
685
  image: PipelineImageInput = None,
592
686
  height: Optional[int] = None,
593
687
  width: Optional[int] = None,
@@ -600,6 +694,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
600
694
  latents: Optional[torch.FloatTensor] = None,
601
695
  prompt_embeds: Optional[torch.FloatTensor] = None,
602
696
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
697
+ ip_adapter_image: Optional[PipelineImageInput] = None,
698
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
699
+ negative_ip_adapter_image: Optional[PipelineImageInput] = None,
700
+ negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
701
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
702
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
603
703
  output_type: Optional[str] = "pil",
604
704
  return_dict: bool = True,
605
705
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -661,6 +761,17 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
661
761
  pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
662
762
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
663
763
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
764
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
765
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
766
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
767
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
768
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
769
+ negative_ip_adapter_image:
770
+ (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
771
+ negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
772
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
773
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
774
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
664
775
  output_type (`str`, *optional*, defaults to `"pil"`):
665
776
  The output format of the generate image. Choose between
666
777
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -699,8 +810,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
699
810
  strength,
700
811
  height,
701
812
  width,
813
+ negative_prompt=negative_prompt,
814
+ negative_prompt_2=negative_prompt_2,
702
815
  prompt_embeds=prompt_embeds,
816
+ negative_prompt_embeds=negative_prompt_embeds,
703
817
  pooled_prompt_embeds=pooled_prompt_embeds,
818
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
704
819
  callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
705
820
  max_sequence_length=max_sequence_length,
706
821
  )
@@ -726,6 +841,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
726
841
  lora_scale = (
727
842
  self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
728
843
  )
844
+ do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
729
845
  (
730
846
  prompt_embeds,
731
847
  pooled_prompt_embeds,
@@ -740,16 +856,31 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
740
856
  max_sequence_length=max_sequence_length,
741
857
  lora_scale=lora_scale,
742
858
  )
859
+ if do_true_cfg:
860
+ (
861
+ negative_prompt_embeds,
862
+ negative_pooled_prompt_embeds,
863
+ _,
864
+ ) = self.encode_prompt(
865
+ prompt=negative_prompt,
866
+ prompt_2=negative_prompt_2,
867
+ prompt_embeds=negative_prompt_embeds,
868
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
869
+ device=device,
870
+ num_images_per_prompt=num_images_per_prompt,
871
+ max_sequence_length=max_sequence_length,
872
+ lora_scale=lora_scale,
873
+ )
743
874
 
744
875
  # 4.Prepare timesteps
745
876
  sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
746
877
  image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
747
878
  mu = calculate_shift(
748
879
  image_seq_len,
749
- self.scheduler.config.base_image_seq_len,
750
- self.scheduler.config.max_image_seq_len,
751
- self.scheduler.config.base_shift,
752
- self.scheduler.config.max_shift,
880
+ self.scheduler.config.get("base_image_seq_len", 256),
881
+ self.scheduler.config.get("max_image_seq_len", 4096),
882
+ self.scheduler.config.get("base_shift", 0.5),
883
+ self.scheduler.config.get("max_shift", 1.15),
753
884
  )
754
885
  timesteps, num_inference_steps = retrieve_timesteps(
755
886
  self.scheduler,
@@ -793,12 +924,43 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
793
924
  else:
794
925
  guidance = None
795
926
 
927
+ if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
928
+ negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
929
+ ):
930
+ negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
931
+ elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
932
+ negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
933
+ ):
934
+ ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
935
+
936
+ if self.joint_attention_kwargs is None:
937
+ self._joint_attention_kwargs = {}
938
+
939
+ image_embeds = None
940
+ negative_image_embeds = None
941
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
942
+ image_embeds = self.prepare_ip_adapter_image_embeds(
943
+ ip_adapter_image,
944
+ ip_adapter_image_embeds,
945
+ device,
946
+ batch_size * num_images_per_prompt,
947
+ )
948
+ if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
949
+ negative_image_embeds = self.prepare_ip_adapter_image_embeds(
950
+ negative_ip_adapter_image,
951
+ negative_ip_adapter_image_embeds,
952
+ device,
953
+ batch_size * num_images_per_prompt,
954
+ )
955
+
796
956
  # 6. Denoising loop
797
957
  with self.progress_bar(total=num_inference_steps) as progress_bar:
798
958
  for i, t in enumerate(timesteps):
799
959
  if self.interrupt:
800
960
  continue
801
961
 
962
+ if image_embeds is not None:
963
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
802
964
  # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
803
965
  timestep = t.expand(latents.shape[0]).to(latents.dtype)
804
966
  noise_pred = self.transformer(
@@ -813,6 +975,22 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
813
975
  return_dict=False,
814
976
  )[0]
815
977
 
978
+ if do_true_cfg:
979
+ if negative_image_embeds is not None:
980
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
981
+ neg_noise_pred = self.transformer(
982
+ hidden_states=latents,
983
+ timestep=timestep / 1000,
984
+ guidance=guidance,
985
+ pooled_projections=negative_pooled_prompt_embeds,
986
+ encoder_hidden_states=negative_prompt_embeds,
987
+ txt_ids=text_ids,
988
+ img_ids=latent_image_ids,
989
+ joint_attention_kwargs=self.joint_attention_kwargs,
990
+ return_dict=False,
991
+ )[0]
992
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
993
+
816
994
  # compute the previous noisy sample x_t -> x_t-1
817
995
  latents_dtype = latents.dtype
818
996
  latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]