diffusers 0.32.2__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (389) hide show
  1. diffusers/__init__.py +186 -3
  2. diffusers/configuration_utils.py +40 -12
  3. diffusers/dependency_versions_table.py +9 -2
  4. diffusers/hooks/__init__.py +9 -0
  5. diffusers/hooks/faster_cache.py +653 -0
  6. diffusers/hooks/group_offloading.py +793 -0
  7. diffusers/hooks/hooks.py +236 -0
  8. diffusers/hooks/layerwise_casting.py +245 -0
  9. diffusers/hooks/pyramid_attention_broadcast.py +311 -0
  10. diffusers/loaders/__init__.py +6 -0
  11. diffusers/loaders/ip_adapter.py +38 -30
  12. diffusers/loaders/lora_base.py +121 -86
  13. diffusers/loaders/lora_conversion_utils.py +504 -44
  14. diffusers/loaders/lora_pipeline.py +1769 -181
  15. diffusers/loaders/peft.py +167 -57
  16. diffusers/loaders/single_file.py +17 -2
  17. diffusers/loaders/single_file_model.py +53 -5
  18. diffusers/loaders/single_file_utils.py +646 -72
  19. diffusers/loaders/textual_inversion.py +9 -9
  20. diffusers/loaders/transformer_flux.py +8 -9
  21. diffusers/loaders/transformer_sd3.py +120 -39
  22. diffusers/loaders/unet.py +20 -7
  23. diffusers/models/__init__.py +22 -0
  24. diffusers/models/activations.py +9 -9
  25. diffusers/models/attention.py +0 -1
  26. diffusers/models/attention_processor.py +163 -25
  27. diffusers/models/auto_model.py +169 -0
  28. diffusers/models/autoencoders/__init__.py +2 -0
  29. diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
  30. diffusers/models/autoencoders/autoencoder_dc.py +106 -4
  31. diffusers/models/autoencoders/autoencoder_kl.py +0 -4
  32. diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
  33. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
  34. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
  35. diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
  36. diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
  37. diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
  38. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
  39. diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
  40. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
  41. diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
  42. diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
  43. diffusers/models/autoencoders/vae.py +31 -141
  44. diffusers/models/autoencoders/vq_model.py +3 -0
  45. diffusers/models/cache_utils.py +108 -0
  46. diffusers/models/controlnets/__init__.py +1 -0
  47. diffusers/models/controlnets/controlnet.py +3 -8
  48. diffusers/models/controlnets/controlnet_flux.py +14 -42
  49. diffusers/models/controlnets/controlnet_sd3.py +58 -34
  50. diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
  51. diffusers/models/controlnets/controlnet_union.py +27 -18
  52. diffusers/models/controlnets/controlnet_xs.py +7 -46
  53. diffusers/models/controlnets/multicontrolnet_union.py +196 -0
  54. diffusers/models/embeddings.py +18 -7
  55. diffusers/models/model_loading_utils.py +122 -80
  56. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  57. diffusers/models/modeling_flax_utils.py +1 -1
  58. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  59. diffusers/models/modeling_utils.py +617 -272
  60. diffusers/models/normalization.py +67 -14
  61. diffusers/models/resnet.py +1 -1
  62. diffusers/models/transformers/__init__.py +6 -0
  63. diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
  64. diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
  65. diffusers/models/transformers/consisid_transformer_3d.py +789 -0
  66. diffusers/models/transformers/dit_transformer_2d.py +5 -19
  67. diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
  68. diffusers/models/transformers/latte_transformer_3d.py +20 -15
  69. diffusers/models/transformers/lumina_nextdit2d.py +3 -1
  70. diffusers/models/transformers/pixart_transformer_2d.py +4 -19
  71. diffusers/models/transformers/prior_transformer.py +5 -1
  72. diffusers/models/transformers/sana_transformer.py +144 -40
  73. diffusers/models/transformers/stable_audio_transformer.py +5 -20
  74. diffusers/models/transformers/transformer_2d.py +7 -22
  75. diffusers/models/transformers/transformer_allegro.py +9 -17
  76. diffusers/models/transformers/transformer_cogview3plus.py +6 -17
  77. diffusers/models/transformers/transformer_cogview4.py +462 -0
  78. diffusers/models/transformers/transformer_easyanimate.py +527 -0
  79. diffusers/models/transformers/transformer_flux.py +68 -110
  80. diffusers/models/transformers/transformer_hunyuan_video.py +404 -46
  81. diffusers/models/transformers/transformer_ltx.py +53 -35
  82. diffusers/models/transformers/transformer_lumina2.py +548 -0
  83. diffusers/models/transformers/transformer_mochi.py +6 -17
  84. diffusers/models/transformers/transformer_omnigen.py +469 -0
  85. diffusers/models/transformers/transformer_sd3.py +56 -86
  86. diffusers/models/transformers/transformer_temporal.py +5 -11
  87. diffusers/models/transformers/transformer_wan.py +469 -0
  88. diffusers/models/unets/unet_1d.py +3 -1
  89. diffusers/models/unets/unet_2d.py +21 -20
  90. diffusers/models/unets/unet_2d_blocks.py +19 -243
  91. diffusers/models/unets/unet_2d_condition.py +4 -6
  92. diffusers/models/unets/unet_3d_blocks.py +14 -127
  93. diffusers/models/unets/unet_3d_condition.py +8 -12
  94. diffusers/models/unets/unet_i2vgen_xl.py +5 -13
  95. diffusers/models/unets/unet_kandinsky3.py +0 -4
  96. diffusers/models/unets/unet_motion_model.py +20 -114
  97. diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
  98. diffusers/models/unets/unet_stable_cascade.py +8 -35
  99. diffusers/models/unets/uvit_2d.py +1 -4
  100. diffusers/optimization.py +2 -2
  101. diffusers/pipelines/__init__.py +57 -8
  102. diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
  103. diffusers/pipelines/amused/pipeline_amused.py +15 -2
  104. diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
  105. diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
  106. diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
  107. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
  108. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
  109. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
  110. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
  111. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
  112. diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
  113. diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
  114. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
  115. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
  116. diffusers/pipelines/auto_pipeline.py +35 -14
  117. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  118. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
  119. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
  120. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
  121. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
  122. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
  123. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
  124. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
  125. diffusers/pipelines/cogview4/__init__.py +49 -0
  126. diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
  127. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
  128. diffusers/pipelines/cogview4/pipeline_output.py +21 -0
  129. diffusers/pipelines/consisid/__init__.py +49 -0
  130. diffusers/pipelines/consisid/consisid_utils.py +357 -0
  131. diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
  132. diffusers/pipelines/consisid/pipeline_output.py +20 -0
  133. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
  134. diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
  135. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
  136. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
  137. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
  138. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
  139. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
  140. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
  141. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
  142. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
  143. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
  144. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
  145. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
  146. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
  147. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
  148. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
  149. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
  150. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
  151. diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
  152. diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
  153. diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
  154. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
  155. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
  156. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
  157. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
  158. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
  159. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
  160. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
  161. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  162. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
  163. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
  164. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
  165. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
  166. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
  167. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
  168. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
  169. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
  170. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  171. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  172. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
  173. diffusers/pipelines/dit/pipeline_dit.py +15 -2
  174. diffusers/pipelines/easyanimate/__init__.py +52 -0
  175. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
  176. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
  177. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
  178. diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
  179. diffusers/pipelines/flux/pipeline_flux.py +53 -21
  180. diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
  181. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
  182. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
  183. diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
  184. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
  185. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
  186. diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
  187. diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
  188. diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
  189. diffusers/pipelines/free_noise_utils.py +3 -3
  190. diffusers/pipelines/hunyuan_video/__init__.py +4 -0
  191. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
  192. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
  193. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
  194. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
  195. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
  196. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
  197. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  198. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
  199. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
  200. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
  201. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
  202. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
  203. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
  204. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
  205. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
  206. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
  207. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
  208. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
  209. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
  210. diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
  211. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
  212. diffusers/pipelines/kolors/text_encoder.py +7 -34
  213. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
  214. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
  215. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
  216. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
  217. diffusers/pipelines/latte/pipeline_latte.py +36 -7
  218. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
  219. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
  220. diffusers/pipelines/ltx/__init__.py +2 -0
  221. diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
  222. diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
  223. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
  224. diffusers/pipelines/lumina/__init__.py +2 -2
  225. diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
  226. diffusers/pipelines/lumina2/__init__.py +48 -0
  227. diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
  228. diffusers/pipelines/marigold/__init__.py +2 -0
  229. diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
  230. diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
  231. diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
  232. diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
  233. diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
  234. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
  235. diffusers/pipelines/omnigen/__init__.py +50 -0
  236. diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
  237. diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
  238. diffusers/pipelines/onnx_utils.py +5 -3
  239. diffusers/pipelines/pag/pag_utils.py +1 -1
  240. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
  241. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
  242. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
  243. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
  244. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
  245. diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
  246. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
  247. diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
  248. diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
  249. diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
  250. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
  251. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
  252. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
  253. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
  254. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
  255. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
  256. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
  257. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
  258. diffusers/pipelines/pia/pipeline_pia.py +13 -1
  259. diffusers/pipelines/pipeline_flax_utils.py +7 -7
  260. diffusers/pipelines/pipeline_loading_utils.py +193 -83
  261. diffusers/pipelines/pipeline_utils.py +221 -106
  262. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
  263. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
  264. diffusers/pipelines/sana/__init__.py +2 -0
  265. diffusers/pipelines/sana/pipeline_sana.py +183 -58
  266. diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
  267. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
  268. diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
  269. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
  270. diffusers/pipelines/shap_e/renderer.py +6 -6
  271. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
  272. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
  273. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
  274. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
  275. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
  276. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
  277. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
  278. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
  279. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  280. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
  281. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
  282. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
  283. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
  284. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
  285. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
  286. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
  287. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
  288. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
  289. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
  290. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
  291. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
  292. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
  293. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
  294. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
  295. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
  296. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
  297. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
  298. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
  299. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
  300. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
  301. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
  302. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
  303. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
  304. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
  305. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
  306. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  307. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
  308. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
  309. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
  310. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
  311. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
  312. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
  313. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
  314. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
  315. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
  316. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
  317. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
  318. diffusers/pipelines/transformers_loading_utils.py +121 -0
  319. diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
  320. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
  321. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
  322. diffusers/pipelines/wan/__init__.py +51 -0
  323. diffusers/pipelines/wan/pipeline_output.py +20 -0
  324. diffusers/pipelines/wan/pipeline_wan.py +595 -0
  325. diffusers/pipelines/wan/pipeline_wan_i2v.py +724 -0
  326. diffusers/pipelines/wan/pipeline_wan_video2video.py +727 -0
  327. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
  328. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
  329. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
  330. diffusers/quantizers/auto.py +5 -1
  331. diffusers/quantizers/base.py +5 -9
  332. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
  333. diffusers/quantizers/bitsandbytes/utils.py +30 -20
  334. diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
  335. diffusers/quantizers/gguf/utils.py +4 -2
  336. diffusers/quantizers/quantization_config.py +59 -4
  337. diffusers/quantizers/quanto/__init__.py +1 -0
  338. diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
  339. diffusers/quantizers/quanto/utils.py +60 -0
  340. diffusers/quantizers/torchao/__init__.py +1 -1
  341. diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
  342. diffusers/schedulers/__init__.py +2 -1
  343. diffusers/schedulers/scheduling_consistency_models.py +1 -2
  344. diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
  345. diffusers/schedulers/scheduling_ddpm.py +2 -3
  346. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
  347. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
  348. diffusers/schedulers/scheduling_edm_euler.py +45 -10
  349. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
  350. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
  351. diffusers/schedulers/scheduling_heun_discrete.py +1 -1
  352. diffusers/schedulers/scheduling_lcm.py +1 -2
  353. diffusers/schedulers/scheduling_lms_discrete.py +1 -1
  354. diffusers/schedulers/scheduling_repaint.py +5 -1
  355. diffusers/schedulers/scheduling_scm.py +265 -0
  356. diffusers/schedulers/scheduling_tcd.py +1 -2
  357. diffusers/schedulers/scheduling_utils.py +2 -1
  358. diffusers/training_utils.py +14 -7
  359. diffusers/utils/__init__.py +9 -1
  360. diffusers/utils/constants.py +13 -1
  361. diffusers/utils/deprecation_utils.py +1 -1
  362. diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
  363. diffusers/utils/dummy_gguf_objects.py +17 -0
  364. diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
  365. diffusers/utils/dummy_pt_objects.py +233 -0
  366. diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
  367. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  368. diffusers/utils/dummy_torchao_objects.py +17 -0
  369. diffusers/utils/dynamic_modules_utils.py +1 -1
  370. diffusers/utils/export_utils.py +28 -3
  371. diffusers/utils/hub_utils.py +52 -102
  372. diffusers/utils/import_utils.py +121 -221
  373. diffusers/utils/loading_utils.py +2 -1
  374. diffusers/utils/logging.py +1 -2
  375. diffusers/utils/peft_utils.py +6 -14
  376. diffusers/utils/remote_utils.py +425 -0
  377. diffusers/utils/source_code_parsing_utils.py +52 -0
  378. diffusers/utils/state_dict_utils.py +15 -1
  379. diffusers/utils/testing_utils.py +243 -13
  380. diffusers/utils/torch_utils.py +10 -0
  381. diffusers/utils/typing_utils.py +91 -0
  382. diffusers/video_processor.py +1 -1
  383. {diffusers-0.32.2.dist-info → diffusers-0.33.1.dist-info}/METADATA +21 -4
  384. diffusers-0.33.1.dist-info/RECORD +608 -0
  385. {diffusers-0.32.2.dist-info → diffusers-0.33.1.dist-info}/WHEEL +1 -1
  386. diffusers-0.32.2.dist-info/RECORD +0 -550
  387. {diffusers-0.32.2.dist-info → diffusers-0.33.1.dist-info}/LICENSE +0 -0
  388. {diffusers-0.32.2.dist-info → diffusers-0.33.1.dist-info}/entry_points.txt +0 -0
  389. {diffusers-0.32.2.dist-info → diffusers-0.33.1.dist-info}/top_level.txt +0 -0
@@ -18,14 +18,16 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
18
  import numpy as np
19
19
  import torch
20
20
  from transformers import (
21
+ CLIPImageProcessor,
21
22
  CLIPTextModel,
22
23
  CLIPTokenizer,
24
+ CLIPVisionModelWithProjection,
23
25
  T5EncoderModel,
24
26
  T5TokenizerFast,
25
27
  )
26
28
 
27
29
  from ...image_processor import PipelineImageInput, VaeImageProcessor
28
- from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
30
+ from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
29
31
  from ...models.autoencoders import AutoencoderKL
30
32
  from ...models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
31
33
  from ...models.transformers import FluxTransformer2DModel
@@ -61,6 +63,7 @@ EXAMPLE_DOC_STRING = """
61
63
  >>> from diffusers import FluxControlNetPipeline
62
64
  >>> from diffusers import FluxControlNetModel
63
65
 
66
+ >>> base_model = "black-forest-labs/FLUX.1-dev"
64
67
  >>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
65
68
  >>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
66
69
  >>> pipe = FluxControlNetPipeline.from_pretrained(
@@ -89,7 +92,7 @@ def calculate_shift(
89
92
  base_seq_len: int = 256,
90
93
  max_seq_len: int = 4096,
91
94
  base_shift: float = 0.5,
92
- max_shift: float = 1.16,
95
+ max_shift: float = 1.15,
93
96
  ):
94
97
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
95
98
  b = base_shift - m * base_seq_len
@@ -171,7 +174,7 @@ def retrieve_timesteps(
171
174
  return timesteps, num_inference_steps
172
175
 
173
176
 
174
- class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
177
+ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin, FluxIPAdapterMixin):
175
178
  r"""
176
179
  The Flux pipeline for text-to-image generation.
177
180
 
@@ -198,9 +201,9 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
198
201
  [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
199
202
  """
200
203
 
201
- model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
202
- _optional_components = []
203
- _callback_tensor_inputs = ["latents", "prompt_embeds"]
204
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
205
+ _optional_components = ["image_encoder", "feature_extractor"]
206
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "control_image"]
204
207
 
205
208
  def __init__(
206
209
  self,
@@ -214,6 +217,8 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
214
217
  controlnet: Union[
215
218
  FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
216
219
  ],
220
+ image_encoder: CLIPVisionModelWithProjection = None,
221
+ feature_extractor: CLIPImageProcessor = None,
217
222
  ):
218
223
  super().__init__()
219
224
  if isinstance(controlnet, (list, tuple)):
@@ -228,10 +233,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
228
233
  transformer=transformer,
229
234
  scheduler=scheduler,
230
235
  controlnet=controlnet,
236
+ image_encoder=image_encoder,
237
+ feature_extractor=feature_extractor,
231
238
  )
232
- self.vae_scale_factor = (
233
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
234
- )
239
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
235
240
  # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
236
241
  # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
237
242
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -415,14 +420,67 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
415
420
 
416
421
  return prompt_embeds, pooled_prompt_embeds, text_ids
417
422
 
423
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
424
+ def encode_image(self, image, device, num_images_per_prompt):
425
+ dtype = next(self.image_encoder.parameters()).dtype
426
+
427
+ if not isinstance(image, torch.Tensor):
428
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
429
+
430
+ image = image.to(device=device, dtype=dtype)
431
+ image_embeds = self.image_encoder(image).image_embeds
432
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
433
+ return image_embeds
434
+
435
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
436
+ def prepare_ip_adapter_image_embeds(
437
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
438
+ ):
439
+ image_embeds = []
440
+ if ip_adapter_image_embeds is None:
441
+ if not isinstance(ip_adapter_image, list):
442
+ ip_adapter_image = [ip_adapter_image]
443
+
444
+ if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
445
+ raise ValueError(
446
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
447
+ )
448
+
449
+ for single_ip_adapter_image in ip_adapter_image:
450
+ single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
451
+ image_embeds.append(single_image_embeds[None, :])
452
+ else:
453
+ if not isinstance(ip_adapter_image_embeds, list):
454
+ ip_adapter_image_embeds = [ip_adapter_image_embeds]
455
+
456
+ if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
457
+ raise ValueError(
458
+ f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
459
+ )
460
+
461
+ for single_image_embeds in ip_adapter_image_embeds:
462
+ image_embeds.append(single_image_embeds)
463
+
464
+ ip_adapter_image_embeds = []
465
+ for single_image_embeds in image_embeds:
466
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
467
+ single_image_embeds = single_image_embeds.to(device=device)
468
+ ip_adapter_image_embeds.append(single_image_embeds)
469
+
470
+ return ip_adapter_image_embeds
471
+
418
472
  def check_inputs(
419
473
  self,
420
474
  prompt,
421
475
  prompt_2,
422
476
  height,
423
477
  width,
478
+ negative_prompt=None,
479
+ negative_prompt_2=None,
424
480
  prompt_embeds=None,
481
+ negative_prompt_embeds=None,
425
482
  pooled_prompt_embeds=None,
483
+ negative_pooled_prompt_embeds=None,
426
484
  callback_on_step_end_tensor_inputs=None,
427
485
  max_sequence_length=None,
428
486
  ):
@@ -457,10 +515,33 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
457
515
  elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
458
516
  raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
459
517
 
518
+ if negative_prompt is not None and negative_prompt_embeds is not None:
519
+ raise ValueError(
520
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
521
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
522
+ )
523
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
524
+ raise ValueError(
525
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
526
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
527
+ )
528
+
529
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
530
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
531
+ raise ValueError(
532
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
533
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
534
+ f" {negative_prompt_embeds.shape}."
535
+ )
536
+
460
537
  if prompt_embeds is not None and pooled_prompt_embeds is None:
461
538
  raise ValueError(
462
539
  "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
463
540
  )
541
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
542
+ raise ValueError(
543
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
544
+ )
464
545
 
465
546
  if max_sequence_length is not None and max_sequence_length > 512:
466
547
  raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
@@ -599,6 +680,9 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
599
680
  self,
600
681
  prompt: Union[str, List[str]] = None,
601
682
  prompt_2: Optional[Union[str, List[str]]] = None,
683
+ negative_prompt: Union[str, List[str]] = None,
684
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
685
+ true_cfg_scale: float = 1.0,
602
686
  height: Optional[int] = None,
603
687
  width: Optional[int] = None,
604
688
  num_inference_steps: int = 28,
@@ -614,6 +698,12 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
614
698
  latents: Optional[torch.FloatTensor] = None,
615
699
  prompt_embeds: Optional[torch.FloatTensor] = None,
616
700
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
701
+ ip_adapter_image: Optional[PipelineImageInput] = None,
702
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
703
+ negative_ip_adapter_image: Optional[PipelineImageInput] = None,
704
+ negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
705
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
706
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
617
707
  output_type: Optional[str] = "pil",
618
708
  return_dict: bool = True,
619
709
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -681,6 +771,17 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
681
771
  pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
682
772
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
683
773
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
774
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
775
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
776
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
777
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
778
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
779
+ negative_ip_adapter_image:
780
+ (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
781
+ negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
782
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
783
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
784
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
684
785
  output_type (`str`, *optional*, defaults to `"pil"`):
685
786
  The output format of the generate image. Choose between
686
787
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -729,8 +830,12 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
729
830
  prompt_2,
730
831
  height,
731
832
  width,
833
+ negative_prompt=negative_prompt,
834
+ negative_prompt_2=negative_prompt_2,
732
835
  prompt_embeds=prompt_embeds,
836
+ negative_prompt_embeds=negative_prompt_embeds,
733
837
  pooled_prompt_embeds=pooled_prompt_embeds,
838
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
734
839
  callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
735
840
  max_sequence_length=max_sequence_length,
736
841
  )
@@ -754,6 +859,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
754
859
  lora_scale = (
755
860
  self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
756
861
  )
862
+ do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
757
863
  (
758
864
  prompt_embeds,
759
865
  pooled_prompt_embeds,
@@ -768,6 +874,21 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
768
874
  max_sequence_length=max_sequence_length,
769
875
  lora_scale=lora_scale,
770
876
  )
877
+ if do_true_cfg:
878
+ (
879
+ negative_prompt_embeds,
880
+ negative_pooled_prompt_embeds,
881
+ _,
882
+ ) = self.encode_prompt(
883
+ prompt=negative_prompt,
884
+ prompt_2=negative_prompt_2,
885
+ prompt_embeds=negative_prompt_embeds,
886
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
887
+ device=device,
888
+ num_images_per_prompt=num_images_per_prompt,
889
+ max_sequence_length=max_sequence_length,
890
+ lora_scale=lora_scale,
891
+ )
771
892
 
772
893
  # 3. Prepare control image
773
894
  num_channels_latents = self.transformer.config.in_channels // 4
@@ -876,10 +997,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
876
997
  image_seq_len = latents.shape[1]
877
998
  mu = calculate_shift(
878
999
  image_seq_len,
879
- self.scheduler.config.base_image_seq_len,
880
- self.scheduler.config.max_image_seq_len,
881
- self.scheduler.config.base_shift,
882
- self.scheduler.config.max_shift,
1000
+ self.scheduler.config.get("base_image_seq_len", 256),
1001
+ self.scheduler.config.get("max_image_seq_len", 4096),
1002
+ self.scheduler.config.get("base_shift", 0.5),
1003
+ self.scheduler.config.get("max_shift", 1.15),
883
1004
  )
884
1005
  timesteps, num_inference_steps = retrieve_timesteps(
885
1006
  self.scheduler,
@@ -901,12 +1022,43 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
901
1022
  ]
902
1023
  controlnet_keep.append(keeps[0] if isinstance(self.controlnet, FluxControlNetModel) else keeps)
903
1024
 
1025
+ if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
1026
+ negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
1027
+ ):
1028
+ negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
1029
+ elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
1030
+ negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
1031
+ ):
1032
+ ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
1033
+
1034
+ if self.joint_attention_kwargs is None:
1035
+ self._joint_attention_kwargs = {}
1036
+
1037
+ image_embeds = None
1038
+ negative_image_embeds = None
1039
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1040
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1041
+ ip_adapter_image,
1042
+ ip_adapter_image_embeds,
1043
+ device,
1044
+ batch_size * num_images_per_prompt,
1045
+ )
1046
+ if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
1047
+ negative_image_embeds = self.prepare_ip_adapter_image_embeds(
1048
+ negative_ip_adapter_image,
1049
+ negative_ip_adapter_image_embeds,
1050
+ device,
1051
+ batch_size * num_images_per_prompt,
1052
+ )
1053
+
904
1054
  # 7. Denoising loop
905
1055
  with self.progress_bar(total=num_inference_steps) as progress_bar:
906
1056
  for i, t in enumerate(timesteps):
907
1057
  if self.interrupt:
908
1058
  continue
909
1059
 
1060
+ if image_embeds is not None:
1061
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
910
1062
  # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
911
1063
  timestep = t.expand(latents.shape[0]).to(latents.dtype)
912
1064
 
@@ -962,6 +1114,25 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
962
1114
  controlnet_blocks_repeat=controlnet_blocks_repeat,
963
1115
  )[0]
964
1116
 
1117
+ if do_true_cfg:
1118
+ if negative_image_embeds is not None:
1119
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
1120
+ neg_noise_pred = self.transformer(
1121
+ hidden_states=latents,
1122
+ timestep=timestep / 1000,
1123
+ guidance=guidance,
1124
+ pooled_projections=negative_pooled_prompt_embeds,
1125
+ encoder_hidden_states=negative_prompt_embeds,
1126
+ controlnet_block_samples=controlnet_block_samples,
1127
+ controlnet_single_block_samples=controlnet_single_block_samples,
1128
+ txt_ids=text_ids,
1129
+ img_ids=latent_image_ids,
1130
+ joint_attention_kwargs=self.joint_attention_kwargs,
1131
+ return_dict=False,
1132
+ controlnet_blocks_repeat=controlnet_blocks_repeat,
1133
+ )[0]
1134
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
1135
+
965
1136
  # compute the previous noisy sample x_t -> x_t-1
966
1137
  latents_dtype = latents.dtype
967
1138
  latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
@@ -979,6 +1150,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
979
1150
 
980
1151
  latents = callback_outputs.pop("latents", latents)
981
1152
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1153
+ control_image = callback_outputs.pop("control_image", control_image)
982
1154
 
983
1155
  # call the callback, if provided
984
1156
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -87,7 +87,7 @@ def calculate_shift(
87
87
  base_seq_len: int = 256,
88
88
  max_seq_len: int = 4096,
89
89
  base_shift: float = 0.5,
90
- max_shift: float = 1.16,
90
+ max_shift: float = 1.15,
91
91
  ):
92
92
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
93
93
  b = base_shift - m * base_seq_len
@@ -198,7 +198,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
198
198
 
199
199
  model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
200
200
  _optional_components = []
201
- _callback_tensor_inputs = ["latents", "prompt_embeds"]
201
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "control_image"]
202
202
 
203
203
  def __init__(
204
204
  self,
@@ -227,9 +227,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
227
227
  scheduler=scheduler,
228
228
  controlnet=controlnet,
229
229
  )
230
- self.vae_scale_factor = (
231
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
232
- )
230
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
233
231
  # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
234
232
  # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
235
233
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -535,7 +533,6 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
535
533
 
536
534
  return latents
537
535
 
538
- # Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
539
536
  def prepare_latents(
540
537
  self,
541
538
  image,
@@ -864,10 +861,10 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
864
861
  image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
865
862
  mu = calculate_shift(
866
863
  image_seq_len,
867
- self.scheduler.config.base_image_seq_len,
868
- self.scheduler.config.max_image_seq_len,
869
- self.scheduler.config.base_shift,
870
- self.scheduler.config.max_shift,
864
+ self.scheduler.config.get("base_image_seq_len", 256),
865
+ self.scheduler.config.get("max_image_seq_len", 4096),
866
+ self.scheduler.config.get("base_shift", 0.5),
867
+ self.scheduler.config.get("max_shift", 1.15),
871
868
  )
872
869
  timesteps, num_inference_steps = retrieve_timesteps(
873
870
  self.scheduler,
@@ -975,6 +972,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
975
972
 
976
973
  latents = callback_outputs.pop("latents", latents)
977
974
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
975
+ control_image = callback_outputs.pop("control_image", control_image)
978
976
 
979
977
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
980
978
  progress_bar.update()
@@ -89,7 +89,7 @@ def calculate_shift(
89
89
  base_seq_len: int = 256,
90
90
  max_seq_len: int = 4096,
91
91
  base_shift: float = 0.5,
92
- max_shift: float = 1.16,
92
+ max_shift: float = 1.15,
93
93
  ):
94
94
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
95
95
  b = base_shift - m * base_seq_len
@@ -200,7 +200,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
200
200
 
201
201
  model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
202
202
  _optional_components = []
203
- _callback_tensor_inputs = ["latents", "prompt_embeds"]
203
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "control_image", "mask", "masked_image_latents"]
204
204
 
205
205
  def __init__(
206
206
  self,
@@ -230,15 +230,14 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
230
230
  controlnet=controlnet,
231
231
  )
232
232
 
233
- self.vae_scale_factor = (
234
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
235
- )
233
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
236
234
  # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
237
235
  # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
238
236
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
237
+ latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
239
238
  self.mask_processor = VaeImageProcessor(
240
239
  vae_scale_factor=self.vae_scale_factor * 2,
241
- vae_latent_channels=self.vae.config.latent_channels,
240
+ vae_latent_channels=latent_channels,
242
241
  do_normalize=False,
243
242
  do_binarize=True,
244
243
  do_convert_grayscale=True,
@@ -508,7 +507,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
508
507
  if padding_mask_crop is not None:
509
508
  if not isinstance(image, PIL.Image.Image):
510
509
  raise ValueError(
511
- f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
510
+ f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
512
511
  )
513
512
  if not isinstance(mask_image, PIL.Image.Image):
514
513
  raise ValueError(
@@ -516,7 +515,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
516
515
  f" {type(mask_image)}."
517
516
  )
518
517
  if output_type != "pil":
519
- raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
518
+ raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
520
519
 
521
520
  if max_sequence_length is not None and max_sequence_length > 512:
522
521
  raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
@@ -562,7 +561,6 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
562
561
 
563
562
  return latents
564
563
 
565
- # Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_latents
566
564
  def prepare_latents(
567
565
  self,
568
566
  image,
@@ -615,7 +613,6 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
615
613
  latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
616
614
  return latents, noise, image_latents, latent_image_ids
617
615
 
618
- # Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_mask_latents
619
616
  def prepare_mask_latents(
620
617
  self,
621
618
  mask,
@@ -931,8 +928,8 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
931
928
  if isinstance(self.controlnet, FluxControlNetModel):
932
929
  control_image = self.prepare_image(
933
930
  image=control_image,
934
- width=height,
935
- height=width,
931
+ width=width,
932
+ height=height,
936
933
  batch_size=batch_size * num_images_per_prompt,
937
934
  num_images_per_prompt=num_images_per_prompt,
938
935
  device=device,
@@ -1017,10 +1014,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
1017
1014
  )
1018
1015
  mu = calculate_shift(
1019
1016
  image_seq_len,
1020
- self.scheduler.config.base_image_seq_len,
1021
- self.scheduler.config.max_image_seq_len,
1022
- self.scheduler.config.base_shift,
1023
- self.scheduler.config.max_shift,
1017
+ self.scheduler.config.get("base_image_seq_len", 256),
1018
+ self.scheduler.config.get("max_image_seq_len", 4096),
1019
+ self.scheduler.config.get("base_shift", 0.5),
1020
+ self.scheduler.config.get("max_shift", 1.15),
1024
1021
  )
1025
1022
  timesteps, num_inference_steps = retrieve_timesteps(
1026
1023
  self.scheduler,
@@ -1179,6 +1176,9 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
1179
1176
 
1180
1177
  latents = callback_outputs.pop("latents", latents)
1181
1178
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1179
+ control_image = callback_outputs.pop("control_image", control_image)
1180
+ mask = callback_outputs.pop("mask", mask)
1181
+ masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
1182
1182
 
1183
1183
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1184
1184
  progress_bar.update()