diffusers 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +66 -5
- diffusers/callbacks.py +56 -3
- diffusers/configuration_utils.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +25 -17
- diffusers/loaders/__init__.py +22 -3
- diffusers/loaders/ip_adapter.py +538 -15
- diffusers/loaders/lora_base.py +124 -118
- diffusers/loaders/lora_conversion_utils.py +318 -3
- diffusers/loaders/lora_pipeline.py +1688 -368
- diffusers/loaders/peft.py +379 -0
- diffusers/loaders/single_file_model.py +71 -4
- diffusers/loaders/single_file_utils.py +519 -9
- diffusers/loaders/textual_inversion.py +3 -3
- diffusers/loaders/transformer_flux.py +181 -0
- diffusers/loaders/transformer_sd3.py +89 -0
- diffusers/loaders/unet.py +17 -4
- diffusers/models/__init__.py +47 -14
- diffusers/models/activations.py +22 -9
- diffusers/models/attention.py +13 -4
- diffusers/models/attention_flax.py +1 -1
- diffusers/models/attention_processor.py +2059 -281
- diffusers/models/autoencoders/__init__.py +5 -0
- diffusers/models/autoencoders/autoencoder_dc.py +620 -0
- diffusers/models/autoencoders/autoencoder_kl.py +2 -1
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
- diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
- diffusers/models/autoencoders/vae.py +18 -5
- diffusers/models/controlnet.py +47 -802
- diffusers/models/controlnet_flux.py +29 -495
- diffusers/models/controlnet_sd3.py +25 -379
- diffusers/models/controlnet_sparsectrl.py +46 -718
- diffusers/models/controlnets/__init__.py +23 -0
- diffusers/models/controlnets/controlnet.py +872 -0
- diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
- diffusers/models/controlnets/controlnet_flux.py +536 -0
- diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
- diffusers/models/controlnets/controlnet_sd3.py +489 -0
- diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
- diffusers/models/controlnets/controlnet_union.py +832 -0
- diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
- diffusers/models/controlnets/multicontrolnet.py +183 -0
- diffusers/models/embeddings.py +838 -43
- diffusers/models/model_loading_utils.py +88 -6
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +74 -28
- diffusers/models/normalization.py +78 -13
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
- diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
- diffusers/models/transformers/dit_transformer_2d.py +1 -1
- diffusers/models/transformers/latte_transformer_3d.py +4 -4
- diffusers/models/transformers/pixart_transformer_2d.py +1 -1
- diffusers/models/transformers/sana_transformer.py +488 -0
- diffusers/models/transformers/stable_audio_transformer.py +1 -1
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +422 -0
- diffusers/models/transformers/transformer_cogview3plus.py +1 -1
- diffusers/models/transformers/transformer_flux.py +30 -9
- diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
- diffusers/models/transformers/transformer_ltx.py +469 -0
- diffusers/models/transformers/transformer_mochi.py +499 -0
- diffusers/models/transformers/transformer_sd3.py +105 -17
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +8 -1
- diffusers/models/unets/unet_2d_blocks.py +88 -21
- diffusers/models/unets/unet_2d_condition.py +1 -1
- diffusers/models/unets/unet_3d_blocks.py +9 -7
- diffusers/models/unets/unet_motion_model.py +5 -5
- diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
- diffusers/models/unets/unet_stable_cascade.py +2 -2
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +8 -0
- diffusers/pipelines/__init__.py +34 -0
- diffusers/pipelines/allegro/__init__.py +48 -0
- diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
- diffusers/pipelines/allegro/pipeline_output.py +23 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
- diffusers/pipelines/auto_pipeline.py +53 -6
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
- diffusers/pipelines/controlnet/__init__.py +86 -80
- diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
- diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
- diffusers/pipelines/flux/__init__.py +13 -1
- diffusers/pipelines/flux/modeling_flux.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +204 -29
- diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
- diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
- diffusers/pipelines/flux/pipeline_output.py +16 -0
- diffusers/pipelines/hunyuan_video/__init__.py +48 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
- diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
- diffusers/pipelines/kolors/text_encoder.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/ltx/__init__.py +50 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
- diffusers/pipelines/ltx/pipeline_output.py +20 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
- diffusers/pipelines/mochi/__init__.py +48 -0
- diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
- diffusers/pipelines/mochi/pipeline_output.py +20 -0
- diffusers/pipelines/pag/__init__.py +7 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
- diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
- diffusers/pipelines/pipeline_flax_utils.py +1 -1
- diffusers/pipelines/pipeline_loading_utils.py +25 -4
- diffusers/pipelines/pipeline_utils.py +35 -6
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
- diffusers/pipelines/sana/__init__.py +47 -0
- diffusers/pipelines/sana/pipeline_output.py +21 -0
- diffusers/pipelines/sana/pipeline_sana.py +884 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
- diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/quantizers/auto.py +14 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
- diffusers/quantizers/gguf/__init__.py +1 -0
- diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
- diffusers/quantizers/gguf/utils.py +456 -0
- diffusers/quantizers/quantization_config.py +280 -2
- diffusers/quantizers/torchao/__init__.py +15 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
- diffusers/schedulers/scheduling_ddpm.py +2 -6
- diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
- diffusers/schedulers/scheduling_deis_multistep.py +28 -9
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
- diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
- diffusers/schedulers/scheduling_euler_discrete.py +4 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
- diffusers/schedulers/scheduling_heun_discrete.py +4 -4
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
- diffusers/schedulers/scheduling_lcm.py +2 -6
- diffusers/schedulers/scheduling_lms_discrete.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +1 -1
- diffusers/schedulers/scheduling_sasolver.py +28 -9
- diffusers/schedulers/scheduling_tcd.py +2 -6
- diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
- diffusers/training_utils.py +16 -2
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/constants.py +1 -0
- diffusers/utils/dummy_pt_objects.py +180 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dynamic_modules_utils.py +3 -3
- diffusers/utils/hub_utils.py +31 -39
- diffusers/utils/import_utils.py +67 -0
- diffusers/utils/peft_utils.py +3 -0
- diffusers/utils/testing_utils.py +56 -1
- diffusers/utils/torch_utils.py +3 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/METADATA +69 -69
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/RECORD +214 -162
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ import torch
|
|
20
20
|
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
21
21
|
|
22
22
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
23
|
-
from ...loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
|
23
|
+
from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
|
24
24
|
from ...models.autoencoders import AutoencoderKL
|
25
25
|
from ...models.transformers import FluxTransformer2DModel
|
26
26
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
@@ -159,7 +159,7 @@ def retrieve_timesteps(
|
|
159
159
|
return timesteps, num_inference_steps
|
160
160
|
|
161
161
|
|
162
|
-
class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
162
|
+
class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
163
163
|
r"""
|
164
164
|
The Flux pipeline for image inpainting.
|
165
165
|
|
@@ -212,13 +212,15 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
212
212
|
scheduler=scheduler,
|
213
213
|
)
|
214
214
|
self.vae_scale_factor = (
|
215
|
-
2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else
|
215
|
+
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
216
216
|
)
|
217
|
-
|
217
|
+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
218
|
+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
219
|
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
|
218
220
|
self.tokenizer_max_length = (
|
219
221
|
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
220
222
|
)
|
221
|
-
self.default_sample_size =
|
223
|
+
self.default_sample_size = 128
|
222
224
|
|
223
225
|
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
|
224
226
|
def _get_t5_prompt_embeds(
|
@@ -437,8 +439,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
437
439
|
if strength < 0 or strength > 1:
|
438
440
|
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
439
441
|
|
440
|
-
if height %
|
441
|
-
|
442
|
+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
|
443
|
+
logger.warning(
|
444
|
+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
|
445
|
+
)
|
442
446
|
|
443
447
|
if callback_on_step_end_tensor_inputs is not None and not all(
|
444
448
|
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
@@ -477,9 +481,9 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
477
481
|
@staticmethod
|
478
482
|
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
|
479
483
|
def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
480
|
-
latent_image_ids = torch.zeros(height
|
481
|
-
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height
|
482
|
-
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width
|
484
|
+
latent_image_ids = torch.zeros(height, width, 3)
|
485
|
+
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
|
486
|
+
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
|
483
487
|
|
484
488
|
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
|
485
489
|
|
@@ -503,13 +507,15 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
503
507
|
def _unpack_latents(latents, height, width, vae_scale_factor):
|
504
508
|
batch_size, num_patches, channels = latents.shape
|
505
509
|
|
506
|
-
|
507
|
-
|
510
|
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
511
|
+
# latent height and width to be divisible by 2.
|
512
|
+
height = 2 * (int(height) // (vae_scale_factor * 2))
|
513
|
+
width = 2 * (int(width) // (vae_scale_factor * 2))
|
508
514
|
|
509
|
-
latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
|
515
|
+
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
|
510
516
|
latents = latents.permute(0, 3, 1, 4, 2, 5)
|
511
517
|
|
512
|
-
latents = latents.reshape(batch_size, channels // (2 * 2), height
|
518
|
+
latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
|
513
519
|
|
514
520
|
return latents
|
515
521
|
|
@@ -532,11 +538,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
532
538
|
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
533
539
|
)
|
534
540
|
|
535
|
-
|
536
|
-
|
537
|
-
|
541
|
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
542
|
+
# latent height and width to be divisible by 2.
|
543
|
+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
|
544
|
+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
|
538
545
|
shape = (batch_size, num_channels_latents, height, width)
|
539
|
-
latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
|
546
|
+
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
|
540
547
|
|
541
548
|
if latents is not None:
|
542
549
|
return latents.to(device=device, dtype=dtype), latent_image_ids
|
@@ -586,7 +593,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
586
593
|
width: Optional[int] = None,
|
587
594
|
strength: float = 0.6,
|
588
595
|
num_inference_steps: int = 28,
|
589
|
-
|
596
|
+
sigmas: Optional[List[float]] = None,
|
590
597
|
guidance_scale: float = 7.0,
|
591
598
|
num_images_per_prompt: Optional[int] = 1,
|
592
599
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
@@ -629,10 +636,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
629
636
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
630
637
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
631
638
|
expense of slower inference.
|
632
|
-
|
633
|
-
Custom
|
634
|
-
|
635
|
-
|
639
|
+
sigmas (`List[float]`, *optional*):
|
640
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
641
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
642
|
+
will be used.
|
636
643
|
guidance_scale (`float`, *optional*, defaults to 7.0):
|
637
644
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
638
645
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -735,8 +742,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
735
742
|
)
|
736
743
|
|
737
744
|
# 4.Prepare timesteps
|
738
|
-
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
|
739
|
-
image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
|
745
|
+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
|
746
|
+
image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
|
740
747
|
mu = calculate_shift(
|
741
748
|
image_seq_len,
|
742
749
|
self.scheduler.config.base_image_seq_len,
|
@@ -748,8 +755,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
748
755
|
self.scheduler,
|
749
756
|
num_inference_steps,
|
750
757
|
device,
|
751
|
-
|
752
|
-
sigmas,
|
758
|
+
sigmas=sigmas,
|
753
759
|
mu=mu,
|
754
760
|
)
|
755
761
|
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
@@ -209,11 +209,13 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
209
209
|
scheduler=scheduler,
|
210
210
|
)
|
211
211
|
self.vae_scale_factor = (
|
212
|
-
2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else
|
212
|
+
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
213
213
|
)
|
214
|
-
|
214
|
+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
215
|
+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
216
|
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
|
215
217
|
self.mask_processor = VaeImageProcessor(
|
216
|
-
vae_scale_factor=self.vae_scale_factor,
|
218
|
+
vae_scale_factor=self.vae_scale_factor * 2,
|
217
219
|
vae_latent_channels=self.vae.config.latent_channels,
|
218
220
|
do_normalize=False,
|
219
221
|
do_binarize=True,
|
@@ -222,7 +224,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
222
224
|
self.tokenizer_max_length = (
|
223
225
|
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
224
226
|
)
|
225
|
-
self.default_sample_size =
|
227
|
+
self.default_sample_size = 128
|
226
228
|
|
227
229
|
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
|
228
230
|
def _get_t5_prompt_embeds(
|
@@ -445,8 +447,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
445
447
|
if strength < 0 or strength > 1:
|
446
448
|
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
447
449
|
|
448
|
-
if height %
|
449
|
-
|
450
|
+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
|
451
|
+
logger.warning(
|
452
|
+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
|
453
|
+
)
|
450
454
|
|
451
455
|
if callback_on_step_end_tensor_inputs is not None and not all(
|
452
456
|
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
@@ -498,9 +502,9 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
498
502
|
@staticmethod
|
499
503
|
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
|
500
504
|
def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
501
|
-
latent_image_ids = torch.zeros(height
|
502
|
-
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height
|
503
|
-
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width
|
505
|
+
latent_image_ids = torch.zeros(height, width, 3)
|
506
|
+
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
|
507
|
+
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
|
504
508
|
|
505
509
|
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
|
506
510
|
|
@@ -524,13 +528,15 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
524
528
|
def _unpack_latents(latents, height, width, vae_scale_factor):
|
525
529
|
batch_size, num_patches, channels = latents.shape
|
526
530
|
|
527
|
-
|
528
|
-
|
531
|
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
532
|
+
# latent height and width to be divisible by 2.
|
533
|
+
height = 2 * (int(height) // (vae_scale_factor * 2))
|
534
|
+
width = 2 * (int(width) // (vae_scale_factor * 2))
|
529
535
|
|
530
|
-
latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
|
536
|
+
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
|
531
537
|
latents = latents.permute(0, 3, 1, 4, 2, 5)
|
532
538
|
|
533
|
-
latents = latents.reshape(batch_size, channels // (2 * 2), height
|
539
|
+
latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
|
534
540
|
|
535
541
|
return latents
|
536
542
|
|
@@ -553,11 +559,12 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
553
559
|
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
554
560
|
)
|
555
561
|
|
556
|
-
|
557
|
-
|
558
|
-
|
562
|
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
563
|
+
# latent height and width to be divisible by 2.
|
564
|
+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
|
565
|
+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
|
559
566
|
shape = (batch_size, num_channels_latents, height, width)
|
560
|
-
latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
|
567
|
+
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
|
561
568
|
|
562
569
|
image = image.to(device=device, dtype=dtype)
|
563
570
|
image_latents = self._encode_vae_image(image=image, generator=generator)
|
@@ -598,8 +605,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
598
605
|
device,
|
599
606
|
generator,
|
600
607
|
):
|
601
|
-
|
602
|
-
|
608
|
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
609
|
+
# latent height and width to be divisible by 2.
|
610
|
+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
|
611
|
+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
|
603
612
|
# resize the mask to latents shape as we concatenate the mask to the latents
|
604
613
|
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
|
605
614
|
# and half precision
|
@@ -637,7 +646,6 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
637
646
|
|
638
647
|
# aligning device to prevent device errors when concating it with the latent model input
|
639
648
|
masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
|
640
|
-
|
641
649
|
masked_image_latents = self._pack_latents(
|
642
650
|
masked_image_latents,
|
643
651
|
batch_size,
|
@@ -685,7 +693,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
685
693
|
padding_mask_crop: Optional[int] = None,
|
686
694
|
strength: float = 0.6,
|
687
695
|
num_inference_steps: int = 28,
|
688
|
-
|
696
|
+
sigmas: Optional[List[float]] = None,
|
689
697
|
guidance_scale: float = 7.0,
|
690
698
|
num_images_per_prompt: Optional[int] = 1,
|
691
699
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
@@ -745,10 +753,10 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
745
753
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
746
754
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
747
755
|
expense of slower inference.
|
748
|
-
|
749
|
-
Custom
|
750
|
-
|
751
|
-
|
756
|
+
sigmas (`List[float]`, *optional*):
|
757
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
758
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
759
|
+
will be used.
|
752
760
|
guidance_scale (`float`, *optional*, defaults to 7.0):
|
753
761
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
754
762
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -865,8 +873,8 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
865
873
|
)
|
866
874
|
|
867
875
|
# 4.Prepare timesteps
|
868
|
-
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
|
869
|
-
image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
|
876
|
+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
|
877
|
+
image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
|
870
878
|
mu = calculate_shift(
|
871
879
|
image_seq_len,
|
872
880
|
self.scheduler.config.base_image_seq_len,
|
@@ -878,8 +886,7 @@ class FluxInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
|
878
886
|
self.scheduler,
|
879
887
|
num_inference_steps,
|
880
888
|
device,
|
881
|
-
|
882
|
-
sigmas,
|
889
|
+
sigmas=sigmas,
|
883
890
|
mu=mu,
|
884
891
|
)
|
885
892
|
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|