diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +19 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +20 -26
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +42 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
- diffusers/schedulers/scheduling_edm_euler.py +50 -31
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
- diffusers/schedulers/scheduling_euler_discrete.py +160 -68
- diffusers/schedulers/scheduling_heun_discrete.py +57 -39
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +24 -26
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
- diffusers-0.28.0.dist-info/RECORD +414 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
- diffusers-0.27.2.dist-info/RECORD +0 -399
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,8 @@ from transformers import (
|
|
27
27
|
T5EncoderModel,
|
28
28
|
T5Tokenizer,
|
29
29
|
T5TokenizerFast,
|
30
|
+
VitsModel,
|
31
|
+
VitsTokenizer,
|
30
32
|
)
|
31
33
|
|
32
34
|
from ...models import AutoencoderKL
|
@@ -79,6 +81,37 @@ EXAMPLE_DOC_STRING = """
|
|
79
81
|
>>> # save the best audio sample (index 0) as a .wav file
|
80
82
|
>>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
|
81
83
|
```
|
84
|
+
```
|
85
|
+
#Using AudioLDM2 for Text To Speech
|
86
|
+
>>> import scipy
|
87
|
+
>>> import torch
|
88
|
+
>>> from diffusers import AudioLDM2Pipeline
|
89
|
+
|
90
|
+
>>> repo_id = "anhnct/audioldm2_gigaspeech"
|
91
|
+
>>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
|
92
|
+
>>> pipe = pipe.to("cuda")
|
93
|
+
|
94
|
+
>>> # define the prompts
|
95
|
+
>>> prompt = "A female reporter is speaking"
|
96
|
+
>>> transcript = "wish you have a good day"
|
97
|
+
|
98
|
+
>>> # set the seed for generator
|
99
|
+
>>> generator = torch.Generator("cuda").manual_seed(0)
|
100
|
+
|
101
|
+
>>> # run the generation
|
102
|
+
>>> audio = pipe(
|
103
|
+
... prompt,
|
104
|
+
... transcription=transcript,
|
105
|
+
... num_inference_steps=200,
|
106
|
+
... audio_length_in_s=10.0,
|
107
|
+
... num_waveforms_per_prompt=2,
|
108
|
+
... generator=generator,
|
109
|
+
... max_new_tokens=512, #Must set max_new_tokens equa to 512 for TTS
|
110
|
+
... ).audios
|
111
|
+
|
112
|
+
>>> # save the best audio sample (index 0) as a .wav file
|
113
|
+
>>> scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])
|
114
|
+
```
|
82
115
|
"""
|
83
116
|
|
84
117
|
|
@@ -116,20 +149,23 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
116
149
|
specifically the [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant. The
|
117
150
|
text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
|
118
151
|
rank generated waveforms against the text prompt by computing similarity scores.
|
119
|
-
text_encoder_2 ([`~transformers.T5EncoderModel`]):
|
152
|
+
text_encoder_2 ([`~transformers.T5EncoderModel`, `~transformers.VitsModel`]):
|
120
153
|
Second frozen text-encoder. AudioLDM2 uses the encoder of
|
121
154
|
[T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
|
122
|
-
[google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant.
|
155
|
+
[google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant. Second frozen text-encoder use
|
156
|
+
for TTS. AudioLDM2 uses the encoder of
|
157
|
+
[Vits](https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel).
|
123
158
|
projection_model ([`AudioLDM2ProjectionModel`]):
|
124
159
|
A trained model used to linearly project the hidden-states from the first and second text encoder models
|
125
160
|
and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
|
126
|
-
concatenated to give the input to the language model.
|
161
|
+
concatenated to give the input to the language model. A Learned Position Embedding for the Vits
|
162
|
+
hidden-states
|
127
163
|
language_model ([`~transformers.GPT2Model`]):
|
128
164
|
An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
|
129
165
|
outputs from the two text encoders.
|
130
166
|
tokenizer ([`~transformers.RobertaTokenizer`]):
|
131
167
|
Tokenizer to tokenize text for the first frozen text-encoder.
|
132
|
-
tokenizer_2 ([`~transformers.T5Tokenizer`]):
|
168
|
+
tokenizer_2 ([`~transformers.T5Tokenizer`, `~transformers.VitsTokenizer`]):
|
133
169
|
Tokenizer to tokenize text for the second frozen text-encoder.
|
134
170
|
feature_extractor ([`~transformers.ClapFeatureExtractor`]):
|
135
171
|
Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.
|
@@ -146,11 +182,11 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
146
182
|
self,
|
147
183
|
vae: AutoencoderKL,
|
148
184
|
text_encoder: ClapModel,
|
149
|
-
text_encoder_2: T5EncoderModel,
|
185
|
+
text_encoder_2: Union[T5EncoderModel, VitsModel],
|
150
186
|
projection_model: AudioLDM2ProjectionModel,
|
151
187
|
language_model: GPT2Model,
|
152
188
|
tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
|
153
|
-
tokenizer_2: Union[T5Tokenizer, T5TokenizerFast],
|
189
|
+
tokenizer_2: Union[T5Tokenizer, T5TokenizerFast, VitsTokenizer],
|
154
190
|
feature_extractor: ClapFeatureExtractor,
|
155
191
|
unet: AudioLDM2UNet2DConditionModel,
|
156
192
|
scheduler: KarrasDiffusionSchedulers,
|
@@ -237,7 +273,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
237
273
|
Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.
|
238
274
|
|
239
275
|
Parameters:
|
240
|
-
inputs_embeds (`torch.
|
276
|
+
inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
241
277
|
The sequence used as a prompt for the generation.
|
242
278
|
max_new_tokens (`int`):
|
243
279
|
Number of new tokens to generate.
|
@@ -246,7 +282,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
246
282
|
function of the model.
|
247
283
|
|
248
284
|
Return:
|
249
|
-
`inputs_embeds (`torch.
|
285
|
+
`inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
250
286
|
The sequence of generated hidden-states.
|
251
287
|
"""
|
252
288
|
max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
|
@@ -273,11 +309,12 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
273
309
|
device,
|
274
310
|
num_waveforms_per_prompt,
|
275
311
|
do_classifier_free_guidance,
|
312
|
+
transcription=None,
|
276
313
|
negative_prompt=None,
|
277
|
-
prompt_embeds: Optional[torch.
|
278
|
-
negative_prompt_embeds: Optional[torch.
|
279
|
-
generated_prompt_embeds: Optional[torch.
|
280
|
-
negative_generated_prompt_embeds: Optional[torch.
|
314
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
315
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
316
|
+
generated_prompt_embeds: Optional[torch.Tensor] = None,
|
317
|
+
negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
|
281
318
|
attention_mask: Optional[torch.LongTensor] = None,
|
282
319
|
negative_attention_mask: Optional[torch.LongTensor] = None,
|
283
320
|
max_new_tokens: Optional[int] = None,
|
@@ -288,6 +325,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
288
325
|
Args:
|
289
326
|
prompt (`str` or `List[str]`, *optional*):
|
290
327
|
prompt to be encoded
|
328
|
+
transcription (`str` or `List[str]`):
|
329
|
+
transcription of text to speech
|
291
330
|
device (`torch.device`):
|
292
331
|
torch device
|
293
332
|
num_waveforms_per_prompt (`int`):
|
@@ -298,18 +337,18 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
298
337
|
The prompt or prompts not to guide the audio generation. If not defined, one has to pass
|
299
338
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
300
339
|
less than `1`).
|
301
|
-
prompt_embeds (`torch.
|
340
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
302
341
|
Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
|
303
342
|
prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
|
304
|
-
negative_prompt_embeds (`torch.
|
343
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
305
344
|
Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
|
306
345
|
*e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
|
307
346
|
`negative_prompt` input argument.
|
308
|
-
generated_prompt_embeds (`torch.
|
347
|
+
generated_prompt_embeds (`torch.Tensor`, *optional*):
|
309
348
|
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
|
310
349
|
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
|
311
350
|
argument.
|
312
|
-
negative_generated_prompt_embeds (`torch.
|
351
|
+
negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
|
313
352
|
Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
|
314
353
|
inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
|
315
354
|
`negative_prompt` input argument.
|
@@ -322,11 +361,11 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
322
361
|
max_new_tokens (`int`, *optional*, defaults to None):
|
323
362
|
The number of new tokens to generate with the GPT2 language model.
|
324
363
|
Returns:
|
325
|
-
prompt_embeds (`torch.
|
364
|
+
prompt_embeds (`torch.Tensor`):
|
326
365
|
Text embeddings from the Flan T5 model.
|
327
366
|
attention_mask (`torch.LongTensor`):
|
328
367
|
Attention mask to be applied to the `prompt_embeds`.
|
329
|
-
generated_prompt_embeds (`torch.
|
368
|
+
generated_prompt_embeds (`torch.Tensor`):
|
330
369
|
Text embeddings generated from the GPT2 langauge model.
|
331
370
|
|
332
371
|
Example:
|
@@ -368,16 +407,26 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
368
407
|
|
369
408
|
# Define tokenizers and text encoders
|
370
409
|
tokenizers = [self.tokenizer, self.tokenizer_2]
|
371
|
-
|
410
|
+
is_vits_text_encoder = isinstance(self.text_encoder_2, VitsModel)
|
411
|
+
|
412
|
+
if is_vits_text_encoder:
|
413
|
+
text_encoders = [self.text_encoder, self.text_encoder_2.text_encoder]
|
414
|
+
else:
|
415
|
+
text_encoders = [self.text_encoder, self.text_encoder_2]
|
372
416
|
|
373
417
|
if prompt_embeds is None:
|
374
418
|
prompt_embeds_list = []
|
375
419
|
attention_mask_list = []
|
376
420
|
|
377
421
|
for tokenizer, text_encoder in zip(tokenizers, text_encoders):
|
422
|
+
use_prompt = isinstance(
|
423
|
+
tokenizer, (RobertaTokenizer, RobertaTokenizerFast, T5Tokenizer, T5TokenizerFast)
|
424
|
+
)
|
378
425
|
text_inputs = tokenizer(
|
379
|
-
prompt,
|
380
|
-
padding="max_length"
|
426
|
+
prompt if use_prompt else transcription,
|
427
|
+
padding="max_length"
|
428
|
+
if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
|
429
|
+
else True,
|
381
430
|
max_length=tokenizer.model_max_length,
|
382
431
|
truncation=True,
|
383
432
|
return_tensors="pt",
|
@@ -407,6 +456,18 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
407
456
|
prompt_embeds = prompt_embeds[:, None, :]
|
408
457
|
# make sure that we attend to this single hidden-state
|
409
458
|
attention_mask = attention_mask.new_ones((batch_size, 1))
|
459
|
+
elif is_vits_text_encoder:
|
460
|
+
# Add end_token_id and attention mask in the end of sequence phonemes
|
461
|
+
for text_input_id, text_attention_mask in zip(text_input_ids, attention_mask):
|
462
|
+
for idx, phoneme_id in enumerate(text_input_id):
|
463
|
+
if phoneme_id == 0:
|
464
|
+
text_input_id[idx] = 182
|
465
|
+
text_attention_mask[idx] = 1
|
466
|
+
break
|
467
|
+
prompt_embeds = text_encoder(
|
468
|
+
text_input_ids, attention_mask=attention_mask, padding_mask=attention_mask.unsqueeze(-1)
|
469
|
+
)
|
470
|
+
prompt_embeds = prompt_embeds[0]
|
410
471
|
else:
|
411
472
|
prompt_embeds = text_encoder(
|
412
473
|
text_input_ids,
|
@@ -485,7 +546,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
485
546
|
uncond_tokens,
|
486
547
|
padding="max_length",
|
487
548
|
max_length=tokenizer.model_max_length
|
488
|
-
if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
|
549
|
+
if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
|
489
550
|
else max_length,
|
490
551
|
truncation=True,
|
491
552
|
return_tensors="pt",
|
@@ -503,6 +564,15 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
503
564
|
negative_prompt_embeds = negative_prompt_embeds[:, None, :]
|
504
565
|
# make sure that we attend to this single hidden-state
|
505
566
|
negative_attention_mask = negative_attention_mask.new_ones((batch_size, 1))
|
567
|
+
elif is_vits_text_encoder:
|
568
|
+
negative_prompt_embeds = torch.zeros(
|
569
|
+
batch_size,
|
570
|
+
tokenizer.model_max_length,
|
571
|
+
text_encoder.config.hidden_size,
|
572
|
+
).to(dtype=self.text_encoder_2.dtype, device=device)
|
573
|
+
negative_attention_mask = torch.zeros(batch_size, tokenizer.model_max_length).to(
|
574
|
+
dtype=self.text_encoder_2.dtype, device=device
|
575
|
+
)
|
506
576
|
else:
|
507
577
|
negative_prompt_embeds = text_encoder(
|
508
578
|
uncond_input_ids,
|
@@ -623,6 +693,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
623
693
|
audio_length_in_s,
|
624
694
|
vocoder_upsample_factor,
|
625
695
|
callback_steps,
|
696
|
+
transcription=None,
|
626
697
|
negative_prompt=None,
|
627
698
|
prompt_embeds=None,
|
628
699
|
negative_prompt_embeds=None,
|
@@ -690,6 +761,14 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
690
761
|
f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
|
691
762
|
)
|
692
763
|
|
764
|
+
if transcription is None:
|
765
|
+
if self.text_encoder_2.config.model_type == "vits":
|
766
|
+
raise ValueError("Cannot forward without transcription. Please make sure to" " have transcription")
|
767
|
+
elif transcription is not None and (
|
768
|
+
not isinstance(transcription, str) and not isinstance(transcription, list)
|
769
|
+
):
|
770
|
+
raise ValueError(f"`transcription` has to be of type `str` or `list` but is {type(transcription)}")
|
771
|
+
|
693
772
|
if generated_prompt_embeds is not None and negative_generated_prompt_embeds is not None:
|
694
773
|
if generated_prompt_embeds.shape != negative_generated_prompt_embeds.shape:
|
695
774
|
raise ValueError(
|
@@ -711,8 +790,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
711
790
|
shape = (
|
712
791
|
batch_size,
|
713
792
|
num_channels_latents,
|
714
|
-
height // self.vae_scale_factor,
|
715
|
-
self.vocoder.config.model_in_dim // self.vae_scale_factor,
|
793
|
+
int(height) // self.vae_scale_factor,
|
794
|
+
int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
|
716
795
|
)
|
717
796
|
if isinstance(generator, list) and len(generator) != batch_size:
|
718
797
|
raise ValueError(
|
@@ -734,6 +813,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
734
813
|
def __call__(
|
735
814
|
self,
|
736
815
|
prompt: Union[str, List[str]] = None,
|
816
|
+
transcription: Union[str, List[str]] = None,
|
737
817
|
audio_length_in_s: Optional[float] = None,
|
738
818
|
num_inference_steps: int = 200,
|
739
819
|
guidance_scale: float = 3.5,
|
@@ -741,16 +821,16 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
741
821
|
num_waveforms_per_prompt: Optional[int] = 1,
|
742
822
|
eta: float = 0.0,
|
743
823
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
744
|
-
latents: Optional[torch.
|
745
|
-
prompt_embeds: Optional[torch.
|
746
|
-
negative_prompt_embeds: Optional[torch.
|
747
|
-
generated_prompt_embeds: Optional[torch.
|
748
|
-
negative_generated_prompt_embeds: Optional[torch.
|
824
|
+
latents: Optional[torch.Tensor] = None,
|
825
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
826
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
827
|
+
generated_prompt_embeds: Optional[torch.Tensor] = None,
|
828
|
+
negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
|
749
829
|
attention_mask: Optional[torch.LongTensor] = None,
|
750
830
|
negative_attention_mask: Optional[torch.LongTensor] = None,
|
751
831
|
max_new_tokens: Optional[int] = None,
|
752
832
|
return_dict: bool = True,
|
753
|
-
callback: Optional[Callable[[int, int, torch.
|
833
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
754
834
|
callback_steps: Optional[int] = 1,
|
755
835
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
756
836
|
output_type: Optional[str] = "np",
|
@@ -761,6 +841,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
761
841
|
Args:
|
762
842
|
prompt (`str` or `List[str]`, *optional*):
|
763
843
|
The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
|
844
|
+
transcription (`str` or `List[str]`, *optional*):\
|
845
|
+
The transcript for text to speech.
|
764
846
|
audio_length_in_s (`int`, *optional*, defaults to 10.24):
|
765
847
|
The length of the generated audio sample in seconds.
|
766
848
|
num_inference_steps (`int`, *optional*, defaults to 200):
|
@@ -783,21 +865,21 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
783
865
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
784
866
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
785
867
|
generation deterministic.
|
786
|
-
latents (`torch.
|
868
|
+
latents (`torch.Tensor`, *optional*):
|
787
869
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
|
788
870
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
789
871
|
tensor is generated by sampling using the supplied random `generator`.
|
790
|
-
prompt_embeds (`torch.
|
872
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
791
873
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
792
874
|
provided, text embeddings are generated from the `prompt` input argument.
|
793
|
-
negative_prompt_embeds (`torch.
|
875
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
794
876
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
795
877
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
796
|
-
generated_prompt_embeds (`torch.
|
878
|
+
generated_prompt_embeds (`torch.Tensor`, *optional*):
|
797
879
|
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
|
798
880
|
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
|
799
881
|
argument.
|
800
|
-
negative_generated_prompt_embeds (`torch.
|
882
|
+
negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
|
801
883
|
Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
|
802
884
|
inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
|
803
885
|
`negative_prompt` input argument.
|
@@ -815,7 +897,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
815
897
|
plain tuple.
|
816
898
|
callback (`Callable`, *optional*):
|
817
899
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
818
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
900
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
819
901
|
callback_steps (`int`, *optional*, defaults to 1):
|
820
902
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
821
903
|
every step.
|
@@ -857,6 +939,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
857
939
|
audio_length_in_s,
|
858
940
|
vocoder_upsample_factor,
|
859
941
|
callback_steps,
|
942
|
+
transcription,
|
860
943
|
negative_prompt,
|
861
944
|
prompt_embeds,
|
862
945
|
negative_prompt_embeds,
|
@@ -886,6 +969,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
886
969
|
device,
|
887
970
|
num_waveforms_per_prompt,
|
888
971
|
do_classifier_free_guidance,
|
972
|
+
transcription,
|
889
973
|
negative_prompt,
|
890
974
|
prompt_embeds=prompt_embeds,
|
891
975
|
negative_prompt_embeds=negative_prompt_embeds,
|
@@ -45,7 +45,8 @@ from .kandinsky2_2 import (
|
|
45
45
|
)
|
46
46
|
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
|
47
47
|
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
|
48
|
-
from .pixart_alpha import PixArtAlphaPipeline
|
48
|
+
from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
|
49
|
+
from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
|
49
50
|
from .stable_diffusion import (
|
50
51
|
StableDiffusionImg2ImgPipeline,
|
51
52
|
StableDiffusionInpaintPipeline,
|
@@ -70,8 +71,10 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
70
71
|
("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
|
71
72
|
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
|
72
73
|
("wuerstchen", WuerstchenCombinedPipeline),
|
74
|
+
("cascade", StableCascadeCombinedPipeline),
|
73
75
|
("lcm", LatentConsistencyModelPipeline),
|
74
|
-
("pixart", PixArtAlphaPipeline),
|
76
|
+
("pixart-alpha", PixArtAlphaPipeline),
|
77
|
+
("pixart-sigma", PixArtSigmaPipeline),
|
75
78
|
]
|
76
79
|
)
|
77
80
|
|
@@ -106,6 +109,7 @@ _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
|
|
106
109
|
("kandinsky", KandinskyPipeline),
|
107
110
|
("kandinsky22", KandinskyV22Pipeline),
|
108
111
|
("wuerstchen", WuerstchenDecoderPipeline),
|
112
|
+
("cascade", StableCascadeDecoderPipeline),
|
109
113
|
]
|
110
114
|
)
|
111
115
|
_AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
|
@@ -213,7 +217,7 @@ class AutoPipelineForText2Image(ConfigMixin):
|
|
213
217
|
```
|
214
218
|
|
215
219
|
Parameters:
|
216
|
-
|
220
|
+
pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
|
217
221
|
Can be either:
|
218
222
|
|
219
223
|
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
|
@@ -230,9 +234,9 @@ class AutoPipelineForText2Image(ConfigMixin):
|
|
230
234
|
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
231
235
|
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
232
236
|
is not used.
|
233
|
-
resume_download
|
234
|
-
|
235
|
-
|
237
|
+
resume_download:
|
238
|
+
Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
|
239
|
+
of Diffusers.
|
236
240
|
proxies (`Dict[str, str]`, *optional*):
|
237
241
|
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
|
238
242
|
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
@@ -307,7 +311,7 @@ class AutoPipelineForText2Image(ConfigMixin):
|
|
307
311
|
"""
|
308
312
|
cache_dir = kwargs.pop("cache_dir", None)
|
309
313
|
force_download = kwargs.pop("force_download", False)
|
310
|
-
resume_download = kwargs.pop("resume_download",
|
314
|
+
resume_download = kwargs.pop("resume_download", None)
|
311
315
|
proxies = kwargs.pop("proxies", None)
|
312
316
|
token = kwargs.pop("token", None)
|
313
317
|
local_files_only = kwargs.pop("local_files_only", False)
|
@@ -486,7 +490,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
|
486
490
|
```
|
487
491
|
|
488
492
|
Parameters:
|
489
|
-
|
493
|
+
pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
|
490
494
|
Can be either:
|
491
495
|
|
492
496
|
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
|
@@ -503,9 +507,9 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
|
503
507
|
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
504
508
|
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
505
509
|
is not used.
|
506
|
-
resume_download
|
507
|
-
|
508
|
-
|
510
|
+
resume_download:
|
511
|
+
Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
|
512
|
+
of Diffusers.
|
509
513
|
proxies (`Dict[str, str]`, *optional*):
|
510
514
|
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
|
511
515
|
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
@@ -580,7 +584,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
|
580
584
|
"""
|
581
585
|
cache_dir = kwargs.pop("cache_dir", None)
|
582
586
|
force_download = kwargs.pop("force_download", False)
|
583
|
-
resume_download = kwargs.pop("resume_download",
|
587
|
+
resume_download = kwargs.pop("resume_download", None)
|
584
588
|
proxies = kwargs.pop("proxies", None)
|
585
589
|
token = kwargs.pop("token", None)
|
586
590
|
local_files_only = kwargs.pop("local_files_only", False)
|
@@ -762,7 +766,7 @@ class AutoPipelineForInpainting(ConfigMixin):
|
|
762
766
|
```
|
763
767
|
|
764
768
|
Parameters:
|
765
|
-
|
769
|
+
pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
|
766
770
|
Can be either:
|
767
771
|
|
768
772
|
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
|
@@ -779,9 +783,9 @@ class AutoPipelineForInpainting(ConfigMixin):
|
|
779
783
|
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
780
784
|
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
781
785
|
is not used.
|
782
|
-
resume_download
|
783
|
-
|
784
|
-
|
786
|
+
resume_download:
|
787
|
+
Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
|
788
|
+
of Diffusers.
|
785
789
|
proxies (`Dict[str, str]`, *optional*):
|
786
790
|
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
|
787
791
|
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
@@ -856,7 +860,7 @@ class AutoPipelineForInpainting(ConfigMixin):
|
|
856
860
|
"""
|
857
861
|
cache_dir = kwargs.pop("cache_dir", None)
|
858
862
|
force_download = kwargs.pop("force_download", False)
|
859
|
-
resume_download = kwargs.pop("resume_download",
|
863
|
+
resume_download = kwargs.pop("resume_download", None)
|
860
864
|
proxies = kwargs.pop("proxies", None)
|
861
865
|
token = kwargs.pop("token", None)
|
862
866
|
local_files_only = kwargs.pop("local_files_only", False)
|
@@ -298,7 +298,7 @@ class BlipImageProcessor(BaseImageProcessor):
|
|
298
298
|
return encoded_outputs
|
299
299
|
|
300
300
|
# Follows diffusers.VaeImageProcessor.postprocess
|
301
|
-
def postprocess(self, sample: torch.
|
301
|
+
def postprocess(self, sample: torch.Tensor, output_type: str = "pil"):
|
302
302
|
if output_type not in ["pt", "np", "pil"]:
|
303
303
|
raise ValueError(
|
304
304
|
f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
|
@@ -117,7 +117,7 @@ class Blip2VisionEmbeddings(nn.Module):
|
|
117
117
|
|
118
118
|
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
|
119
119
|
|
120
|
-
def forward(self, pixel_values: torch.
|
120
|
+
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
121
121
|
batch_size = pixel_values.shape[0]
|
122
122
|
target_dtype = self.patch_embedding.weight.dtype
|
123
123
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
@@ -376,7 +376,7 @@ class Blip2VisionModel(Blip2PreTrainedModel):
|
|
376
376
|
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
|
377
377
|
def forward(
|
378
378
|
self,
|
379
|
-
pixel_values: Optional[torch.
|
379
|
+
pixel_values: Optional[torch.Tensor] = None,
|
380
380
|
output_attentions: Optional[bool] = None,
|
381
381
|
output_hidden_states: Optional[bool] = None,
|
382
382
|
return_dict: Optional[bool] = None,
|
@@ -524,15 +524,15 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
|
524
524
|
return_dict=None,
|
525
525
|
):
|
526
526
|
r"""
|
527
|
-
encoder_hidden_states (`torch.
|
527
|
+
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
|
528
528
|
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
529
529
|
the model is configured as a decoder.
|
530
|
-
encoder_attention_mask (`torch.
|
530
|
+
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
|
531
531
|
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
532
532
|
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
|
533
533
|
- 1 for tokens that are **not masked**,
|
534
534
|
- 0 for tokens that are **masked**.
|
535
|
-
past_key_values (`tuple(tuple(torch.
|
535
|
+
past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
|
536
536
|
shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
|
537
537
|
value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
|
538
538
|
used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
|
@@ -186,7 +186,7 @@ class ContextCLIPTextEmbeddings(nn.Module):
|
|
186
186
|
ctx_begin_pos: list,
|
187
187
|
input_ids: Optional[torch.LongTensor] = None,
|
188
188
|
position_ids: Optional[torch.LongTensor] = None,
|
189
|
-
inputs_embeds: Optional[torch.
|
189
|
+
inputs_embeds: Optional[torch.Tensor] = None,
|
190
190
|
) -> torch.Tensor:
|
191
191
|
if ctx_embeddings is None:
|
192
192
|
ctx_len = 0
|
@@ -191,7 +191,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
|
|
191
191
|
reference_image: PIL.Image.Image,
|
192
192
|
source_subject_category: List[str],
|
193
193
|
target_subject_category: List[str],
|
194
|
-
latents: Optional[torch.
|
194
|
+
latents: Optional[torch.Tensor] = None,
|
195
195
|
guidance_scale: float = 7.5,
|
196
196
|
height: int = 512,
|
197
197
|
width: int = 512,
|
@@ -215,7 +215,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
|
|
215
215
|
The source subject category.
|
216
216
|
target_subject_category (`List[str]`):
|
217
217
|
The target subject category.
|
218
|
-
latents (`torch.
|
218
|
+
latents (`torch.Tensor`, *optional*):
|
219
219
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
220
220
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
221
221
|
tensor will ge generated by random sampling.
|
@@ -105,7 +105,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
|
|
105
105
|
return latents
|
106
106
|
|
107
107
|
# Follows diffusers.VaeImageProcessor.postprocess
|
108
|
-
def postprocess_image(self, sample: torch.
|
108
|
+
def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"):
|
109
109
|
if output_type not in ["pt", "np", "pil"]:
|
110
110
|
raise ValueError(
|
111
111
|
f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
|
@@ -173,10 +173,10 @@ class ConsistencyModelPipeline(DiffusionPipeline):
|
|
173
173
|
num_inference_steps: int = 1,
|
174
174
|
timesteps: List[int] = None,
|
175
175
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
176
|
-
latents: Optional[torch.
|
176
|
+
latents: Optional[torch.Tensor] = None,
|
177
177
|
output_type: Optional[str] = "pil",
|
178
178
|
return_dict: bool = True,
|
179
|
-
callback: Optional[Callable[[int, int, torch.
|
179
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
180
180
|
callback_steps: int = 1,
|
181
181
|
):
|
182
182
|
r"""
|
@@ -195,7 +195,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
|
|
195
195
|
generator (`torch.Generator`, *optional*):
|
196
196
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
197
197
|
generation deterministic.
|
198
|
-
latents (`torch.
|
198
|
+
latents (`torch.Tensor`, *optional*):
|
199
199
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
200
200
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
201
201
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -205,7 +205,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
|
|
205
205
|
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
|
206
206
|
callback (`Callable`, *optional*):
|
207
207
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
208
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
208
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
209
209
|
callback_steps (`int`, *optional*, defaults to 1):
|
210
210
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
211
211
|
every step.
|
@@ -31,7 +31,7 @@ class MultiControlNetModel(ModelMixin):
|
|
31
31
|
|
32
32
|
def forward(
|
33
33
|
self,
|
34
|
-
sample: torch.
|
34
|
+
sample: torch.Tensor,
|
35
35
|
timestep: Union[torch.Tensor, float, int],
|
36
36
|
encoder_hidden_states: torch.Tensor,
|
37
37
|
controlnet_cond: List[torch.tensor],
|
@@ -100,20 +100,16 @@ class MultiControlNetModel(ModelMixin):
|
|
100
100
|
variant (`str`, *optional*):
|
101
101
|
If specified, weights are saved in the format pytorch_model.<variant>.bin.
|
102
102
|
"""
|
103
|
-
idx
|
104
|
-
|
105
|
-
for controlnet in self.nets:
|
103
|
+
for idx, controlnet in enumerate(self.nets):
|
104
|
+
suffix = "" if idx == 0 else f"_{idx}"
|
106
105
|
controlnet.save_pretrained(
|
107
|
-
|
106
|
+
save_directory + suffix,
|
108
107
|
is_main_process=is_main_process,
|
109
108
|
save_function=save_function,
|
110
109
|
safe_serialization=safe_serialization,
|
111
110
|
variant=variant,
|
112
111
|
)
|
113
112
|
|
114
|
-
idx += 1
|
115
|
-
model_path_to_save = model_path_to_save + f"_{idx}"
|
116
|
-
|
117
113
|
@classmethod
|
118
114
|
def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
|
119
115
|
r"""
|