diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +9 -1
- diffusers/commands/env.py +1 -5
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +2 -1
- diffusers/loaders/__init__.py +2 -2
- diffusers/loaders/lora.py +406 -140
- diffusers/loaders/lora_conversion_utils.py +7 -1
- diffusers/loaders/single_file.py +1 -1
- diffusers/loaders/single_file_model.py +5 -0
- diffusers/loaders/single_file_utils.py +242 -2
- diffusers/loaders/unet.py +307 -272
- diffusers/models/__init__.py +5 -3
- diffusers/models/attention.py +125 -1
- diffusers/models/attention_processor.py +169 -1
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
- diffusers/models/autoencoders/autoencoder_kl.py +17 -6
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
- diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
- diffusers/models/autoencoders/vq_model.py +182 -0
- diffusers/models/controlnet_xs.py +6 -6
- diffusers/models/embeddings.py +112 -84
- diffusers/models/model_loading_utils.py +55 -0
- diffusers/models/modeling_utils.py +128 -17
- diffusers/models/normalization.py +11 -6
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/dual_transformer_2d.py +5 -4
- diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
- diffusers/models/transformers/prior_transformer.py +5 -5
- diffusers/models/transformers/transformer_2d.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +344 -0
- diffusers/models/transformers/transformer_temporal.py +12 -10
- diffusers/models/unets/unet_1d.py +3 -3
- diffusers/models/unets/unet_2d.py +3 -3
- diffusers/models/unets/unet_2d_condition.py +4 -15
- diffusers/models/unets/unet_3d_condition.py +5 -17
- diffusers/models/unets/unet_i2vgen_xl.py +4 -4
- diffusers/models/unets/unet_motion_model.py +4 -4
- diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
- diffusers/models/vq_model.py +8 -165
- diffusers/pipelines/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
- diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
- diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
- diffusers/pipelines/pia/pipeline_pia.py +4 -3
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +886 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +923 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
- diffusers/schedulers/__init__.py +2 -0
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
- diffusers/schedulers/scheduling_edm_euler.py +2 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/training_utils.py +4 -4
- diffusers/utils/__init__.py +3 -0
- diffusers/utils/constants.py +2 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0
- diffusers/utils/dynamic_modules_utils.py +15 -13
- diffusers/utils/hub_utils.py +106 -0
- diffusers/utils/import_utils.py +0 -1
- diffusers/utils/logging.py +3 -1
- diffusers/utils/state_dict_utils.py +2 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/METADATA +45 -45
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/RECORD +108 -111
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/WHEEL +1 -1
- diffusers/models/dual_transformer_2d.py +0 -20
- diffusers/models/prior_transformer.py +0 -12
- diffusers/models/t5_film_transformer.py +0 -70
- diffusers/models/transformer_2d.py +0 -25
- diffusers/models/transformer_temporal.py +0 -34
- diffusers/models/unet_1d.py +0 -26
- diffusers/models/unet_1d_blocks.py +0 -203
- diffusers/models/unet_2d.py +0 -27
- diffusers/models/unet_2d_blocks.py +0 -375
- diffusers/models/unet_2d_condition.py +0 -25
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/LICENSE +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/top_level.txt +0 -0
@@ -455,9 +455,10 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
|
|
455
455
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
457
457
|
|
458
|
-
if
|
459
|
-
|
460
|
-
|
458
|
+
if self.text_encoder is not None:
|
459
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
460
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
461
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
461
462
|
|
462
463
|
return prompt_embeds, negative_prompt_embeds
|
463
464
|
|
@@ -85,10 +85,9 @@ EXAMPLE_DOC_STRING = """
|
|
85
85
|
|
86
86
|
>>> init_image = download_image(img_url).resize((768, 768))
|
87
87
|
|
88
|
-
>>>
|
88
|
+
>>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
|
89
89
|
... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
|
90
90
|
... )
|
91
|
-
>>> pipe = pipe.to("cuda")
|
92
91
|
|
93
92
|
>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
94
93
|
>>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
|
@@ -97,9 +96,9 @@ EXAMPLE_DOC_STRING = """
|
|
97
96
|
>>> mask_prompt = "A bowl of fruits"
|
98
97
|
>>> prompt = "A bowl of pears"
|
99
98
|
|
100
|
-
>>> mask_image =
|
101
|
-
>>> image_latents =
|
102
|
-
>>> image =
|
99
|
+
>>> mask_image = pipeline.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
|
100
|
+
>>> image_latents = pipeline.invert(image=init_image, prompt=mask_prompt).latents
|
101
|
+
>>> image = pipeline(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
|
103
102
|
```
|
104
103
|
"""
|
105
104
|
|
@@ -122,10 +121,9 @@ EXAMPLE_INVERT_DOC_STRING = """
|
|
122
121
|
|
123
122
|
>>> init_image = download_image(img_url).resize((768, 768))
|
124
123
|
|
125
|
-
>>>
|
124
|
+
>>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
|
126
125
|
... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
|
127
126
|
... )
|
128
|
-
>>> pipe = pipe.to("cuda")
|
129
127
|
|
130
128
|
>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
131
129
|
>>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
|
@@ -133,7 +131,7 @@ EXAMPLE_INVERT_DOC_STRING = """
|
|
133
131
|
|
134
132
|
>>> prompt = "A bowl of fruits"
|
135
133
|
|
136
|
-
>>> inverted_latents =
|
134
|
+
>>> inverted_latents = pipeline.invert(image=init_image, prompt=prompt).latents
|
137
135
|
```
|
138
136
|
"""
|
139
137
|
|
@@ -582,9 +580,10 @@ class StableDiffusionDiffEditPipeline(
|
|
582
580
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
583
581
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
584
582
|
|
585
|
-
if
|
586
|
-
|
587
|
-
|
583
|
+
if self.text_encoder is not None:
|
584
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
585
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
586
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
588
587
|
|
589
588
|
return prompt_embeds, negative_prompt_embeds
|
590
589
|
|
@@ -381,9 +381,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
381
381
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
382
382
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
383
383
|
|
384
|
-
if
|
385
|
-
|
386
|
-
|
384
|
+
if self.text_encoder is not None:
|
385
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
386
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
387
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
387
388
|
|
388
389
|
return prompt_embeds, negative_prompt_embeds
|
389
390
|
|
@@ -406,9 +406,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
406
406
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
407
407
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
408
408
|
|
409
|
-
if
|
410
|
-
|
411
|
-
|
409
|
+
if self.text_encoder is not None:
|
410
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
411
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
412
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
412
413
|
|
413
414
|
return prompt_embeds, negative_prompt_embeds
|
414
415
|
|
@@ -355,9 +355,10 @@ class StableDiffusionKDiffusionPipeline(
|
|
355
355
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
356
356
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
357
357
|
|
358
|
-
if
|
359
|
-
|
360
|
-
|
358
|
+
if self.text_encoder is not None:
|
359
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
360
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
361
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
361
362
|
|
362
363
|
return prompt_embeds, negative_prompt_embeds
|
363
364
|
|
@@ -455,9 +455,10 @@ class StableDiffusionLDM3DPipeline(
|
|
455
455
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
457
457
|
|
458
|
-
if
|
459
|
-
|
460
|
-
|
458
|
+
if self.text_encoder is not None:
|
459
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
460
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
461
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
461
462
|
|
462
463
|
return prompt_embeds, negative_prompt_embeds
|
463
464
|
|
@@ -427,9 +427,10 @@ class StableDiffusionPanoramaPipeline(
|
|
427
427
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
428
428
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
429
429
|
|
430
|
-
if
|
431
|
-
|
432
|
-
|
430
|
+
if self.text_encoder is not None:
|
431
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
432
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
433
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
433
434
|
|
434
435
|
return prompt_embeds, negative_prompt_embeds
|
435
436
|
|
@@ -370,9 +370,10 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
370
370
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
371
371
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
372
372
|
|
373
|
-
if
|
374
|
-
|
375
|
-
|
373
|
+
if self.text_encoder is not None:
|
374
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
375
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
376
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
376
377
|
|
377
378
|
return prompt_embeds, negative_prompt_embeds
|
378
379
|
|
@@ -472,9 +472,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
472
472
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
473
473
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
474
474
|
|
475
|
-
if
|
476
|
-
|
477
|
-
|
475
|
+
if self.text_encoder is not None:
|
476
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
477
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
478
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
478
479
|
|
479
480
|
return prompt_embeds, negative_prompt_embeds
|
480
481
|
|
@@ -315,9 +315,10 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
315
315
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
316
316
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
317
317
|
|
318
|
-
if
|
319
|
-
|
320
|
-
|
318
|
+
if self.text_encoder is not None:
|
319
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
320
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
321
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
321
322
|
|
322
323
|
return prompt_embeds, negative_prompt_embeds
|
323
324
|
|
@@ -350,9 +350,10 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
350
350
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
351
351
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
352
352
|
|
353
|
-
if
|
354
|
-
|
355
|
-
|
353
|
+
if self.text_encoder is not None:
|
354
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
355
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
356
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
356
357
|
|
357
358
|
return prompt_embeds, negative_prompt_embeds
|
358
359
|
|
@@ -963,9 +963,10 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
|
|
963
963
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
964
964
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
965
965
|
|
966
|
-
if
|
967
|
-
|
968
|
-
|
966
|
+
if self.text_encoder is not None:
|
967
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
968
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
969
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
969
970
|
|
970
971
|
return prompt_embeds, negative_prompt_embeds
|
971
972
|
|
@@ -9,8 +9,8 @@ from ...models import ModelMixin
|
|
9
9
|
from ...models.attention import FeedForward
|
10
10
|
from ...models.attention_processor import Attention
|
11
11
|
from ...models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed
|
12
|
+
from ...models.modeling_outputs import Transformer2DModelOutput
|
12
13
|
from ...models.normalization import AdaLayerNorm
|
13
|
-
from ...models.transformers.transformer_2d import Transformer2DModelOutput
|
14
14
|
from ...utils import logging
|
15
15
|
|
16
16
|
|
@@ -554,9 +554,10 @@ class UniDiffuserPipeline(DiffusionPipeline):
|
|
554
554
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
555
555
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
556
556
|
|
557
|
-
if
|
558
|
-
|
559
|
-
|
557
|
+
if self.text_encoder is not None:
|
558
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
559
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
560
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
560
561
|
|
561
562
|
return prompt_embeds, negative_prompt_embeds
|
562
563
|
|
diffusers/schedulers/__init__.py
CHANGED
@@ -56,6 +56,7 @@ else:
|
|
56
56
|
_import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
|
57
57
|
_import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
|
58
58
|
_import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
|
59
|
+
_import_structure["scheduling_flow_match_euler_discrete"] = ["FlowMatchEulerDiscreteScheduler"]
|
59
60
|
_import_structure["scheduling_heun_discrete"] = ["HeunDiscreteScheduler"]
|
60
61
|
_import_structure["scheduling_ipndm"] = ["IPNDMScheduler"]
|
61
62
|
_import_structure["scheduling_k_dpm_2_ancestral_discrete"] = ["KDPM2AncestralDiscreteScheduler"]
|
@@ -151,6 +152,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
151
152
|
from .scheduling_edm_euler import EDMEulerScheduler
|
152
153
|
from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
|
153
154
|
from .scheduling_euler_discrete import EulerDiscreteScheduler
|
155
|
+
from .scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
|
154
156
|
from .scheduling_heun_discrete import HeunDiscreteScheduler
|
155
157
|
from .scheduling_ipndm import IPNDMScheduler
|
156
158
|
from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
|
@@ -370,7 +370,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
|
370
370
|
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sig_proposed])
|
371
371
|
return timesteps
|
372
372
|
|
373
|
-
#
|
373
|
+
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
374
374
|
def _sigma_to_t(self, sigma, log_sigmas):
|
375
375
|
# get log sigma
|
376
376
|
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
@@ -394,7 +394,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
|
394
394
|
t = t.reshape(sigma.shape)
|
395
395
|
return t
|
396
396
|
|
397
|
-
# copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
|
397
|
+
# copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
398
398
|
def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
|
399
399
|
"""Constructs the noise schedule of Karras et al. (2022)."""
|
400
400
|
|
@@ -243,13 +243,13 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
|
243
243
|
|
244
244
|
self.num_inference_steps = num_inference_steps
|
245
245
|
|
246
|
-
ramp =
|
246
|
+
ramp = torch.linspace(0, 1, self.num_inference_steps)
|
247
247
|
if self.config.sigma_schedule == "karras":
|
248
248
|
sigmas = self._compute_karras_sigmas(ramp)
|
249
249
|
elif self.config.sigma_schedule == "exponential":
|
250
250
|
sigmas = self._compute_exponential_sigmas(ramp)
|
251
251
|
|
252
|
-
sigmas =
|
252
|
+
sigmas = sigmas.to(dtype=torch.float32, device=device)
|
253
253
|
self.timesteps = self.precondition_noise(sigmas)
|
254
254
|
|
255
255
|
if self.config.final_sigmas_type == "sigma_min":
|
@@ -283,7 +283,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
|
283
283
|
min_inv_rho = sigma_min ** (1 / rho)
|
284
284
|
max_inv_rho = sigma_max ** (1 / rho)
|
285
285
|
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
|
286
|
-
|
287
286
|
return sigmas
|
288
287
|
|
289
288
|
# Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
|
@@ -16,7 +16,6 @@ import math
|
|
16
16
|
from dataclasses import dataclass
|
17
17
|
from typing import Optional, Tuple, Union
|
18
18
|
|
19
|
-
import numpy as np
|
20
19
|
import torch
|
21
20
|
|
22
21
|
from ..configuration_utils import ConfigMixin, register_to_config
|
@@ -210,13 +209,13 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
|
210
209
|
"""
|
211
210
|
self.num_inference_steps = num_inference_steps
|
212
211
|
|
213
|
-
ramp =
|
212
|
+
ramp = torch.linspace(0, 1, self.num_inference_steps)
|
214
213
|
if self.config.sigma_schedule == "karras":
|
215
214
|
sigmas = self._compute_karras_sigmas(ramp)
|
216
215
|
elif self.config.sigma_schedule == "exponential":
|
217
216
|
sigmas = self._compute_exponential_sigmas(ramp)
|
218
217
|
|
219
|
-
sigmas =
|
218
|
+
sigmas = sigmas.to(dtype=torch.float32, device=device)
|
220
219
|
self.timesteps = self.precondition_noise(sigmas)
|
221
220
|
|
222
221
|
self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
|
@@ -234,7 +233,6 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
|
234
233
|
min_inv_rho = sigma_min ** (1 / rho)
|
235
234
|
max_inv_rho = sigma_max ** (1 / rho)
|
236
235
|
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
|
237
|
-
|
238
236
|
return sigmas
|
239
237
|
|
240
238
|
def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
|
@@ -0,0 +1,287 @@
|
|
1
|
+
# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from dataclasses import dataclass
|
16
|
+
from typing import Optional, Tuple, Union
|
17
|
+
|
18
|
+
import numpy as np
|
19
|
+
import torch
|
20
|
+
|
21
|
+
from ..configuration_utils import ConfigMixin, register_to_config
|
22
|
+
from ..utils import BaseOutput, logging
|
23
|
+
from ..utils.torch_utils import randn_tensor
|
24
|
+
from .scheduling_utils import SchedulerMixin
|
25
|
+
|
26
|
+
|
27
|
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
|
32
|
+
"""
|
33
|
+
Output class for the scheduler's `step` function output.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
|
37
|
+
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
|
38
|
+
denoising loop.
|
39
|
+
"""
|
40
|
+
|
41
|
+
prev_sample: torch.FloatTensor
|
42
|
+
|
43
|
+
|
44
|
+
class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
45
|
+
"""
|
46
|
+
Euler scheduler.
|
47
|
+
|
48
|
+
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
|
49
|
+
methods the library implements for all schedulers such as loading and saving.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
num_train_timesteps (`int`, defaults to 1000):
|
53
|
+
The number of diffusion steps to train the model.
|
54
|
+
timestep_spacing (`str`, defaults to `"linspace"`):
|
55
|
+
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
56
|
+
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
57
|
+
shift (`float`, defaults to 1.0):
|
58
|
+
The shift value for the timestep schedule.
|
59
|
+
"""
|
60
|
+
|
61
|
+
_compatibles = []
|
62
|
+
order = 1
|
63
|
+
|
64
|
+
@register_to_config
|
65
|
+
def __init__(
|
66
|
+
self,
|
67
|
+
num_train_timesteps: int = 1000,
|
68
|
+
shift: float = 1.0,
|
69
|
+
):
|
70
|
+
timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
|
71
|
+
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
|
72
|
+
|
73
|
+
sigmas = timesteps / num_train_timesteps
|
74
|
+
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
|
75
|
+
|
76
|
+
self.timesteps = sigmas * num_train_timesteps
|
77
|
+
|
78
|
+
self._step_index = None
|
79
|
+
self._begin_index = None
|
80
|
+
|
81
|
+
self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
82
|
+
self.sigma_min = self.sigmas[-1].item()
|
83
|
+
self.sigma_max = self.sigmas[0].item()
|
84
|
+
|
85
|
+
@property
|
86
|
+
def step_index(self):
|
87
|
+
"""
|
88
|
+
The index counter for current timestep. It will increase 1 after each scheduler step.
|
89
|
+
"""
|
90
|
+
return self._step_index
|
91
|
+
|
92
|
+
@property
|
93
|
+
def begin_index(self):
|
94
|
+
"""
|
95
|
+
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
|
96
|
+
"""
|
97
|
+
return self._begin_index
|
98
|
+
|
99
|
+
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
100
|
+
def set_begin_index(self, begin_index: int = 0):
|
101
|
+
"""
|
102
|
+
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
begin_index (`int`):
|
106
|
+
The begin index for the scheduler.
|
107
|
+
"""
|
108
|
+
self._begin_index = begin_index
|
109
|
+
|
110
|
+
def scale_noise(
|
111
|
+
self,
|
112
|
+
sample: torch.FloatTensor,
|
113
|
+
timestep: Union[float, torch.FloatTensor],
|
114
|
+
noise: Optional[torch.FloatTensor] = None,
|
115
|
+
) -> torch.FloatTensor:
|
116
|
+
"""
|
117
|
+
Foward process in flow-matching
|
118
|
+
|
119
|
+
Args:
|
120
|
+
sample (`torch.FloatTensor`):
|
121
|
+
The input sample.
|
122
|
+
timestep (`int`, *optional*):
|
123
|
+
The current timestep in the diffusion chain.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
`torch.FloatTensor`:
|
127
|
+
A scaled input sample.
|
128
|
+
"""
|
129
|
+
if self.step_index is None:
|
130
|
+
self._init_step_index(timestep)
|
131
|
+
|
132
|
+
sigma = self.sigmas[self.step_index]
|
133
|
+
sample = sigma * noise + (1.0 - sigma) * sample
|
134
|
+
|
135
|
+
return sample
|
136
|
+
|
137
|
+
def _sigma_to_t(self, sigma):
|
138
|
+
return sigma * self.config.num_train_timesteps
|
139
|
+
|
140
|
+
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
|
141
|
+
"""
|
142
|
+
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
143
|
+
|
144
|
+
Args:
|
145
|
+
num_inference_steps (`int`):
|
146
|
+
The number of diffusion steps used when generating samples with a pre-trained model.
|
147
|
+
device (`str` or `torch.device`, *optional*):
|
148
|
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
149
|
+
"""
|
150
|
+
self.num_inference_steps = num_inference_steps
|
151
|
+
|
152
|
+
timesteps = np.linspace(
|
153
|
+
self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
|
154
|
+
)
|
155
|
+
|
156
|
+
sigmas = timesteps / self.config.num_train_timesteps
|
157
|
+
sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
|
158
|
+
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
|
159
|
+
|
160
|
+
timesteps = sigmas * self.config.num_train_timesteps
|
161
|
+
self.timesteps = timesteps.to(device=device)
|
162
|
+
self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
|
163
|
+
|
164
|
+
self._step_index = None
|
165
|
+
self._begin_index = None
|
166
|
+
|
167
|
+
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
168
|
+
if schedule_timesteps is None:
|
169
|
+
schedule_timesteps = self.timesteps
|
170
|
+
|
171
|
+
indices = (schedule_timesteps == timestep).nonzero()
|
172
|
+
|
173
|
+
# The sigma index that is taken for the **very** first `step`
|
174
|
+
# is always the second index (or the last index if there is only 1)
|
175
|
+
# This way we can ensure we don't accidentally skip a sigma in
|
176
|
+
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
|
177
|
+
pos = 1 if len(indices) > 1 else 0
|
178
|
+
|
179
|
+
return indices[pos].item()
|
180
|
+
|
181
|
+
def _init_step_index(self, timestep):
|
182
|
+
if self.begin_index is None:
|
183
|
+
if isinstance(timestep, torch.Tensor):
|
184
|
+
timestep = timestep.to(self.timesteps.device)
|
185
|
+
self._step_index = self.index_for_timestep(timestep)
|
186
|
+
else:
|
187
|
+
self._step_index = self._begin_index
|
188
|
+
|
189
|
+
def step(
|
190
|
+
self,
|
191
|
+
model_output: torch.FloatTensor,
|
192
|
+
timestep: Union[float, torch.FloatTensor],
|
193
|
+
sample: torch.FloatTensor,
|
194
|
+
s_churn: float = 0.0,
|
195
|
+
s_tmin: float = 0.0,
|
196
|
+
s_tmax: float = float("inf"),
|
197
|
+
s_noise: float = 1.0,
|
198
|
+
generator: Optional[torch.Generator] = None,
|
199
|
+
return_dict: bool = True,
|
200
|
+
) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
|
201
|
+
"""
|
202
|
+
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
203
|
+
process from the learned model outputs (most often the predicted noise).
|
204
|
+
|
205
|
+
Args:
|
206
|
+
model_output (`torch.FloatTensor`):
|
207
|
+
The direct output from learned diffusion model.
|
208
|
+
timestep (`float`):
|
209
|
+
The current discrete timestep in the diffusion chain.
|
210
|
+
sample (`torch.FloatTensor`):
|
211
|
+
A current instance of a sample created by the diffusion process.
|
212
|
+
s_churn (`float`):
|
213
|
+
s_tmin (`float`):
|
214
|
+
s_tmax (`float`):
|
215
|
+
s_noise (`float`, defaults to 1.0):
|
216
|
+
Scaling factor for noise added to the sample.
|
217
|
+
generator (`torch.Generator`, *optional*):
|
218
|
+
A random number generator.
|
219
|
+
return_dict (`bool`):
|
220
|
+
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
|
221
|
+
tuple.
|
222
|
+
|
223
|
+
Returns:
|
224
|
+
[`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
|
225
|
+
If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
226
|
+
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
227
|
+
"""
|
228
|
+
|
229
|
+
if (
|
230
|
+
isinstance(timestep, int)
|
231
|
+
or isinstance(timestep, torch.IntTensor)
|
232
|
+
or isinstance(timestep, torch.LongTensor)
|
233
|
+
):
|
234
|
+
raise ValueError(
|
235
|
+
(
|
236
|
+
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
237
|
+
" `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
|
238
|
+
" one of the `scheduler.timesteps` as a timestep."
|
239
|
+
),
|
240
|
+
)
|
241
|
+
|
242
|
+
if self.step_index is None:
|
243
|
+
self._init_step_index(timestep)
|
244
|
+
|
245
|
+
# Upcast to avoid precision issues when computing prev_sample
|
246
|
+
sample = sample.to(torch.float32)
|
247
|
+
|
248
|
+
sigma = self.sigmas[self.step_index]
|
249
|
+
|
250
|
+
gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
|
251
|
+
|
252
|
+
noise = randn_tensor(
|
253
|
+
model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
|
254
|
+
)
|
255
|
+
|
256
|
+
eps = noise * s_noise
|
257
|
+
sigma_hat = sigma * (gamma + 1)
|
258
|
+
|
259
|
+
if gamma > 0:
|
260
|
+
sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
|
261
|
+
|
262
|
+
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
263
|
+
# NOTE: "original_sample" should not be an expected prediction_type but is left in for
|
264
|
+
# backwards compatibility
|
265
|
+
|
266
|
+
# if self.config.prediction_type == "vector_field":
|
267
|
+
|
268
|
+
denoised = sample - model_output * sigma
|
269
|
+
# 2. Convert to an ODE derivative
|
270
|
+
derivative = (sample - denoised) / sigma_hat
|
271
|
+
|
272
|
+
dt = self.sigmas[self.step_index + 1] - sigma_hat
|
273
|
+
|
274
|
+
prev_sample = sample + derivative * dt
|
275
|
+
# Cast sample back to model compatible dtype
|
276
|
+
prev_sample = prev_sample.to(model_output.dtype)
|
277
|
+
|
278
|
+
# upon completion increase step index by one
|
279
|
+
self._step_index += 1
|
280
|
+
|
281
|
+
if not return_dict:
|
282
|
+
return (prev_sample,)
|
283
|
+
|
284
|
+
return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
|
285
|
+
|
286
|
+
def __len__(self):
|
287
|
+
return self.config.num_train_timesteps
|
@@ -324,7 +324,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
|
324
324
|
else:
|
325
325
|
self._step_index = self._begin_index
|
326
326
|
|
327
|
-
#
|
327
|
+
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
328
328
|
def _sigma_to_t(self, sigma, log_sigmas):
|
329
329
|
# get log sigma
|
330
330
|
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
@@ -348,7 +348,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
|
348
348
|
t = t.reshape(sigma.shape)
|
349
349
|
return t
|
350
350
|
|
351
|
-
# copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
|
351
|
+
# copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
352
352
|
def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
|
353
353
|
"""Constructs the noise schedule of Karras et al. (2022)."""
|
354
354
|
|