diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. diffusers/__init__.py +9 -1
  2. diffusers/commands/env.py +1 -5
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +2 -1
  5. diffusers/loaders/__init__.py +2 -2
  6. diffusers/loaders/lora.py +406 -140
  7. diffusers/loaders/lora_conversion_utils.py +7 -1
  8. diffusers/loaders/single_file.py +1 -1
  9. diffusers/loaders/single_file_model.py +5 -0
  10. diffusers/loaders/single_file_utils.py +242 -2
  11. diffusers/loaders/unet.py +307 -272
  12. diffusers/models/__init__.py +5 -3
  13. diffusers/models/attention.py +125 -1
  14. diffusers/models/attention_processor.py +169 -1
  15. diffusers/models/autoencoders/__init__.py +1 -0
  16. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  17. diffusers/models/autoencoders/autoencoder_kl.py +17 -6
  18. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
  19. diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
  20. diffusers/models/autoencoders/vq_model.py +182 -0
  21. diffusers/models/controlnet_xs.py +6 -6
  22. diffusers/models/embeddings.py +112 -84
  23. diffusers/models/model_loading_utils.py +55 -0
  24. diffusers/models/modeling_utils.py +128 -17
  25. diffusers/models/normalization.py +11 -6
  26. diffusers/models/transformers/__init__.py +1 -0
  27. diffusers/models/transformers/dual_transformer_2d.py +5 -4
  28. diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
  29. diffusers/models/transformers/prior_transformer.py +5 -5
  30. diffusers/models/transformers/transformer_2d.py +2 -2
  31. diffusers/models/transformers/transformer_sd3.py +344 -0
  32. diffusers/models/transformers/transformer_temporal.py +12 -10
  33. diffusers/models/unets/unet_1d.py +3 -3
  34. diffusers/models/unets/unet_2d.py +3 -3
  35. diffusers/models/unets/unet_2d_condition.py +4 -15
  36. diffusers/models/unets/unet_3d_condition.py +5 -17
  37. diffusers/models/unets/unet_i2vgen_xl.py +4 -4
  38. diffusers/models/unets/unet_motion_model.py +4 -4
  39. diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
  40. diffusers/models/vq_model.py +8 -165
  41. diffusers/pipelines/__init__.py +2 -0
  42. diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
  43. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
  44. diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
  45. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
  46. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
  47. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
  48. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  49. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
  50. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
  51. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
  52. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
  53. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
  54. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
  55. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
  56. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
  57. diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
  58. diffusers/pipelines/pia/pipeline_pia.py +4 -3
  59. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  60. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  61. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
  65. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
  69. diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
  70. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  71. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +886 -0
  72. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +923 -0
  73. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
  74. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
  75. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
  76. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
  77. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
  78. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
  79. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
  80. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
  81. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
  82. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
  83. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
  84. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
  85. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  86. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
  87. diffusers/schedulers/__init__.py +2 -0
  88. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  89. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
  90. diffusers/schedulers/scheduling_edm_euler.py +2 -4
  91. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
  92. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  93. diffusers/training_utils.py +4 -4
  94. diffusers/utils/__init__.py +3 -0
  95. diffusers/utils/constants.py +2 -0
  96. diffusers/utils/dummy_pt_objects.py +30 -0
  97. diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0
  98. diffusers/utils/dynamic_modules_utils.py +15 -13
  99. diffusers/utils/hub_utils.py +106 -0
  100. diffusers/utils/import_utils.py +0 -1
  101. diffusers/utils/logging.py +3 -1
  102. diffusers/utils/state_dict_utils.py +2 -0
  103. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/METADATA +45 -45
  104. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/RECORD +108 -111
  105. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/WHEEL +1 -1
  106. diffusers/models/dual_transformer_2d.py +0 -20
  107. diffusers/models/prior_transformer.py +0 -12
  108. diffusers/models/t5_film_transformer.py +0 -70
  109. diffusers/models/transformer_2d.py +0 -25
  110. diffusers/models/transformer_temporal.py +0 -34
  111. diffusers/models/unet_1d.py +0 -26
  112. diffusers/models/unet_1d_blocks.py +0 -203
  113. diffusers/models/unet_2d.py +0 -27
  114. diffusers/models/unet_2d_blocks.py +0 -375
  115. diffusers/models/unet_2d_condition.py +0 -25
  116. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/LICENSE +0 -0
  117. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/entry_points.txt +0 -0
  118. {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/top_level.txt +0 -0
@@ -455,9 +455,10 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
455
455
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
456
456
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
457
457
 
458
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
459
- # Retrieve the original scale by scaling back the LoRA layers
460
- unscale_lora_layers(self.text_encoder, lora_scale)
458
+ if self.text_encoder is not None:
459
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
460
+ # Retrieve the original scale by scaling back the LoRA layers
461
+ unscale_lora_layers(self.text_encoder, lora_scale)
461
462
 
462
463
  return prompt_embeds, negative_prompt_embeds
463
464
 
@@ -85,10 +85,9 @@ EXAMPLE_DOC_STRING = """
85
85
 
86
86
  >>> init_image = download_image(img_url).resize((768, 768))
87
87
 
88
- >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
88
+ >>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
89
89
  ... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
90
90
  ... )
91
- >>> pipe = pipe.to("cuda")
92
91
 
93
92
  >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
94
93
  >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
@@ -97,9 +96,9 @@ EXAMPLE_DOC_STRING = """
97
96
  >>> mask_prompt = "A bowl of fruits"
98
97
  >>> prompt = "A bowl of pears"
99
98
 
100
- >>> mask_image = pipe.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
101
- >>> image_latents = pipe.invert(image=init_image, prompt=mask_prompt).latents
102
- >>> image = pipe(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
99
+ >>> mask_image = pipeline.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
100
+ >>> image_latents = pipeline.invert(image=init_image, prompt=mask_prompt).latents
101
+ >>> image = pipeline(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
103
102
  ```
104
103
  """
105
104
 
@@ -122,10 +121,9 @@ EXAMPLE_INVERT_DOC_STRING = """
122
121
 
123
122
  >>> init_image = download_image(img_url).resize((768, 768))
124
123
 
125
- >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
124
+ >>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained(
126
125
  ... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
127
126
  ... )
128
- >>> pipe = pipe.to("cuda")
129
127
 
130
128
  >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
131
129
  >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
@@ -133,7 +131,7 @@ EXAMPLE_INVERT_DOC_STRING = """
133
131
 
134
132
  >>> prompt = "A bowl of fruits"
135
133
 
136
- >>> inverted_latents = pipe.invert(image=init_image, prompt=prompt).latents
134
+ >>> inverted_latents = pipeline.invert(image=init_image, prompt=prompt).latents
137
135
  ```
138
136
  """
139
137
 
@@ -582,9 +580,10 @@ class StableDiffusionDiffEditPipeline(
582
580
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
583
581
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
584
582
 
585
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
586
- # Retrieve the original scale by scaling back the LoRA layers
587
- unscale_lora_layers(self.text_encoder, lora_scale)
583
+ if self.text_encoder is not None:
584
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
585
+ # Retrieve the original scale by scaling back the LoRA layers
586
+ unscale_lora_layers(self.text_encoder, lora_scale)
588
587
 
589
588
  return prompt_embeds, negative_prompt_embeds
590
589
 
@@ -381,9 +381,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
381
381
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
382
382
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
383
383
 
384
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
385
- # Retrieve the original scale by scaling back the LoRA layers
386
- unscale_lora_layers(self.text_encoder, lora_scale)
384
+ if self.text_encoder is not None:
385
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
386
+ # Retrieve the original scale by scaling back the LoRA layers
387
+ unscale_lora_layers(self.text_encoder, lora_scale)
387
388
 
388
389
  return prompt_embeds, negative_prompt_embeds
389
390
 
@@ -406,9 +406,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
406
406
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
407
407
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
408
408
 
409
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
410
- # Retrieve the original scale by scaling back the LoRA layers
411
- unscale_lora_layers(self.text_encoder, lora_scale)
409
+ if self.text_encoder is not None:
410
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
411
+ # Retrieve the original scale by scaling back the LoRA layers
412
+ unscale_lora_layers(self.text_encoder, lora_scale)
412
413
 
413
414
  return prompt_embeds, negative_prompt_embeds
414
415
 
@@ -355,9 +355,10 @@ class StableDiffusionKDiffusionPipeline(
355
355
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
356
356
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
357
357
 
358
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
359
- # Retrieve the original scale by scaling back the LoRA layers
360
- unscale_lora_layers(self.text_encoder, lora_scale)
358
+ if self.text_encoder is not None:
359
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
360
+ # Retrieve the original scale by scaling back the LoRA layers
361
+ unscale_lora_layers(self.text_encoder, lora_scale)
361
362
 
362
363
  return prompt_embeds, negative_prompt_embeds
363
364
 
@@ -455,9 +455,10 @@ class StableDiffusionLDM3DPipeline(
455
455
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
456
456
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
457
457
 
458
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
459
- # Retrieve the original scale by scaling back the LoRA layers
460
- unscale_lora_layers(self.text_encoder, lora_scale)
458
+ if self.text_encoder is not None:
459
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
460
+ # Retrieve the original scale by scaling back the LoRA layers
461
+ unscale_lora_layers(self.text_encoder, lora_scale)
461
462
 
462
463
  return prompt_embeds, negative_prompt_embeds
463
464
 
@@ -427,9 +427,10 @@ class StableDiffusionPanoramaPipeline(
427
427
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
428
428
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
429
429
 
430
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
431
- # Retrieve the original scale by scaling back the LoRA layers
432
- unscale_lora_layers(self.text_encoder, lora_scale)
430
+ if self.text_encoder is not None:
431
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
432
+ # Retrieve the original scale by scaling back the LoRA layers
433
+ unscale_lora_layers(self.text_encoder, lora_scale)
433
434
 
434
435
  return prompt_embeds, negative_prompt_embeds
435
436
 
@@ -370,9 +370,10 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
370
370
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
371
371
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
372
372
 
373
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
374
- # Retrieve the original scale by scaling back the LoRA layers
375
- unscale_lora_layers(self.text_encoder, lora_scale)
373
+ if self.text_encoder is not None:
374
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
375
+ # Retrieve the original scale by scaling back the LoRA layers
376
+ unscale_lora_layers(self.text_encoder, lora_scale)
376
377
 
377
378
  return prompt_embeds, negative_prompt_embeds
378
379
 
@@ -472,9 +472,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
472
472
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
473
473
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
474
474
 
475
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
476
- # Retrieve the original scale by scaling back the LoRA layers
477
- unscale_lora_layers(self.text_encoder, lora_scale)
475
+ if self.text_encoder is not None:
476
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
477
+ # Retrieve the original scale by scaling back the LoRA layers
478
+ unscale_lora_layers(self.text_encoder, lora_scale)
478
479
 
479
480
  return prompt_embeds, negative_prompt_embeds
480
481
 
@@ -315,9 +315,10 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
315
315
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
316
316
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
317
317
 
318
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
319
- # Retrieve the original scale by scaling back the LoRA layers
320
- unscale_lora_layers(self.text_encoder, lora_scale)
318
+ if self.text_encoder is not None:
319
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
320
+ # Retrieve the original scale by scaling back the LoRA layers
321
+ unscale_lora_layers(self.text_encoder, lora_scale)
321
322
 
322
323
  return prompt_embeds, negative_prompt_embeds
323
324
 
@@ -350,9 +350,10 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
350
350
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
351
351
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
352
352
 
353
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
354
- # Retrieve the original scale by scaling back the LoRA layers
355
- unscale_lora_layers(self.text_encoder, lora_scale)
353
+ if self.text_encoder is not None:
354
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
355
+ # Retrieve the original scale by scaling back the LoRA layers
356
+ unscale_lora_layers(self.text_encoder, lora_scale)
356
357
 
357
358
  return prompt_embeds, negative_prompt_embeds
358
359
 
@@ -963,9 +963,10 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
963
963
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
964
964
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
965
965
 
966
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
967
- # Retrieve the original scale by scaling back the LoRA layers
968
- unscale_lora_layers(self.text_encoder, lora_scale)
966
+ if self.text_encoder is not None:
967
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
968
+ # Retrieve the original scale by scaling back the LoRA layers
969
+ unscale_lora_layers(self.text_encoder, lora_scale)
969
970
 
970
971
  return prompt_embeds, negative_prompt_embeds
971
972
 
@@ -9,8 +9,8 @@ from ...models import ModelMixin
9
9
  from ...models.attention import FeedForward
10
10
  from ...models.attention_processor import Attention
11
11
  from ...models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed
12
+ from ...models.modeling_outputs import Transformer2DModelOutput
12
13
  from ...models.normalization import AdaLayerNorm
13
- from ...models.transformers.transformer_2d import Transformer2DModelOutput
14
14
  from ...utils import logging
15
15
 
16
16
 
@@ -554,9 +554,10 @@ class UniDiffuserPipeline(DiffusionPipeline):
554
554
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
555
555
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
556
556
 
557
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
558
- # Retrieve the original scale by scaling back the LoRA layers
559
- unscale_lora_layers(self.text_encoder, lora_scale)
557
+ if self.text_encoder is not None:
558
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
559
+ # Retrieve the original scale by scaling back the LoRA layers
560
+ unscale_lora_layers(self.text_encoder, lora_scale)
560
561
 
561
562
  return prompt_embeds, negative_prompt_embeds
562
563
 
@@ -56,6 +56,7 @@ else:
56
56
  _import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
57
57
  _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
58
58
  _import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
59
+ _import_structure["scheduling_flow_match_euler_discrete"] = ["FlowMatchEulerDiscreteScheduler"]
59
60
  _import_structure["scheduling_heun_discrete"] = ["HeunDiscreteScheduler"]
60
61
  _import_structure["scheduling_ipndm"] = ["IPNDMScheduler"]
61
62
  _import_structure["scheduling_k_dpm_2_ancestral_discrete"] = ["KDPM2AncestralDiscreteScheduler"]
@@ -151,6 +152,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
151
152
  from .scheduling_edm_euler import EDMEulerScheduler
152
153
  from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
153
154
  from .scheduling_euler_discrete import EulerDiscreteScheduler
155
+ from .scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
154
156
  from .scheduling_heun_discrete import HeunDiscreteScheduler
155
157
  from .scheduling_ipndm import IPNDMScheduler
156
158
  from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
@@ -370,7 +370,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
370
370
  timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sig_proposed])
371
371
  return timesteps
372
372
 
373
- # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
373
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
374
374
  def _sigma_to_t(self, sigma, log_sigmas):
375
375
  # get log sigma
376
376
  log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -394,7 +394,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
394
394
  t = t.reshape(sigma.shape)
395
395
  return t
396
396
 
397
- # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
397
+ # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
398
398
  def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
399
399
  """Constructs the noise schedule of Karras et al. (2022)."""
400
400
 
@@ -243,13 +243,13 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
243
243
 
244
244
  self.num_inference_steps = num_inference_steps
245
245
 
246
- ramp = np.linspace(0, 1, self.num_inference_steps)
246
+ ramp = torch.linspace(0, 1, self.num_inference_steps)
247
247
  if self.config.sigma_schedule == "karras":
248
248
  sigmas = self._compute_karras_sigmas(ramp)
249
249
  elif self.config.sigma_schedule == "exponential":
250
250
  sigmas = self._compute_exponential_sigmas(ramp)
251
251
 
252
- sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
252
+ sigmas = sigmas.to(dtype=torch.float32, device=device)
253
253
  self.timesteps = self.precondition_noise(sigmas)
254
254
 
255
255
  if self.config.final_sigmas_type == "sigma_min":
@@ -283,7 +283,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
283
283
  min_inv_rho = sigma_min ** (1 / rho)
284
284
  max_inv_rho = sigma_max ** (1 / rho)
285
285
  sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
286
-
287
286
  return sigmas
288
287
 
289
288
  # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
@@ -16,7 +16,6 @@ import math
16
16
  from dataclasses import dataclass
17
17
  from typing import Optional, Tuple, Union
18
18
 
19
- import numpy as np
20
19
  import torch
21
20
 
22
21
  from ..configuration_utils import ConfigMixin, register_to_config
@@ -210,13 +209,13 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
210
209
  """
211
210
  self.num_inference_steps = num_inference_steps
212
211
 
213
- ramp = np.linspace(0, 1, self.num_inference_steps)
212
+ ramp = torch.linspace(0, 1, self.num_inference_steps)
214
213
  if self.config.sigma_schedule == "karras":
215
214
  sigmas = self._compute_karras_sigmas(ramp)
216
215
  elif self.config.sigma_schedule == "exponential":
217
216
  sigmas = self._compute_exponential_sigmas(ramp)
218
217
 
219
- sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
218
+ sigmas = sigmas.to(dtype=torch.float32, device=device)
220
219
  self.timesteps = self.precondition_noise(sigmas)
221
220
 
222
221
  self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
@@ -234,7 +233,6 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
234
233
  min_inv_rho = sigma_min ** (1 / rho)
235
234
  max_inv_rho = sigma_max ** (1 / rho)
236
235
  sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
237
-
238
236
  return sigmas
239
237
 
240
238
  def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
@@ -0,0 +1,287 @@
1
+ # Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+
21
+ from ..configuration_utils import ConfigMixin, register_to_config
22
+ from ..utils import BaseOutput, logging
23
+ from ..utils.torch_utils import randn_tensor
24
+ from .scheduling_utils import SchedulerMixin
25
+
26
+
27
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
28
+
29
+
30
+ @dataclass
31
+ class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
32
+ """
33
+ Output class for the scheduler's `step` function output.
34
+
35
+ Args:
36
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
37
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
38
+ denoising loop.
39
+ """
40
+
41
+ prev_sample: torch.FloatTensor
42
+
43
+
44
+ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
45
+ """
46
+ Euler scheduler.
47
+
48
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
49
+ methods the library implements for all schedulers such as loading and saving.
50
+
51
+ Args:
52
+ num_train_timesteps (`int`, defaults to 1000):
53
+ The number of diffusion steps to train the model.
54
+ timestep_spacing (`str`, defaults to `"linspace"`):
55
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
56
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
57
+ shift (`float`, defaults to 1.0):
58
+ The shift value for the timestep schedule.
59
+ """
60
+
61
+ _compatibles = []
62
+ order = 1
63
+
64
+ @register_to_config
65
+ def __init__(
66
+ self,
67
+ num_train_timesteps: int = 1000,
68
+ shift: float = 1.0,
69
+ ):
70
+ timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
71
+ timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
72
+
73
+ sigmas = timesteps / num_train_timesteps
74
+ sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
75
+
76
+ self.timesteps = sigmas * num_train_timesteps
77
+
78
+ self._step_index = None
79
+ self._begin_index = None
80
+
81
+ self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
82
+ self.sigma_min = self.sigmas[-1].item()
83
+ self.sigma_max = self.sigmas[0].item()
84
+
85
+ @property
86
+ def step_index(self):
87
+ """
88
+ The index counter for current timestep. It will increase 1 after each scheduler step.
89
+ """
90
+ return self._step_index
91
+
92
+ @property
93
+ def begin_index(self):
94
+ """
95
+ The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
96
+ """
97
+ return self._begin_index
98
+
99
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
100
+ def set_begin_index(self, begin_index: int = 0):
101
+ """
102
+ Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
103
+
104
+ Args:
105
+ begin_index (`int`):
106
+ The begin index for the scheduler.
107
+ """
108
+ self._begin_index = begin_index
109
+
110
+ def scale_noise(
111
+ self,
112
+ sample: torch.FloatTensor,
113
+ timestep: Union[float, torch.FloatTensor],
114
+ noise: Optional[torch.FloatTensor] = None,
115
+ ) -> torch.FloatTensor:
116
+ """
117
+ Foward process in flow-matching
118
+
119
+ Args:
120
+ sample (`torch.FloatTensor`):
121
+ The input sample.
122
+ timestep (`int`, *optional*):
123
+ The current timestep in the diffusion chain.
124
+
125
+ Returns:
126
+ `torch.FloatTensor`:
127
+ A scaled input sample.
128
+ """
129
+ if self.step_index is None:
130
+ self._init_step_index(timestep)
131
+
132
+ sigma = self.sigmas[self.step_index]
133
+ sample = sigma * noise + (1.0 - sigma) * sample
134
+
135
+ return sample
136
+
137
+ def _sigma_to_t(self, sigma):
138
+ return sigma * self.config.num_train_timesteps
139
+
140
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
141
+ """
142
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
143
+
144
+ Args:
145
+ num_inference_steps (`int`):
146
+ The number of diffusion steps used when generating samples with a pre-trained model.
147
+ device (`str` or `torch.device`, *optional*):
148
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
149
+ """
150
+ self.num_inference_steps = num_inference_steps
151
+
152
+ timesteps = np.linspace(
153
+ self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
154
+ )
155
+
156
+ sigmas = timesteps / self.config.num_train_timesteps
157
+ sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
158
+ sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
159
+
160
+ timesteps = sigmas * self.config.num_train_timesteps
161
+ self.timesteps = timesteps.to(device=device)
162
+ self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
163
+
164
+ self._step_index = None
165
+ self._begin_index = None
166
+
167
+ def index_for_timestep(self, timestep, schedule_timesteps=None):
168
+ if schedule_timesteps is None:
169
+ schedule_timesteps = self.timesteps
170
+
171
+ indices = (schedule_timesteps == timestep).nonzero()
172
+
173
+ # The sigma index that is taken for the **very** first `step`
174
+ # is always the second index (or the last index if there is only 1)
175
+ # This way we can ensure we don't accidentally skip a sigma in
176
+ # case we start in the middle of the denoising schedule (e.g. for image-to-image)
177
+ pos = 1 if len(indices) > 1 else 0
178
+
179
+ return indices[pos].item()
180
+
181
+ def _init_step_index(self, timestep):
182
+ if self.begin_index is None:
183
+ if isinstance(timestep, torch.Tensor):
184
+ timestep = timestep.to(self.timesteps.device)
185
+ self._step_index = self.index_for_timestep(timestep)
186
+ else:
187
+ self._step_index = self._begin_index
188
+
189
+ def step(
190
+ self,
191
+ model_output: torch.FloatTensor,
192
+ timestep: Union[float, torch.FloatTensor],
193
+ sample: torch.FloatTensor,
194
+ s_churn: float = 0.0,
195
+ s_tmin: float = 0.0,
196
+ s_tmax: float = float("inf"),
197
+ s_noise: float = 1.0,
198
+ generator: Optional[torch.Generator] = None,
199
+ return_dict: bool = True,
200
+ ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
201
+ """
202
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
203
+ process from the learned model outputs (most often the predicted noise).
204
+
205
+ Args:
206
+ model_output (`torch.FloatTensor`):
207
+ The direct output from learned diffusion model.
208
+ timestep (`float`):
209
+ The current discrete timestep in the diffusion chain.
210
+ sample (`torch.FloatTensor`):
211
+ A current instance of a sample created by the diffusion process.
212
+ s_churn (`float`):
213
+ s_tmin (`float`):
214
+ s_tmax (`float`):
215
+ s_noise (`float`, defaults to 1.0):
216
+ Scaling factor for noise added to the sample.
217
+ generator (`torch.Generator`, *optional*):
218
+ A random number generator.
219
+ return_dict (`bool`):
220
+ Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
221
+ tuple.
222
+
223
+ Returns:
224
+ [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
225
+ If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
226
+ returned, otherwise a tuple is returned where the first element is the sample tensor.
227
+ """
228
+
229
+ if (
230
+ isinstance(timestep, int)
231
+ or isinstance(timestep, torch.IntTensor)
232
+ or isinstance(timestep, torch.LongTensor)
233
+ ):
234
+ raise ValueError(
235
+ (
236
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
237
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
238
+ " one of the `scheduler.timesteps` as a timestep."
239
+ ),
240
+ )
241
+
242
+ if self.step_index is None:
243
+ self._init_step_index(timestep)
244
+
245
+ # Upcast to avoid precision issues when computing prev_sample
246
+ sample = sample.to(torch.float32)
247
+
248
+ sigma = self.sigmas[self.step_index]
249
+
250
+ gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
251
+
252
+ noise = randn_tensor(
253
+ model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
254
+ )
255
+
256
+ eps = noise * s_noise
257
+ sigma_hat = sigma * (gamma + 1)
258
+
259
+ if gamma > 0:
260
+ sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
261
+
262
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
263
+ # NOTE: "original_sample" should not be an expected prediction_type but is left in for
264
+ # backwards compatibility
265
+
266
+ # if self.config.prediction_type == "vector_field":
267
+
268
+ denoised = sample - model_output * sigma
269
+ # 2. Convert to an ODE derivative
270
+ derivative = (sample - denoised) / sigma_hat
271
+
272
+ dt = self.sigmas[self.step_index + 1] - sigma_hat
273
+
274
+ prev_sample = sample + derivative * dt
275
+ # Cast sample back to model compatible dtype
276
+ prev_sample = prev_sample.to(model_output.dtype)
277
+
278
+ # upon completion increase step index by one
279
+ self._step_index += 1
280
+
281
+ if not return_dict:
282
+ return (prev_sample,)
283
+
284
+ return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
285
+
286
+ def __len__(self):
287
+ return self.config.num_train_timesteps
@@ -324,7 +324,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
324
324
  else:
325
325
  self._step_index = self._begin_index
326
326
 
327
- # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
327
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
328
328
  def _sigma_to_t(self, sigma, log_sigmas):
329
329
  # get log sigma
330
330
  log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -348,7 +348,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
348
348
  t = t.reshape(sigma.shape)
349
349
  return t
350
350
 
351
- # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
351
+ # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
352
352
  def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
353
353
  """Constructs the noise schedule of Karras et al. (2022)."""
354
354