diffusers 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. diffusers/__init__.py +3 -1
  2. diffusers/commands/fp16_safetensors.py +2 -7
  3. diffusers/configuration_utils.py +23 -1
  4. diffusers/dependency_versions_table.py +1 -1
  5. diffusers/loaders.py +62 -64
  6. diffusers/models/__init__.py +1 -0
  7. diffusers/models/activations.py +2 -0
  8. diffusers/models/attention.py +45 -1
  9. diffusers/models/autoencoder_tiny.py +193 -0
  10. diffusers/models/controlnet.py +1 -1
  11. diffusers/models/embeddings.py +56 -0
  12. diffusers/models/lora.py +0 -6
  13. diffusers/models/modeling_flax_utils.py +28 -2
  14. diffusers/models/modeling_utils.py +33 -16
  15. diffusers/models/transformer_2d.py +26 -9
  16. diffusers/models/unet_1d.py +2 -2
  17. diffusers/models/unet_2d_blocks.py +106 -56
  18. diffusers/models/unet_2d_condition.py +20 -5
  19. diffusers/models/vae.py +106 -1
  20. diffusers/pipelines/__init__.py +1 -0
  21. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +10 -3
  22. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -3
  23. diffusers/pipelines/audioldm/pipeline_audioldm.py +1 -1
  24. diffusers/pipelines/auto_pipeline.py +33 -43
  25. diffusers/pipelines/controlnet/multicontrolnet.py +4 -2
  26. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -4
  27. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +15 -7
  28. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +14 -4
  29. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +157 -10
  30. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -10
  31. diffusers/pipelines/deepfloyd_if/pipeline_if.py +1 -1
  32. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +1 -1
  33. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +1 -1
  34. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +1 -1
  35. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +1 -1
  36. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +1 -1
  37. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +43 -2
  38. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +44 -2
  39. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  40. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  41. diffusers/pipelines/pipeline_flax_utils.py +41 -4
  42. diffusers/pipelines/pipeline_utils.py +60 -16
  43. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +2 -2
  44. diffusers/pipelines/stable_diffusion/__init__.py +1 -0
  45. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +81 -37
  46. diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +10 -3
  47. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +10 -3
  48. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +10 -3
  49. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +10 -3
  50. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +12 -5
  51. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py +832 -0
  52. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +10 -3
  53. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +10 -3
  54. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +10 -3
  55. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +9 -2
  56. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +17 -8
  57. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +10 -3
  58. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +10 -3
  59. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +10 -3
  60. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +10 -3
  61. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +10 -3
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +10 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +10 -3
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +10 -3
  65. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +3 -5
  66. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +75 -3
  67. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +76 -6
  68. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +1 -2
  69. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +10 -3
  70. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +10 -3
  71. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +11 -4
  72. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +1 -1
  73. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +131 -28
  74. diffusers/schedulers/scheduling_consistency_models.py +70 -57
  75. diffusers/schedulers/scheduling_ddim.py +76 -71
  76. diffusers/schedulers/scheduling_ddim_inverse.py +76 -44
  77. diffusers/schedulers/scheduling_ddim_parallel.py +11 -8
  78. diffusers/schedulers/scheduling_ddpm.py +68 -67
  79. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -15
  80. diffusers/schedulers/scheduling_deis_multistep.py +93 -85
  81. diffusers/schedulers/scheduling_dpmsolver_multistep.py +118 -120
  82. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +116 -109
  83. diffusers/schedulers/scheduling_dpmsolver_sde.py +57 -43
  84. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +122 -121
  85. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +54 -44
  86. diffusers/schedulers/scheduling_euler_discrete.py +63 -56
  87. diffusers/schedulers/scheduling_heun_discrete.py +57 -45
  88. diffusers/schedulers/scheduling_ipndm.py +27 -22
  89. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +54 -41
  90. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +52 -41
  91. diffusers/schedulers/scheduling_karras_ve.py +55 -45
  92. diffusers/schedulers/scheduling_lms_discrete.py +58 -52
  93. diffusers/schedulers/scheduling_pndm.py +77 -62
  94. diffusers/schedulers/scheduling_repaint.py +56 -38
  95. diffusers/schedulers/scheduling_sde_ve.py +62 -50
  96. diffusers/schedulers/scheduling_sde_vp.py +32 -11
  97. diffusers/schedulers/scheduling_unclip.py +3 -3
  98. diffusers/schedulers/scheduling_unipc_multistep.py +131 -91
  99. diffusers/schedulers/scheduling_utils.py +41 -35
  100. diffusers/schedulers/scheduling_utils_flax.py +8 -2
  101. diffusers/schedulers/scheduling_vq_diffusion.py +39 -68
  102. diffusers/utils/__init__.py +2 -2
  103. diffusers/utils/dummy_pt_objects.py +15 -0
  104. diffusers/utils/dummy_torch_and_transformers_objects.py +15 -0
  105. diffusers/utils/hub_utils.py +105 -2
  106. diffusers/utils/import_utils.py +0 -4
  107. diffusers/utils/pil_utils.py +19 -0
  108. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/METADATA +5 -7
  109. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/RECORD +113 -112
  110. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/WHEEL +1 -1
  111. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/entry_points.txt +0 -1
  112. diffusers/models/cross_attention.py +0 -94
  113. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/LICENSE +0 -0
  114. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/top_level.txt +0 -0
@@ -29,11 +29,11 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
29
29
  @dataclass
30
30
  class CMStochasticIterativeSchedulerOutput(BaseOutput):
31
31
  """
32
- Output class for the scheduler's step function output.
32
+ Output class for the scheduler's `step` function.
33
33
 
34
34
  Args:
35
35
  prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
36
- Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
36
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
37
37
  denoising loop.
38
38
  """
39
39
 
@@ -42,38 +42,32 @@ class CMStochasticIterativeSchedulerOutput(BaseOutput):
42
42
 
43
43
  class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
44
44
  """
45
- Multistep and onestep sampling for consistency models from Song et al. 2023 [1]. This implements Algorithm 1 in the
46
- paper [1].
45
+ Multistep and onestep sampling for consistency models.
47
46
 
48
- [1] Song, Yang and Dhariwal, Prafulla and Chen, Mark and Sutskever, Ilya. "Consistency Models"
49
- https://arxiv.org/pdf/2303.01469 [2] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based
50
- Generative Models." https://arxiv.org/abs/2206.00364
51
-
52
- [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
53
- function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
54
- [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
55
- [`~SchedulerMixin.from_pretrained`] functions.
47
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
48
+ methods the library implements for all schedulers such as loading and saving.
56
49
 
57
50
  Args:
58
- num_train_timesteps (`int`): number of diffusion steps used to train the model.
59
- sigma_min (`float`):
60
- Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the original implementation.
61
- sigma_max (`float`):
62
- Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the original implementation.
63
- sigma_data (`float`):
64
- The standard deviation of the data distribution, following the EDM paper [2]. This was set to 0.5 in the
65
- original implementation, which is also the original value suggested in the EDM paper.
66
- s_noise (`float`):
51
+ num_train_timesteps (`int`, defaults to 40):
52
+ The number of diffusion steps to train the model.
53
+ sigma_min (`float`, defaults to 0.002):
54
+ Minimum noise magnitude in the sigma schedule. Defaults to 0.002 from the original implementation.
55
+ sigma_max (`float`, defaults to 80.0):
56
+ Maximum noise magnitude in the sigma schedule. Defaults to 80.0 from the original implementation.
57
+ sigma_data (`float`, defaults to 0.5):
58
+ The standard deviation of the data distribution from the EDM
59
+ [paper](https://huggingface.co/papers/2206.00364). Defaults to 0.5 from the original implementation.
60
+ s_noise (`float`, defaults to 1.0):
67
61
  The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
68
- 1.011]. This was set to 1.0 in the original implementation.
69
- rho (`float`):
70
- The rho parameter used for calculating the Karras sigma schedule, introduced in the EDM paper [2]. This was
71
- set to 7.0 in the original implementation, which is also the original value suggested in the EDM paper.
72
- clip_denoised (`bool`):
73
- Whether to clip the denoised outputs to `(-1, 1)`. Defaults to `True`.
62
+ 1.011]. Defaults to 1.0 from the original implementation.
63
+ rho (`float`, defaults to 7.0):
64
+ The parameter for calculating the Karras sigma schedule from the EDM
65
+ [paper](https://huggingface.co/papers/2206.00364). Defaults to 7.0 from the original implementation.
66
+ clip_denoised (`bool`, defaults to `True`):
67
+ Whether to clip the denoised outputs to `(-1, 1)`.
74
68
  timesteps (`List` or `np.ndarray` or `torch.Tensor`, *optional*):
75
- Optionally, an explicit timestep schedule can be specified. The timesteps are expected to be in increasing
76
- order.
69
+ An explicit timestep schedule that can be optionally specified. The timesteps are expected to be in
70
+ increasing order.
77
71
  """
78
72
 
79
73
  order = 1
@@ -114,13 +108,17 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
114
108
  self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
115
109
  ) -> torch.FloatTensor:
116
110
  """
117
- Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`, following the EDM model.
111
+ Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`.
118
112
 
119
113
  Args:
120
- sample (`torch.FloatTensor`): input sample
121
- timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
114
+ sample (`torch.FloatTensor`):
115
+ The input sample.
116
+ timestep (`float` or `torch.FloatTensor`):
117
+ The current timestep in the diffusion chain.
118
+
122
119
  Returns:
123
- `torch.FloatTensor`: scaled input sample
120
+ `torch.FloatTensor`:
121
+ A scaled input sample.
124
122
  """
125
123
  # Get sigma corresponding to timestep
126
124
  if isinstance(timestep, torch.Tensor):
@@ -135,12 +133,15 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
135
133
 
136
134
  def sigma_to_t(self, sigmas: Union[float, np.ndarray]):
137
135
  """
138
- Gets scaled timesteps from the Karras sigmas, for input to the consistency model.
136
+ Gets scaled timesteps from the Karras sigmas for input to the consistency model.
139
137
 
140
138
  Args:
141
- sigmas (`float` or `np.ndarray`): single Karras sigma or array of Karras sigmas
139
+ sigmas (`float` or `np.ndarray`):
140
+ A single Karras sigma or an array of Karras sigmas.
141
+
142
142
  Returns:
143
- `float` or `np.ndarray`: scaled input timestep or scaled input timestep array
143
+ `float` or `np.ndarray`:
144
+ A scaled input timestep or scaled input timestep array.
144
145
  """
145
146
  if not isinstance(sigmas, np.ndarray):
146
147
  sigmas = np.array(sigmas, dtype=np.float64)
@@ -156,17 +157,17 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
156
157
  timesteps: Optional[List[int]] = None,
157
158
  ):
158
159
  """
159
- Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
160
+ Sets the timesteps used for the diffusion chain (to be run before inference).
160
161
 
161
162
  Args:
162
163
  num_inference_steps (`int`):
163
- the number of diffusion steps used when generating samples with a pre-trained model.
164
- device (`str` or `torch.device`, optional):
165
- the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
166
- timesteps (`List[int]`, optional):
167
- custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
168
- timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps`
169
- must be `None`.
164
+ The number of diffusion steps used when generating samples with a pre-trained model.
165
+ device (`str` or `torch.device`, *optional*):
166
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
167
+ timesteps (`List[int]`, *optional*):
168
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
169
+ timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
170
+ `num_inference_steps` must be `None`.
170
171
  """
171
172
  if num_inference_steps is None and timesteps is None:
172
173
  raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
@@ -241,17 +242,22 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
241
242
 
242
243
  def get_scalings_for_boundary_condition(self, sigma):
243
244
  """
244
- Gets the scalings used in the consistency model parameterization, following Appendix C of the original paper.
245
- This enforces the consistency model boundary condition.
245
+ Gets the scalings used in the consistency model parameterization (from Appendix C of the
246
+ [paper](https://huggingface.co/papers/2303.01469)) to enforce boundary condition.
246
247
 
247
- Note that `epsilon` in the equations for c_skip and c_out is set to sigma_min.
248
+ <Tip>
249
+
250
+ `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`.
251
+
252
+ </Tip>
248
253
 
249
254
  Args:
250
255
  sigma (`torch.FloatTensor`):
251
256
  The current sigma in the Karras sigma schedule.
257
+
252
258
  Returns:
253
259
  `tuple`:
254
- A two-element tuple where c_skip (which weights the current sample) is the first element and c_out
260
+ A two-element tuple where `c_skip` (which weights the current sample) is the first element and `c_out`
255
261
  (which weights the consistency model output) is the second element.
256
262
  """
257
263
  sigma_min = self.config.sigma_min
@@ -270,20 +276,27 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
270
276
  return_dict: bool = True,
271
277
  ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]:
272
278
  """
273
- Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
279
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
274
280
  process from the learned model outputs (most often the predicted noise).
275
281
 
276
282
  Args:
277
- model_output (`torch.FloatTensor`): direct output from learned diffusion model.
278
- timestep (`float`): current timestep in the diffusion chain.
283
+ model_output (`torch.FloatTensor`):
284
+ The direct output from the learned diffusion model.
285
+ timestep (`float`):
286
+ The current timestep in the diffusion chain.
279
287
  sample (`torch.FloatTensor`):
280
- current instance of sample being created by diffusion process.
281
- generator (`torch.Generator`, *optional*): Random number generator.
282
- return_dict (`bool`): option for returning tuple rather than EulerDiscreteSchedulerOutput class
288
+ A current instance of a sample created by the diffusion process.
289
+ generator (`torch.Generator`, *optional*):
290
+ A random number generator.
291
+ return_dict (`bool`, *optional*, defaults to `True`):
292
+ Whether or not to return a
293
+ [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`.
294
+
283
295
  Returns:
284
- [`~schedulers.scheduling_utils.CMStochasticIterativeSchedulerOutput`] or `tuple`:
285
- [`~schedulers.scheduling_utils.CMStochasticIterativeSchedulerOutput`] if `return_dict` is True, otherwise a
286
- `tuple`. When returning a tuple, the first element is the sample tensor.
296
+ [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`:
297
+ If return_dict is `True`,
298
+ [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] is returned,
299
+ otherwise a tuple is returned where the first element is the sample tensor.
287
300
  """
288
301
 
289
302
  if (
@@ -31,14 +31,14 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
31
31
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
32
32
  class DDIMSchedulerOutput(BaseOutput):
33
33
  """
34
- Output class for the scheduler's step function output.
34
+ Output class for the scheduler's `step` function output.
35
35
 
36
36
  Args:
37
37
  prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
38
- Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
38
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
39
39
  denoising loop.
40
40
  pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
41
- The predicted denoised sample (x_{0}) based on the model output from the current timestep.
41
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
42
42
  `pred_original_sample` can be used to preview progress or for guidance.
43
43
  """
44
44
 
@@ -129,57 +129,53 @@ def rescale_zero_terminal_snr(betas):
129
129
 
130
130
  class DDIMScheduler(SchedulerMixin, ConfigMixin):
131
131
  """
132
- Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
133
- diffusion probabilistic models (DDPMs) with non-Markovian guidance.
132
+ `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
133
+ non-Markovian guidance.
134
134
 
135
- [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
136
- function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
137
- [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
138
- [`~SchedulerMixin.from_pretrained`] functions.
139
-
140
- For more details, see the original paper: https://arxiv.org/abs/2010.02502
135
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
136
+ methods the library implements for all schedulers such as loading and saving.
141
137
 
142
138
  Args:
143
- num_train_timesteps (`int`): number of diffusion steps used to train the model.
144
- beta_start (`float`): the starting `beta` value of inference.
145
- beta_end (`float`): the final `beta` value.
146
- beta_schedule (`str`):
147
- the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
139
+ num_train_timesteps (`int`, defaults to 1000):
140
+ The number of diffusion steps to train the model.
141
+ beta_start (`float`, defaults to 0.0001):
142
+ The starting `beta` value of inference.
143
+ beta_end (`float`, defaults to 0.02):
144
+ The final `beta` value.
145
+ beta_schedule (`str`, defaults to `"linear"`):
146
+ The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
148
147
  `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
149
- trained_betas (`np.ndarray`, optional):
150
- option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
151
- clip_sample (`bool`, default `True`):
152
- option to clip predicted sample for numerical stability.
153
- clip_sample_range (`float`, default `1.0`):
154
- the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
155
- set_alpha_to_one (`bool`, default `True`):
156
- each diffusion step uses the value of alphas product at that step and at the previous one. For the final
157
- step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
158
- otherwise it uses the value of alpha at step 0.
159
- steps_offset (`int`, default `0`):
160
- an offset added to the inference steps. You can use a combination of `offset=1` and
161
- `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
162
- stable diffusion.
163
- prediction_type (`str`, default `epsilon`, optional):
164
- prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
165
- process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
166
- https://imagen.research.google/video/paper.pdf)
167
- thresholding (`bool`, default `False`):
168
- whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
169
- Note that the thresholding method is unsuitable for latent-space diffusion models (such as
170
- stable-diffusion).
171
- dynamic_thresholding_ratio (`float`, default `0.995`):
172
- the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
173
- (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
174
- sample_max_value (`float`, default `1.0`):
175
- the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
176
- timestep_spacing (`str`, default `"leading"`):
177
- The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
178
- Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
179
- rescale_betas_zero_snr (`bool`, default `False`):
180
- whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
181
- This can enable the model to generate very bright and dark samples instead of limiting it to samples with
182
- medium brightness. Loosely related to
148
+ trained_betas (`np.ndarray`, *optional*):
149
+ Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
150
+ clip_sample (`bool`, defaults to `True`):
151
+ Clip the predicted sample for numerical stability.
152
+ clip_sample_range (`float`, defaults to 1.0):
153
+ The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
154
+ set_alpha_to_one (`bool`, defaults to `True`):
155
+ Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
156
+ there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
157
+ otherwise it uses the alpha value at step 0.
158
+ steps_offset (`int`, defaults to 0):
159
+ An offset added to the inference steps. You can use a combination of `offset=1` and
160
+ `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
161
+ Diffusion.
162
+ prediction_type (`str`, defaults to `epsilon`, *optional*):
163
+ Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
164
+ `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
165
+ Video](https://imagen.research.google/video/paper.pdf) paper).
166
+ thresholding (`bool`, defaults to `False`):
167
+ Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
168
+ as Stable Diffusion.
169
+ dynamic_thresholding_ratio (`float`, defaults to 0.995):
170
+ The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
171
+ sample_max_value (`float`, defaults to 1.0):
172
+ The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
173
+ timestep_spacing (`str`, defaults to `"leading"`):
174
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
175
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
176
+ rescale_betas_zero_snr (`bool`, defaults to `False`):
177
+ Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
178
+ dark samples instead of limiting it to samples with medium brightness. Loosely related to
183
179
  [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
184
180
  """
185
181
 
@@ -246,11 +242,14 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
246
242
  current timestep.
247
243
 
248
244
  Args:
249
- sample (`torch.FloatTensor`): input sample
250
- timestep (`int`, optional): current timestep
245
+ sample (`torch.FloatTensor`):
246
+ The input sample.
247
+ timestep (`int`, *optional*):
248
+ The current timestep in the diffusion chain.
251
249
 
252
250
  Returns:
253
- `torch.FloatTensor`: scaled input sample
251
+ `torch.FloatTensor`:
252
+ A scaled input sample.
254
253
  """
255
254
  return sample
256
255
 
@@ -301,11 +300,11 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
301
300
 
302
301
  def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
303
302
  """
304
- Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
303
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
305
304
 
306
305
  Args:
307
306
  num_inference_steps (`int`):
308
- the number of diffusion steps used when generating samples with a pre-trained model.
307
+ The number of diffusion steps used when generating samples with a pre-trained model.
309
308
  """
310
309
 
311
310
  if num_inference_steps > self.config.num_train_timesteps:
@@ -356,29 +355,35 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
356
355
  return_dict: bool = True,
357
356
  ) -> Union[DDIMSchedulerOutput, Tuple]:
358
357
  """
359
- Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
358
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
360
359
  process from the learned model outputs (most often the predicted noise).
361
360
 
362
361
  Args:
363
- model_output (`torch.FloatTensor`): direct output from learned diffusion model.
364
- timestep (`int`): current discrete timestep in the diffusion chain.
362
+ model_output (`torch.FloatTensor`):
363
+ The direct output from learned diffusion model.
364
+ timestep (`float`):
365
+ The current discrete timestep in the diffusion chain.
365
366
  sample (`torch.FloatTensor`):
366
- current instance of sample being created by diffusion process.
367
- eta (`float`): weight of noise for added noise in diffusion step.
368
- use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
369
- predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
370
- `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
371
- coincide with the one provided as input and `use_clipped_model_output` will have not effect.
372
- generator: random number generator.
373
- variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
374
- can directly provide the noise for the variance itself. This is useful for methods such as
375
- CycleDiffusion. (https://arxiv.org/abs/2210.05559)
376
- return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
367
+ A current instance of a sample created by the diffusion process.
368
+ eta (`float`):
369
+ The weight of noise for added noise in diffusion step.
370
+ use_clipped_model_output (`bool`, defaults to `False`):
371
+ If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
372
+ because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
373
+ clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
374
+ `use_clipped_model_output` has no effect.
375
+ generator (`torch.Generator`, *optional*):
376
+ A random number generator.
377
+ variance_noise (`torch.FloatTensor`):
378
+ Alternative to generating noise with `generator` by directly providing the noise for the variance
379
+ itself. Useful for methods such as [`CycleDiffusion`].
380
+ return_dict (`bool`, *optional*, defaults to `True`):
381
+ Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
377
382
 
378
383
  Returns:
379
384
  [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
380
- [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
381
- returning a tuple, the first element is the sample tensor.
385
+ If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
386
+ tuple is returned where the first element is the sample tensor.
382
387
 
383
388
  """
384
389
  if self.num_inference_steps is None:
@@ -30,14 +30,14 @@ from diffusers.utils import BaseOutput, deprecate
30
30
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
31
31
  class DDIMSchedulerOutput(BaseOutput):
32
32
  """
33
- Output class for the scheduler's step function output.
33
+ Output class for the scheduler's `step` function output.
34
34
 
35
35
  Args:
36
36
  prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
37
- Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
37
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
38
38
  denoising loop.
39
39
  pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
40
- The predicted denoised sample (x_{0}) based on the model output from the current timestep.
40
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
41
41
  `pred_original_sample` can be used to preview progress or for guidance.
42
42
  """
43
43
 
@@ -129,47 +129,45 @@ def rescale_zero_terminal_snr(betas):
129
129
 
130
130
  class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
131
131
  """
132
- DDIMInverseScheduler is the reverse scheduler of [`DDIMScheduler`].
132
+ `DDIMInverseScheduler` is the reverse scheduler of [`DDIMScheduler`].
133
133
 
134
- [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
135
- function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
136
- [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
137
- [`~SchedulerMixin.from_pretrained`] functions.
138
-
139
- For more details, see the original paper: https://arxiv.org/abs/2010.02502
134
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
135
+ methods the library implements for all schedulers such as loading and saving.
140
136
 
141
137
  Args:
142
- num_train_timesteps (`int`): number of diffusion steps used to train the model.
143
- beta_start (`float`): the starting `beta` value of inference.
144
- beta_end (`float`): the final `beta` value.
145
- beta_schedule (`str`):
146
- the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
138
+ num_train_timesteps (`int`, defaults to 1000):
139
+ The number of diffusion steps to train the model.
140
+ beta_start (`float`, defaults to 0.0001):
141
+ The starting `beta` value of inference.
142
+ beta_end (`float`, defaults to 0.02):
143
+ The final `beta` value.
144
+ beta_schedule (`str`, defaults to `"linear"`):
145
+ The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
147
146
  `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
148
- trained_betas (`np.ndarray`, optional):
149
- option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
150
- clip_sample (`bool`, default `True`):
151
- option to clip predicted sample for numerical stability.
152
- clip_sample_range (`float`, default `1.0`):
153
- the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
154
- set_alpha_to_zero (`bool`, default `True`):
155
- each diffusion step uses the value of alphas product at that step and at the previous one. For the final
156
- step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `0`,
157
- otherwise it uses the value of alpha at step `num_train_timesteps - 1`.
158
- steps_offset (`int`, default `0`):
159
- an offset added to the inference steps. You can use a combination of `offset=1` and
160
- `set_alpha_to_zero=False`, to make the last step use step `num_train_timesteps - 1` for the previous alpha
147
+ trained_betas (`np.ndarray`, *optional*):
148
+ Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
149
+ clip_sample (`bool`, defaults to `True`):
150
+ Clip the predicted sample for numerical stability.
151
+ clip_sample_range (`float`, defaults to 1.0):
152
+ The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
153
+ set_alpha_to_one (`bool`, defaults to `True`):
154
+ Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
155
+ there is no previous alpha. When this option is `True` the previous alpha product is fixed to 0, otherwise
156
+ it uses the alpha value at step `num_train_timesteps - 1`.
157
+ steps_offset (`int`, defaults to 0):
158
+ An offset added to the inference steps. You can use a combination of `offset=1` and
159
+ `set_alpha_to_one=False` to make the last step use `num_train_timesteps - 1` for the previous alpha
161
160
  product.
162
- prediction_type (`str`, default `epsilon`, optional):
163
- prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
164
- process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
165
- https://imagen.research.google/video/paper.pdf)
166
- timestep_spacing (`str`, default `"leading"`):
167
- The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
168
- Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
169
- rescale_betas_zero_snr (`bool`, default `False`):
170
- whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
171
- This can enable the model to generate very bright and dark samples instead of limiting it to samples with
172
- medium brightness. Loosely related to
161
+ prediction_type (`str`, defaults to `epsilon`, *optional*):
162
+ Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
163
+ `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
164
+ Video](https://imagen.research.google/video/paper.pdf) paper).
165
+ timestep_spacing (`str`, defaults to `"leading"`):
166
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
167
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
168
+ rescale_betas_zero_snr (`bool`, defaults to `False`):
169
+ Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
170
+ dark samples instead of limiting it to samples with medium brightness. Loosely related to
173
171
  [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
174
172
  """
175
173
 
@@ -243,21 +241,24 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
243
241
  current timestep.
244
242
 
245
243
  Args:
246
- sample (`torch.FloatTensor`): input sample
247
- timestep (`int`, optional): current timestep
244
+ sample (`torch.FloatTensor`):
245
+ The input sample.
246
+ timestep (`int`, *optional*):
247
+ The current timestep in the diffusion chain.
248
248
 
249
249
  Returns:
250
- `torch.FloatTensor`: scaled input sample
250
+ `torch.FloatTensor`:
251
+ A scaled input sample.
251
252
  """
252
253
  return sample
253
254
 
254
255
  def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
255
256
  """
256
- Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
257
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
257
258
 
258
259
  Args:
259
260
  num_inference_steps (`int`):
260
- the number of diffusion steps used when generating samples with a pre-trained model.
261
+ The number of diffusion steps used when generating samples with a pre-trained model.
261
262
  """
262
263
 
263
264
  if num_inference_steps > self.config.num_train_timesteps:
@@ -302,6 +303,37 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
302
303
  variance_noise: Optional[torch.FloatTensor] = None,
303
304
  return_dict: bool = True,
304
305
  ) -> Union[DDIMSchedulerOutput, Tuple]:
306
+ """
307
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
308
+ process from the learned model outputs (most often the predicted noise).
309
+
310
+ Args:
311
+ model_output (`torch.FloatTensor`):
312
+ The direct output from learned diffusion model.
313
+ timestep (`float`):
314
+ The current discrete timestep in the diffusion chain.
315
+ sample (`torch.FloatTensor`):
316
+ A current instance of a sample created by the diffusion process.
317
+ eta (`float`):
318
+ The weight of noise for added noise in diffusion step.
319
+ use_clipped_model_output (`bool`, defaults to `False`):
320
+ If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
321
+ because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
322
+ clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
323
+ `use_clipped_model_output` has no effect.
324
+ variance_noise (`torch.FloatTensor`):
325
+ Alternative to generating noise with `generator` by directly providing the noise for the variance
326
+ itself. Useful for methods such as [`CycleDiffusion`].
327
+ return_dict (`bool`, *optional*, defaults to `True`):
328
+ Whether or not to return a [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or
329
+ `tuple`.
330
+
331
+ Returns:
332
+ [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or `tuple`:
333
+ If return_dict is `True`, [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] is
334
+ returned, otherwise a tuple is returned where the first element is the sample tensor.
335
+
336
+ """
305
337
  # 1. get previous step value (=t+1)
306
338
  prev_timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
307
339