diffusers 0.18.0__py3-none-any.whl → 0.18.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
diffusers/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.18.0"
1
+ __version__ = "0.18.2"
2
2
 
3
3
  from .configuration_utils import ConfigMixin
4
4
  from .utils import (
@@ -607,7 +607,7 @@ def register_to_config(init):
607
607
 
608
608
  # Take note of the parameters that were not present in the loaded config
609
609
  if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
610
- new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
610
+ new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
611
611
 
612
612
  new_kwargs = {**config_init_kwargs, **new_kwargs}
613
613
  getattr(self, "register_to_config")(**new_kwargs)
@@ -655,7 +655,7 @@ def flax_register_to_config(cls):
655
655
 
656
656
  # Take note of the parameters that were not present in the loaded config
657
657
  if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
658
- new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
658
+ new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
659
659
 
660
660
  getattr(self, "register_to_config")(**new_kwargs)
661
661
  original_init(self, *args, **kwargs)
diffusers/loaders.py CHANGED
@@ -177,7 +177,7 @@ class UNet2DConditionLoadersMixin:
177
177
 
178
178
  if use_safetensors and not is_safetensors_available():
179
179
  raise ValueError(
180
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
180
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
181
181
  )
182
182
 
183
183
  allow_pickle = False
@@ -589,7 +589,7 @@ class TextualInversionLoaderMixin:
589
589
 
590
590
  if use_safetensors and not is_safetensors_available():
591
591
  raise ValueError(
592
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
592
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
593
593
  )
594
594
 
595
595
  allow_pickle = False
@@ -806,7 +806,7 @@ class LoraLoaderMixin:
806
806
 
807
807
  if use_safetensors and not is_safetensors_available():
808
808
  raise ValueError(
809
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
809
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
810
810
  )
811
811
 
812
812
  allow_pickle = False
@@ -1054,7 +1054,7 @@ class LoraLoaderMixin:
1054
1054
 
1055
1055
  if use_safetensors and not is_safetensors_available():
1056
1056
  raise ValueError(
1057
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
1057
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
1058
1058
  )
1059
1059
 
1060
1060
  allow_pickle = False
@@ -1394,7 +1394,7 @@ class FromSingleFileMixin:
1394
1394
  use_auth_token = kwargs.pop("use_auth_token", None)
1395
1395
  revision = kwargs.pop("revision", None)
1396
1396
  extract_ema = kwargs.pop("extract_ema", False)
1397
- image_size = kwargs.pop("image_size", 512)
1397
+ image_size = kwargs.pop("image_size", None)
1398
1398
  scheduler_type = kwargs.pop("scheduler_type", "pndm")
1399
1399
  num_in_channels = kwargs.pop("num_in_channels", None)
1400
1400
  upcast_attention = kwargs.pop("upcast_attention", None)
@@ -152,6 +152,7 @@ class FlaxAttention(nn.Module):
152
152
  self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
153
153
 
154
154
  self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
155
+ self.dropout_layer = nn.Dropout(rate=self.dropout)
155
156
 
156
157
  def reshape_heads_to_batch_dim(self, tensor):
157
158
  batch_size, seq_len, dim = tensor.shape
@@ -214,7 +215,7 @@ class FlaxAttention(nn.Module):
214
215
 
215
216
  hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
216
217
  hidden_states = self.proj_attn(hidden_states)
217
- return hidden_states
218
+ return self.dropout_layer(hidden_states, deterministic=deterministic)
218
219
 
219
220
 
220
221
  class FlaxBasicTransformerBlock(nn.Module):
@@ -260,6 +261,7 @@ class FlaxBasicTransformerBlock(nn.Module):
260
261
  self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
261
262
  self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
262
263
  self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
264
+ self.dropout_layer = nn.Dropout(rate=self.dropout)
263
265
 
264
266
  def __call__(self, hidden_states, context, deterministic=True):
265
267
  # self attention
@@ -280,7 +282,7 @@ class FlaxBasicTransformerBlock(nn.Module):
280
282
  hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
281
283
  hidden_states = hidden_states + residual
282
284
 
283
- return hidden_states
285
+ return self.dropout_layer(hidden_states, deterministic=deterministic)
284
286
 
285
287
 
286
288
  class FlaxTransformer2DModel(nn.Module):
@@ -356,6 +358,8 @@ class FlaxTransformer2DModel(nn.Module):
356
358
  dtype=self.dtype,
357
359
  )
358
360
 
361
+ self.dropout_layer = nn.Dropout(rate=self.dropout)
362
+
359
363
  def __call__(self, hidden_states, context, deterministic=True):
360
364
  batch, height, width, channels = hidden_states.shape
361
365
  residual = hidden_states
@@ -378,7 +382,7 @@ class FlaxTransformer2DModel(nn.Module):
378
382
  hidden_states = self.proj_out(hidden_states)
379
383
 
380
384
  hidden_states = hidden_states + residual
381
- return hidden_states
385
+ return self.dropout_layer(hidden_states, deterministic=deterministic)
382
386
 
383
387
 
384
388
  class FlaxFeedForward(nn.Module):
@@ -409,7 +413,7 @@ class FlaxFeedForward(nn.Module):
409
413
  self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
410
414
 
411
415
  def __call__(self, hidden_states, deterministic=True):
412
- hidden_states = self.net_0(hidden_states)
416
+ hidden_states = self.net_0(hidden_states, deterministic=deterministic)
413
417
  hidden_states = self.net_2(hidden_states)
414
418
  return hidden_states
415
419
 
@@ -434,8 +438,9 @@ class FlaxGEGLU(nn.Module):
434
438
  def setup(self):
435
439
  inner_dim = self.dim * 4
436
440
  self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
441
+ self.dropout_layer = nn.Dropout(rate=self.dropout)
437
442
 
438
443
  def __call__(self, hidden_states, deterministic=True):
439
444
  hidden_states = self.proj(hidden_states)
440
445
  hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
441
- return hidden_linear * nn.gelu(hidden_gelu)
446
+ return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
@@ -456,7 +456,7 @@ class ModelMixin(torch.nn.Module):
456
456
 
457
457
  if use_safetensors and not is_safetensors_available():
458
458
  raise ValueError(
459
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
459
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
460
460
  )
461
461
 
462
462
  allow_pickle = False
@@ -204,7 +204,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
204
204
  transformers_index_format = r"\d{5}-of-\d{5}"
205
205
 
206
206
  if variant is not None:
207
- # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors`
207
+ # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
208
208
  variant_file_re = re.compile(
209
209
  rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
210
210
  )
@@ -213,7 +213,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
213
213
  rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
214
214
  )
215
215
 
216
- # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors`
216
+ # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
217
217
  non_variant_file_re = re.compile(
218
218
  rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
219
219
  )
@@ -1168,7 +1168,7 @@ class DiffusionPipeline(ConfigMixin):
1168
1168
 
1169
1169
  if use_safetensors and not is_safetensors_available():
1170
1170
  raise ValueError(
1171
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
1171
+ "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
1172
1172
  )
1173
1173
 
1174
1174
  allow_pickle = False
@@ -1213,6 +1213,15 @@ class DiffusionPipeline(ConfigMixin):
1213
1213
  filenames = {sibling.rfilename for sibling in info.siblings}
1214
1214
  model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
1215
1215
 
1216
+ if len(variant_filenames) == 0 and variant is not None:
1217
+ deprecation_message = (
1218
+ f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
1219
+ f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
1220
+ "if such variant modeling files are not available. Doing so will lead to an error in v0.22.0 as defaulting to non-variant"
1221
+ "modeling files is deprecated."
1222
+ )
1223
+ deprecate("no variant default", "0.22.0", deprecation_message, standard_warn=False)
1224
+
1216
1225
  # remove ignored filenames
1217
1226
  model_filenames = set(model_filenames) - set(ignore_filenames)
1218
1227
  variant_filenames = set(variant_filenames) - set(ignore_filenames)
@@ -1302,7 +1311,7 @@ class DiffusionPipeline(ConfigMixin):
1302
1311
  snapshot_folder = Path(config_file).parent
1303
1312
  pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
1304
1313
 
1305
- if pipeline_is_cached:
1314
+ if pipeline_is_cached and not force_download:
1306
1315
  # if the pipeline is cached, we can directly return it
1307
1316
  # else call snapshot_download
1308
1317
  return snapshot_folder
@@ -24,6 +24,7 @@ from transformers import (
24
24
  AutoFeatureExtractor,
25
25
  BertTokenizerFast,
26
26
  CLIPImageProcessor,
27
+ CLIPTextConfig,
27
28
  CLIPTextModel,
28
29
  CLIPTextModelWithProjection,
29
30
  CLIPTokenizer,
@@ -48,7 +49,7 @@ from ...schedulers import (
48
49
  PNDMScheduler,
49
50
  UnCLIPScheduler,
50
51
  )
51
- from ...utils import is_omegaconf_available, is_safetensors_available, logging
52
+ from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
52
53
  from ...utils.import_utils import BACKENDS_MAPPING
53
54
  from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
54
55
  from ..paint_by_example import PaintByExampleImageEncoder
@@ -57,6 +58,10 @@ from .safety_checker import StableDiffusionSafetyChecker
57
58
  from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
58
59
 
59
60
 
61
+ if is_accelerate_available():
62
+ from accelerate import init_empty_weights
63
+ from accelerate.utils import set_module_tensor_to_device
64
+
60
65
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
61
66
 
62
67
 
@@ -391,8 +396,8 @@ def convert_ldm_unet_checkpoint(
391
396
 
392
397
  # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
393
398
  if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
394
- print(f"Checkpoint {path} has both EMA and non-EMA weights.")
395
- print(
399
+ logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
400
+ logger.warning(
396
401
  "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
397
402
  " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
398
403
  )
@@ -402,7 +407,7 @@ def convert_ldm_unet_checkpoint(
402
407
  unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
403
408
  else:
404
409
  if sum(k.startswith("model_ema") for k in keys) > 100:
405
- print(
410
+ logger.warning(
406
411
  "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
407
412
  " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
408
413
  )
@@ -770,11 +775,12 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
770
775
 
771
776
 
772
777
  def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
773
- text_model = (
774
- CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
775
- if text_encoder is None
776
- else text_encoder
777
- )
778
+ if text_encoder is None:
779
+ config_name = "openai/clip-vit-large-patch14"
780
+ config = CLIPTextConfig.from_pretrained(config_name)
781
+
782
+ with init_empty_weights():
783
+ text_model = CLIPTextModel(config)
778
784
 
779
785
  keys = list(checkpoint.keys())
780
786
 
@@ -787,7 +793,8 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
787
793
  if key.startswith(prefix):
788
794
  text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
789
795
 
790
- text_model.load_state_dict(text_model_dict)
796
+ for param_name, param in text_model_dict.items():
797
+ set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
791
798
 
792
799
  return text_model
793
800
 
@@ -884,14 +891,26 @@ def convert_paint_by_example_checkpoint(checkpoint):
884
891
  return model
885
892
 
886
893
 
887
- def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
894
+ def convert_open_clip_checkpoint(
895
+ checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
896
+ ):
888
897
  # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
889
- text_model = CLIPTextModelWithProjection.from_pretrained(
890
- "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
891
- )
898
+ # text_model = CLIPTextModelWithProjection.from_pretrained(
899
+ # "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
900
+ # )
901
+ config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
902
+
903
+ with init_empty_weights():
904
+ text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
892
905
 
893
906
  keys = list(checkpoint.keys())
894
907
 
908
+ keys_to_ignore = []
909
+ if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
910
+ # make sure to remove all keys > 22
911
+ keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
912
+ keys_to_ignore += ["cond_stage_model.model.text_projection"]
913
+
895
914
  text_model_dict = {}
896
915
 
897
916
  if prefix + "text_projection" in checkpoint:
@@ -902,8 +921,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
902
921
  text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
903
922
 
904
923
  for key in keys:
905
- # if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer
906
- # continue
924
+ if key in keys_to_ignore:
925
+ continue
907
926
  if key[len(prefix) :] in textenc_conversion_map:
908
927
  if key.endswith("text_projection"):
909
928
  value = checkpoint[key].T
@@ -931,7 +950,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
931
950
 
932
951
  text_model_dict[new_key] = checkpoint[key]
933
952
 
934
- text_model.load_state_dict(text_model_dict)
953
+ for param_name, param in text_model_dict.items():
954
+ set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
935
955
 
936
956
  return text_model
937
957
 
@@ -1061,7 +1081,7 @@ def convert_controlnet_checkpoint(
1061
1081
  def download_from_original_stable_diffusion_ckpt(
1062
1082
  checkpoint_path: str,
1063
1083
  original_config_file: str = None,
1064
- image_size: int = 512,
1084
+ image_size: Optional[int] = None,
1065
1085
  prediction_type: str = None,
1066
1086
  model_type: str = None,
1067
1087
  extract_ema: bool = False,
@@ -1144,6 +1164,7 @@ def download_from_original_stable_diffusion_ckpt(
1144
1164
  LDMTextToImagePipeline,
1145
1165
  PaintByExamplePipeline,
1146
1166
  StableDiffusionControlNetPipeline,
1167
+ StableDiffusionInpaintPipeline,
1147
1168
  StableDiffusionPipeline,
1148
1169
  StableDiffusionXLImg2ImgPipeline,
1149
1170
  StableDiffusionXLPipeline,
@@ -1166,12 +1187,9 @@ def download_from_original_stable_diffusion_ckpt(
1166
1187
  if not is_safetensors_available():
1167
1188
  raise ValueError(BACKENDS_MAPPING["safetensors"][1])
1168
1189
 
1169
- from safetensors import safe_open
1190
+ from safetensors.torch import load_file as safe_load
1170
1191
 
1171
- checkpoint = {}
1172
- with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
1173
- for key in f.keys():
1174
- checkpoint[key] = f.get_tensor(key)
1192
+ checkpoint = safe_load(checkpoint_path, device="cpu")
1175
1193
  else:
1176
1194
  if device is None:
1177
1195
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1183,7 +1201,7 @@ def download_from_original_stable_diffusion_ckpt(
1183
1201
  if "global_step" in checkpoint:
1184
1202
  global_step = checkpoint["global_step"]
1185
1203
  else:
1186
- print("global_step key not found in model")
1204
+ logger.debug("global_step key not found in model")
1187
1205
  global_step = None
1188
1206
 
1189
1207
  # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
@@ -1230,8 +1248,15 @@ def download_from_original_stable_diffusion_ckpt(
1230
1248
  model_type = "SDXL"
1231
1249
  else:
1232
1250
  model_type = "SDXL-Refiner"
1251
+ if image_size is None:
1252
+ image_size = 1024
1233
1253
 
1234
- if num_in_channels is not None:
1254
+ if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
1255
+ num_in_channels = 9
1256
+ elif num_in_channels is None:
1257
+ num_in_channels = 4
1258
+
1259
+ if "unet_config" in original_config.model.params:
1235
1260
  original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
1236
1261
 
1237
1262
  if (
@@ -1263,7 +1288,6 @@ def download_from_original_stable_diffusion_ckpt(
1263
1288
  num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
1264
1289
 
1265
1290
  if model_type in ["SDXL", "SDXL-Refiner"]:
1266
- image_size = 1024
1267
1291
  scheduler_dict = {
1268
1292
  "beta_schedule": "scaled_linear",
1269
1293
  "beta_start": 0.00085,
@@ -1279,7 +1303,6 @@ def download_from_original_stable_diffusion_ckpt(
1279
1303
  }
1280
1304
  scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
1281
1305
  scheduler_type = "euler"
1282
- vae_path = "stabilityai/sdxl-vae"
1283
1306
  else:
1284
1307
  beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
1285
1308
  beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
@@ -1318,25 +1341,45 @@ def download_from_original_stable_diffusion_ckpt(
1318
1341
  # Convert the UNet2DConditionModel model.
1319
1342
  unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
1320
1343
  unet_config["upcast_attention"] = upcast_attention
1321
- unet = UNet2DConditionModel(**unet_config)
1344
+ with init_empty_weights():
1345
+ unet = UNet2DConditionModel(**unet_config)
1322
1346
 
1323
1347
  converted_unet_checkpoint = convert_ldm_unet_checkpoint(
1324
1348
  checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
1325
1349
  )
1326
- unet.load_state_dict(converted_unet_checkpoint)
1350
+
1351
+ for param_name, param in converted_unet_checkpoint.items():
1352
+ set_module_tensor_to_device(unet, param_name, "cpu", value=param)
1327
1353
 
1328
1354
  # Convert the VAE model.
1329
1355
  if vae_path is None:
1330
1356
  vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
1331
1357
  converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
1332
1358
 
1333
- vae = AutoencoderKL(**vae_config)
1334
- vae.load_state_dict(converted_vae_checkpoint)
1359
+ if (
1360
+ "model" in original_config
1361
+ and "params" in original_config.model
1362
+ and "scale_factor" in original_config.model.params
1363
+ ):
1364
+ vae_scaling_factor = original_config.model.params.scale_factor
1365
+ else:
1366
+ vae_scaling_factor = 0.18215 # default SD scaling factor
1367
+
1368
+ vae_config["scaling_factor"] = vae_scaling_factor
1369
+
1370
+ with init_empty_weights():
1371
+ vae = AutoencoderKL(**vae_config)
1372
+
1373
+ for param_name, param in converted_vae_checkpoint.items():
1374
+ set_module_tensor_to_device(vae, param_name, "cpu", value=param)
1335
1375
  else:
1336
1376
  vae = AutoencoderKL.from_pretrained(vae_path)
1337
1377
 
1338
1378
  if model_type == "FrozenOpenCLIPEmbedder":
1339
- text_model = convert_open_clip_checkpoint(checkpoint)
1379
+ config_name = "stabilityai/stable-diffusion-2"
1380
+ config_kwargs = {"subfolder": "text_encoder"}
1381
+
1382
+ text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
1340
1383
  tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
1341
1384
 
1342
1385
  if stable_unclip is None:
@@ -1469,7 +1512,12 @@ def download_from_original_stable_diffusion_ckpt(
1469
1512
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
1470
1513
  text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
1471
1514
  tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
1472
- text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.")
1515
+
1516
+ config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
1517
+ config_kwargs = {"projection_dim": 1280}
1518
+ text_encoder_2 = convert_open_clip_checkpoint(
1519
+ checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
1520
+ )
1473
1521
 
1474
1522
  pipe = StableDiffusionXLPipeline(
1475
1523
  vae=vae,
@@ -1485,7 +1533,12 @@ def download_from_original_stable_diffusion_ckpt(
1485
1533
  tokenizer = None
1486
1534
  text_encoder = None
1487
1535
  tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
1488
- text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.")
1536
+
1537
+ config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
1538
+ config_kwargs = {"projection_dim": 1280}
1539
+ text_encoder_2 = convert_open_clip_checkpoint(
1540
+ checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
1541
+ )
1489
1542
 
1490
1543
  pipe = StableDiffusionXLImg2ImgPipeline(
1491
1544
  vae=vae,
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
24
24
 
25
25
  from ...configuration_utils import FrozenDict
26
26
  from ...image_processor import VaeImageProcessor
27
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
27
+ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
28
28
  from ...models import AutoencoderKL, UNet2DConditionModel
29
29
  from ...schedulers import KarrasDiffusionSchedulers
30
30
  from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
@@ -153,7 +153,9 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
153
153
  return mask, masked_image
154
154
 
155
155
 
156
- class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
156
+ class StableDiffusionInpaintPipeline(
157
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
158
+ ):
157
159
  r"""
158
160
  Pipeline for text-guided image inpainting using Stable Diffusion.
159
161
 
@@ -748,15 +748,19 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
748
748
  # make sure the VAE is in float32 mode, as it overflows in float16
749
749
  self.vae.to(dtype=torch.float32)
750
750
 
751
- use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
752
- AttnProcessor2_0,
753
- XFormersAttnProcessor,
754
- LoRAXFormersAttnProcessor,
755
- LoRAAttnProcessor2_0,
756
- ]
751
+ use_torch_2_0_or_xformers = isinstance(
752
+ self.vae.decoder.mid_block.attentions[0].processor,
753
+ (
754
+ AttnProcessor2_0,
755
+ XFormersAttnProcessor,
756
+ LoRAXFormersAttnProcessor,
757
+ LoRAAttnProcessor2_0,
758
+ ),
759
+ )
760
+
757
761
  # if xformers or torch_2_0 is used attention block does not need
758
762
  # to be in float32 which can save lots of memory
759
- if not use_torch_2_0_or_xformers:
763
+ if use_torch_2_0_or_xformers:
760
764
  self.vae.post_quant_conv.to(latents.dtype)
761
765
  self.vae.decoder.conv_in.to(latents.dtype)
762
766
  self.vae.decoder.mid_block.to(latents.dtype)
@@ -8,7 +8,6 @@ from ...utils import BaseOutput, is_invisible_watermark_available, is_torch_avai
8
8
 
9
9
 
10
10
  @dataclass
11
- # Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with StableDiffusion->StableDiffusionXL
12
11
  class StableDiffusionXLPipelineOutput(BaseOutput):
13
12
  """
14
13
  Output class for Stable Diffusion pipelines.
@@ -17,13 +16,9 @@ class StableDiffusionXLPipelineOutput(BaseOutput):
17
16
  images (`List[PIL.Image.Image]` or `np.ndarray`)
18
17
  List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
19
18
  num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
20
- nsfw_content_detected (`List[bool]`)
21
- List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
22
- (nsfw) content, or `None` if safety checking could not be performed.
23
19
  """
24
20
 
25
21
  images: Union[List[PIL.Image.Image], np.ndarray]
26
- nsfw_content_detected: Optional[List[bool]]
27
22
 
28
23
 
29
24
  if is_transformers_available() and is_torch_available() and is_invisible_watermark_available():
@@ -129,9 +129,11 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
129
129
  self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
130
130
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
131
131
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
132
+ self.default_sample_size = self.unet.config.sample_size
132
133
 
133
134
  self.watermark = StableDiffusionXLWatermarker()
134
135
 
136
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
135
137
  def enable_vae_slicing(self):
136
138
  r"""
137
139
  Enable sliced VAE decoding.
@@ -141,6 +143,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
141
143
  """
142
144
  self.vae.enable_slicing()
143
145
 
146
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
144
147
  def disable_vae_slicing(self):
145
148
  r"""
146
149
  Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -148,6 +151,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
148
151
  """
149
152
  self.vae.disable_slicing()
150
153
 
154
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
151
155
  def enable_vae_tiling(self):
152
156
  r"""
153
157
  Enable tiled VAE decoding.
@@ -157,6 +161,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
157
161
  """
158
162
  self.vae.enable_tiling()
159
163
 
164
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
160
165
  def disable_vae_tiling(self):
161
166
  r"""
162
167
  Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -183,7 +188,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
183
188
  self.to("cpu", silence_dtype_warnings=True)
184
189
  torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
185
190
 
186
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
191
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
187
192
  cpu_offload(cpu_offloaded_model, device)
188
193
 
189
194
  def enable_model_cpu_offload(self, gpu_id=0):
@@ -217,6 +222,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
217
222
  self.final_offload_hook = hook
218
223
 
219
224
  @property
225
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
220
226
  def _execution_device(self):
221
227
  r"""
222
228
  Returns the device on which the pipeline's models will be executed. After calling
@@ -237,12 +243,14 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
237
243
  def encode_prompt(
238
244
  self,
239
245
  prompt,
240
- device,
241
- num_images_per_prompt,
242
- do_classifier_free_guidance,
246
+ device: Optional[torch.device] = None,
247
+ num_images_per_prompt: int = 1,
248
+ do_classifier_free_guidance: bool = True,
243
249
  negative_prompt=None,
244
250
  prompt_embeds: Optional[torch.FloatTensor] = None,
245
251
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
252
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
253
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
246
254
  lora_scale: Optional[float] = None,
247
255
  ):
248
256
  r"""
@@ -268,9 +276,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
268
276
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
269
277
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
270
278
  argument.
279
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
280
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
281
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
282
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
283
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
284
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
285
+ input argument.
271
286
  lora_scale (`float`, *optional*):
272
287
  A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
273
288
  """
289
+ device = device or self._execution_device
290
+
274
291
  # set lora scale so that monkey patched LoRA
275
292
  # function of text encoder can correctly access it
276
293
  if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -399,6 +416,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
399
416
 
400
417
  negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
401
418
 
419
+ bs_embed = pooled_prompt_embeds.shape[0]
402
420
  pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
403
421
  bs_embed * num_images_per_prompt, -1
404
422
  )
@@ -408,20 +426,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
408
426
 
409
427
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
410
428
 
411
- def run_safety_checker(self, image, device, dtype):
412
- if self.safety_checker is None:
413
- has_nsfw_concept = None
414
- else:
415
- if torch.is_tensor(image):
416
- feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
417
- else:
418
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
419
- safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
420
- image, has_nsfw_concept = self.safety_checker(
421
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
422
- )
423
- return image, has_nsfw_concept
424
-
429
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
425
430
  def prepare_extra_step_kwargs(self, generator, eta):
426
431
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
427
432
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -448,6 +453,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
448
453
  negative_prompt=None,
449
454
  prompt_embeds=None,
450
455
  negative_prompt_embeds=None,
456
+ pooled_prompt_embeds=None,
457
+ negative_pooled_prompt_embeds=None,
451
458
  ):
452
459
  if height % 8 != 0 or width % 8 != 0:
453
460
  raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -486,6 +493,17 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
486
493
  f" {negative_prompt_embeds.shape}."
487
494
  )
488
495
 
496
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
497
+ raise ValueError(
498
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
499
+ )
500
+
501
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
502
+ raise ValueError(
503
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
504
+ )
505
+
506
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
489
507
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
490
508
  shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
491
509
  if isinstance(generator, list) and len(generator) != batch_size:
@@ -535,6 +553,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
535
553
  latents: Optional[torch.FloatTensor] = None,
536
554
  prompt_embeds: Optional[torch.FloatTensor] = None,
537
555
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
556
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
557
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
538
558
  output_type: Optional[str] = "pil",
539
559
  return_dict: bool = True,
540
560
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -588,6 +608,13 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
588
608
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
589
609
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
590
610
  argument.
611
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
612
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
613
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
614
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
615
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
616
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
617
+ input argument.
591
618
  output_type (`str`, *optional*, defaults to `"pil"`):
592
619
  The output format of the generate image. Choose between
593
620
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -626,15 +653,23 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
626
653
  "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
627
654
  """
628
655
  # 0. Default height and width to unet
629
- height = height or self.unet.config.sample_size * self.vae_scale_factor
630
- width = width or self.unet.config.sample_size * self.vae_scale_factor
656
+ height = height or self.default_sample_size * self.vae_scale_factor
657
+ width = width or self.default_sample_size * self.vae_scale_factor
631
658
 
632
659
  original_size = original_size or (height, width)
633
660
  target_size = target_size or (height, width)
634
661
 
635
662
  # 1. Check inputs. Raise error if not correct
636
663
  self.check_inputs(
637
- prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
664
+ prompt,
665
+ height,
666
+ width,
667
+ callback_steps,
668
+ negative_prompt,
669
+ prompt_embeds,
670
+ negative_prompt_embeds,
671
+ pooled_prompt_embeds,
672
+ negative_pooled_prompt_embeds,
638
673
  )
639
674
 
640
675
  # 2. Define call parameters
@@ -669,6 +704,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
669
704
  negative_prompt,
670
705
  prompt_embeds=prompt_embeds,
671
706
  negative_prompt_embeds=negative_prompt_embeds,
707
+ pooled_prompt_embeds=pooled_prompt_embeds,
708
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
672
709
  lora_scale=text_encoder_lora_scale,
673
710
  )
674
711
 
@@ -749,15 +786,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
749
786
  # make sure the VAE is in float32 mode, as it overflows in float16
750
787
  self.vae.to(dtype=torch.float32)
751
788
 
752
- use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
753
- AttnProcessor2_0,
754
- XFormersAttnProcessor,
755
- LoRAXFormersAttnProcessor,
756
- LoRAAttnProcessor2_0,
757
- ]
789
+ use_torch_2_0_or_xformers = isinstance(
790
+ self.vae.decoder.mid_block.attentions[0].processor,
791
+ (
792
+ AttnProcessor2_0,
793
+ XFormersAttnProcessor,
794
+ LoRAXFormersAttnProcessor,
795
+ LoRAAttnProcessor2_0,
796
+ ),
797
+ )
758
798
  # if xformers or torch_2_0 is used attention block does not need
759
799
  # to be in float32 which can save lots of memory
760
- if not use_torch_2_0_or_xformers:
800
+ if use_torch_2_0_or_xformers:
761
801
  self.vae.post_quant_conv.to(latents.dtype)
762
802
  self.vae.decoder.conv_in.to(latents.dtype)
763
803
  self.vae.decoder.mid_block.to(latents.dtype)
@@ -765,27 +805,19 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
765
805
  latents = latents.float()
766
806
 
767
807
  if not output_type == "latent":
768
- # CHECK there is problem here (PVP)
769
808
  image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
770
- has_nsfw_concept = None
771
809
  else:
772
810
  image = latents
773
- has_nsfw_concept = None
774
- return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
775
-
776
- if has_nsfw_concept is None:
777
- do_denormalize = [True] * image.shape[0]
778
- else:
779
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
811
+ return StableDiffusionXLPipelineOutput(images=image)
780
812
 
781
813
  image = self.watermark.apply_watermark(image)
782
- image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
814
+ image = self.image_processor.postprocess(image, output_type=output_type)
783
815
 
784
816
  # Offload last model to CPU
785
817
  if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
786
818
  self.final_offload_hook.offload()
787
819
 
788
820
  if not return_dict:
789
- return (image, has_nsfw_concept)
821
+ return (image,)
790
822
 
791
- return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
823
+ return StableDiffusionXLPipelineOutput(images=image)
@@ -140,6 +140,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
140
140
 
141
141
  self.watermark = StableDiffusionXLWatermarker()
142
142
 
143
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
143
144
  def enable_vae_slicing(self):
144
145
  r"""
145
146
  Enable sliced VAE decoding.
@@ -149,6 +150,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
149
150
  """
150
151
  self.vae.enable_slicing()
151
152
 
153
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
152
154
  def disable_vae_slicing(self):
153
155
  r"""
154
156
  Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -156,6 +158,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
156
158
  """
157
159
  self.vae.disable_slicing()
158
160
 
161
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
159
162
  def enable_vae_tiling(self):
160
163
  r"""
161
164
  Enable tiled VAE decoding.
@@ -165,6 +168,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
165
168
  """
166
169
  self.vae.enable_tiling()
167
170
 
171
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
168
172
  def disable_vae_tiling(self):
169
173
  r"""
170
174
  Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -172,6 +176,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
172
176
  """
173
177
  self.vae.disable_tiling()
174
178
 
179
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_sequential_cpu_offload
175
180
  def enable_sequential_cpu_offload(self, gpu_id=0):
176
181
  r"""
177
182
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -191,9 +196,10 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
191
196
  self.to("cpu", silence_dtype_warnings=True)
192
197
  torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
193
198
 
194
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
199
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
195
200
  cpu_offload(cpu_offloaded_model, device)
196
201
 
202
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
197
203
  def enable_model_cpu_offload(self, gpu_id=0):
198
204
  r"""
199
205
  Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -225,6 +231,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
225
231
  self.final_offload_hook = hook
226
232
 
227
233
  @property
234
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
228
235
  def _execution_device(self):
229
236
  r"""
230
237
  Returns the device on which the pipeline's models will be executed. After calling
@@ -242,15 +249,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
242
249
  return torch.device(module._hf_hook.execution_device)
243
250
  return self.device
244
251
 
252
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
245
253
  def encode_prompt(
246
254
  self,
247
255
  prompt,
248
- device,
249
- num_images_per_prompt,
250
- do_classifier_free_guidance,
256
+ device: Optional[torch.device] = None,
257
+ num_images_per_prompt: int = 1,
258
+ do_classifier_free_guidance: bool = True,
251
259
  negative_prompt=None,
252
260
  prompt_embeds: Optional[torch.FloatTensor] = None,
253
261
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
262
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
263
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
254
264
  lora_scale: Optional[float] = None,
255
265
  ):
256
266
  r"""
@@ -276,9 +286,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
276
286
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
277
287
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
278
288
  argument.
289
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
290
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
291
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
292
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
293
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
294
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
295
+ input argument.
279
296
  lora_scale (`float`, *optional*):
280
297
  A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
281
298
  """
299
+ device = device or self._execution_device
300
+
282
301
  # set lora scale so that monkey patched LoRA
283
302
  # function of text encoder can correctly access it
284
303
  if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -327,13 +346,11 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
327
346
  text_input_ids.to(device),
328
347
  output_hidden_states=True,
329
348
  )
349
+
330
350
  # We are only ALWAYS interested in the pooled output of the final text encoder
331
351
  pooled_prompt_embeds = prompt_embeds[0]
332
-
333
352
  prompt_embeds = prompt_embeds.hidden_states[-2]
334
353
 
335
- prompt_embeds = prompt_embeds
336
-
337
354
  bs_embed, seq_len, _ = prompt_embeds.shape
338
355
  # duplicate text embeddings for each generation per prompt, using mps friendly method
339
356
  prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -349,10 +366,9 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
349
366
  negative_prompt_embeds = torch.zeros_like(prompt_embeds)
350
367
  negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
351
368
  elif do_classifier_free_guidance and negative_prompt_embeds is None:
369
+ negative_prompt = negative_prompt or ""
352
370
  uncond_tokens: List[str]
353
- if negative_prompt is None:
354
- uncond_tokens = [""] * batch_size
355
- elif prompt is not None and type(prompt) is not type(negative_prompt):
371
+ if prompt is not None and type(prompt) is not type(negative_prompt):
356
372
  raise TypeError(
357
373
  f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
358
374
  f" {type(prompt)}."
@@ -389,7 +405,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
389
405
  )
390
406
  # We are only ALWAYS interested in the pooled output of the final text encoder
391
407
  negative_pooled_prompt_embeds = negative_prompt_embeds[0]
392
-
393
408
  negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
394
409
 
395
410
  if do_classifier_free_guidance:
@@ -411,6 +426,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
411
426
 
412
427
  negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
413
428
 
429
+ bs_embed = pooled_prompt_embeds.shape[0]
414
430
  pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
415
431
  bs_embed * num_images_per_prompt, -1
416
432
  )
@@ -420,20 +436,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
420
436
 
421
437
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
422
438
 
423
- def run_safety_checker(self, image, device, dtype):
424
- if self.safety_checker is None:
425
- has_nsfw_concept = None
426
- else:
427
- if torch.is_tensor(image):
428
- feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
429
- else:
430
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
431
- safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
432
- image, has_nsfw_concept = self.safety_checker(
433
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
434
- )
435
- return image, has_nsfw_concept
436
-
439
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
437
440
  def prepare_extra_step_kwargs(self, generator, eta):
438
441
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
439
442
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -624,6 +627,8 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
624
627
  latents: Optional[torch.FloatTensor] = None,
625
628
  prompt_embeds: Optional[torch.FloatTensor] = None,
626
629
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
630
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
631
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
627
632
  output_type: Optional[str] = "pil",
628
633
  return_dict: bool = True,
629
634
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -683,6 +688,13 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
683
688
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
684
689
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
685
690
  argument.
691
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
692
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
693
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
694
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
695
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
696
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
697
+ input argument.
686
698
  output_type (`str`, *optional*, defaults to `"pil"`):
687
699
  The output format of the generate image. Choose between
688
700
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -759,6 +771,8 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
759
771
  negative_prompt,
760
772
  prompt_embeds=prompt_embeds,
761
773
  negative_prompt_embeds=negative_prompt_embeds,
774
+ pooled_prompt_embeds=pooled_prompt_embeds,
775
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
762
776
  lora_scale=text_encoder_lora_scale,
763
777
  )
764
778
 
@@ -845,15 +859,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
845
859
  # make sure the VAE is in float32 mode, as it overflows in float16
846
860
  self.vae.to(dtype=torch.float32)
847
861
 
848
- use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
849
- AttnProcessor2_0,
850
- XFormersAttnProcessor,
851
- LoRAXFormersAttnProcessor,
852
- LoRAAttnProcessor2_0,
853
- ]
862
+ use_torch_2_0_or_xformers = isinstance(
863
+ self.vae.decoder.mid_block.attentions[0].processor,
864
+ (
865
+ AttnProcessor2_0,
866
+ XFormersAttnProcessor,
867
+ LoRAXFormersAttnProcessor,
868
+ LoRAAttnProcessor2_0,
869
+ ),
870
+ )
854
871
  # if xformers or torch_2_0 is used attention block does not need
855
872
  # to be in float32 which can save lots of memory
856
- if not use_torch_2_0_or_xformers:
873
+ if use_torch_2_0_or_xformers:
857
874
  self.vae.post_quant_conv.to(latents.dtype)
858
875
  self.vae.decoder.conv_in.to(latents.dtype)
859
876
  self.vae.decoder.mid_block.to(latents.dtype)
@@ -862,24 +879,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
862
879
 
863
880
  if not output_type == "latent":
864
881
  image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
865
- has_nsfw_concept = None
866
882
  else:
867
883
  image = latents
868
- return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
869
-
870
- if has_nsfw_concept is None:
871
- do_denormalize = [True] * image.shape[0]
872
- else:
873
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
884
+ return StableDiffusionXLPipelineOutput(images=image)
874
885
 
875
886
  image = self.watermark.apply_watermark(image)
876
- image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
887
+ image = self.image_processor.postprocess(image, output_type=output_type)
877
888
 
878
889
  # Offload last model to CPU
879
890
  if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
880
891
  self.final_offload_hook.offload()
881
892
 
882
893
  if not return_dict:
883
- return (image, has_nsfw_concept)
894
+ return (image,)
884
895
 
885
- return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
896
+ return StableDiffusionXLPipelineOutput(images=image)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: diffusers
3
- Version: 0.18.0
3
+ Version: 0.18.2
4
4
  Summary: Diffusers
5
5
  Home-page: https://github.com/huggingface/diffusers
6
6
  Author: The HuggingFace team
@@ -1,9 +1,9 @@
1
- diffusers/__init__.py,sha256=Wcm-mUVXAJwULWpuWcyFbdceTZdVb6gqW0NG8YGD30A,9329
2
- diffusers/configuration_utils.py,sha256=xBuxUFnruv-0Y9obZvbvM3-0l9MRel1J--8V46WTB98,30357
1
+ diffusers/__init__.py,sha256=Vtoe0ie8nREHRwBNNwzbyQ2rwqLTcB4399y6DBFTOok,9329
2
+ diffusers/configuration_utils.py,sha256=--Nwf_FViQXq71M8PcgUUjT_YoLV1WYqV49Fnk-amkk,30369
3
3
  diffusers/dependency_versions_check.py,sha256=T2AQMFfOGMCULAqRAE8zf1VE5j7GFxxs7SfEuhTY4lA,1756
4
4
  diffusers/dependency_versions_table.py,sha256=TnzJqBXnJYMXeMw61Lgq_QlTkjWydwOKDIKbV0RXG6Q,1446
5
5
  diffusers/image_processor.py,sha256=VqdToqZY-Xdb0sqibwVn1A9gdGOU3OvgQpr67mnMWGg,13700
6
- diffusers/loaders.py,sha256=sbotPO3y1mRXF4byG9DMwmqWeD_wWNYBIycY1qgUCuI,75164
6
+ diffusers/loaders.py,sha256=9trJ4QdgKOmfqguAKHq73fu5VDjw13krtgyJq7AnpQw,75161
7
7
  diffusers/optimization.py,sha256=KZpFO98pzgt1l-etti_7k5c-EK9WEY3-XossN6VEGrs,14546
8
8
  diffusers/pipeline_utils.py,sha256=dJVuXQ_ZBHkW64dwPbIPM51QnqQKIp9-WSIhRQYlJg4,1147
9
9
  diffusers/training_utils.py,sha256=TEuw7ro2RT35ujfMW2DKzb1KZpF4-HfuKSZ1NNnIIvI,13195
@@ -16,7 +16,7 @@ diffusers/experimental/rl/value_guided_sampling.py,sha256=iIhf1gc2QP7Jx4HrsoOyRC
16
16
  diffusers/models/__init__.py,sha256=MDG83d8C1YGGSnGNwi9sG6c33_FEaMGS3BVGnaqWJqQ,1446
17
17
  diffusers/models/activations.py,sha256=cWe7qw4wR626ADw-abcV3lI1v5Vim_R_eNMc5jPlaLo,297
18
18
  diffusers/models/attention.py,sha256=Nfmze9IvGR5a6ir9o0Z4DbAQ8repJxBo2_t4fDsnvHw,15197
19
- diffusers/models/attention_flax.py,sha256=IHc1OfaIfmlJ2xYHdZ2UGixO9m08ThBN-6C7fv1XEb0,17680
19
+ diffusers/models/attention_flax.py,sha256=6IOINRK5flDgnzsLiSLIfhBnDtdY9LyhcDIUXVS_Gag,18142
20
20
  diffusers/models/attention_processor.py,sha256=04g9405fWhb-C0xO9cnn-LfAMcSwxZ9fOzYrX98aa6A,70119
21
21
  diffusers/models/autoencoder_kl.py,sha256=qM2oRqJROHvA3PSwMDmNISQzK3oFmgJiRRzvHZw9dHQ,17913
22
22
  diffusers/models/controlnet.py,sha256=OzCVtpmlJXTfIze3Bmc6p7lGFirxvlI-MroHL7HQ5mQ,33086
@@ -28,7 +28,7 @@ diffusers/models/embeddings_flax.py,sha256=87ysODCdTERpYfH-EDhElOUyCAu8z6-xIQCqL
28
28
  diffusers/models/modeling_flax_pytorch_utils.py,sha256=yFQHU86DdvrzFLfkTbyZZ0_PWKrjnp08s46dD-wf_tw,4601
29
29
  diffusers/models/modeling_flax_utils.py,sha256=0ailGzoCLU5-81rn048e2UJEr0S1lHGBQGqpOJzWfWQ,26071
30
30
  diffusers/models/modeling_pytorch_flax_utils.py,sha256=5dt6mC956MYrIMp8Owvx8QQv8xsfik6vu0frgb_c6HE,6974
31
- diffusers/models/modeling_utils.py,sha256=_tRAf4PGPdH1gqHoYlPEiNoTuSpj9RejTxpue8BBvIA,46589
31
+ diffusers/models/modeling_utils.py,sha256=vHf-AWIwuTvyjtOCbTryupWmQLxiujNhBfVL0hmop_k,46588
32
32
  diffusers/models/prior_transformer.py,sha256=5A8Tgq4VXkjH0ib05kPHXPObekLYdrRwuCgnGvoMVN4,16574
33
33
  diffusers/models/resnet.py,sha256=y9FIuXYUTHYA3AFUeDBwiHJVu0crM0fMRnzEJ3ZtVf4,35294
34
34
  diffusers/models/resnet_flax.py,sha256=VKF-ti1jlH_GnlWRy9dY6ETc-W9ZitfQoNjmrFAQxuU,4021
@@ -50,7 +50,7 @@ diffusers/models/vq_model.py,sha256=_98GsNUGg3HxcC97zQSgxEPVuDNvn1DcJP6TCTpGLVE,
50
50
  diffusers/pipelines/__init__.py,sha256=pjJh4SXSHjSBtzzAsiuQp64YQ03xPMdgTzK-0-iV9Ew,7009
51
51
  diffusers/pipelines/onnx_utils.py,sha256=M-6GBVRFji_ik5x1CMxrz9r5oEBr9TTblqLsI1HfiS4,8282
52
52
  diffusers/pipelines/pipeline_flax_utils.py,sha256=CLjAhcwfBJ1xTbdRbyWHGdcd5uRJDoXDdxruuK2t2iM,25924
53
- diffusers/pipelines/pipeline_utils.py,sha256=NTjp1RgH4aSFNEgSslMQGDvE5Ij-XCy_eImcwiMBT-w,71753
53
+ diffusers/pipelines/pipeline_utils.py,sha256=2P6oTVvZcs33-LoWUQosYkdsl1bEKE3MfnQdhkjubRw,72464
54
54
  diffusers/pipelines/alt_diffusion/__init__.py,sha256=rCOBtGQ7xi3DahUXY8r5ICt_t6S0ogp4uDJL9q4avso,1346
55
55
  diffusers/pipelines/alt_diffusion/modeling_roberta_series.py,sha256=_UC4IxHAg2QAFtw4yCvo2eLIDBRmg2JvvtOr6k5PFC8,5580
56
56
  diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py,sha256=YgUvsnah_cIXNwzJgxE87fftWox7leXOY8lzZeph7c8,40641
@@ -127,7 +127,7 @@ diffusers/pipelines/spectrogram_diffusion/midi_utils.py,sha256=HmOSMSaKZlloW8J6m
127
127
  diffusers/pipelines/spectrogram_diffusion/notes_encoder.py,sha256=Yq3W0lkAMGhx5pGklTvomBHjqR1nAVALBcYlzZBSQ90,2921
128
128
  diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py,sha256=GAHovdMWMhmGgS02kFOaS7_Lq9AJmTxrBZC0VElHwBQ,8657
129
129
  diffusers/pipelines/stable_diffusion/__init__.py,sha256=nBYUiO6TbCsqNfImNCPi1aE-Q35Lc5r9B7qWb9TDjcM,6164
130
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py,sha256=Y4J394whXeBduWXJlcp04E71rBzD9PcCDPYd3ZETlrw,67209
130
+ diffusers/pipelines/stable_diffusion/convert_from_ckpt.py,sha256=wYHT2MGLa6LFcmlvtxgDCQ5tqZCqejur2hN-0YL0GsA,69501
131
131
  diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py,sha256=sA76ZiTUVOTiCMDss7z3nouqg8czJwBmhX7OPuYheWk,43554
132
132
  diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py,sha256=Pbprq5sXlbS6JPP44eOzzm0FrwsccrHoaXuFWY_Kx38,20922
133
133
  diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py,sha256=rjxVeyAmMTQiTiQF7Q9y3BYU45jCImKZGx37ir8zpM8,1257
@@ -145,7 +145,7 @@ diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py,sha2
145
145
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py,sha256=-k3M22p4KXDkUJAnxj4xU9VM_QBBqs_pV0XHlfYzsKk,80921
146
146
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py,sha256=xOvGZvBBvB1Ee_9B46bKpRK5C9SuxbNZhrQ44nvIYsQ,23137
147
147
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py,sha256=XPNN7lINZoezjS2ciifKfIvRXotUbB_dyByaAPOE3Vs,42149
148
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py,sha256=oWUXc2LgfjNFBroXqhu1JaK6FlXI84kKaxg2OtXdi7U,55092
148
+ diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py,sha256=0Q9m9yJ5rED3U1vyjmrHVft9jIqB35KkfDerc_VYJNg,55140
149
149
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py,sha256=gzklmzg25O3y48wNubrl_jzKwzZlgiperjyV6OqdvxU,42148
150
150
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py,sha256=g8yEk161s1CO8NDVm6ZTMdONtilXDIKQWI6dMnRk6Bs,40919
151
151
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py,sha256=UM-T14SgwlIk9is3QDQx7IsTWU-_ZsD1iA4rCO9-3Rk,31679
@@ -156,7 +156,7 @@ diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py,sha25
156
156
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py,sha256=cnwmjjTQA5lMd-pK-MhnwLF8bK-pwZu0S3xiHLO2WFk,43302
157
157
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py,sha256=_EtteuRKzWruht97EpQO7zrqoyhGyZUUgDFuw04mW5M,62705
158
158
  diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py,sha256=xiBT7l8nha5HLg2MEeOYozUwmNIUZzbW1mjgt6b7JU8,40491
159
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py,sha256=4__1bReiM-LDL1JUEZ4wUb2Pk02W9r1ZPkvj3FQ-A6U,38548
159
+ diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py,sha256=TOJge0fcAt10pg-hJdjhQdMi53pBzBrcx7x6vIGMZVc,38610
160
160
  diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py,sha256=ef5W_IynHqocHCWJcX32MUxRPvShH3spsv0RA_lBlTk,46256
161
161
  diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py,sha256=YSwcDOwL1sE6ItdYm1ZuYb3uZVAf-DzuhtndV_Auqzw,40305
162
162
  diffusers/pipelines/stable_diffusion/safety_checker.py,sha256=zLs3meGi6JiRYlHntPiBEaU9_JjYcZnzrPa5picFiG4,5734
@@ -165,9 +165,9 @@ diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py,sha256=RE
165
165
  diffusers/pipelines/stable_diffusion_safe/__init__.py,sha256=FAuvPLSYCLDzJ1d2GntTwQXpxgABEaoLrj5LdQOtxpA,2502
166
166
  diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py,sha256=yZMsvIdDAhF7maJOpc9UWSIUbWpIghOypzgo4-vAI0A,37886
167
167
  diffusers/pipelines/stable_diffusion_safe/safety_checker.py,sha256=lEXvS-_WCcVpje14hoajJG2Z4jlWs0UsID3IqWTnOys,5049
168
- diffusers/pipelines/stable_diffusion_xl/__init__.py,sha256=ORE7mQPSp_8k1V6Lzc85dJOQCJ5BOTFwzxOMvf7XRS8,1372
169
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py,sha256=Xnr-X09UTvzcz6d6kTxJZ3pyrCVEnH78b6x6H70JOh8,38907
170
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py,sha256=IGHWSxR6QrjG702DPFegBT0ZjjOTC5qh53vSPSHyTso,44219
168
+ diffusers/pipelines/stable_diffusion_xl/__init__.py,sha256=-RFjtUQxnCEPSF0Foq90HWIjyHblHOH0eHoNg1dqj68,953
169
+ diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py,sha256=MTG8Ym65rS3vvALVHWkHHP5cEKMcjpSnYHZuh_lwKcU,41570
170
+ diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py,sha256=USKMxXkFUeDJQr77Ecw1QlJieiet37KdQB53wWqpCc4,46168
171
171
  diffusers/pipelines/stable_diffusion_xl/watermark.py,sha256=22Pg7TXApd4oRBvyJDh5B5L6--Zj7hKaYj8dHSTsGzQ,1142
172
172
  diffusers/pipelines/stochastic_karras_ve/__init__.py,sha256=StxEhuNuCeEY3qv3ZIcBfXsaxDH3JmWeuHx1xCHnYRI,60
173
173
  diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py,sha256=zZn4jZ8iHJSsoMvStG3l4WvQ6wAtKjK0LDjLRQA3PLU,5669
@@ -254,9 +254,9 @@ diffusers/utils/outputs.py,sha256=l5RdKO6SRnnz7fsXsmmnkOyCf_0z35kwfkDbnhCFeAc,36
254
254
  diffusers/utils/pil_utils.py,sha256=F7M3QWYQyRcLNsS8876wgKqOnhzg8hNTPHQy6Q-jYj0,1423
255
255
  diffusers/utils/testing_utils.py,sha256=TiKwlhR4SvEIIkAOrF11qYNg27p_tVp0ifJgEW2mNAk,21197
256
256
  diffusers/utils/torch_utils.py,sha256=4gRMtlH81IrbYh_pfR0ZkDNbuxmVX03fmR6xrDTZIP0,3378
257
- diffusers-0.18.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
258
- diffusers-0.18.0.dist-info/METADATA,sha256=R-HlsAiR3JVN5Q8fP_WphGIxMb69nAaiYq7jycXXAvA,17540
259
- diffusers-0.18.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
260
- diffusers-0.18.0.dist-info/entry_points.txt,sha256=VULXr1th-UU5J0Ou_l0If6E4CY4HSSiMElweZ58u9H0,73
261
- diffusers-0.18.0.dist-info/top_level.txt,sha256=axJl2884vMSvhzrFrSoht36QXA_6gZN9cKtg4xOO72o,10
262
- diffusers-0.18.0.dist-info/RECORD,,
257
+ diffusers-0.18.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
258
+ diffusers-0.18.2.dist-info/METADATA,sha256=iO5QVnb_Ri2SVl5YaXfVn7P6JsyQJ2PiTcQ5aNNubxY,17540
259
+ diffusers-0.18.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
260
+ diffusers-0.18.2.dist-info/entry_points.txt,sha256=VULXr1th-UU5J0Ou_l0If6E4CY4HSSiMElweZ58u9H0,73
261
+ diffusers-0.18.2.dist-info/top_level.txt,sha256=axJl2884vMSvhzrFrSoht36QXA_6gZN9cKtg4xOO72o,10
262
+ diffusers-0.18.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.40.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5