PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/examples/stable_diffusion/diffusion.py CHANGED Viewed

@@ -13,14 +13,13 @@
 # limitations under the License.
 # ==============================================================================
-import torch
-from torch import nn
 import ai_edge_torch.generative.layers.builder as layers_builder
 import ai_edge_torch.generative.layers.model_config as layers_cfg
-import ai_edge_torch.generative.layers.unet.blocks_2d as blocks_2d
+from ai_edge_torch.generative.layers.unet import blocks_2d
 import ai_edge_torch.generative.layers.unet.model_config as unet_cfg
-import ai_edge_torch.generative.utilities.stable_diffusion_loader as stable_diffusion_loader
+from ai_edge_torch.generative.utilities import stable_diffusion_loader
+import torch
+from torch import nn
 _down_encoder_blocks_tensor_names = [
     stable_diffusion_loader.DownEncoderBlockTensorNames(
@@ -39,9 +38,15 @@ _down_encoder_blocks_tensor_names = [
         ],
         transformer_block_tensor_names=[
             stable_diffusion_loader.TransformerBlockTensorNames(
-                pre_conv_norm=f"model.diffusion_model.input_blocks.{i*3+j+1}.1.norm",
-                conv_in=f"model.diffusion_model.input_blocks.{i*3+j+1}.1.proj_in",
-                conv_out=f"model.diffusion_model.input_blocks.{i*3+j+1}.1.proj_out",
+                pre_conv_norm=(
+                    f"model.diffusion_model.input_blocks.{i*3+j+1}.1.norm"
+                ),
+                conv_in=(
+                    f"model.diffusion_model.input_blocks.{i*3+j+1}.1.proj_in"
+                ),
+                conv_out=(
+                    f"model.diffusion_model.input_blocks.{i*3+j+1}.1.proj_out"
+                ),
                 self_attention=stable_diffusion_loader.AttentionBlockTensorNames(
                     norm=f"model.diffusion_model.input_blocks.{i*3+j+1}.1.transformer_blocks.0.norm1",
                     q_proj=f"model.diffusion_model.input_blocks.{i*3+j+1}.1.transformer_blocks.0.attn1.to_q",
@@ -80,7 +85,9 @@ _mid_block_tensor_names = stable_diffusion_loader.MidBlockTensorNames(
             conv_1=f"model.diffusion_model.middle_block.{i}.in_layers.2",
             norm_2=f"model.diffusion_model.middle_block.{i}.out_layers.0",
             conv_2=f"model.diffusion_model.middle_block.{i}.out_layers.3",
-            time_embedding=f"model.diffusion_model.middle_block.{i}.emb_layers.1",
+            time_embedding=(
+                f"model.diffusion_model.middle_block.{i}.emb_layers.1"
+            ),
         )
         for i in [0, 2]
     ],
@@ -117,8 +124,12 @@ _up_decoder_blocks_tensor_names = [
     stable_diffusion_loader.SkipUpDecoderBlockTensorNames(
         residual_block_tensor_names=[
             stable_diffusion_loader.ResidualBlockTensorNames(
-                norm_1=f"model.diffusion_model.output_blocks.{i*3+j}.0.in_layers.0",
-                conv_1=f"model.diffusion_model.output_blocks.{i*3+j}.0.in_layers.2",
+                norm_1=(
+                    f"model.diffusion_model.output_blocks.{i*3+j}.0.in_layers.0"
+                ),
+                conv_1=(
+                    f"model.diffusion_model.output_blocks.{i*3+j}.0.in_layers.2"
+                ),
                 norm_2=f"model.diffusion_model.output_blocks.{i*3+j}.0.out_layers.0",
                 conv_2=f"model.diffusion_model.output_blocks.{i*3+j}.0.out_layers.3",
                 time_embedding=f"model.diffusion_model.output_blocks.{i*3+j}.0.emb_layers.1",
@@ -128,9 +139,15 @@ _up_decoder_blocks_tensor_names = [
         ],
         transformer_block_tensor_names=[
             stable_diffusion_loader.TransformerBlockTensorNames(
-                pre_conv_norm=f"model.diffusion_model.output_blocks.{i*3+j}.1.norm",
-                conv_in=f"model.diffusion_model.output_blocks.{i*3+j}.1.proj_in",
-                conv_out=f"model.diffusion_model.output_blocks.{i*3+j}.1.proj_out",
+                pre_conv_norm=(
+                    f"model.diffusion_model.output_blocks.{i*3+j}.1.norm"
+                ),
+                conv_in=(
+                    f"model.diffusion_model.output_blocks.{i*3+j}.1.proj_in"
+                ),
+                conv_out=(
+                    f"model.diffusion_model.output_blocks.{i*3+j}.1.proj_out"
+                ),
                 self_attention=stable_diffusion_loader.AttentionBlockTensorNames(
                     norm=f"model.diffusion_model.output_blocks.{i*3+j}.1.transformer_blocks.0.norm1",
                     q_proj=f"model.diffusion_model.output_blocks.{i*3+j}.1.transformer_blocks.0.attn1.to_q",
@@ -157,7 +174,9 @@ _up_decoder_blocks_tensor_names = [
         else None,
         upsample_conv=f"model.diffusion_model.output_blocks.{i*3+2}.2.conv"
         if 0 < i < 3
-        else (f"model.diffusion_model.output_blocks.2.1.conv" if i == 0 else None),
+        else (
+            f"model.diffusion_model.output_blocks.2.1.conv" if i == 0 else None
+        ),
     )
     for i in range(4)
 ]
@@ -176,6 +195,31 @@ TENSOR_NAMES = stable_diffusion_loader.DiffusionModelLoader.TensorNames(
 )
+def build_attention_config(
+    num_heads,
+    dim,
+    num_query_groups,
+    rotary_percentage=0.0,
+    qkv_transpose_before_split=True,
+    qkv_use_bias=False,
+    output_proj_use_bias=True,
+    enable_kv_cache=False,
+    qkv_fused_interleaved=False,
+):
+  return layers_cfg.AttentionConfig(
+      num_heads=num_heads,
+      head_dim=dim // num_heads,
+      num_query_groups=num_query_groups,
+      rotary_percentage=rotary_percentage,
+      qkv_transpose_before_split=qkv_transpose_before_split,
+      qkv_use_bias=qkv_use_bias,
+      output_proj_use_bias=output_proj_use_bias,
+      enable_kv_cache=enable_kv_cache,
+      qkv_fused_interleaved=qkv_fused_interleaved,
+  )
 class TimeEmbedding(nn.Module):
   def __init__(self, in_dim, out_dim):
@@ -248,17 +292,6 @@ class Diffusion(nn.Module):
         config.in_channels, block_out_channels[0], kernel_size=3, padding=1
     )
-    attention_config = layers_cfg.AttentionConfig(
-        num_heads=config.transformer_num_attention_heads,
-        num_query_groups=config.transformer_num_attention_heads,
-        rotary_percentage=0.0,
-        qkv_transpose_before_split=True,
-        qkv_use_bias=False,
-        output_proj_use_bias=True,
-        enable_kv_cache=False,
-        qkv_fused_interleaved=False,
-    )
     # Down encoders.
     down_encoders = []
     output_channel = block_out_channels[0]
@@ -293,14 +326,26 @@ class Diffusion(nn.Module):
                             dim=output_channel,
                             attention_batch_size=config.transformer_batch_size,
                             normalization_config=config.transformer_norm_config,
-                            attention_config=attention_config,
+                            attention_config=build_attention_config(
+                                num_heads=config.transformer_num_attention_heads,
+                                dim=output_channel,
+                                num_query_groups=config.transformer_num_attention_heads,
+                            ),
+                            enable_hlfb=False,
                         ),
                         cross_attention_block_config=unet_cfg.CrossAttentionBlock2DConfig(
                             query_dim=output_channel,
                             cross_dim=config.transformer_cross_attention_dim,
+                            hidden_dim=output_channel,
+                            output_dim=output_channel,
                             attention_batch_size=config.transformer_batch_size,
                             normalization_config=config.transformer_norm_config,
-                            attention_config=attention_config,
+                            attention_config=build_attention_config(
+                                num_heads=config.transformer_num_attention_heads,
+                                dim=output_channel,
+                                num_query_groups=config.transformer_num_attention_heads,
+                            ),
+                            enable_hlfb=False,
                         ),
                         pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
                         feed_forward_block_config=unet_cfg.FeedForwardBlock2DConfig(
@@ -353,14 +398,26 @@ class Diffusion(nn.Module):
                     dim=mid_block_channels,
                     attention_batch_size=config.transformer_batch_size,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=mid_block_channels,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
+                    enable_hlfb=False,
                 ),
                 cross_attention_block_config=unet_cfg.CrossAttentionBlock2DConfig(
                     query_dim=mid_block_channels,
                     cross_dim=config.transformer_cross_attention_dim,
+                    hidden_dim=mid_block_channels,
+                    output_dim=mid_block_channels,
                     attention_batch_size=config.transformer_batch_size,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=mid_block_channels,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
+                    enable_hlfb=False,
                 ),
                 pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
                 feed_forward_block_config=unet_cfg.FeedForwardBlock2DConfig(
@@ -414,14 +471,26 @@ class Diffusion(nn.Module):
                             dim=output_channel,
                             attention_batch_size=config.transformer_batch_size,
                             normalization_config=config.transformer_norm_config,
-                            attention_config=attention_config,
+                            attention_config=build_attention_config(
+                                num_heads=config.transformer_num_attention_heads,
+                                dim=output_channel,
+                                num_query_groups=config.transformer_num_attention_heads,
+                            ),
+                            enable_hlfb=False,
                         ),
                         cross_attention_block_config=unet_cfg.CrossAttentionBlock2DConfig(
                             query_dim=output_channel,
                             cross_dim=config.transformer_cross_attention_dim,
+                            hidden_dim=output_channel,
+                            output_dim=output_channel,
                             attention_batch_size=config.transformer_batch_size,
                             normalization_config=config.transformer_norm_config,
-                            attention_config=attention_config,
+                            attention_config=build_attention_config(
+                                num_heads=config.transformer_num_attention_heads,
+                                dim=output_channel,
+                                num_query_groups=config.transformer_num_attention_heads,
+                            ),
+                            enable_hlfb=False,
                         ),
                         pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
                         feed_forward_block_config=unet_cfg.FeedForwardBlock2DConfig(
@@ -469,7 +538,10 @@ class Diffusion(nn.Module):
         layers_cfg.ActivationConfig(config.final_activation_type)
     )
     self.conv_out = nn.Conv2d(
-        reversed_block_out_channels[-1], config.out_channels, kernel_size=3, padding=1
+        reversed_block_out_channels[-1],
+        config.out_channels,
+        kernel_size=3,
+        padding=1,
     )
   @torch.inference_mode
@@ -490,12 +562,15 @@ class Diffusion(nn.Module):
     x = self.conv_in(latents)
     skip_connection_tensors = [x]
     for encoder in self.down_encoders:
-      x, hidden_states = encoder(x, time_emb, context, output_hidden_states=True)
+      x, hidden_states = encoder(
+          x, time_emb, context, output_hidden_states=True
+      )
       skip_connection_tensors.extend(hidden_states)
     x = self.mid_block(x, time_emb, context)
     for decoder in self.up_decoders:
       encoder_tensors = [
-          skip_connection_tensors.pop() for i in range(self.config.layers_per_block + 1)
+          skip_connection_tensors.pop()
+          for i in range(self.config.layers_per_block + 1)
       ]
       x = decoder(x, encoder_tensors, time_emb, context)
     x = self.final_norm(x)
@@ -512,7 +587,6 @@ def get_model_config(batch_size: int) -> unet_cfg.DiffusionModelConfig:
   Retruns:
     The configuration of diffusion model of Stable Diffusion v1.5.
   """
   in_channels = 4
   out_channels = 4
@@ -529,7 +603,7 @@ def get_model_config(batch_size: int) -> unet_cfg.DiffusionModelConfig:
   # Transformer configs.
   transformer_num_attention_heads = 8
   transformer_batch_size = batch_size
-  transformer_cross_attention_dim = 768  # Embedding fomr CLIP model
+  transformer_cross_attention_dim = 768  # Embedding from CLIP model
   transformer_pre_conv_norm_config = layers_cfg.NormalizationConfig(
       layers_cfg.NormalizationType.GROUP_NORM, epsilon=1e-6, group_num=32
   )
@@ -571,3 +645,71 @@ def get_model_config(batch_size: int) -> unet_cfg.DiffusionModelConfig:
       final_norm_config=final_norm_config,
       final_activation_type=final_activation_type,
   )
+def get_fake_model_config(batch_size: int) -> unet_cfg.DiffusionModelConfig:
+  """Get fake configs for the Diffusion model of Stable Diffusion v1.5 for testing.
+  Args:
+    batch_size (int): the batch size of input.
+  Retruns:
+    The configuration of diffusion model of Stable Diffusion v1.5.
+  """
+  in_channels = 4
+  out_channels = 4
+  block_out_channels = [2, 4, 8, 8]
+  layers_per_block = 1
+  downsample_padding = 1
+  # Residual configs.
+  residual_norm_config = layers_cfg.NormalizationConfig(
+      layers_cfg.NormalizationType.GROUP_NORM, group_num=2
+  )
+  residual_activation_type = layers_cfg.ActivationType.SILU
+  # Transformer configs.
+  transformer_num_attention_heads = 1
+  transformer_batch_size = batch_size
+  transformer_cross_attention_dim = 4  # Embedding from CLIP model
+  transformer_pre_conv_norm_config = layers_cfg.NormalizationConfig(
+      layers_cfg.NormalizationType.GROUP_NORM, epsilon=1e-6, group_num=2
+  )
+  transformer_norm_config = layers_cfg.NormalizationConfig(
+      layers_cfg.NormalizationType.LAYER_NORM
+  )
+  transformer_ff_activation_type = layers_cfg.ActivationType.GE_GLU
+  # Time embedding configs.
+  time_embedding_dim = 2
+  time_embedding_blocks_dim = 4
+  # Mid block configs.
+  mid_block_layers = 1
+  # Finaly layer configs.
+  final_norm_config = layers_cfg.NormalizationConfig(
+      layers_cfg.NormalizationType.GROUP_NORM, group_num=2
+  )
+  final_activation_type = layers_cfg.ActivationType.SILU
+  return unet_cfg.DiffusionModelConfig(
+      in_channels=in_channels,
+      out_channels=out_channels,
+      block_out_channels=block_out_channels,
+      layers_per_block=layers_per_block,
+      downsample_padding=downsample_padding,
+      residual_norm_config=residual_norm_config,
+      residual_activation_type=residual_activation_type,
+      transformer_batch_size=transformer_batch_size,
+      transformer_num_attention_heads=transformer_num_attention_heads,
+      transformer_cross_attention_dim=transformer_cross_attention_dim,
+      transformer_pre_conv_norm_config=transformer_pre_conv_norm_config,
+      transformer_norm_config=transformer_norm_config,
+      transformer_ff_activation_type=transformer_ff_activation_type,
+      mid_block_layers=mid_block_layers,
+      time_embedding_dim=time_embedding_dim,
+      time_embedding_blocks_dim=time_embedding_blocks_dim,
+      final_norm_config=final_norm_config,
+      final_activation_type=final_activation_type,
+  )

ai_edge_torch/generative/examples/stable_diffusion/encoder.py CHANGED Viewed

@@ -13,12 +13,11 @@
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch.generative.examples.stable_diffusion.attention import SelfAttention  # NOQA
 import torch
 from torch import nn
 from torch.nn import functional as F
-from ai_edge_torch.generative.examples.stable_diffusion.attention import SelfAttention  # NOQA
 class AttentionBlock(nn.Module):
@@ -50,7 +49,9 @@ class ResidualBlock(nn.Module):
     self.conv_1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
     self.groupnorm_2 = nn.GroupNorm(32, out_channels)
-    self.conv_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+    self.conv_2 = nn.Conv2d(
+        out_channels, out_channels, kernel_size=3, padding=1
+    )
     if in_channels == out_channels:
       self.residual_layer = nn.Identity()

ai_edge_torch/generative/examples/stable_diffusion/pipeline.py CHANGED Viewed

@@ -15,33 +15,44 @@
 import argparse
 import os
-from pathlib import Path
-from typing import Dict, Optional
+import pathlib
+from typing import Optional
+import ai_edge_torch
+from ai_edge_torch.generative.examples.stable_diffusion import samplers
+from ai_edge_torch.generative.examples.stable_diffusion import tokenizer
+from ai_edge_torch.generative.examples.stable_diffusion import util
 import numpy as np
 from PIL import Image
-from tqdm import tqdm
-import ai_edge_torch.generative.examples.stable_diffusion.samplers as samplers
-from ai_edge_torch.generative.examples.stable_diffusion.tokenizer import Tokenizer  # NOQA
-import ai_edge_torch.generative.examples.stable_diffusion.util as util
-from ai_edge_torch.model import TfLiteModel
+import tqdm
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument(
     '--tokenizer_vocab_dir',
     type=str,
-    help='Directory to the tokenizer vocabulary files, which include `merges.txt` and `vocab.json`',
+    help=(
+        'Directory to the tokenizer vocabulary files, which include'
+        ' `merges.txt` and `vocab.json`'
+    ),
     required=True,
 )
 arg_parser.add_argument(
-    '--clip_ckpt', type=str, help='Path to CLIP TFLite tflite file', required=True
+    '--clip_ckpt',
+    type=str,
+    help='Path to CLIP TFLite tflite file',
+    required=True,
 )
 arg_parser.add_argument(
-    '--diffusion_ckpt', type=str, help='Path to diffusion tflite file', required=True
+    '--diffusion_ckpt',
+    type=str,
+    help='Path to diffusion tflite file',
+    required=True,
 )
 arg_parser.add_argument(
-    '--decoder_ckpt', type=str, help='Path to decoder tflite file', required=True
+    '--decoder_ckpt',
+    type=str,
+    help='Path to decoder tflite file',
+    required=True,
 )
 arg_parser.add_argument(
     '--output_path',
@@ -56,14 +67,29 @@ arg_parser.add_argument(
     help='The prompt to guide the image generation.',
 )
 arg_parser.add_argument(
-    '--n_inference_steps', default=20, type=int, help='The number of denoising steps.'
+    '--n_inference_steps',
+    default=20,
+    type=int,
+    help='The number of denoising steps.',
 )
 arg_parser.add_argument(
     '--sampler',
     default='k_euler',
     type=str,
     choices=['k_euler', 'k_euler_ancestral', 'k_lms'],
-    help='A sampler to be used to denoise the encoded image latents. Can be one of `k_lms, `k_euler`, or `k_euler_ancestral`.',
+    help=(
+        'A sampler to be used to denoise the encoded image latents. Can be one'
+        ' of `k_lms, `k_euler`, or `k_euler_ancestral`.'
+    ),
+)
+arg_parser.add_argument(
+    '--seed',
+    default=None,
+    type=int,
+    help=(
+        'A seed to make generation deterministic. A random number is used if'
+        ' unspecified.'
+    ),
 )
@@ -78,12 +104,12 @@ class StableDiffusion:
       diffusion_ckpt: str,
       decoder_ckpt: str
   ):
-    self.tokenizer = Tokenizer(tokenizer_vocab_dir)
-    self.clip = TfLiteModel.load(clip_ckpt)
-    self.decoder = TfLiteModel.load(decoder_ckpt)
-    self.diffusion = TfLiteModel.load(diffusion_ckpt)
+    self.tokenizer = tokenizer.Tokenizer(tokenizer_vocab_dir)
+    self.clip = ai_edge_torch.model.TfLiteModel.load(clip_ckpt)
+    self.decoder = ai_edge_torch.model.TfLiteModel.load(decoder_ckpt)
+    self.diffusion = ai_edge_torch.model.TfLiteModel.load(diffusion_ckpt)
     if encoder_ckpt is not None:
-      self.encoder = TfLiteModel.load(encoder_ckpt)
+      self.encoder = ai_edge_torch.model.TfLiteModel.load(encoder_ckpt)
 def run_tflite_pipeline(
@@ -101,38 +127,33 @@ def run_tflite_pipeline(
     input_image: Optional[Image.Image] = None,
 ):
   """Run stable diffusion pipeline with tflite model.
-  model:
-    StableDiffsuion model.
-  prompt:
-    The prompt to guide the image generation.
-  output_path:
-    The path to the generated output image.
-  uncond_prompt:
-    The prompt not to guide the image generation.
-  cfg_scale:
-    Guidance scale of classifier-free guidance. Higher guidance scale encourages to generate
-    images that are closely linked to the text `prompt`, usually at the expense of lower
-    image quality.
-  height:
-    The height in pixels of the generated image.
-  width:
-    The width in pixels of the generated image.
-  sampler:
-    A sampler to be used to denoise the encoded image latents. Can be one of `k_lms, `k_euler`,
-    or `k_euler_ancestral`.
-  n_inference_steps:
-    The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-    expense of slower inference. This parameter will be modulated by `strength`.
-  seed:
-    A seed to make generation deterministic.
-  strength:
-    Conceptually, indicates how much to transform the reference `input_image`. Must be between 0 and 1.
-    `input_image` will be used as a starting point, adding more noise to it the larger the `strength`.
-    The number of denoising steps depends on the amount of noise initially added. When `strength` is 1,
-    added noise will be maximum and the denoising process will run for the full number of iterations
-    specified in `n_inference_steps`. A value of 1, therefore, essentially ignores `input_image`.
-  input_image:
-    Image which is served as the starting point for the image generation.
+  Args:
+    model: StableDiffsuion model.
+    prompt: The prompt to guide the image generation.
+    output_path: The path to the generated output image.
+    uncond_prompt: The prompt not to guide the image generation.
+    cfg_scale: Guidance scale of classifier-free guidance. Higher guidance scale
+      encourages to generate images that are closely linked to the text
+      `prompt`, usually at the expense of lower image quality.
+    height: The height in pixels of the generated image.
+    width: The width in pixels of the generated image.
+    sampler: A sampler to be used to denoise the encoded image latents. Can be
+      one of `k_lms, `k_euler`, or `k_euler_ancestral`.
+    n_inference_steps: The number of denoising steps. More denoising steps
+      usually lead to a higher quality image at the expense of slower inference.
+      This parameter will be modulated by `strength`.
+    seed: A seed to make generation deterministic.
+    strength: Conceptually, indicates how much to transform the reference
+      `input_image`. Must be between 0 and 1. `input_image` will be used as a
+      starting point, adding more noise to it the larger the `strength`. The
+      number of denoising steps depends on the amount of noise initially added.
+      When `strength` is 1, added noise will be maximum and the denoising
+      process will run for the full number of iterations specified in
+      `n_inference_steps`. A value of 1, therefore, essentially ignores
+      `input_image`.
+    input_image: Image which is served as the starting point for the image
+      generation.
   """
   if not 0 < strength < 1:
     raise ValueError('strength must be between 0 and 1')
@@ -148,7 +169,9 @@ def run_tflite_pipeline(
   elif sampler == 'k_euler':
     sampler = samplers.KEulerSampler(n_inference_steps=n_inference_steps)
   elif sampler == 'k_euler_ancestral':
-    sampler = samplers.KEulerAncestralSampler(n_inference_steps=n_inference_steps)
+    sampler = samplers.KEulerAncestralSampler(
+        n_inference_steps=n_inference_steps
+    )
   else:
     raise ValueError(
         'Unknown sampler value %s. '
@@ -163,14 +186,15 @@ def run_tflite_pipeline(
   context = np.concatenate([cond_context, uncond_context], axis=0)
   noise_shape = (1, 4, height // 8, width // 8)
-  # Initialization starts from input_image if any, otherwise, starts from a random sampling.
+  # Initialization starts from input_image if any, otherwise, starts from a
+  # random sampling.
   if input_image:
     if not hasattr(model, 'encoder'):
       raise AttributeError(
-          'Stable Diffusion must be initialized with encoder to accept input_image.'
+          'Stable Diffusion must be initialized with encoder to accept'
+          ' input_image.'
       )
     input_image = input_image.resize((width, height))
-    input_image_np = np.array(input_image).astype(np.float32)
     input_image_np = util.rescale(input_image, (0, 255), (-1, 1))
     input_image_np = util.move_channel(input_image_np, to='first')
     encoder_noise = np.random.normal(size=noise_shape).astype(np.float32)
@@ -183,8 +207,8 @@ def run_tflite_pipeline(
     latents *= sampler.initial_scale
   # Diffusion process.
-  timesteps = tqdm(sampler.timesteps)
-  for i, timestep in enumerate(timesteps):
+  timesteps = tqdm.tqdm(sampler.timesteps)
+  for _, timestep in enumerate(timesteps):
     time_embedding = util.get_time_embedding(timestep)
     input_latents = latents * sampler.get_input_scale()
@@ -202,7 +226,7 @@ def run_tflite_pipeline(
   images = util.rescale(images, (-1, 1), (0, 255), clamp=True)
   images = util.move_channel(images, to='last')
   if not os.path.exists(output_path):
-    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
   Image.fromarray(images[0].astype(np.uint8)).save(output_path)
@@ -219,4 +243,5 @@ if __name__ == '__main__':
       output_path=args.output_path,
       sampler=args.sampler,
       n_inference_steps=args.n_inference_steps,
+      seed=args.seed,
   )

ai_edge_torch/generative/examples/stable_diffusion/samplers/k_euler.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
-import numpy as np
 from ai_edge_torch.generative.examples.stable_diffusion import util
 from ai_edge_torch.generative.examples.stable_diffusion.samplers.sampler import SamplerInterface  # NOQA
+import numpy as np
 class KEulerSampler(SamplerInterface):
@@ -46,7 +45,9 @@ class KEulerSampler(SamplerInterface):
   def set_strength(self, strength=1):
     start_step = self.n_inference_steps - int(self.n_inference_steps * strength)
-    self.timesteps = np.linspace(self.n_training_steps - 1, 0, self.n_inference_steps)
+    self.timesteps = np.linspace(
+        self.n_training_steps - 1, 0, self.n_inference_steps
+    )
     self.timesteps = self.timesteps[start_step:]
     self.initial_scale = self.sigmas[start_step]
     self.step_count = start_step

ai_edge_torch/generative/examples/stable_diffusion/samplers/k_euler_ancestral.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
-import numpy as np
 from ai_edge_torch.generative.examples.stable_diffusion import util
 from ai_edge_torch.generative.examples.stable_diffusion.samplers.sampler import SamplerInterface  # NOQA
+import numpy as np
 class KEulerAncestralSampler(SamplerInterface):
@@ -46,7 +45,9 @@ class KEulerAncestralSampler(SamplerInterface):
   def set_strength(self, strength=1):
     start_step = self.n_inference_steps - int(self.n_inference_steps * strength)
-    self.timesteps = np.linspace(self.n_training_steps - 1, 0, self.n_inference_steps)
+    self.timesteps = np.linspace(
+        self.n_training_steps - 1, 0, self.n_inference_steps
+    )
     self.timesteps = self.timesteps[start_step:]
     self.initial_scale = self.sigmas[start_step]
     self.step_count = start_step

ai_edge_torch/generative/examples/stable_diffusion/samplers/k_lms.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
-import numpy as np
 from ai_edge_torch.generative.examples.stable_diffusion import util
 from ai_edge_torch.generative.examples.stable_diffusion.samplers.sampler import SamplerInterface  # NOQA
+import numpy as np
 class KLMSSampler(SamplerInterface):
@@ -48,7 +47,9 @@ class KLMSSampler(SamplerInterface):
   def set_strength(self, strength=1):
     start_step = self.n_inference_steps - int(self.n_inference_steps * strength)
-    self.timesteps = np.linspace(self.n_training_steps - 1, 0, self.n_inference_steps)
+    self.timesteps = np.linspace(
+        self.n_training_steps - 1, 0, self.n_inference_steps
+    )
     self.timesteps = self.timesteps[start_step:]
     self.initial_scale = self.sigmas[start_step]
     self.step_count = start_step

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl