PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/utilities/stable_diffusion_loader.py CHANGED Viewed

@@ -16,11 +16,10 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
-import torch
 import ai_edge_torch.generative.layers.model_config as layers_config
 import ai_edge_torch.generative.layers.unet.model_config as unet_config
 import ai_edge_torch.generative.utilities.loader as loader
+import torch
 @dataclass
@@ -80,27 +79,35 @@ class TransformerBlockTensorNames:
 class MidBlockTensorNames:
   residual_block_tensor_names: List[ResidualBlockTensorNames]
   attention_block_tensor_names: Optional[List[AttentionBlockTensorNames]] = None
-  transformer_block_tensor_names: Optional[List[TransformerBlockTensorNames]] = None
+  transformer_block_tensor_names: Optional[
+      List[TransformerBlockTensorNames]
+  ] = None
 @dataclass
 class DownEncoderBlockTensorNames:
   residual_block_tensor_names: List[ResidualBlockTensorNames]
-  transformer_block_tensor_names: Optional[List[TransformerBlockTensorNames]] = None
+  transformer_block_tensor_names: Optional[
+      List[TransformerBlockTensorNames]
+  ] = None
   downsample_conv: str = None
 @dataclass
 class UpDecoderBlockTensorNames:
   residual_block_tensor_names: List[ResidualBlockTensorNames]
-  transformer_block_tensor_names: Optional[List[TransformerBlockTensorNames]] = None
+  transformer_block_tensor_names: Optional[
+      List[TransformerBlockTensorNames]
+  ] = None
   upsample_conv: str = None
 @dataclass
 class SkipUpDecoderBlockTensorNames:
   residual_block_tensor_names: List[ResidualBlockTensorNames]
-  transformer_block_tensor_names: Optional[List[TransformerBlockTensorNames]] = None
+  transformer_block_tensor_names: Optional[
+      List[TransformerBlockTensorNames]
+  ] = None
   upsample_conv: str = None
@@ -119,7 +126,9 @@ def _map_to_converted_state(
         converted_state[f"{converted_state_param}.weight"]
     )
   if f"{state_param}.bias" in state:
-    converted_state[f"{converted_state_param}.bias"] = state.pop(f"{state_param}.bias")
+    converted_state[f"{converted_state_param}.bias"] = state.pop(
+        f"{state_param}.bias"
+    )
     if squeeze_dims:
       converted_state[f"{converted_state_param}.bias"] = torch.squeeze(
           converted_state[f"{converted_state_param}.bias"]
@@ -220,25 +229,41 @@ class BaseLoader(loader.ModelLoader):
           f"{attention_layer_prefix}.v_projection",
           squeeze_dims=True,
       )
-      converted_state[f"{attention_layer_prefix}.qkv_projection.weight"] = torch.concat(
-          [
-              converted_state[f"{attention_layer_prefix}.q_projection.weight"],
-              converted_state[f"{attention_layer_prefix}.k_projection.weight"],
-              converted_state[f"{attention_layer_prefix}.v_projection.weight"],
-          ],
-          axis=0,
+      converted_state[f"{attention_layer_prefix}.qkv_projection.weight"] = (
+          torch.concat(
+              [
+                  converted_state[
+                      f"{attention_layer_prefix}.q_projection.weight"
+                  ],
+                  converted_state[
+                      f"{attention_layer_prefix}.k_projection.weight"
+                  ],
+                  converted_state[
+                      f"{attention_layer_prefix}.v_projection.weight"
+                  ],
+              ],
+              axis=0,
+          )
       )
       del converted_state[f"{attention_layer_prefix}.q_projection.weight"]
       del converted_state[f"{attention_layer_prefix}.k_projection.weight"]
       del converted_state[f"{attention_layer_prefix}.v_projection.weight"]
       if config.attention_config.qkv_use_bias:
-        converted_state[f"{attention_layer_prefix}.qkv_projection.bias"] = torch.concat(
-            [
-                converted_state[f"{attention_layer_prefix}.q_projection.bias"],
-                converted_state[f"{attention_layer_prefix}.k_projection.bias"],
-                converted_state[f"{attention_layer_prefix}.v_projection.bias"],
-            ],
-            axis=0,
+        converted_state[f"{attention_layer_prefix}.qkv_projection.bias"] = (
+            torch.concat(
+                [
+                    converted_state[
+                        f"{attention_layer_prefix}.q_projection.bias"
+                    ],
+                    converted_state[
+                        f"{attention_layer_prefix}.k_projection.bias"
+                    ],
+                    converted_state[
+                        f"{attention_layer_prefix}.v_projection.bias"
+                    ],
+                ],
+                axis=0,
+            )
         )
         del converted_state[f"{attention_layer_prefix}.q_projection.bias"]
         del converted_state[f"{attention_layer_prefix}.k_projection.bias"]
@@ -316,11 +341,17 @@ class BaseLoader(loader.ModelLoader):
       )
     else:
       _map_to_converted_state(
-          state, tensor_names.w1, converted_state, f"{converted_state_param_prefix}.w1"
+          state,
+          tensor_names.w1,
+          converted_state,
+          f"{converted_state_param_prefix}.w1",
       )
     _map_to_converted_state(
-        state, tensor_names.w2, converted_state, f"{converted_state_param_prefix}.w2"
+        state,
+        tensor_names.w2,
+        converted_state,
+        f"{converted_state_param_prefix}.w2",
     )
   def _map_transformer_block(
@@ -381,6 +412,7 @@ class BaseLoader(loader.ModelLoader):
   ):
     residual_block_config = unet_config.ResidualBlock2DConfig(
         in_channels=config.in_channels,
+        hidden_channels=config.in_channels,
         out_channels=config.in_channels,
         time_embedding_channels=config.time_embedding_channels,
         normalization_config=config.normalization_config,
@@ -435,6 +467,7 @@ class BaseLoader(loader.ModelLoader):
           f"{converted_state_param_prefix}.resnets.{i}",
           unet_config.ResidualBlock2DConfig(
               in_channels=input_channels,
+              hidden_channels=config.out_channels,
               out_channels=config.out_channels,
               time_embedding_channels=config.time_embedding_channels,
               normalization_config=config.normalization_config,
@@ -477,6 +510,7 @@ class BaseLoader(loader.ModelLoader):
           f"{converted_state_param_prefix}.resnets.{i}",
           unet_config.ResidualBlock2DConfig(
               in_channels=input_channels,
+              hidden_channels=config.out_channels,
               out_channels=config.out_channels,
               time_embedding_channels=config.time_embedding_channels,
               normalization_config=config.normalization_config,
@@ -509,9 +543,13 @@ class BaseLoader(loader.ModelLoader):
   ):
     for i in range(config.num_layers):
       res_skip_channels = (
-          config.in_channels if (i == config.num_layers - 1) else config.out_channels
+          config.in_channels
+          if (i == config.num_layers - 1)
+          else config.out_channels
+      )
+      resnet_in_channels = (
+          config.prev_out_channels if i == 0 else config.out_channels
       )
-      resnet_in_channels = config.prev_out_channels if i == 0 else config.out_channels
       self._map_residual_block(
           state,
           converted_state,
@@ -519,6 +557,7 @@ class BaseLoader(loader.ModelLoader):
           f"{converted_state_param_prefix}.resnets.{i}",
           unet_config.ResidualBlock2DConfig(
               in_channels=resnet_in_channels + res_skip_channels,
+              hidden_channels=config.out_channels,
               out_channels=config.out_channels,
               time_embedding_channels=config.time_embedding_channels,
               normalization_config=config.normalization_config,
@@ -559,11 +598,13 @@ class AutoEncoderModelLoader(BaseLoader):
     up_decoder_blocks_tensor_names: List[UpDecoderBlockTensorNames] = None
   def __init__(self, file_name: str, names: TensorNames):
-    """AutoEncoderModelLoader constructor. Can be used to load encoder and decoder models.
+    """AutoEncoderModelLoader constructor.
+    Can be used to load encoder and decoder models.
     Args:
-        file_name (str): Path to the checkpoint. Can be a directory or an
-          exact file.
+        file_name (str): Path to the checkpoint. Can be a directory or an exact
+          file.
         names (TensorNames): An instance of `TensorNames` to determine mappings.
     """
     self._file_name = file_name
@@ -582,7 +623,8 @@ class AutoEncoderModelLoader(BaseLoader):
     Returns:
         missing_keys (List[str]): a list of str containing the missing keys.
-        unexpected_keys (List[str]): a list of str containing the unexpected keys.
+        unexpected_keys (List[str]): a list of str containing the unexpected
+        keys.
     Raises:
         ValueError: If conversion results in unmapped tensors and strict mode is
@@ -599,9 +641,13 @@ class AutoEncoderModelLoader(BaseLoader):
           state, self._names.post_quant_conv, converted_state, "post_quant_conv"
       )
     if self._names.conv_in is not None:
-      _map_to_converted_state(state, self._names.conv_in, converted_state, "conv_in")
+      _map_to_converted_state(
+          state, self._names.conv_in, converted_state, "conv_in"
+      )
     if self._names.conv_out is not None:
-      _map_to_converted_state(state, self._names.conv_out, converted_state, "conv_out")
+      _map_to_converted_state(
+          state, self._names.conv_out, converted_state, "conv_out"
+      )
     if self._names.final_norm is not None:
       _map_to_converted_state(
           state, self._names.final_norm, converted_state, "final_norm"
@@ -614,7 +660,9 @@ class AutoEncoderModelLoader(BaseLoader):
         model.config.mid_block_config,
     )
-    reversed_block_out_channels = list(reversed(model.config.block_out_channels))
+    reversed_block_out_channels = list(
+        reversed(model.config.block_out_channels)
+    )
     block_out_channels = reversed_block_out_channels[0]
     for i, out_channels in enumerate(reversed_block_out_channels):
       prev_output_channel = block_out_channels
@@ -642,6 +690,31 @@ class AutoEncoderModelLoader(BaseLoader):
     return model.load_state_dict(converted_state, strict=strict)
+def build_attention_config(
+    num_heads,
+    dim,
+    num_query_groups,
+    rotary_percentage=0.0,
+    qkv_transpose_before_split=True,
+    qkv_use_bias=False,
+    output_proj_use_bias=True,
+    enable_kv_cache=False,
+    qkv_fused_interleaved=False,
+):
+  return layers_config.AttentionConfig(
+      num_heads=num_heads,
+      head_dim=dim // num_heads,
+      num_query_groups=num_query_groups,
+      rotary_percentage=rotary_percentage,
+      qkv_transpose_before_split=qkv_transpose_before_split,
+      qkv_use_bias=qkv_use_bias,
+      output_proj_use_bias=output_proj_use_bias,
+      enable_kv_cache=enable_kv_cache,
+      qkv_fused_interleaved=qkv_fused_interleaved,
+  )
 class DiffusionModelLoader(BaseLoader):
   @dataclass
@@ -655,11 +728,13 @@ class DiffusionModelLoader(BaseLoader):
     up_decoder_blocks_tensor_names: List[UpDecoderBlockTensorNames] = None
   def __init__(self, file_name: str, names: TensorNames):
-    """DiffusionModelLoader constructor. Can be used to load diffusion models of Stable Diffusion.
+    """DiffusionModelLoader constructor.
+    Can be used to load diffusion models of Stable Diffusion.
     Args:
-        file_name (str): Path to the checkpoint. Can be a directory or an
-          exact file.
+        file_name (str): Path to the checkpoint. Can be a directory or an exact
+          file.
         names (TensorNames): An instance of `TensorNames` to determine mappings.
     """
     self._file_name = file_name
@@ -678,7 +753,8 @@ class DiffusionModelLoader(BaseLoader):
     Returns:
         missing_keys (List[str]): a list of str containing the missing keys.
-        unexpected_keys (List[str]): a list of str containing the unexpected keys.
+        unexpected_keys (List[str]): a list of str containing the unexpected
+        keys.
     Raises:
         ValueError: If conversion results in unmapped tensors and strict mode is
@@ -690,20 +766,14 @@ class DiffusionModelLoader(BaseLoader):
     self._map_time_embedding(
         state, converted_state, "time_embedding", self._names.time_embedding
     )
-    _map_to_converted_state(state, self._names.conv_in, converted_state, "conv_in")
-    _map_to_converted_state(state, self._names.conv_out, converted_state, "conv_out")
     _map_to_converted_state(
-        state, self._names.final_norm, converted_state, "final_norm"
+        state, self._names.conv_in, converted_state, "conv_in"
     )
-    attention_config = layers_config.AttentionConfig(
-        num_heads=config.transformer_num_attention_heads,
-        num_query_groups=config.transformer_num_attention_heads,
-        rotary_percentage=0.0,
-        qkv_transpose_before_split=True,
-        qkv_use_bias=False,
-        output_proj_use_bias=True,
-        enable_kv_cache=False,
+    _map_to_converted_state(
+        state, self._names.conv_out, converted_state, "conv_out"
+    )
+    _map_to_converted_state(
+        state, self._names.final_norm, converted_state, "final_norm"
     )
     # Map down_encoders.
@@ -736,13 +806,23 @@ class DiffusionModelLoader(BaseLoader):
                 attention_block_config=unet_config.AttentionBlock2DConfig(
                     dim=output_channel,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=output_channel,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
                 ),
                 cross_attention_block_config=unet_config.CrossAttentionBlock2DConfig(
                     query_dim=output_channel,
                     cross_dim=config.transformer_cross_attention_dim,
+                    hidden_dim=output_channel,
+                    output_dim=output_channel,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=output_channel,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
                 ),
                 pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
                 feed_forward_block_config=unet_config.FeedForwardBlock2DConfig(
@@ -794,13 +874,23 @@ class DiffusionModelLoader(BaseLoader):
             attention_block_config=unet_config.AttentionBlock2DConfig(
                 dim=mid_block_channels,
                 normalization_config=config.transformer_norm_config,
-                attention_config=attention_config,
+                attention_config=build_attention_config(
+                    num_heads=config.transformer_num_attention_heads,
+                    dim=mid_block_channels,
+                    num_query_groups=config.transformer_num_attention_heads,
+                ),
             ),
             cross_attention_block_config=unet_config.CrossAttentionBlock2DConfig(
                 query_dim=mid_block_channels,
                 cross_dim=config.transformer_cross_attention_dim,
+                hidden_dim=mid_block_channels,
+                output_dim=mid_block_channels,
                 normalization_config=config.transformer_norm_config,
-                attention_config=attention_config,
+                attention_config=build_attention_config(
+                    num_heads=config.transformer_num_attention_heads,
+                    dim=mid_block_channels,
+                    num_query_groups=config.transformer_num_attention_heads,
+                ),
             ),
             pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
             feed_forward_block_config=unet_config.FeedForwardBlock2DConfig(
@@ -825,7 +915,9 @@ class DiffusionModelLoader(BaseLoader):
     )
     # Map up_decoders.
-    reversed_block_out_channels = list(reversed(model.config.block_out_channels))
+    reversed_block_out_channels = list(
+        reversed(model.config.block_out_channels)
+    )
     up_decoder_layers_per_block = config.layers_per_block + 1
     output_channel = reversed_block_out_channels[0]
     for i, block_out_channel in enumerate(reversed_block_out_channels):
@@ -857,13 +949,23 @@ class DiffusionModelLoader(BaseLoader):
                 attention_block_config=unet_config.AttentionBlock2DConfig(
                     dim=output_channel,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=output_channel,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
                 ),
                 cross_attention_block_config=unet_config.CrossAttentionBlock2DConfig(
                     query_dim=output_channel,
                     cross_dim=config.transformer_cross_attention_dim,
+                    hidden_dim=output_channel,
+                    output_dim=output_channel,
                     normalization_config=config.transformer_norm_config,
-                    attention_config=attention_config,
+                    attention_config=build_attention_config(
+                        num_heads=config.transformer_num_attention_heads,
+                        dim=output_channel,
+                        num_query_groups=config.transformer_num_attention_heads,
+                    ),
                 ),
                 pre_conv_normalization_config=config.transformer_pre_conv_norm_config,
                 feed_forward_block_config=unet_config.FeedForwardBlock2DConfig(
@@ -917,8 +1019,14 @@ class DiffusionModelLoader(BaseLoader):
       tensor_names: TimeEmbeddingTensorNames,
   ):
     _map_to_converted_state(
-        state, tensor_names.w1, converted_state, f"{converted_state_param_prefix}.w1"
+        state,
+        tensor_names.w1,
+        converted_state,
+        f"{converted_state_param_prefix}.w1",
     )
     _map_to_converted_state(
-        state, tensor_names.w2, converted_state, f"{converted_state_param_prefix}.w2"
+        state,
+        tensor_names.w2,
+        converted_state,
+        f"{converted_state_param_prefix}.w2",
     )

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl