PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

ai_edge_torch/generative/examples/{experimental/phi → phi}/phi2.py RENAMED Viewed

@@ -12,26 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building phi-2 model from the Edge Generative API layers.
-#
-# Note: This is an experimental version of phi2 with external KV cache.
-# Please use with caution.
+"""Example of building a Phi-2 model."""
 import os
-from pathlib import Path
-from typing import Tuple
+import pathlib
+from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-from ai_edge_torch.generative.layers.experimental import attention
-from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
 import numpy as np
 import torch
 from torch import nn
 TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     ff_up_proj="model.layers.{}.mlp.fc1",
     ff_down_proj="model.layers.{}.mlp.fc2",
@@ -52,7 +48,6 @@ class Phi2(nn.Module):
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.lm_head = nn.Linear(
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
@@ -60,18 +55,20 @@ class Phi2(nn.Module):
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
     )
+    # Phi-2 has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        attention.TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -89,13 +86,17 @@ class Phi2(nn.Module):
       self,
       tokens: torch.Tensor,
       input_pos: torch.Tensor,
-      kv_cache: kv_utils.EKVCache,
-  ) -> Tuple[torch.Tensor, kv_utils.EKVCache]:
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
     _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -111,11 +112,11 @@ class Phi2(nn.Module):
       x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
       if kv_entry:
         updated_kv_entires.append(kv_entry)
-    updated_kv_cache = kv_utils.EKVCache(tuple(updated_kv_entires))
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res, updated_kv_cache
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -143,17 +144,20 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       use_bias=True,
   )
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.LAYER_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      parallel_residual=True,
+  )
   config = cfg.ModelConfig(
       vocab_size=51200,
       num_layers=32,
       max_seq_len=2048,
       kv_cache_max_len=kv_cache_max_len,
       embedding_dim=2560,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
-      parallel_residual=True,
       lm_head_use_bias=True,
       enable_hlfb=True,
   )
@@ -165,43 +169,42 @@ def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
   config.vocab_size = 128
   config.num_layers = 2
   config.max_seq_len = 2 * kv_cache_max_len
-  config.ff_config.intermediate_size = 128
+  # Phi-2 has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 128
   return config
-def build_model(
-    checkpoint_path: str, test_model: bool = False, **kwargs
-) -> nn.Module:
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
   """Instantiates the model instance and load checkpoint if provided."""
-  config = (
-      get_fake_model_config(**kwargs)
-      if test_model
-      else get_model_config(**kwargs)
-  )
+  config = get_model_config(**kwargs)
   model = Phi2(config)
-  if checkpoint_path is not None:
-    loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
-    loader.load(model)
+  loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+  loader.load(model)
   model.eval()
   return model
-def define_and_run(checkpoint_path: str, test_model: bool = False) -> None:
+def define_and_run(checkpoint_path: str) -> None:
   """Instantiates and runs a Phi-2 model."""
+  current_dir = pathlib.Path(__file__).parent.resolve()
+  phi2_goldens = torch.load(current_dir / "phi2_lm_logits.pt")
   kv_cache_max_len = 1024
-  model = build_model(
-      checkpoint_path, test_model=test_model, kv_cache_max_len=kv_cache_max_len
-  )
+  model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
   tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
   tokens[0, :4] = idx
   input_pos = torch.arange(0, kv_cache_max_len)
-  kv = kv_utils.EKVCache.from_model_config(model.config)
-  print("running an inference")
-  print(model.forward(tokens, input_pos, kv))
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
+  print("comparing with goldens..")
+  assert torch.allclose(
+      phi2_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-02
+  )
 if __name__ == "__main__":
-  input_checkpoint_path = os.path.join(Path.home(), "Downloads/phi2")
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/phi2"
+  )
   define_and_run(input_checkpoint_path)

ai_edge_torch/generative/examples/{experimental/gemma → smallm}/convert_to_tflite.py RENAMED Viewed

@@ -12,30 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Note: This is an experimental version of Gemma with external KV cache.
-# Please use with caution.
+"""Example of converting SmalLM model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
-from ai_edge_torch.generative.examples.experimental.gemma import gemma
-from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+from ai_edge_torch.generative.examples.smallm import smallm
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_gemma_to_tflite(
+def convert_smallm_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """An example method for converting a Gemma 2B model to multi-signature
+  """Converts SmalLM model to multi-signature tflite model.
-  tflite model.
   Args:
       checkpoint_path (str): The filepath to the model checkpoint, or directory
         holding the checkpoint.
@@ -46,7 +43,7 @@ def convert_gemma_to_tflite(
       quantize (bool, optional): Whether the model should be quanized. Defaults
         to True.
   """
-  pytorch_model = gemma.build_2b_model(
+  pytorch_model = smallm.build_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
@@ -54,7 +51,7 @@ def convert_gemma_to_tflite(
   prefill_input_pos = torch.arange(0, prefill_seq_len)
   decode_token = torch.tensor([[0]], dtype=torch.long)
   decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  kv = kv_utils.EKVCache.from_model_config(pytorch_model.config)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
@@ -78,11 +75,12 @@ def convert_gemma_to_tflite(
       )
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+      f'/tmp/smallm_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smallm')
+  convert_smallm_to_tflite(path)

ai_edge_torch/generative/examples/smallm/smallm.py ADDED Viewed

@@ -0,0 +1,122 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of building a SmalLM model."""
+import copy
+import os
+import pathlib
+from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+import numpy as np
+import torch
+from torch import nn
+TENSOR_NAMES = copy.copy(tiny_llama.TENSOR_NAMES)
+# SmalLM re-uses the embedding as the head projection layer.
+TENSOR_NAMES.lm_head = None
+class SmalLM(tiny_llama.TinyLlama):
+  """A SmalLM model built from the Edge Generative API layers.
+  SmalLM shares the same architecture as TinyLlama, but with different model
+  sizes.
+  """
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__(config)
+    # SmalLM re-uses the embedding as the head projection layer.
+    self.lm_head.weight.data = self.tok_embedding.weight.data
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a SmalLM 135M model.
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+  Returns:
+    The model config for a SmalLM model.
+  """
+  attn_config = cfg.AttentionConfig(
+      num_heads=9,
+      head_dim=64,
+      num_query_groups=3,
+      rotary_percentage=1.0,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.GATED,
+      activation=cfg.ActivationConfig(cfg.ActivationType.SILU),
+      intermediate_size=1536,
+  )
+  norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
+  config = cfg.ModelConfig(
+      vocab_size=49152,
+      num_layers=30,
+      max_seq_len=2048,
+      embedding_dim=576,
+      kv_cache_max_len=kv_cache_max_len,
+      block_configs=block_config,
+      final_norm_config=norm_config,
+      enable_hlfb=True,
+  )
+  return config
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
+  config = get_model_config(**kwargs)
+  model = SmalLM(config)
+  loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+  # Since embedding and lm-head use the same weight, we need to set strict
+  # to False.
+  loader.load(model, strict=False)
+  model.eval()
+  return model
+def define_and_run(checkpoint_path: str) -> None:
+  """Instantiates and runs a SmalLM model."""
+  current_dir = pathlib.Path(__file__).parent.resolve()
+  smallm_goldens = torch.load(current_dir / "smallm_lm_logits.pt")
+  kv_cache_max_len = 1024
+  model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
+  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens[0, :4] = idx
+  input_pos = torch.arange(0, kv_cache_max_len)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
+  assert torch.allclose(
+      smallm_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-05
+  )
+if __name__ == "__main__":
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/smallm"
+  )
+  define_and_run(input_checkpoint_path)

ai_edge_torch/generative/examples/stable_diffusion/clip.py CHANGED Viewed

@@ -61,8 +61,10 @@ class CLIP(nn.Module):
     )
     self.config = config
+    # CLIP has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        TransformerBlock(config) for _ in range(config.num_layers)
+        TransformerBlock(block_config, config) for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim, config.final_norm_config
@@ -112,15 +114,19 @@ def get_model_config() -> cfg.ModelConfig:
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.LAYER_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=vocab_size,
       num_layers=num_layers,
       max_seq_len=max_seq_len,
       embedding_dim=embedding_dim,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/t5/t5.py CHANGED Viewed

@@ -52,9 +52,15 @@ class T5Stack(nn.Module):
     self.config = config
     self.embed_tokens = embed_tokens
     self.is_decoder = config.is_decoder
+    # T5 has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList([
-        EncoderDecoderBlock(config, has_relative_attention_bias=bool(i == 0))
-        for i in range(config.num_layers)
+        EncoderDecoderBlock(
+            block_config,
+            config,
+            has_relative_attention_bias=bool(idx == 0),
+        )
+        for idx in range(config.num_layers)
     ])
     self.final_norm = builder.build_norm(
         config.embedding_dim, config.final_norm_config
@@ -73,13 +79,11 @@ class T5Stack(nn.Module):
           torch.Tensor
       ] = None,  # should be for decoder case
   ):
-    input_shape = input_ids.size()
     inputs_embeds = self.embed_tokens(input_ids)
-    batch_size, seq_length = input_shape
     hidden_states = inputs_embeds
     position_bias = None
     encoder_decoder_position_bias = None
-    for i, layer_module in enumerate(self.transformer_blocks):
+    for _, layer_module in enumerate(self.transformer_blocks):
       # EncoderDecoderBlock.forward
       hidden_states, position_bias, encoder_decoder_position_bias = (
           layer_module(
@@ -111,7 +115,8 @@ class T5(nn.Module):
     encoder_config = copy.deepcopy(config)
     encoder_config.is_decoder = False
-    encoder_config.attn_config.enable_kv_cache = False
+    # T5 has only one block config.
+    encoder_config.block_config(0).attn_config.enable_kv_cache = False
     self.encoder = T5Stack(encoder_config, self.tok_embedding)
     decoder_config = copy.deepcopy(config)
@@ -137,20 +142,22 @@ class T5(nn.Module):
         device=torch.device("cpu"),
     )
+    # T5 has only one block config.
+    attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
         query_length=config.kv_cache_max,
         key_length=config.kv_cache_max,
-        num_buckets=config.attn_config.relative_attention_num_buckets,
-        max_distance=config.attn_config.relative_attention_max_distance,
+        num_buckets=attn_config.relative_attention_num_buckets,
+        max_distance=attn_config.relative_attention_max_distance,
     )
     self.dec_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=False,
         query_length=config.kv_cache_max,
         key_length=config.kv_cache_max,
-        num_buckets=config.attn_config.relative_attention_num_buckets,
-        max_distance=config.attn_config.relative_attention_max_distance,
+        num_buckets=attn_config.relative_attention_num_buckets,
+        max_distance=attn_config.relative_attention_max_distance,
     )
   @torch.inference_mode
@@ -230,7 +237,8 @@ class T5Encoder(nn.Module):
     encoder_config = copy.deepcopy(config)
     encoder_config.is_decoder = False
-    encoder_config.attn_config.enable_kv_cache = False
+    # T5 has only one block config.
+    encoder_config.block_config(0).attn_config.enable_kv_cache = False
     self.encoder = T5Stack(encoder_config, self.tok_embedding)
     self.enc_attn_mask_cache = (
@@ -243,12 +251,14 @@ class T5Encoder(nn.Module):
         .unsqueeze(0)
     )
+    # T5 has only one block config.
+    attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
         query_length=config.kv_cache_max,
         key_length=config.kv_cache_max,
-        num_buckets=config.attn_config.relative_attention_num_buckets,
-        max_distance=config.attn_config.relative_attention_max_distance,
+        num_buckets=attn_config.relative_attention_num_buckets,
+        max_distance=attn_config.relative_attention_max_distance,
     )
   @torch.inference_mode
@@ -313,12 +323,14 @@ class T5Decoder(nn.Module):
         .unsqueeze(0)
     )
+    # T5 has only one block config.
+    attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
         query_length=config.kv_cache_max,
         key_length=config.kv_cache_max,
-        num_buckets=config.attn_config.relative_attention_num_buckets,
-        max_distance=config.attn_config.relative_attention_max_distance,
+        num_buckets=attn_config.relative_attention_num_buckets,
+        max_distance=attn_config.relative_attention_max_distance,
     )
     self.dec_attn_mask_cache = attn_utils.build_causal_mask_cache(
@@ -386,19 +398,20 @@ def get_model_config_t5() -> cfg.ModelConfig:
       type=cfg.NormalizationType.RMS_NORM,
       epsilon=1e-6,
   )
-  config = cfg.ModelConfig(
-      vocab_size=32128,
-      num_layers=12,
-      max_seq_len=512,
-      embedding_dim=768,
+  block_config = cfg.TransformerBlockConfig(
       attn_config=attn_config,
       relative_attention=True,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
       post_attention_norm_config=norm_config,
+  )
+  config = cfg.ModelConfig(
+      vocab_size=32128,
+      num_layers=12,
+      max_seq_len=512,
+      embedding_dim=768,
+      block_configs=block_config,
       final_norm_config=norm_config,
-      parallel_residual=False,
       lm_head_use_bias=False,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/t5/t5_attention.py CHANGED Viewed

@@ -24,7 +24,6 @@ from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_
 from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
 import torch
 from torch import nn
-import torch.nn.functional as F
 BATCH_SIZE = 1
@@ -32,13 +31,18 @@ BATCH_SIZE = 1
 class EncoderDecoderBlock(nn.Module):
   def __init__(
-      self, config: cfg.ModelConfig, has_relative_attention_bias: bool = False
+      self,
+      config: cfg.TransformerBlockConfig,
+      model_config: cfg.ModelConfig,
+      has_relative_attention_bias: bool = False,
   ) -> None:
     """Initialize an instance of the EncoderDecoderBlock.
     Args:
-      config (cfg.ModelConfig): the configuration object for this transformer
-        block.
+      config (cfg.TransformerBlockConfig): the configuration object for this
+        transformer block.
+      model_config (cfg.ModelConfig): the configuration object for the model
+        this transformer block belongs to.
       has_relative_attention_bias (bool): whether the self attention block has
         relative bias.
     """
@@ -46,22 +50,22 @@ class EncoderDecoderBlock(nn.Module):
     super().__init__()
     self.atten_func = T5Attention(
         BATCH_SIZE,
-        config.embedding_dim,
+        model_config.embedding_dim,
         config.attn_config,
         config.pre_attention_norm_config,
-        config.kv_cache_max,
-        config.enable_hlfb,
+        model_config.kv_cache_max,
+        model_config.enable_hlfb,
         has_relative_attention_bias=has_relative_attention_bias,
     )
     # For a decoder, we add a cross attention.
-    if config.is_decoder:
+    if model_config.is_decoder:
       self.cross_atten_func = T5Attention(
           BATCH_SIZE,
-          config.embedding_dim,
+          model_config.embedding_dim,
           config.attn_config,
           config.pre_attention_norm_config,
-          config.kv_cache_max,
-          config.enable_hlfb,
+          model_config.kv_cache_max,
+          model_config.enable_hlfb,
           # Cross Attention does not have relative attention bias.
           has_relative_attention_bias=False,
       )
@@ -69,9 +73,10 @@ class EncoderDecoderBlock(nn.Module):
       self.cross_atten_func = None
     self.post_atten_norm = builder.build_norm(
-        config.embedding_dim, config.post_attention_norm_config
+        model_config.embedding_dim,
+        config.post_attention_norm_config,
     )
-    self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
+    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
     self.config = config
   def forward(

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl