PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Common building blocks for Attention layer.
-from typing import Optional, Tuple
+"""Common building blocks for Attention layer."""
-import ai_edge_torch.generative.layers.builder as builder
-from ai_edge_torch.generative.layers.kv_cache import KVCache
+from typing import Optional, Tuple, Union
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention  # NOQA
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
 import torch
 from torch import nn
@@ -55,29 +55,35 @@ def _embed_rope(
 class TransformerBlock(nn.Module):
-  def __init__(self, config: cfg.ModelConfig) -> None:
+  def __init__(
+      self,
+      config: cfg.TransformerBlockConfig,
+      model_config: cfg.ModelConfig,
+  ) -> None:
     """Initialize an instance of the TransformerBlock.
     Args:
-      config (cfg.ModelConfig): the configuration object for this transformer
-        block.
+      config (cfg.TransformerBlockConfig): the configuration object for this
+        transformer block.
+      model_config (cfg.ModelConfig): the configuration object for the model
+        this transformer block belongs to.
     """
     super().__init__()
     self.pre_atten_norm = builder.build_norm(
-        config.embedding_dim, config.pre_attention_norm_config
+        model_config.embedding_dim,
+        config.pre_attention_norm_config,
     )
     self.atten_func = CausalSelfAttention(
-        config.batch_size,
-        config.embedding_dim,
+        model_config.batch_size,
+        model_config.embedding_dim,
         config.attn_config,
-        config.kv_cache_max,
-        config.enable_hlfb,
+        model_config.enable_hlfb,
     )
     self.post_atten_norm = builder.build_norm(
-        config.embedding_dim, config.post_attention_norm_config
+        model_config.embedding_dim,
+        config.post_attention_norm_config,
     )
-    self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
+    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
     self.config = config
   def forward(
@@ -86,7 +92,8 @@ class TransformerBlock(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
     Args:
@@ -94,24 +101,34 @@ class TransformerBlock(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
     Returns:
-      output activation from this transformer block.
+      output activation from this transformer block, and updated kv cache (if
+      passed in).
     """
+    kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       ff_out = self.ff(x_norm)
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       x = x + attn_out
       x_norm = self.post_atten_norm(x)
       output = x + self.ff(x_norm)
-    return output
+    return output if kv is None else (output, kv)
 class CausalSelfAttention(nn.Module):
@@ -121,7 +138,6 @@ class CausalSelfAttention(nn.Module):
       batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
@@ -130,12 +146,9 @@ class CausalSelfAttention(nn.Module):
       batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
-    self.config = config
     self.kv_cache = None
     self.batch_size = batch_size
     qkv_shape = (
@@ -147,21 +160,13 @@ class CausalSelfAttention(nn.Module):
     self.output_projection = nn.Linear(
         output_shape, dim, bias=config.output_proj_use_bias
     )
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.config = config
+    self.enable_hlfb = enable_hlfb
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -169,7 +174,8 @@ class CausalSelfAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
        MQA, GQA and MHA.
@@ -179,9 +185,11 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
@@ -224,9 +232,11 @@ class CausalSelfAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     y = self.sdpa_func(
         q,
@@ -240,7 +250,7 @@ class CausalSelfAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)
 class SelfAttention(CausalSelfAttention):
@@ -251,16 +261,19 @@ class SelfAttention(CausalSelfAttention):
       x: torch.Tensor,
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
     Args:
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
     return super().forward(
@@ -279,9 +292,8 @@ class CrossAttention(nn.Module):
       query_dim: int,
       cross_dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
-  ) -> None:
+  ):
     """Initialize an instance of CrossAttention.
     Args:
@@ -289,8 +301,6 @@ class CrossAttention(nn.Module):
       query_dim (int): query tensor's dimension.
       cross_dim (int): cross attention's dimensions, for key and value tensors.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
@@ -309,21 +319,11 @@ class CrossAttention(nn.Module):
         query_dim, query_dim, bias=config.output_proj_use_bias
     )
-    self.kv_cache = None
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          self.config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -332,6 +332,7 @@ class CrossAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -342,6 +343,7 @@ class CrossAttention(nn.Module):
       mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
         [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
       output activation from this cross attention layer.
@@ -363,9 +365,11 @@ class CrossAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     if mask is None:
       mask = torch.zeros(
           (batch_size, 1, target_seq_len, source_seq_len), dtype=torch.float32
@@ -375,4 +379,4 @@ class CrossAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)

ai_edge_torch/generative/layers/builder.py CHANGED Viewed

@@ -59,9 +59,11 @@ def build_norm(dim: int, config: cfg.NormalizationConfig):
         zero_centered_gamma=config.zero_centered,
     )
   elif config.type == cfg.NormalizationType.LAYER_NORM:
-    return nn.LayerNorm(dim, eps=config.epsilon)
+    return normalization.LayerNorm(dim, config.epsilon, config.enable_hlfb)
   elif config.type == cfg.NormalizationType.GROUP_NORM:
-    return nn.GroupNorm(config.group_num, dim, config.epsilon)
+    return normalization.GroupNorm(
+        config.group_num, dim, config.epsilon, config.enable_hlfb
+    )
   else:
     raise ValueError("Unsupported norm type.")
@@ -71,7 +73,7 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
   Args:
     dim (int): dimension of the input tensor.
-    config (`ModelConfig` object): the model configuration.
+    config (`FeedForwardConfig` object): the model configuration.
   Returns:
     The constructed `nn.Module` feedforward layer.

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -12,72 +12,184 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# `nn.Module` which implements a KV cache.
-from ai_edge_torch.hlfb import StableHLOCompositeBuilder
+"""Utility functions for externalized KV Cache."""
+import dataclasses
+from typing import List, Tuple
+from ai_edge_torch import hlfb
+from ai_edge_torch.generative.layers import model_config
 import torch
-from torch import nn
+import torch.utils._pytree as pytree
+BATCH_SIZE = 1
-class KVCache(nn.Module):
-  def __init__(
-      self, batch_size, kv_cache_max, n_heads, head_dim, enable_hlfb=False
-  ):
-    """Initializes the KVCache layer.
+@dataclasses.dataclass
+class KVCacheEntry:
+  """A single cache entry that includes K and V caches.
-    Args:
-      batch_size (int): batch size. Currently only batch size 1 is supported.
-      kv_cache_max (int): the max length of KV cache.
-      n_heads (int): number of kv heads.
-      head_dim (int): the head dimension size.
-      enable_hlfb (bool): whether hlfb is enabled or not.
-    """
-    super().__init__()
-    cache_shape = (batch_size, kv_cache_max, n_heads, head_dim)
-    self.register_buffer("k_cache", torch.zeros(cache_shape), persistent=False)
-    self.register_buffer("v_cache", torch.zeros(cache_shape), persistent=False)
-    self.enable_hlfb = enable_hlfb
-    self.kv_cache_max = kv_cache_max
+  The chaches are built based on the provided config with the shape of
+  (batch_size=1, kv_cache_max, num_query_groups, head_dim).
+  """
-  def update_cache(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache.
+  k_cache: torch.Tensor
+  v_cache: torch.Tensor
-    Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+  @classmethod
+  def from_model_config(
+      cls,
+      kv_cache_max: int,
+      config: model_config.AttentionConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCacheEntry":
+    """Build an instance of the class based on model config."""
+    shape = (BATCH_SIZE, kv_cache_max, config.num_query_groups, config.head_dim)
+    k = torch.zeros(shape, dtype=dtype, device=device)
+    v = torch.zeros(shape, dtype=dtype, device=device)
+    obj = cls(k_cache=k, v_cache=v)
+    return obj
-    Returns:
-      The updated key and value tensor.
-    """
-    if self.enable_hlfb:
-      return self.update_cache_with_hlfb(input_pos, k_val, v_val)
-    updated_k = self.k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = self.v_cache.index_copy_(1, input_pos, v_val)
-    # Here we need a clone otherwise dynamo export will fail.
-    return torch.clone(updated_k), torch.clone(updated_v)
+@dataclasses.dataclass
+class KVCache:
+  """A utility class for holding KV cache entries per layer."""
-  def update_cache_with_hlfb(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache and enable high-level function boundary.
+  caches: Tuple[KVCacheEntry, ...]
+  @classmethod
+  def from_model_config(
+      cls,
+      config: model_config.ModelConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCache":
+    """Build an instance of the class based on model config.
     Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+        config (ModelConfig): Model config used for building the cache.
+        dtype (torch.dtype, optional): The data type of the cache tensor.
+          Defaults to torch.float32.
+        device (torch.device, optional): The device placement of the cache
+          tensors. Defaults to None.
     Returns:
-      The updated key and value tensor.
+        KVCache: The created cache object.
     """
+    caches = [
+        KVCacheEntry.from_model_config(
+            config.kv_cache_max,
+            config.block_config(idx).attn_config,
+            dtype,
+            device,
+        )
+        for idx in range(config.num_layers)
+    ]
+    obj = cls(caches=tuple(caches))
+    return obj
-    builder = StableHLOCompositeBuilder(
-        name="odml.update_kv_cache", attr={"kv_cache_max": self.kv_cache_max}
-    )
-    k_cache, v_cache, input_pos, k_val, v_val = builder.mark_inputs(
-        self.k_cache, self.v_cache, input_pos, k_val, v_val
+  def flatten(self) -> List[torch.Tensor]:
+    """Flatten the cache entries into a list of tensors with order k_i, v_i."""
+    flattened, _ = _flatten_kvc(self)
+    return flattened
+def _flatten_kvc(kvc: KVCache) -> Tuple[List[str], List[str]]:
+  flattened = []
+  flat_names = []
+  none_names = []
+  for i, kv_entry in enumerate(kvc.caches):
+    flattened.append(kv_entry.k_cache)
+    flat_names.append(f"k_{i}")
+    flattened.append(kv_entry.v_cache)
+    flat_names.append(f"v_{i}")
+  return flattened, [flat_names, none_names]
+def _flatten_kvc_with_keys(kvc: KVCache) -> Tuple[List, List]:
+  flattened, (flat_names, none_names) = _flatten_kvc(kvc)
+  return [
+      (pytree.MappingKey(k), v) for k, v in zip(flat_names, flattened)
+  ], flat_names
+def _unflatten_kvc(
+    values: List[torch.Tensor], context: Tuple[List, List]
+) -> KVCache:
+  assert len(values) % 2 == 0, "Found odd number of K and V entries."
+  num_layers = len(values) // 2
+  flat_names = context[0]
+  kv_entries = []
+  for i in range(num_layers):
+    k_cache_idx = flat_names.index(f"k_{i}")
+    v_cache_idx = flat_names.index(f"v_{i}")
+    kv_entries.append(
+        KVCacheEntry(k_cache=values[k_cache_idx], v_cache=values[v_cache_idx])
     )
-    updated_k = k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = v_cache.index_copy_(1, input_pos, v_val)
-    updated_k, updated_v = builder.mark_outputs(updated_k, updated_v)
-    return updated_k, updated_v
+  obj = KVCache(tuple(kv_entries))
+  return obj
+pytree.register_pytree_node(
+    KVCache,
+    _flatten_kvc,
+    _unflatten_kvc,
+    flatten_with_keys_fn=_flatten_kvc_with_keys,
+    serialized_type_name="",
+)
+def update(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+    enable_hlfb: bool = True,
+) -> KVCacheEntry:
+  """Out of place update of Cache buffer.
+  Args:
+      cache (KVCacheEntry): The original cache buffer.
+      input_pos (torch.Tensor): The update slice positions.
+      k_slice (torch.Tensor): The K slice to be updated in the new cache.
+      v_slice (torch.Tensor): The V slice to be updated in the new cache.
+      enable_hlfb (bool, optional): Whether the op is annotated for export with
+        High Level Function Boundary. Defaults to True.
+  Returns:
+      KVCacheEntry: The updated KVCache entry based on the passed inputs.
+  """
+  update_func = _update_kv_hlfb_impl if enable_hlfb else _update_kv_base_impl
+  return update_func(cache, input_pos, k_slice, v_slice)
+def _update_kv_base_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer without High Level Function Boundary annotation."""
+  k = cache.k_cache.index_copy(1, input_pos, k_slice)
+  v = cache.v_cache.index_copy(1, input_pos, v_slice)
+  updated_cache = KVCacheEntry(k, v)
+  return updated_cache
+def _update_kv_hlfb_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer with High Level Function Boundary annotation."""
+  builder = hlfb.StableHLOCompositeBuilder(name="odml.update_external_kv_cache")
+  k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
+      cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
+  )
+  k = k_cache.index_copy(1, input_pos, k_slice)
+  v = v_cache.index_copy(1, input_pos, v_slice)
+  k, v = builder.mark_outputs(k, v)
+  return KVCacheEntry(k, v)

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from dataclasses import field
 import enum
-from typing import Optional, Sequence
+from typing import Optional, Sequence, Union
 @enum.unique
@@ -85,8 +85,8 @@ class AttentionConfig:
   relative_attention_max_distance: int = 0
   # Softcap on the output logits.
   logit_softcap: Optional[float] = None
-  # The types of attention used in the layers of the model.
-  attn_types: Optional[Sequence[AttentionType]] = None
+  # The type of attention.
+  attn_type: Optional[AttentionType] = None
   # The size of the sliding window used for local attention.
   sliding_window_size: Optional[int] = None
@@ -104,6 +104,7 @@ class NormalizationConfig:
   """Normalizater parameters."""
   type: NormalizationType = NormalizationType.NONE
+  enable_hlfb: bool = False
   epsilon: float = 1e-5
   zero_centered: bool = False
   # Number of groups used in group normalization.
@@ -129,13 +130,8 @@ class FeedForwardConfig:
 @dataclass
-class ModelConfig:
-  """Base configurations for building a transformer architecture."""
-  vocab_size: int
-  num_layers: int
-  max_seq_len: int
-  embedding_dim: int
+class TransformerBlockConfig:
+  """TransformerBlock module's parameters."""
   attn_config: AttentionConfig
   ff_config: FeedForwardConfig
@@ -147,15 +143,33 @@ class ModelConfig:
   post_attention_norm_config: NormalizationConfig = field(
       default_factory=NormalizationConfig
   )
+  # If set to True, only attn_config.pre_attention_norm is applied to the input
+  # and the decode's output is computed as `output = input + attn_out + ff_out`
+  # where attention and feed forward are called with pre_attention_norm's
+  # output.
+  parallel_residual: bool = False
+  # The Attention computation will include relative positional bias.
+  relative_attention: bool = False
+@dataclass
+class ModelConfig:
+  """Base configurations for building a transformer architecture."""
+  vocab_size: int
+  num_layers: int
+  max_seq_len: int
+  embedding_dim: int
+  # TransformerBlockConfig for each layer block. If a single
+  # TransformerBlockConfig is provided, it will be used for all layers.
+  block_configs: Union[TransformerBlockConfig, Sequence[TransformerBlockConfig]]
   # The normalization applied before LM head.
   final_norm_config: NormalizationConfig = field(
       default_factory=NormalizationConfig
   )
-  # If set to True, only pre_attention_norm is applied to the input and the
-  # decode's output is computed as `output = input + attn_out + ff_out` where
-  # attention and feed forward are called with pre_attention_norm's output.
-  parallel_residual: bool = False
   # Use bias term within LLM's HEAD.
   lm_head_use_bias: bool = False
   # Whether to turn on high-level function boundary.
@@ -164,9 +178,6 @@ class ModelConfig:
   # The maximum sequence length of the KV cache. Should not exceed max_seq_len.
   kv_cache_max_len: int = 0
-  # The Attention computation will include relative positional bias.
-  relative_attention: bool = False
   # Default batch size of the exported model. Default value is 1.
   batch_size: int = 1
@@ -177,5 +188,13 @@ class ModelConfig:
   def kv_cache_max(self) -> int:
     if self.kv_cache_max_len > 0:
       return self.kv_cache_max_len
-    else:
-      return self.max_seq_len
+    return self.max_seq_len
+  def block_config(self, idx: int) -> TransformerBlockConfig:
+    if isinstance(self.block_configs, TransformerBlockConfig):
+      return self.block_configs
+    if idx < 0 or idx >= len(self.block_configs):
+      raise ValueError(
+          f"Index {idx} is out of range for layer configs: {self.block_configs}"
+      )
+    return self.block_configs[idx]

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl