PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -12,20 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Common building blocks for Attention layer.
-from typing import Optional, Tuple
+"""Common building blocks for Attention layer."""
-import torch
-from torch import nn
-import torch.nn.functional as F
+from typing import Optional, Tuple, Union
-import ai_edge_torch.generative.layers.builder as builder
-from ai_edge_torch.generative.layers.kv_cache import KVCache
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention  # NOQA
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
+import torch
+from torch import nn
 def _embed_rope(
@@ -57,29 +55,35 @@ def _embed_rope(
 class TransformerBlock(nn.Module):
-  def __init__(self, config: cfg.ModelConfig) -> None:
+  def __init__(
+      self,
+      config: cfg.TransformerBlockConfig,
+      model_config: cfg.ModelConfig,
+  ) -> None:
     """Initialize an instance of the TransformerBlock.
     Args:
-      config (cfg.ModelConfig): the configuration object
-        for this transformer block.
+      config (cfg.TransformerBlockConfig): the configuration object for this
+        transformer block.
+      model_config (cfg.ModelConfig): the configuration object for the model
+        this transformer block belongs to.
     """
     super().__init__()
     self.pre_atten_norm = builder.build_norm(
-        config.embedding_dim, config.pre_attention_norm_config
+        model_config.embedding_dim,
+        config.pre_attention_norm_config,
     )
     self.atten_func = CausalSelfAttention(
-        config.batch_size,
-        config.embedding_dim,
+        model_config.batch_size,
+        model_config.embedding_dim,
         config.attn_config,
-        config.kv_cache_max,
-        config.enable_hlfb,
+        model_config.enable_hlfb,
     )
-    self.pre_ff_norm = builder.build_norm(
-        config.embedding_dim, config.pre_ff_norm_config
+    self.post_atten_norm = builder.build_norm(
+        model_config.embedding_dim,
+        config.post_attention_norm_config,
     )
-    self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
+    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
     self.config = config
   def forward(
@@ -88,7 +92,8 @@ class TransformerBlock(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
     Args:
@@ -96,24 +101,34 @@ class TransformerBlock(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
     Returns:
-      output activation from this transformer block.
+      output activation from this transformer block, and updated kv cache (if
+      passed in).
     """
+    kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       ff_out = self.ff(x_norm)
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       x = x + attn_out
-      x_norm = self.pre_ff_norm(x)
+      x_norm = self.post_atten_norm(x)
       output = x + self.ff(x_norm)
-    return output
+    return output if kv is None else (output, kv)
 class CausalSelfAttention(nn.Module):
@@ -123,7 +138,6 @@ class CausalSelfAttention(nn.Module):
       batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
@@ -132,33 +146,31 @@ class CausalSelfAttention(nn.Module):
       batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
-    self.head_dim = dim // config.num_heads
-    shape = (config.num_heads + 2 * config.num_query_groups) * self.head_dim
-    # Key, query, value projections for all heads.
-    self.qkv_projection = nn.Linear(dim, shape, bias=config.qkv_use_bias)
-    self.output_projection = nn.Linear(dim, dim, bias=config.output_proj_use_bias)
-    self.config = config
     self.kv_cache = None
     self.batch_size = batch_size
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          self.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    qkv_shape = (
+        config.num_heads + 2 * config.num_query_groups
+    ) * config.head_dim
+    output_shape = config.num_heads * config.head_dim
+    # Key, query, value projections for all heads.
+    self.qkv_projection = nn.Linear(dim, qkv_shape, bias=config.qkv_use_bias)
+    self.output_projection = nn.Linear(
+        output_shape, dim, bias=config.output_proj_use_bias
+    )
+    self.query_norm = builder.build_norm(
+        config.head_dim, config.query_norm_config
+    )
+    self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
+    self.config = config
+    self.enable_hlfb = enable_hlfb
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -166,8 +178,10 @@ class CausalSelfAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
        MQA, GQA and MHA.
     Args:
@@ -175,15 +189,18 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
-    assert (
-        B == self.batch_size
-    ), "batch size of input tensor must match with the batch size specified in the model configuration."
+    assert B == self.batch_size, (
+        "batch size of input tensor must match with the batch size specified in"
+        " the model configuration."
+    )
     qkv = self.qkv_projection(x)
@@ -191,7 +208,7 @@ class CausalSelfAttention(nn.Module):
     q_per_kv = self.config.num_heads // self.config.num_query_groups
     # Each group has >=1 queries, 1 key, and 1 value.
     if self.config.qkv_transpose_before_split:
-      qkv = qkv.view(B, T, -1, self.head_dim)
+      qkv = qkv.view(B, T, -1, self.config.head_dim)
       q, k, v = qkv.split(
           (
               q_per_kv * self.config.num_query_groups,
@@ -203,27 +220,44 @@ class CausalSelfAttention(nn.Module):
     else:
       qkv = qkv.view(B, T, self.config.num_query_groups, -1)
       q, k, v = qkv.split(
-          (q_per_kv * self.head_dim, self.head_dim, self.head_dim), dim=-1
+          (
+              q_per_kv * self.config.head_dim,
+              self.config.head_dim,
+              self.config.head_dim,
+          ),
+          dim=-1,
       )
-    q = q.reshape(B, T, -1, self.head_dim)
-    k = k.reshape(B, T, -1, self.head_dim)
-    v = v.reshape(B, T, -1, self.head_dim)
+    q = self.query_norm(q)
+    k = self.key_norm(k)
+    q = q.reshape(B, T, -1, self.config.head_dim)
+    k = k.reshape(B, T, -1, self.config.head_dim)
+    v = v.reshape(B, T, -1, self.config.head_dim)
     # Compute rotary positional embedding for query and key.
-    n_elem = int(self.config.rotary_percentage * self.head_dim)
+    n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
-    y = self.sdpa_func(q, k, v, self.head_dim, mask=mask)
-    y = y.reshape(B, T, E)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
+    y = self.sdpa_func(
+        q,
+        k,
+        v,
+        self.config.head_dim,
+        mask=mask,
+        softcap=self.config.logit_softcap,
+    )
+    y = y.reshape(B, T, -1)
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)
 class SelfAttention(CausalSelfAttention):
@@ -234,16 +268,19 @@ class SelfAttention(CausalSelfAttention):
       x: torch.Tensor,
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
     Args:
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
     return super().forward(
@@ -261,46 +298,43 @@ class CrossAttention(nn.Module):
       batch_size: int,
       query_dim: int,
       cross_dim: int,
+      hidden_dim: int,
+      output_dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
-  ) -> None:
+  ):
     """Initialize an instance of CrossAttention.
     Args:
       batch_size (int): batch size of the input tensor.
       query_dim (int): query tensor's dimension.
       cross_dim (int): cross attention's dimensions, for key and value tensors.
+      hidden_dim (int): hidden dimension that q, k, v tensors project to.
+      output_dim (int): output tensor's dimension.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
     self.config = config
-    self.head_dim = query_dim // config.num_heads
     self.n_heads = config.num_heads
-    self.q_projection = nn.Linear(query_dim, query_dim, bias=config.qkv_use_bias)
-    self.k_projection = nn.Linear(cross_dim, query_dim, bias=config.qkv_use_bias)
-    self.v_projection = nn.Linear(cross_dim, query_dim, bias=config.qkv_use_bias)
+    self.q_projection = nn.Linear(
+        query_dim, hidden_dim, bias=config.qkv_use_bias
+    )
+    self.k_projection = nn.Linear(
+        cross_dim, hidden_dim, bias=config.qkv_use_bias
+    )
+    self.v_projection = nn.Linear(
+        cross_dim, hidden_dim, bias=config.qkv_use_bias
+    )
     self.output_projection = nn.Linear(
-        query_dim, query_dim, bias=config.output_proj_use_bias
+        hidden_dim, output_dim, bias=config.output_proj_use_bias
     )
-    self.kv_cache = None
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          self.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -309,6 +343,7 @@ class CrossAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -316,8 +351,10 @@ class CrossAttention(nn.Module):
       x (torch.Tensor): the target tensor, with shape [B, target_seq_len, ...].
       y (torch.Tensor): the source tensor, with shape [B, source_seq_len, ...].
       rope (Tuple[torch.Tensor, torch.Tensor]): the optional input rope tensor.
-      mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape [B, n_heads, target_seq_len, source_seq_len].
+      mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
+        [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
       output activation from this cross attention layer.
@@ -330,25 +367,27 @@ class CrossAttention(nn.Module):
     k = self.k_projection(y)
     v = self.v_projection(y)
-    interim_shape = (batch_size, -1, self.n_heads, self.head_dim)
+    interim_shape = (batch_size, -1, self.n_heads, self.config.head_dim)
     q = q.view(interim_shape)
     k = k.view(interim_shape)
     v = v.view(interim_shape)
     # Compute rotary positional embedding for query and key.
-    n_elem = int(self.config.rotary_percentage * self.head_dim)
+    n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     if mask is None:
       mask = torch.zeros(
           (batch_size, 1, target_seq_len, source_seq_len), dtype=torch.float32
       )
-    y = self.sdpa_func(q, k, v, self.head_dim, mask=mask)
+    y = self.sdpa_func(q, k, v, self.config.head_dim, mask=mask)
     y = y.reshape(batch_size, target_seq_len, -1)
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)

ai_edge_torch/generative/layers/attention_utils.py CHANGED Viewed

@@ -28,7 +28,9 @@ def build_rope_cache(
     dtype: torch.dtype = torch.float32,
     device: torch.device = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-  """Precompute Rotary Positional Embedding Sin and Cos values for quick lookups
+  """Precomputes Rotary Positional Embeddings.
+  Precompute Rotary Positional Embedding Sin and Cos values for quick lookup
   during the inference.
   Args:
@@ -72,28 +74,64 @@ def build_causal_mask_cache(
   Returns:
       torch.Tensor: Causal attention mask.
   """
   if device is None:
     device = torch.device('cpu')
   mask = torch.full((size, size), float('-inf'), dtype=dtype, device=device)
   return torch.triu(mask, diagonal=1).unsqueeze(0).unsqueeze(0)
+def build_sliding_window_mask_cache(
+    size: int,
+    window_size: int,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device = None,
+) -> torch.Tensor:
+  """Build a cache for a sliding window mask.
+  Args:
+      size (int): The size of the built mask cache.
+      window_size (int): The window size that is "seen" by a token.
+      dtype (torch.dtype, optional): Output tensor's data type. Defaults to
+        torch.float32.
+      device (torch.device, optional): Output tensor's data type. Defaults to
+        None in which case "cpu" is used.
+  Returns:
+      torch.Tensor: Causal attention mask.
+  """
+  mask = build_causal_mask_cache(size, dtype, device)
+  all_ones = torch.ones_like(mask)
+  window_size = min(size, window_size)
+  sliding_mask = torch.triu(all_ones, -1 * window_size + 1) * torch.tril(
+      all_ones, window_size - 1
+  )
+  return torch.where(sliding_mask == 1, mask, -2.3819763e38)
 def relative_position_bucket(
     relative_position: torch.Tensor,
     bidirectional: bool,
     num_buckets: int,
     max_distance: int,
 ) -> torch.Tensor:
-  """
-  Adapted from Mesh Tensorflow:
+  """Adapted from Mesh Tensorflow:
   https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-  Translate relative position to a bucket number for relative attention. The relative position is defined as
-  memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-  position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-  small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-  positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-  This should allow for more graceful generalization to longer sequences than the model has been trained on
+  Translate relative position to a bucket number for relative attention. The
+  relative position is defined as
+  memory_position - query_position, i.e. the distance in tokens from the
+  attending position to the attended-to
+  position. If bidirectional=False, then positive relative positions are
+  invalid. We use smaller buckets for
+  small absolute relative_position and larger buckets for larger absolute
+  relative_positions. All relative
+  positions >=max_distance map to the same bucket. All relative positions
+  <=-max_distance map to the same bucket.
+  This should allow for more graceful generalization to longer sequences than
+  the model has been trained on
   Args:
       relative_position: an int32 Tensor
@@ -102,7 +140,8 @@ def relative_position_bucket(
       max_distance: an integer for max distance.
   Returns:
-      a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+      a Tensor with the same shape as relative_position, containing int32 values
+      in the range [0, num_buckets)
   """
   relative_buckets = 0
   if bidirectional:
@@ -119,7 +158,8 @@ def relative_position_bucket(
   max_exact = num_buckets // 2
   is_small = relative_position < max_exact
-  # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+  # The other half of the buckets are for logarithmically bigger bins in
+  # positions up to max_distance
   relative_position_if_large = max_exact + (
       torch.log(relative_position.float() / max_exact)
       / math.log(max_distance / max_exact)
@@ -148,7 +188,8 @@ def build_relative_position_buckets(
   Args:
     query_length: an integer of length of current query tensor.
     key_length: an integer of length of current key tensor.
-    bidirectional: a boolean - whether the attention is bidirectional, default is True.
+    bidirectional: a boolean - whether the attention is bidirectional, default
+      is True.
     num_buckets: an integer for number of buckets, default is 32.
     max_distance: an integer for max distance, default is 128.

ai_edge_torch/generative/layers/builder.py CHANGED Viewed

@@ -13,13 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 # Builder class for individual components.
-import torch
-from torch import nn
-import torch.nn.functional as F
+from typing import Callable
 import ai_edge_torch.generative.layers.feed_forward as feed_forward
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.normalization as normalization
+import torch
+from torch import nn
+import torch.nn.functional as F
 class GeGLU(nn.Module):
@@ -27,7 +28,6 @@ class GeGLU(nn.Module):
   GeGLU(x) = (xW+b) * GELU(xV+c)
   See: https://arxiv.org/abs/2002.05202v1
   """
   def __init__(self, d_in: int, d_out: int):
@@ -39,6 +39,21 @@ class GeGLU(nn.Module):
     return x * F.gelu(gate)
+class SwiGLU(nn.Module):
+  """SwiGLU is an activation function which is a variant of GLU.
+  SwiGLU is same as SiLU_GLU, because The SiLU function is also known as the
+  swish function.
+  SwiGLU(x) = Swish(xW+b) * (xV+c)
+  See: https://paperswithcode.com/method/swiglu
+  """
+  def forward(self, x: torch.Tensor):
+    x, y = x.chunk(2, dim=-1)
+    return F.silu(x) * y
 def build_norm(dim: int, config: cfg.NormalizationConfig):
   """Builder function for normalizers.
@@ -61,9 +76,13 @@ def build_norm(dim: int, config: cfg.NormalizationConfig):
         zero_centered_gamma=config.zero_centered,
     )
   elif config.type == cfg.NormalizationType.LAYER_NORM:
-    return nn.LayerNorm(dim, eps=config.epsilon)
+    return normalization.LayerNorm(
+        dim, config.epsilon, config.enable_hlfb, config.use_input_shape
+    )
   elif config.type == cfg.NormalizationType.GROUP_NORM:
-    return nn.GroupNorm(config.group_num, dim, config.epsilon)
+    return normalization.GroupNorm(
+        config.group_num, dim, config.epsilon, config.enable_hlfb
+    )
   else:
     raise ValueError("Unsupported norm type.")
@@ -73,7 +92,7 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
   Args:
     dim (int): dimension of the input tensor.
-    config (`ModelConfig` object): the model configuration.
+    config (`FeedForwardConfig` object): the model configuration.
   Returns:
     The constructed `nn.Module` feedforward layer.
@@ -91,11 +110,20 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
   activation = get_activation(config.activation)
+  pre_ff_norm = build_norm(dim, config.pre_ff_norm_config)
+  post_ff_norm = build_norm(dim, config.post_ff_norm_config)
   return ff_module(
       dim=dim,
       hidden_dim=config.intermediate_size,
       activation=activation,
       use_bias=config.use_bias,
+      use_glu=(
+          config.activation.type == cfg.ActivationType.GE_GLU
+          or config.activation.type == cfg.ActivationType.SILU_GLU
+      ),
+      pre_ff_norm=pre_ff_norm,
+      post_ff_norm=post_ff_norm,
   )
@@ -127,5 +155,7 @@ def get_activation(config: cfg.ActivationConfig):
     return GeGLU(config.dim_in, config.dim_out)
   elif config.type == cfg.ActivationType.RELU:
     return F.relu
+  elif config.type == cfg.ActivationType.SILU_GLU:
+    return SwiGLU()
   else:
     raise ValueError("Unsupported activation type.")

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl