PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building a TinyLlama model from the Edge Generative API layers.
+"""Example of building a TinyLlama model."""
 import os
-from pathlib import Path
+import pathlib
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
@@ -42,13 +44,12 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
 )
-class TinyLLamma(nn.Module):
+class TinyLlama(nn.Module):
   """A TinyLlama model built from the Edge Generative API layers."""
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.lm_head = nn.Linear(
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
@@ -56,18 +57,20 @@ class TinyLLamma(nn.Module):
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
     )
+    # TinyLlama has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        attention.TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -80,16 +83,22 @@ class TinyLLamma(nn.Module):
     )
     self.config = config
-  # The model's forward function takes in additional k/v cache tensors
-  # and returns the updated k/v cache tensors to the caller.
-  # This can be eliminated if we handle k/v cache updates inside the model itself.
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    _, seq_len = idx.size()
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -97,16 +106,20 @@ class TinyLLamma(nn.Module):
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.kv_cache_max]
-    # forward the model itself
-    x = self.tok_embedding(idx)  # token embeddings of shape (b, t, n_embd)
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(tokens)
-    for _, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -131,55 +144,63 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       intermediate_size=5632,
   )
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=32000,
       num_layers=22,
       max_seq_len=2048,
       embedding_dim=2048,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )
   return config
-def get_fake_model_config() -> cfg.ModelConfig:
-  config = get_model_config()
+def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
   config.vocab_size = 128
   config.num_layers = 2
-  config.ff_config.intermediate_size = 64
+  # TinyLlama has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
   return config
 def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config(**kwargs)
-  model = TinyLLamma(config)
+  model = TinyLlama(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
   loader.load(model)
+  model.eval()
   return model
-def define_and_run() -> None:
+def define_and_run(checkpoint_path: str) -> None:
   """Instantiates and runs a TinyLlama model."""
-  current_dir = Path(__file__).parent.resolve()
+  current_dir = pathlib.Path(__file__).parent.resolve()
   tiny_llama_goldens = torch.load(current_dir / "tiny_llama_lm_logits.pt")
   kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/tiny_llama")
   model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
-  lm_logits = model.forward(tokens, input_pos)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
   assert torch.allclose(
-      tiny_llama_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05
+      tiny_llama_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-02
   )
 if __name__ == "__main__":
-  define_and_run()
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/tiny_llama"
+  )
+  define_and_run(input_checkpoint_path)

ai_edge_torch/generative/fx_passes/__init__.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from ai_edge_torch._convert.fx_passes import CanonicalizePass
-from ai_edge_torch._convert.fx_passes import run_passes
-from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass  # NOQA
+from ai_edge_torch import fx_pass_base
+from ai_edge_torch.fx_pass_base import CanonicalizePass
+from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass
 import torch
 def run_generative_passes(
     exported_program: torch.export.ExportedProgram,
 ) -> torch.export.ExportedProgram:
-  return run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           RemoveSDPACompositeZeroMaskPass(),

ai_edge_torch/generative/fx_passes/remove_sdpa_zero_mask_pass.py CHANGED Viewed

@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult
 import torch
-class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
+class RemoveSDPACompositeZeroMaskPass(fx_pass_base.ExportedProgramPassBase):
   def is_zero_tensor_node(self, node: torch.fx.Node):
     return node.target == torch.ops.aten.zeros.default
@@ -48,4 +47,4 @@ class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
     exported_program.graph_module.graph.lint()
     exported_program.graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Common building blocks for Attention layer.
-from typing import Optional, Tuple
+"""Common building blocks for Attention layer."""
-import ai_edge_torch.generative.layers.builder as builder
-from ai_edge_torch.generative.layers.kv_cache import KVCache
+from typing import Optional, Tuple, Union
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention  # NOQA
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
 import torch
 from torch import nn
@@ -55,29 +55,35 @@ def _embed_rope(
 class TransformerBlock(nn.Module):
-  def __init__(self, config: cfg.ModelConfig) -> None:
+  def __init__(
+      self,
+      config: cfg.TransformerBlockConfig,
+      model_config: cfg.ModelConfig,
+  ) -> None:
     """Initialize an instance of the TransformerBlock.
     Args:
-      config (cfg.ModelConfig): the configuration object for this transformer
-        block.
+      config (cfg.TransformerBlockConfig): the configuration object for this
+        transformer block.
+      model_config (cfg.ModelConfig): the configuration object for the model
+        this transformer block belongs to.
     """
     super().__init__()
     self.pre_atten_norm = builder.build_norm(
-        config.embedding_dim, config.pre_attention_norm_config
+        model_config.embedding_dim,
+        config.pre_attention_norm_config,
     )
     self.atten_func = CausalSelfAttention(
-        config.batch_size,
-        config.embedding_dim,
+        model_config.batch_size,
+        model_config.embedding_dim,
         config.attn_config,
-        config.kv_cache_max,
-        config.enable_hlfb,
+        model_config.enable_hlfb,
     )
     self.post_atten_norm = builder.build_norm(
-        config.embedding_dim, config.post_attention_norm_config
+        model_config.embedding_dim,
+        config.post_attention_norm_config,
     )
-    self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
+    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
     self.config = config
   def forward(
@@ -86,7 +92,8 @@ class TransformerBlock(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
     Args:
@@ -94,24 +101,34 @@ class TransformerBlock(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
     Returns:
-      output activation from this transformer block.
+      output activation from this transformer block, and updated kv cache (if
+      passed in).
     """
+    kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       ff_out = self.ff(x_norm)
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       x = x + attn_out
       x_norm = self.post_atten_norm(x)
       output = x + self.ff(x_norm)
-    return output
+    return output if kv is None else (output, kv)
 class CausalSelfAttention(nn.Module):
@@ -121,7 +138,6 @@ class CausalSelfAttention(nn.Module):
       batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
@@ -130,12 +146,9 @@ class CausalSelfAttention(nn.Module):
       batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
-    self.config = config
     self.kv_cache = None
     self.batch_size = batch_size
     qkv_shape = (
@@ -147,21 +160,17 @@ class CausalSelfAttention(nn.Module):
     self.output_projection = nn.Linear(
         output_shape, dim, bias=config.output_proj_use_bias
     )
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.query_norm = builder.build_norm(
+        config.head_dim, config.query_norm_config
+    )
+    self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
+    self.config = config
+    self.enable_hlfb = enable_hlfb
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -169,7 +178,8 @@ class CausalSelfAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
        MQA, GQA and MHA.
@@ -179,9 +189,11 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
@@ -216,6 +228,9 @@ class CausalSelfAttention(nn.Module):
           dim=-1,
       )
+    q = self.query_norm(q)
+    k = self.key_norm(k)
     q = q.reshape(B, T, -1, self.config.head_dim)
     k = k.reshape(B, T, -1, self.config.head_dim)
     v = v.reshape(B, T, -1, self.config.head_dim)
@@ -224,9 +239,11 @@ class CausalSelfAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     y = self.sdpa_func(
         q,
@@ -240,7 +257,7 @@ class CausalSelfAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)
 class SelfAttention(CausalSelfAttention):
@@ -251,16 +268,19 @@ class SelfAttention(CausalSelfAttention):
       x: torch.Tensor,
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
     Args:
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
     return super().forward(
@@ -279,9 +299,8 @@ class CrossAttention(nn.Module):
       query_dim: int,
       cross_dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
-  ) -> None:
+  ):
     """Initialize an instance of CrossAttention.
     Args:
@@ -289,8 +308,6 @@ class CrossAttention(nn.Module):
       query_dim (int): query tensor's dimension.
       cross_dim (int): cross attention's dimensions, for key and value tensors.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
@@ -309,21 +326,11 @@ class CrossAttention(nn.Module):
         query_dim, query_dim, bias=config.output_proj_use_bias
     )
-    self.kv_cache = None
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          self.config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -332,6 +339,7 @@ class CrossAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -342,6 +350,7 @@ class CrossAttention(nn.Module):
       mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
         [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
       output activation from this cross attention layer.
@@ -363,9 +372,11 @@ class CrossAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     if mask is None:
       mask = torch.zeros(
           (batch_size, 1, target_seq_len, source_seq_len), dtype=torch.float32
@@ -375,4 +386,4 @@ class CrossAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)

ai_edge_torch/generative/layers/builder.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 # Builder class for individual components.
+from typing import Callable
 import ai_edge_torch.generative.layers.feed_forward as feed_forward
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.normalization as normalization
@@ -21,20 +23,34 @@ from torch import nn
 import torch.nn.functional as F
-class GeGLU(nn.Module):
-  """GeGLU is an activation function which is a variant of GELU.
+def build_glu(
+    act: Callable[[torch.Tensor], torch.Tensor], gate_is_front: bool = False
+) -> Callable[[torch.Tensor], torch.Tensor]:
+  """Builds an activation function with GLU (Gated Linear Unit).
+  If gate_is_front is True,
+    f(x) = act(x) * y
+  otherwise,
+    f(x) = x * act(y),
+  where x is the first half of the input and y is the second half of the input.
+  Args:
+    act (Callable[[torch.Tensor], torch.Tensor]): activation function to apply
+      to the gate.
+    gate_is_front: whether the gate is in front half of the input. Other part is
+      the output in GLU.
-  GeGLU(x) = (xW+b) * GELU(xV+c)
-  See: https://arxiv.org/abs/2002.05202v1
+  Returns:
+    A callable activation function with GLU.
   """
-  def __init__(self, d_in: int, d_out: int):
-    super().__init__()
-    self.proj = nn.Linear(d_in, d_out * 2)
+  def _glu(x):
+    x, y = x.chunk(2, dim=-1)
+    if gate_is_front:
+      return act(x) * y
+    return x * act(y)
-  def forward(self, x: torch.Tensor):
-    x, gate = self.proj(x).chunk(2, dim=-1)
-    return x * F.gelu(gate)
+  return _glu
 def build_norm(dim: int, config: cfg.NormalizationConfig):
@@ -59,9 +75,11 @@ def build_norm(dim: int, config: cfg.NormalizationConfig):
         zero_centered_gamma=config.zero_centered,
     )
   elif config.type == cfg.NormalizationType.LAYER_NORM:
-    return nn.LayerNorm(dim, eps=config.epsilon)
+    return normalization.LayerNorm(dim, config.epsilon, config.enable_hlfb)
   elif config.type == cfg.NormalizationType.GROUP_NORM:
-    return nn.GroupNorm(config.group_num, dim, config.epsilon)
+    return normalization.GroupNorm(
+        config.group_num, dim, config.epsilon, config.enable_hlfb
+    )
   else:
     raise ValueError("Unsupported norm type.")
@@ -71,7 +89,7 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
   Args:
     dim (int): dimension of the input tensor.
-    config (`ModelConfig` object): the model configuration.
+    config (`FeedForwardConfig` object): the model configuration.
   Returns:
     The constructed `nn.Module` feedforward layer.
@@ -97,6 +115,10 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
       hidden_dim=config.intermediate_size,
       activation=activation,
       use_bias=config.use_bias,
+      use_glu=(
+          config.activation.type == cfg.ActivationType.GE_GLU
+          or config.activation.type == cfg.ActivationType.SILU_GLU
+      ),
       pre_ff_norm=pre_ff_norm,
       post_ff_norm=post_ff_norm,
   )
@@ -127,8 +149,10 @@ def get_activation(config: cfg.ActivationConfig):
     # See: https://github.com/hendrycks/GELUs
     return lambda x: x * F.sigmoid(1.702 * x)
   elif config.type == cfg.ActivationType.GE_GLU:
-    return GeGLU(config.dim_in, config.dim_out)
+    return build_glu(F.gelu, config.gate_is_front)
   elif config.type == cfg.ActivationType.RELU:
     return F.relu
+  elif config.type == cfg.ActivationType.SILU_GLU:
+    return build_glu(F.silu, config.gate_is_front)
   else:
     raise ValueError("Unsupported activation type.")

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl