PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240813__py3-none-any.whl → 0.3.0.dev20240817__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240813py3-none-any.whl → 0.3.0.dev20240817py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (32) hide show

ai_edge_torch/generative/examples/experimental/gemma/gemma.py CHANGED Viewed

@@ -40,7 +40,7 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
-    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head=None,
@@ -150,7 +150,7 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       parallel_residual=False,
       lm_head_use_bias=False,

ai_edge_torch/generative/examples/experimental/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -41,7 +41,7 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
-    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head="lm_head",
@@ -142,7 +142,7 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py ADDED Viewed

@@ -0,0 +1,67 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+from pathlib import Path
+import ai_edge_torch
+from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.quantize import quant_recipes
+import torch
+def convert_gemma_to_tflite(
+    checkpoint_path: str,
+    prefill_seq_len: int = 512,
+    kv_cache_max_len: int = 1024,
+    quantize: bool = True,
+):
+  """Converting a Gemma 2 2B model to multi-signature
+  tflite model.
+  Args:
+      checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
+      prefill_seq_len (int, optional): The maximum size of prefill input tensor.
+        Defaults to 512.
+      kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
+        including both prefill and decode. Defaults to 1024.
+      quantize (bool, optional): Whether the model should be quanized.
+        Defaults to True.
+  """
+  pytorch_model = gemma2.build_2b_model(
+      checkpoint_path, kv_cache_max_len=kv_cache_max_len
+  )
+  # Tensors used to trace the model graph during conversion.
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
+  prefill_input_pos = torch.arange(0, prefill_seq_len)
+  decode_token = torch.tensor([[0]], dtype=torch.long)
+  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
+  edge_model = (
+      ai_edge_torch.signature(
+          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+      )
+      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
+      .convert(quant_config=quant_config)
+  )
+  edge_model.export(
+      f'/tmp/gemma2_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+  )
+if __name__ == '__main__':
+  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma2-2b')
+  convert_gemma_to_tflite(checkpoint_path)

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -35,7 +35,7 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
-    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head=None,
@@ -138,7 +138,7 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       parallel_residual=False,
       lm_head_use_bias=False,
@@ -160,6 +160,7 @@ def build_2b_model(checkpoint_path, **kwargs) -> nn.Module:
   # since embedding and lm-head use the same weight, we need to set strict
   # to False.
   loader.load(model, strict=False)
+  model.eval()
   return model

ai_edge_torch/generative/examples/gemma/gemma2.py ADDED Viewed

@@ -0,0 +1,250 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Example of building the Gemma2 2B model.
+import os
+from pathlib import Path
+from typing import Optional, Tuple
+from ai_edge_torch.generative.layers.attention import TransformerBlock
+import ai_edge_torch.generative.layers.attention_utils as attn_utils
+import ai_edge_torch.generative.layers.builder as builder
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+import numpy as np
+import torch
+import torch.nn as nn
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="model.layers.{}.mlp.up_proj",
+    ff_down_proj="model.layers.{}.mlp.down_proj",
+    ff_gate_proj="model.layers.{}.mlp.gate_proj",
+    attn_fused_qkv_proj="model.layers.{}.self_attn.qkv_proj",
+    attn_output_proj="model.layers.{}.self_attn.o_proj",
+    pre_attn_norm="model.layers.{}.input_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
+    pre_ff_norm="model.layers.{}.pre_feedforward_layernorm",
+    post_ff_norm="model.layers.{}.post_feedforward_layernorm",
+    embedding="embedder",
+    final_norm="model.norm",
+    lm_head=None,
+)
+class Gemma2Block(TransformerBlock):
+  def forward(
+      self,
+      x: torch.Tensor,
+      rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+      mask: Optional[torch.Tensor] = None,
+      input_pos: Optional[torch.Tensor] = None,
+  ) -> torch.Tensor:
+    """Forward function of the Gemma2Block.
+    Exactly the same as TransformerBlock but we call the post-attention norm
+    immediately after attention and not after the residual pointwise addition.
+    Args:
+      x (torch.Tensor): the input tensor.
+      rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
+      mask (torch.Tensor): the optional mask tensor.
+      input_pos (torch.Tensor): the optional input position tensor.
+    Returns:
+      output activation from this transformer block.
+    """
+    x_norm = self.pre_atten_norm(x)
+    attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+    attn_out_norm = self.post_atten_norm(attn_out)
+    x = x + attn_out_norm
+    output = x + self.ff(x)
+    return output
+class Gemma2(nn.Module):
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__()
+    self.config = config
+    # Construct model layers.
+    self.tok_embedding = nn.Embedding(
+        config.vocab_size, config.embedding_dim, padding_idx=0
+    )
+    self.lm_head = nn.Linear(
+        config.embedding_dim,
+        config.vocab_size,
+        bias=config.lm_head_use_bias,
+    )
+    # Gemma re-uses the embedding as the head projection layer.
+    self.lm_head.weight.data = self.tok_embedding.weight.data
+    self.transformer_blocks = nn.ModuleList(
+        Gemma2Block(config) for _ in range(config.num_layers)
+    )
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.kv_cache_max,
+        dim=int(
+            config.attn_config.rotary_percentage * config.attn_config.head_dim
+        ),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
+        size=config.kv_cache_max,
+        window_size=self.config.attn_config.sliding_window_size,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.config = config
+  def get_attention_mask(
+      self, idx: int, input_pos: torch.Tensor
+  ) -> torch.Tensor:
+    if self.config.attn_config.attn_types:
+      if (
+          self.config.attn_config.attn_types[idx]
+          == cfg.AttentionType.LOCAL_SLIDING
+      ):
+        return self.sliding_window_mask_cache.index_select(2, input_pos)
+    return self.mask_cache.index_select(2, input_pos)
+  @torch.inference_mode
+  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
+    B, T = idx.size()
+    assert self.config.max_seq_len >= T, (
+        f"Cannot forward sequence of length {T}, max seq length is only"
+        f" {self.config.max_seq_len}"
+    )
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(idx)
+    x = x * (self.config.embedding_dim**0.5)
+    for i, block in enumerate(self.transformer_blocks):
+      mask = self.get_attention_mask(i, input_pos)
+      x = block(x, (cos, sin), mask, input_pos)
+    x = self.final_norm(x)
+    res = self.lm_head(x)  # (b, t, vocab_size)
+    if self.config.final_logit_softcap is not None:
+      res = res / self.config.final_logit_softcap
+      res = torch.tanh(res)
+      res = res * self.config.final_logit_softcap
+    return res
+def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  attn_config = cfg.AttentionConfig(
+      num_heads=8,
+      head_dim=256,
+      num_query_groups=4,
+      rotary_percentage=1.0,
+      qkv_transpose_before_split=True,
+      logit_softcap=50.0,
+      sliding_window_size=4096,
+      attn_types=[cfg.AttentionType.GLOBAL, cfg.AttentionType.LOCAL_SLIDING]
+      * 13,
+  )
+  norm_config = cfg.NormalizationConfig(
+      type=cfg.NormalizationType.RMS_NORM,
+      epsilon=1e-6,
+      zero_centered=True,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.GATED,
+      activation=cfg.ActivationConfig(cfg.ActivationType.GELU_TANH),
+      intermediate_size=9216,
+      pre_ff_norm_config=norm_config,
+      post_ff_norm_config=norm_config,
+  )
+  config = cfg.ModelConfig(
+      vocab_size=256000,
+      num_layers=26,
+      max_seq_len=8192,
+      embedding_dim=2304,
+      kv_cache_max_len=kv_cache_max_len,
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+      final_norm_config=norm_config,
+      parallel_residual=False,
+      lm_head_use_bias=False,
+      enable_hlfb=False,
+      final_logit_softcap=30.0,
+  )
+  return config
+def get_fake_model_config_2b_for_test() -> cfg.ModelConfig:
+  config = get_model_config_2b()
+  config.num_layers = 2
+  return config
+def build_2b_model(checkpoint_path, **kwargs) -> nn.Module:
+  config = get_model_config_2b(**kwargs)
+  model = Gemma2(config)
+  loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+  # since embedding and lm-head use the same weight, we need to set strict
+  # to False.
+  loader.load(model, strict=False)
+  model.eval()
+  return model
+def define_and_run_2b() -> None:
+  current_dir = Path(__file__).parent.resolve()
+  gemma2_goldens = torch.load(current_dir / "gemma2it_2b_golden.pt")
+  print("Running GEMMA 2")
+  kv_cache_max_len = 1024
+  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma2-2b")
+  model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
+  toks = torch.from_numpy(
+      np.array([2, 651, 9456, 576, 573, 3520, 3858, 603, 235248])
+  )
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens[0, :9] = toks
+  input_pos = torch.arange(0, kv_cache_max_len)
+  out = model.forward(tokens, input_pos)
+  out_final = out[0, 8, :]
+  assert torch.allclose(gemma2_goldens, out_final, atol=1e-04)
+  print(out)
+if __name__ == "__main__":
+  torch.set_printoptions(sci_mode=True)
+  define_and_run_2b()

ai_edge_torch/generative/examples/stable_diffusion/clip.py CHANGED Viewed

@@ -35,7 +35,7 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     pre_attn_norm=(
         "cond_stage_model.transformer.text_model.encoder.layers.{}.layer_norm1"
     ),
-    pre_ff_norm=(
+    post_attn_norm=(
         "cond_stage_model.transformer.text_model.encoder.layers.{}.layer_norm2"
     ),
     embedding=(
@@ -120,7 +120,7 @@ def get_model_config() -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/t5/t5.py CHANGED Viewed

@@ -38,7 +38,7 @@ ENCDEC_TENSOR_NAMES = {
         "{prefix}.block.0.layer.0.SelfAttention.relative_attention_bias"
     ),
     "pre_attn_norm": "{prefix}.block.{}.layer.0.layer_norm",
-    "pre_ff_norm": "{prefix}.block.{}.layer.1.layer_norm",
+    "post_attn_norm": "{prefix}.block.{}.layer.1.layer_norm",
     "final_norm": "{prefix}.final_layer_norm",
 }
@@ -396,7 +396,7 @@ def get_model_config_t5() -> cfg.ModelConfig:
       relative_attention=True,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       parallel_residual=False,
       lm_head_use_bias=False,
@@ -419,7 +419,7 @@ def build_t5_model(checkpoint_path: str) -> nn.Module:
       "cross_attn_value_proj": "{prefix}.block.{}.layer.1.EncDecAttention.v",
       "cross_attn_output_proj": "{prefix}.block.{}.layer.1.EncDecAttention.o",
       # In the decoder, the FF is layer 2 in the Transformer block
-      "pre_ff_norm": "{prefix}.block.{}.layer.2.layer_norm",
+      "post_attn_norm": "{prefix}.block.{}.layer.2.layer_norm",
       # In the decoder, the cross attention is layer 1 in the Transformer block
       "pre_cross_attn_norm": "{prefix}.block.{}.layer.1.layer_norm",
   }
@@ -475,7 +475,7 @@ def build_t5_decoder_model(
       "cross_attn_value_proj": "{prefix}.block.{}.layer.1.EncDecAttention.v",
       "cross_attn_output_proj": "{prefix}.block.{}.layer.1.EncDecAttention.o",
       # In the decoder, the FF is layer 2 in the Transformer block
-      "pre_ff_norm": "{prefix}.block.{}.layer.2.layer_norm",
+      "post_attn_norm": "{prefix}.block.{}.layer.2.layer_norm",
       # In the decoder, the cross attention is layer 1 in the Transformer block
       "pre_cross_attn_norm": "{prefix}.block.{}.layer.1.layer_norm",
   }

ai_edge_torch/generative/examples/t5/t5_attention.py CHANGED Viewed

@@ -68,8 +68,8 @@ class EncoderDecoderBlock(nn.Module):
     else:
       self.cross_atten_func = None
-    self.pre_ff_norm = builder.build_norm(
-        config.embedding_dim, config.pre_ff_norm_config
+    self.post_atten_norm = builder.build_norm(
+        config.embedding_dim, config.post_attention_norm_config
     )
     self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
     self.config = config
@@ -118,7 +118,7 @@ class EncoderDecoderBlock(nn.Module):
       )
       attn_out = hidden_states + attn_out
-    forwarded = self.pre_ff_norm(attn_out)
+    forwarded = self.post_atten_norm(attn_out)
     forwarded = self.ff(forwarded)
     hidden_states = attn_out + forwarded

ai_edge_torch/generative/examples/test_models/toy_model.py CHANGED Viewed

@@ -93,7 +93,7 @@ def get_model_config() -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
   )
   return config

ai_edge_torch/generative/examples/test_models/toy_model_with_external_kv_cache.py CHANGED Viewed

@@ -107,7 +107,7 @@ def get_model_config() -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -94,7 +94,7 @@ def get_model_config() -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -35,7 +35,7 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
-    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head="lm_head",
@@ -130,7 +130,7 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       attn_config=attn_config,
       ff_config=ff_config,
       pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -74,8 +74,8 @@ class TransformerBlock(nn.Module):
         config.kv_cache_max,
         config.enable_hlfb,
     )
-    self.pre_ff_norm = builder.build_norm(
-        config.embedding_dim, config.pre_ff_norm_config
+    self.post_atten_norm = builder.build_norm(
+        config.embedding_dim, config.post_attention_norm_config
     )
     self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
     self.config = config
@@ -108,7 +108,7 @@ class TransformerBlock(nn.Module):
       x_norm = self.pre_atten_norm(x)
       attn_out = self.atten_func(x_norm, rope, mask, input_pos)
       x = x + attn_out
-      x_norm = self.pre_ff_norm(x)
+      x_norm = self.post_atten_norm(x)
       output = x + self.ff(x_norm)
     return output
@@ -228,8 +228,15 @@ class CausalSelfAttention(nn.Module):
       # TODO(haoliang): Handle when execeeding max sequence length.
       k, v = self.kv_cache.update_cache(input_pos, k, v)
-    y = self.sdpa_func(q, k, v, self.config.head_dim, mask=mask)
-    y = y.reshape(B, T, E)
+    y = self.sdpa_func(
+        q,
+        k,
+        v,
+        self.config.head_dim,
+        mask=mask,
+        softcap=self.config.logit_softcap,
+    )
+    y = y.reshape(B, T, -1)
     # Compute the output projection.
     y = self.output_projection(y)

ai_edge_torch/generative/layers/attention_utils.py CHANGED Viewed

@@ -74,12 +74,42 @@ def build_causal_mask_cache(
   Returns:
       torch.Tensor: Causal attention mask.
   """
   if device is None:
     device = torch.device('cpu')
   mask = torch.full((size, size), float('-inf'), dtype=dtype, device=device)
   return torch.triu(mask, diagonal=1).unsqueeze(0).unsqueeze(0)
+def build_sliding_window_mask_cache(
+    size: int,
+    window_size: int,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device = None,
+) -> torch.Tensor:
+  """Build a cache for a sliding window mask.
+  Args:
+      size (int): The size of the built mask cache.
+      window_size (int): The window size that is "seen" by a token.
+      dtype (torch.dtype, optional): Output tensor's data type. Defaults to
+        torch.float32.
+      device (torch.device, optional): Output tensor's data type. Defaults to
+        None in which case "cpu" is used.
+  Returns:
+      torch.Tensor: Causal attention mask.
+  """
+  mask = build_causal_mask_cache(size, dtype, device)
+  all_ones = torch.ones_like(mask)
+  window_size = min(size, window_size)
+  sliding_mask = torch.triu(all_ones, -1 * window_size + 1) * torch.tril(
+      all_ones, window_size - 1
+  )
+  return torch.where(sliding_mask == 1, mask, -2.3819763e38)
 def relative_position_bucket(
     relative_position: torch.Tensor,
     bidirectional: bool,

ai_edge_torch/generative/layers/builder.py CHANGED Viewed

@@ -89,11 +89,16 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
   activation = get_activation(config.activation)
+  pre_ff_norm = build_norm(dim, config.pre_ff_norm_config)
+  post_ff_norm = build_norm(dim, config.post_ff_norm_config)
   return ff_module(
       dim=dim,
       hidden_dim=config.intermediate_size,
       activation=activation,
       use_bias=config.use_bias,
+      pre_ff_norm=pre_ff_norm,
+      post_ff_norm=post_ff_norm,
   )

ai_edge_torch/generative/layers/feed_forward.py CHANGED Viewed

@@ -14,7 +14,7 @@
 # ==============================================================================
 # Common building blocks for FeedForward layers.
-from typing import Callable
+from typing import Callable, Optional
 import torch
 from torch import nn
@@ -30,6 +30,8 @@ class SequentialFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+      post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
@@ -41,6 +43,8 @@ class SequentialFeedForward(nn.Module):
     self.act = activation
     self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
+    self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x
+    self.post_ff_norm = post_ff_norm if post_ff_norm else lambda x: x
   def forward(self, x):
     """Forward pass for Feedforward layer.
@@ -51,7 +55,9 @@ class SequentialFeedForward(nn.Module):
     Returns:
       torch.Tensor: output tensor after feedforward.
     """
-    return self.w2(self.act(self.w1(x)))
+    x_norm = self.pre_ff_norm(x)
+    out = self.w2(self.act(self.w1(x_norm)))
+    return self.post_ff_norm(out)
 class GatedFeedForward(nn.Module):
@@ -66,6 +72,8 @@ class GatedFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+      post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
@@ -78,6 +86,8 @@ class GatedFeedForward(nn.Module):
     self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
     self.w3 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x
+    self.post_ff_norm = post_ff_norm if post_ff_norm else lambda x: x
   def forward(self, x):
     """Forward pass for Feedforward layer.
@@ -88,4 +98,6 @@ class GatedFeedForward(nn.Module):
     Returns:
       torch.Tensor: output tensor after feedforward.
     """
-    return self.w2(self.act(self.w1(x)) * self.w3(x))
+    x_norm = self.pre_ff_norm(x)
+    out = self.w2(self.act(self.w1(x_norm)) * self.w3(x_norm))
+    return self.post_ff_norm(out)

ai-edge-torch-nightly 0.3.0.dev20240813__py3-none-any.whl → 0.3.0.dev20240817__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.3.0.dev20240813py3-none-any.whl → 0.3.0.dev20240817py3-none-any.whl