PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240915__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240915py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ai_edge_torch/generative/examples/phi/convert_to_tflite.py CHANGED Viewed

@@ -47,10 +47,10 @@ def convert_phi2_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
   kv = kv_cache.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None

ai_edge_torch/generative/examples/phi/phi2.py CHANGED Viewed

@@ -192,9 +192,9 @@ def define_and_run(checkpoint_path: str) -> None:
   kv_cache_max_len = 1024
   model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(model.config)
   output = model.forward(tokens, input_pos, kv)
   print("comparing with goldens..")

ai_edge_torch/generative/examples/smollm/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

ai_edge_torch/generative/examples/{smallm → smollm}/convert_to_tflite.py RENAMED Viewed

@@ -13,25 +13,25 @@
 # limitations under the License.
 # ==============================================================================
-"""Example of converting SmalLM model to multi-signature tflite model."""
+"""Example of converting SmolLM model to multi-signature tflite model."""
 import os
 import pathlib
 import ai_edge_torch
-from ai_edge_torch.generative.examples.smallm import smallm
+from ai_edge_torch.generative.examples.smollm import smollm
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_smallm_to_tflite(
+def convert_smollm_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """Converts SmalLM model to multi-signature tflite model.
+  """Converts SmolLM model to multi-signature tflite model.
   Args:
       checkpoint_path (str): The filepath to the model checkpoint, or directory
@@ -43,14 +43,14 @@ def convert_smallm_to_tflite(
       quantize (bool, optional): Whether the model should be quanized. Defaults
         to True.
   """
-  pytorch_model = smallm.build_model(
+  pytorch_model = smollm.build_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
@@ -77,10 +77,10 @@ def convert_smallm_to_tflite(
   )
   quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/smallm_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+      f'/tmp/smollm_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smallm')
-  convert_smallm_to_tflite(path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm')
+  convert_smollm_to_tflite(path)

ai_edge_torch/generative/examples/{smallm/smallm.py → smollm/smollm.py} RENAMED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
-"""Example of building a SmalLM model."""
+"""Example of building a SmolLM model."""
 import copy
 import os
@@ -28,32 +28,32 @@ import torch
 from torch import nn
 TENSOR_NAMES = copy.copy(tiny_llama.TENSOR_NAMES)
-# SmalLM re-uses the embedding as the head projection layer.
+# SmolLM re-uses the embedding as the head projection layer.
 TENSOR_NAMES.lm_head = None
-class SmalLM(tiny_llama.TinyLlama):
-  """A SmalLM model built from the Edge Generative API layers.
+class SmolLM(tiny_llama.TinyLlama):
+  """A SmolLM model built from the Edge Generative API layers.
-  SmalLM shares the same architecture as TinyLlama, but with different model
+  SmolLM shares the same architecture as TinyLlama, but with different model
   sizes.
   """
   def __init__(self, config: cfg.ModelConfig):
     super().__init__(config)
-    # SmalLM re-uses the embedding as the head projection layer.
+    # SmolLM re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a SmalLM 135M model.
+  """Returns the model config for a SmolLM 135M model.
   Args:
     kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
       is 1024.
   Returns:
-    The model config for a SmalLM model.
+    The model config for a SmolLM model.
   """
   attn_config = cfg.AttentionConfig(
       num_heads=9,
@@ -86,9 +86,18 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
+def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
+  config.vocab_size = 128
+  config.num_layers = 2
+  # SmolLM has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
+  return config
 def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config(**kwargs)
-  model = SmalLM(config)
+  model = SmolLM(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
   # Since embedding and lm-head use the same weight, we need to set strict
   # to False.
@@ -98,25 +107,25 @@ def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
 def define_and_run(checkpoint_path: str) -> None:
-  """Instantiates and runs a SmalLM model."""
+  """Instantiates and runs a SmolLM model."""
   current_dir = pathlib.Path(__file__).parent.resolve()
-  smallm_goldens = torch.load(current_dir / "smallm_lm_logits.pt")
+  smollm_goldens = torch.load(current_dir / "smollm_lm_logits.pt")
   kv_cache_max_len = 1024
   model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(model.config)
   output = model.forward(tokens, input_pos, kv)
   assert torch.allclose(
-      smallm_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-05
+      smollm_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-05
   )
 if __name__ == "__main__":
   input_checkpoint_path = os.path.join(
-      pathlib.Path.home(), "Downloads/llm_data/smallm"
+      pathlib.Path.home(), "Downloads/llm_data/smollm"
   )
   define_and_run(input_checkpoint_path)

ai_edge_torch/generative/examples/stable_diffusion/clip.py CHANGED Viewed

@@ -76,7 +76,7 @@ class CLIP(nn.Module):
   @torch.inference_mode
   def forward(self, tokens: torch.LongTensor) -> torch.FloatTensor:
-    tokens = tokens.type(torch.long)
+    tokens = tokens.type(torch.int)
     state = self.tok_embedding(tokens) + self.tok_embedding_position
     for layer in self.transformer_blocks:

ai_edge_torch/generative/examples/stable_diffusion/convert_to_tflite.py CHANGED Viewed

@@ -94,7 +94,7 @@ def convert_stable_diffusion_to_tflite(
   n_tokens = 77
   timestamp = 0
   len_prompt = 1
-  prompt_tokens = torch.full((1, n_tokens), 0, dtype=torch.long)
+  prompt_tokens = torch.full((1, n_tokens), 0, dtype=torch.int)
   input_image = torch.full(
       (1, 3, image_height, image_width), 0, dtype=torch.float32
   )

ai_edge_torch/generative/examples/t5/convert_to_tflite.py CHANGED Viewed

@@ -29,24 +29,24 @@ def convert_t5_to_tflite_singlesig(checkpoint_path: str):
   # encoder
   seq_len = 512
-  prefill_e_tokens = torch.full((1, seq_len), 0, dtype=torch.long)
+  prefill_e_tokens = torch.full((1, seq_len), 0, dtype=torch.int)
   prompt_e_token = [1, 2, 3, 4, 5, 6]
   prefill_e_tokens[0, : len(prompt_e_token)] = torch.tensor(
-      prompt_e_token, dtype=torch.long
+      prompt_e_token, dtype=torch.int
   )
-  prefill_e_input_pos = torch.arange(0, seq_len)
-  prefill_d_tokens = torch.full((1, seq_len), 0, dtype=torch.long)
+  prefill_e_input_pos = torch.arange(0, seq_len, dtype=torch.int)
+  prefill_d_tokens = torch.full((1, seq_len), 0, dtype=torch.int)
   prompt_d_token = [1, 2, 3, 4, 5, 6]
   prefill_d_tokens[0, : len(prompt_d_token)] = torch.tensor(
-      prompt_d_token, dtype=torch.long
+      prompt_d_token, dtype=torch.int
   )
-  prefill_d_input_pos = torch.arange(0, seq_len)
+  prefill_d_input_pos = torch.arange(0, seq_len, dtype=torch.int)
   # decoder
-  decode_token = torch.tensor([[1]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  decode_d_token = torch.tensor([[1]], dtype=torch.long)
-  decode_d_input_pos = torch.tensor([0], dtype=torch.int64)
+  decode_token = torch.tensor([[1]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  decode_d_token = torch.tensor([[1]], dtype=torch.int)
+  decode_d_input_pos = torch.tensor([0], dtype=torch.int)
   # Pad mask for self attention only on "real" tokens.
   # Pad with `-inf` for any tokens indices that aren't desired.
@@ -81,24 +81,24 @@ def convert_t5_to_tflite_multisig(checkpoint_path: str):
   # encoder
   seq_len = 512
-  prefill_e_tokens = torch.full((1, seq_len), 0, dtype=torch.long)
+  prefill_e_tokens = torch.full((1, seq_len), 0, dtype=torch.int)
   prompt_e_token = [1, 2, 3, 4, 5, 6]
   prefill_e_tokens[0, : len(prompt_e_token)] = torch.tensor(
-      prompt_e_token, dtype=torch.long
+      prompt_e_token, dtype=torch.int
   )
-  prefill_e_input_pos = torch.arange(0, seq_len)
-  prefill_d_tokens = torch.full((1, seq_len), 0, dtype=torch.long)
+  prefill_e_input_pos = torch.arange(0, seq_len, dtype=torch.int)
+  prefill_d_tokens = torch.full((1, seq_len), 0, dtype=torch.int)
   prompt_d_token = [1, 2, 3, 4, 5, 6]
   prefill_d_tokens[0, : len(prompt_d_token)] = torch.tensor(
-      prompt_d_token, dtype=torch.long
+      prompt_d_token, dtype=torch.int
   )
-  prefill_d_input_pos = torch.arange(0, seq_len)
+  prefill_d_input_pos = torch.arange(0, seq_len, dtype=torch.int)
   # decoder
-  decode_token = torch.tensor([[1]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  decode_d_token = torch.tensor([[1]], dtype=torch.long)
-  decode_d_input_pos = torch.tensor([0], dtype=torch.int64)
+  decode_token = torch.tensor([[1]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  decode_d_token = torch.tensor([[1]], dtype=torch.int)
+  decode_d_input_pos = torch.tensor([0], dtype=torch.int)
   # Pad mask for self attention only on "real" tokens.
   # Pad with `-inf` for any tokens indices that aren't desired.

ai_edge_torch/generative/examples/t5/t5.py CHANGED Viewed

@@ -601,12 +601,12 @@ def define_and_run_t5(checkpoint_path: str) -> None:
   model = build_t5_model(checkpoint_path)
   idx = get_sample_encoder_input_ids()
-  tokens = torch.full((1, 512), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, 512), 0, dtype=torch.int, device="cpu")
   tokens[0, :77] = idx
-  input_pos = torch.arange(0, 512)
+  input_pos = torch.arange(0, 512, dtype=torch.int)
-  decode_d_token = torch.tensor([[0]], dtype=torch.int64)
-  decode_d_input_pos = torch.tensor([0], dtype=torch.int64)
+  decode_d_token = torch.tensor([[0]], dtype=torch.int)
+  decode_d_input_pos = torch.tensor([0], dtype=torch.int)
   pad_mask = torch.zeros([model.config.kv_cache_max], dtype=torch.float32)
   pad_mask[77:] = float("-inf")
   lm_logits = model.forward(
@@ -633,12 +633,12 @@ def define_and_run_t5_split(checkpoint_path: str) -> None:
   )
   idx = get_sample_encoder_input_ids()
-  tokens = torch.full((1, 512), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, 512), 0, dtype=torch.int, device="cpu")
   tokens[0, :77] = idx
-  input_pos = torch.arange(0, 512)
+  input_pos = torch.arange(0, 512, dtype=torch.int)
-  decode_d_token = torch.tensor([[0]], dtype=torch.int64)
-  decode_d_input_pos = torch.tensor([0], dtype=torch.int64)
+  decode_d_token = torch.tensor([[0]], dtype=torch.int)
+  decode_d_input_pos = torch.tensor([0], dtype=torch.int)
   pad_mask = torch.zeros(
       [t5_encoder_model.config.kv_cache_max], dtype=torch.float32
   )

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -124,13 +124,13 @@ def get_model_config() -> cfg.ModelConfig:
 def get_sample_prefill_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  tokens = torch.unsqueeze(torch.arange(0, 100), 0)
-  input_pos = torch.arange(0, 100)
+  tokens = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
+  input_pos = torch.arange(0, 100, dtype=torch.int)
   return tokens, input_pos
 def get_sample_decode_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  tokens = torch.tensor([[1]], dtype=torch.long)
+  tokens = torch.tensor([[1]], dtype=torch.int)
   input_pos = torch.tensor([10])
   return tokens, input_pos

ai_edge_torch/generative/examples/tiny_llama/convert_to_tflite.py CHANGED Viewed

@@ -47,10 +47,10 @@ def convert_tiny_llama_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -189,9 +189,9 @@ def define_and_run(checkpoint_path: str) -> None:
   kv_cache_max_len = 1024
   model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(model.config)
   output = model.forward(tokens, input_pos, kv)
   assert torch.allclose(

ai_edge_torch/generative/fx_passes/__init__.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from ai_edge_torch._convert.fx_passes import CanonicalizePass
-from ai_edge_torch._convert.fx_passes import run_passes
-from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass  # NOQA
+from ai_edge_torch import fx_pass_base
+from ai_edge_torch.fx_pass_base import CanonicalizePass
+from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass
 import torch
 def run_generative_passes(
     exported_program: torch.export.ExportedProgram,
 ) -> torch.export.ExportedProgram:
-  return run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           RemoveSDPACompositeZeroMaskPass(),

ai_edge_torch/generative/fx_passes/remove_sdpa_zero_mask_pass.py CHANGED Viewed

@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult
 import torch
-class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
+class RemoveSDPACompositeZeroMaskPass(fx_pass_base.ExportedProgramPassBase):
   def is_zero_tensor_node(self, node: torch.fx.Node):
     return node.target == torch.ops.aten.zeros.default
@@ -48,4 +47,4 @@ class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
     exported_program.graph_module.graph.lint()
     exported_program.graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -160,6 +160,10 @@ class CausalSelfAttention(nn.Module):
     self.output_projection = nn.Linear(
         output_shape, dim, bias=config.output_proj_use_bias
     )
+    self.query_norm = builder.build_norm(
+        config.head_dim, config.query_norm_config
+    )
+    self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
     self.config = config
     self.enable_hlfb = enable_hlfb
     self.sdpa_func = (
@@ -224,6 +228,9 @@ class CausalSelfAttention(nn.Module):
           dim=-1,
       )
+    q = self.query_norm(q)
+    k = self.key_norm(k)
     q = q.reshape(B, T, -1, self.config.head_dim)
     k = k.reshape(B, T, -1, self.config.head_dim)
     v = v.reshape(B, T, -1, self.config.head_dim)

ai_edge_torch/generative/layers/builder.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 # Builder class for individual components.
+from typing import Callable
 import ai_edge_torch.generative.layers.feed_forward as feed_forward
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.normalization as normalization
@@ -21,20 +23,34 @@ from torch import nn
 import torch.nn.functional as F
-class GeGLU(nn.Module):
-  """GeGLU is an activation function which is a variant of GELU.
+def build_glu(
+    act: Callable[[torch.Tensor], torch.Tensor], gate_is_front: bool = False
+) -> Callable[[torch.Tensor], torch.Tensor]:
+  """Builds an activation function with GLU (Gated Linear Unit).
+  If gate_is_front is True,
+    f(x) = act(x) * y
+  otherwise,
+    f(x) = x * act(y),
+  where x is the first half of the input and y is the second half of the input.
-  GeGLU(x) = (xW+b) * GELU(xV+c)
-  See: https://arxiv.org/abs/2002.05202v1
+  Args:
+    act (Callable[[torch.Tensor], torch.Tensor]): activation function to apply
+      to the gate.
+    gate_is_front: whether the gate is in front half of the input. Other part is
+      the output in GLU.
+  Returns:
+    A callable activation function with GLU.
   """
-  def __init__(self, d_in: int, d_out: int):
-    super().__init__()
-    self.proj = nn.Linear(d_in, d_out * 2)
+  def _glu(x):
+    x, y = x.chunk(2, dim=-1)
+    if gate_is_front:
+      return act(x) * y
+    return x * act(y)
-  def forward(self, x: torch.Tensor):
-    x, gate = self.proj(x).chunk(2, dim=-1)
-    return x * F.gelu(gate)
+  return _glu
 def build_norm(dim: int, config: cfg.NormalizationConfig):
@@ -99,6 +115,10 @@ def build_ff(dim: int, config: cfg.FeedForwardConfig):
       hidden_dim=config.intermediate_size,
       activation=activation,
       use_bias=config.use_bias,
+      use_glu=(
+          config.activation.type == cfg.ActivationType.GE_GLU
+          or config.activation.type == cfg.ActivationType.SILU_GLU
+      ),
       pre_ff_norm=pre_ff_norm,
       post_ff_norm=post_ff_norm,
   )
@@ -129,8 +149,10 @@ def get_activation(config: cfg.ActivationConfig):
     # See: https://github.com/hendrycks/GELUs
     return lambda x: x * F.sigmoid(1.702 * x)
   elif config.type == cfg.ActivationType.GE_GLU:
-    return GeGLU(config.dim_in, config.dim_out)
+    return build_glu(F.gelu, config.gate_is_front)
   elif config.type == cfg.ActivationType.RELU:
     return F.relu
+  elif config.type == cfg.ActivationType.SILU_GLU:
+    return build_glu(F.silu, config.gate_is_front)
   else:
     raise ValueError("Unsupported activation type.")

ai_edge_torch/generative/layers/feed_forward.py CHANGED Viewed

@@ -30,18 +30,27 @@ class SequentialFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      use_glu=False,
       pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
       post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
-    Args: dim(int): embedding size. hidden_dim(int): hidden dim size of the
-    feedforward layer. activation(Callable): activation function used in this
-    block. use_bias(Boolean): whether to use bias. Default is false.
+    Args:
+      dim (int): embedding size.
+      hidden_dim (int): hidden dim size of the feedforward layer.
+      activation (Callable): activation function used in this block.
+      use_bias (Boolean): whether to use bias. Default is false.
+      use_glu (Boolean): whether to use glu in activation. Default is false.
+      pre_ff_norm (Callable): pre feedforward norm. Default is None.
+      post_ff_norm (Callable): post feedforward norm. Default is None.
     """
     super().__init__()
     self.act = activation
-    self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    if use_glu:
+      self.w1 = nn.Linear(dim, hidden_dim * 2, bias=use_bias)
+    else:
+      self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
     self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x
     self.post_ff_norm = post_ff_norm if post_ff_norm else lambda x: x
@@ -72,18 +81,27 @@ class GatedFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      use_glu=False,
       pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
       post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
-    Args: dim(int): embedding size. hidden_dim(int): hidden dim size of the
-    feedforward layer. activation(Callable): activation function used in this
-    block. use_bias(Boolean): whether to use bias. Default is false.
+    Args:
+      dim (int): embedding size.
+      hidden_dim (int): hidden dim size of the feedforward layer.
+      activation (Callable): activation function used in this block.
+      use_bias (Boolean): whether to use bias. Default is false.
+      use_glu (Boolean): whether to use glu in activation. Default is false.
+      pre_ff_norm (Callable): pre feedforward norm. Default is None.
+      post_ff_norm (Callable): post feedforward norm. Default is None.
     """
     super().__init__()
     self.act = activation
-    self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    if use_glu:
+      self.w1 = nn.Linear(dim, hidden_dim * 2, bias=use_bias)
+    else:
+      self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
     self.w3 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -172,8 +172,8 @@ def _update_kv_base_impl(
     v_slice: torch.Tensor,
 ) -> KVCacheEntry:
   """Update the cache buffer without High Level Function Boundary annotation."""
-  k = cache.k_cache.index_copy(1, input_pos, k_slice)
-  v = cache.v_cache.index_copy(1, input_pos, v_slice)
+  k = cache.k_cache.index_copy(1, input_pos.to(torch.long), k_slice)
+  v = cache.v_cache.index_copy(1, input_pos.to(torch.long), v_slice)
   updated_cache = KVCacheEntry(k, v)
   return updated_cache
@@ -189,7 +189,7 @@ def _update_kv_hlfb_impl(
   k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
       cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
   )
-  k = k_cache.index_copy(1, input_pos, k_slice)
-  v = v_cache.index_copy(1, input_pos, v_slice)
+  k = k_cache.index_copy(1, input_pos.to(torch.long), k_slice)
+  v = v_cache.index_copy(1, input_pos.to(torch.long), v_slice)
   k, v = builder.mark_outputs(k, v)
   return KVCacheEntry(k, v)

ai-edge-torch-nightly 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240915__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240915py3-none-any.whl