PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

ai_edge_torch/generative/examples/gemma/gemma2.py CHANGED Viewed

@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building the Gemma2 2B model.
+"""Example of building a Gemma2 model."""
 import os
-from pathlib import Path
+import pathlib
 from typing import Optional, Tuple
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
@@ -51,7 +53,8 @@ class Gemma2Block(attention.TransformerBlock):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Tuple[torch.Tensor, Optional[kv_utils.KVCacheEntry]]:
     """Forward function of the Gemma2Block.
     Exactly the same as TransformerBlock but we call the post-attention norm
@@ -62,17 +65,19 @@ class Gemma2Block(attention.TransformerBlock):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
     Returns:
-      output activation from this transformer block.
+      output activation from this transformer block, and updated kv cache (if
+      passed in).
     """
     x_norm = self.pre_atten_norm(x)
-    attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+    attn_out, kv = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
     attn_out_norm = self.post_atten_norm(attn_out)
     x = x + attn_out_norm
     output = x + self.ff(x)
-    return output
+    return output, kv
 class Gemma2(nn.Module):
@@ -81,7 +86,6 @@ class Gemma2(nn.Module):
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
@@ -91,20 +95,22 @@ class Gemma2(nn.Module):
         config.vocab_size,
         bias=config.lm_head_use_bias,
     )
-    # Gemma re-uses the embedding as the head projection layer.
+    # Gemma2 re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
     self.transformer_blocks = nn.ModuleList(
-        Gemma2Block(config) for _ in range(config.num_layers)
+        Gemma2Block(config.block_config(idx), config)
+        for idx in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    # Gemma2 has same hyper parameters for each layer except for attention
+    # types. Use the first layer.
+    attn_config = config.block_config(0).attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -115,47 +121,56 @@ class Gemma2(nn.Module):
         dtype=torch.float32,
         device=torch.device("cpu"),
     )
     self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
         size=config.kv_cache_max,
-        window_size=self.config.attn_config.sliding_window_size,
+        window_size=attn_config.sliding_window_size,
         dtype=torch.float32,
         device=torch.device("cpu"),
     )
     self.config = config
   def get_attention_mask(
-      self, idx: int, input_pos: torch.Tensor
+      self, attn_type: cfg.AttentionType, input_pos: torch.Tensor
   ) -> torch.Tensor:
-    if self.config.attn_config.attn_types:
-      if (
-          self.config.attn_config.attn_types[idx]
-          == cfg.AttentionType.LOCAL_SLIDING
-      ):
-        return self.sliding_window_mask_cache.index_select(2, input_pos)
+    if attn_type == cfg.AttentionType.LOCAL_SLIDING:
+      return self.sliding_window_mask_cache.index_select(2, input_pos)
     return self.mask_cache.index_select(2, input_pos)
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    _, seq_len = idx.size()
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
     sin = sin.index_select(0, input_pos)
     # token embeddings of shape (b, t, n_embd)
-    x = self.tok_embedding(idx)
+    x = self.tok_embedding(tokens)
     x = x * (self.config.embedding_dim**0.5)
+    updated_kv_entires = []
     for i, block in enumerate(self.transformer_blocks):
-      mask = self.get_attention_mask(i, input_pos)
-      x = block(x, (cos, sin), mask, input_pos)
+      mask = self.get_attention_mask(
+          block.config.attn_config.attn_type, input_pos
+      )
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
     res = self.lm_head(x)  # (b, t, vocab_size)
@@ -163,7 +178,8 @@ class Gemma2(nn.Module):
       res = res / self.config.final_logit_softcap
       res = torch.tanh(res)
       res = res * self.config.final_logit_softcap
-    return res
+    return {"logits": res, "kv_cache": updated_kv_cache}
 def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -176,18 +192,6 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   Returns:
     The model config for a Gemma 2B model.
   """
-  attn_config = cfg.AttentionConfig(
-      num_heads=8,
-      head_dim=256,
-      num_query_groups=4,
-      rotary_percentage=1.0,
-      qkv_transpose_before_split=True,
-      logit_softcap=50.0,
-      sliding_window_size=4096,
-      attn_types=[cfg.AttentionType.GLOBAL, cfg.AttentionType.LOCAL_SLIDING]
-      * 13,
-  )
   norm_config = cfg.NormalizationConfig(
       type=cfg.NormalizationType.RMS_NORM,
       epsilon=1e-6,
@@ -200,18 +204,38 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       pre_ff_norm_config=norm_config,
       post_ff_norm_config=norm_config,
   )
+  def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
+    attn_config = cfg.AttentionConfig(
+        num_heads=8,
+        head_dim=256,
+        num_query_groups=4,
+        rotary_percentage=1.0,
+        qkv_transpose_before_split=True,
+        logit_softcap=50.0,
+        sliding_window_size=4096,
+        attn_type=(
+            cfg.AttentionType.GLOBAL
+            if idx % 2 == 0
+            else cfg.AttentionType.LOCAL_SLIDING
+        ),
+    )
+    return cfg.TransformerBlockConfig(
+        attn_config=attn_config,
+        ff_config=ff_config,
+        pre_attention_norm_config=norm_config,
+        post_attention_norm_config=norm_config,
+    )
+  num_layers = 26
   config = cfg.ModelConfig(
       vocab_size=256000,
-      num_layers=26,
+      num_layers=num_layers,
       max_seq_len=8192,
       embedding_dim=2304,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=[get_block_config(i) for i in range(num_layers)],
       final_norm_config=norm_config,
-      parallel_residual=False,
       lm_head_use_bias=False,
       enable_hlfb=True,
       final_logit_softcap=30.0,
@@ -221,14 +245,16 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
 def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
   config = get_model_config_2b(kv_cache_max_len)
-  config.attn_config.num_heads = 4
-  config.attn_config.head_dim = 64
-  config.attn_config.sliding_window_size = 64
-  config.ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
   config.max_seq_len = 2 * kv_cache_max_len
   config.embedding_dim = 128
+  config.block_configs = config.block_configs[: config.num_layers]
+  for block_config in config.block_configs:
+    block_config.attn_config.num_heads = 4
+    block_config.attn_config.head_dim = 64
+    block_config.attn_config.sliding_window_size = 64
+    block_config.ff_config.intermediate_size = 128
   return config
@@ -236,33 +262,34 @@ def build_2b_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config_2b(**kwargs)
   model = Gemma2(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
-  # since embedding and lm-head use the same weight, we need to set strict
+  # Since embedding and lm-head use the same weight, we need to set strict
   # to False.
   loader.load(model, strict=False)
   model.eval()
   return model
-def define_and_run_2b() -> None:
+def define_and_run_2b(checkpoint_path: str) -> None:
   """Instantiates and runs a Gemma2 2B model."""
-  current_dir = Path(__file__).parent.resolve()
+  current_dir = pathlib.Path(__file__).parent.resolve()
   gemma2_goldens = torch.load(current_dir / "gemma2it_2b_golden.pt")
   print("Running GEMMA 2")
   kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma2-2b")
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   toks = torch.from_numpy(
       np.array([2, 651, 9456, 576, 573, 3520, 3858, 603, 235248])
   )
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :9] = toks
-  input_pos = torch.arange(0, kv_cache_max_len)
-  out = model.forward(tokens, input_pos)
-  out_final = out[0, 8, :]
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  out = model.forward(tokens, input_pos, kv)
+  out_final = out["logits"][0, 8, :]
   assert torch.allclose(gemma2_goldens, out_final, atol=1e-04)
 if __name__ == "__main__":
   torch.set_printoptions(sci_mode=True)
-  define_and_run_2b()
+  path = os.path.join(pathlib.Path.home(), "Downloads/llm_data/gemma2-2b")
+  define_and_run_2b(path)

ai_edge_torch/generative/examples/{experimental/gemma → openelm}/convert_to_tflite.py RENAMED Viewed

@@ -12,30 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Note: This is an experimental version of Gemma with external KV cache.
-# Please use with caution.
+"""Example of converting OpenELM model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
-from ai_edge_torch.generative.examples.experimental.gemma import gemma
-from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+from ai_edge_torch.generative.examples.openelm import openelm
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_gemma_to_tflite(
+def convert_openelm_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """An example method for converting a Gemma 2B model to multi-signature
+  """Converts OpenELM model to multi-signature tflite model.
-  tflite model.
   Args:
       checkpoint_path (str): The filepath to the model checkpoint, or directory
         holding the checkpoint.
@@ -46,15 +43,15 @@ def convert_gemma_to_tflite(
       quantize (bool, optional): Whether the model should be quanized. Defaults
         to True.
   """
-  pytorch_model = gemma.build_2b_model(
+  pytorch_model = openelm.build_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  kv = kv_utils.EKVCache.from_model_config(pytorch_model.config)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
@@ -78,11 +75,12 @@ def convert_gemma_to_tflite(
       )
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+      f'/tmp/openelm_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/openelm')
+  convert_openelm_to_tflite(path)

ai_edge_torch/generative/examples/openelm/openelm.py ADDED Viewed

@@ -0,0 +1,237 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of building an OpenELM model."""
+import os
+import pathlib
+from ai_edge_torch.generative.layers import attention
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+import ai_edge_torch.generative.layers.attention_utils as attn_utils
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+import numpy as np
+import torch
+from torch import nn
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="transformer.layers.{}.ffn.proj_1",
+    ff_down_proj="transformer.layers.{}.ffn.proj_2",
+    attn_fused_qkv_proj="transformer.layers.{}.attn.qkv_proj",
+    attn_query_norm="transformer.layers.{}.attn.q_norm",
+    attn_key_norm="transformer.layers.{}.attn.k_norm",
+    attn_output_proj="transformer.layers.{}.attn.out_proj",
+    pre_attn_norm="transformer.layers.{}.attn_norm",
+    pre_ff_norm="transformer.layers.{}.ffn_norm",
+    embedding="transformer.token_embeddings",
+    final_norm="transformer.norm",
+    lm_head=None,
+)
+class OpenELM(nn.Module):
+  """An OpenELM model built from the Edge Generative API layers."""
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__()
+    # Construct model layers.
+    self.tok_embedding = nn.Embedding(
+        config.vocab_size, config.embedding_dim, padding_idx=0
+    )
+    self.lm_head = nn.Linear(
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
+    )
+    # OpenELM re-uses the embedding as the head projection layer.
+    self.lm_head.weight.data = self.tok_embedding.weight.data
+    self.transformer_blocks = nn.ModuleList(
+        attention.TransformerBlock(config.block_config(idx), config)
+        for idx in range(config.num_layers)
+    )
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    # OpenELM has same hyper parameters for rotary_percentage and head_dim for
+    # each layer block. Use the first block.
+    attn_config = config.block_config(0).attn_config
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.kv_cache_max,
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.config = config
+  @torch.inference_mode
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
+    assert self.config.max_seq_len >= seq_len, (
+        f"Cannot forward sequence of length {seq_len}, max seq length is only"
+        f" {self.config.max_seq_len}"
+    )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    mask = self.mask_cache.index_select(2, input_pos)
+    mask = mask[:, :, :, : self.config.kv_cache_max]
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(tokens)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    x = self.final_norm(x)
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for an OpenELM model.
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+  Returns:
+    The model config for an OpenELM model.
+  """
+  norm_config = cfg.NormalizationConfig(
+      type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6
+  )
+  num_heads = [12] * 4 + [16] * 14 + [20] * 12 + [24] * 6
+  num_query_groups = [3] * 4 + [4] * 14 + [5] * 12 + [6] * 6
+  def make_divisible(v, d):
+    """Ensures that all layers have a channel number that is divisible by d."""
+    new_v = int(v + d / 2) // d * d
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+      new_v += d
+    return new_v
+  # The way to get intermediate size is from
+  # https://huggingface.co/apple/OpenELM-3B/blob/main/modeling_openelm.py
+  def get_intermediate_size(idx: int) -> int:
+    return make_divisible((0.5 + 0.1 * idx) * 3072, 256)
+  def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
+    return cfg.TransformerBlockConfig(
+        attn_config=cfg.AttentionConfig(
+            num_heads=num_heads[idx],
+            head_dim=128,
+            num_query_groups=num_query_groups[idx],
+            rotary_percentage=1.0,
+            qkv_transpose_before_split=True,
+            query_norm_config=norm_config,
+            key_norm_config=norm_config,
+        ),
+        ff_config=cfg.FeedForwardConfig(
+            type=cfg.FeedForwardType.SEQUENTIAL,
+            activation=cfg.ActivationConfig(
+                cfg.ActivationType.SILU_GLU, gate_is_front=True
+            ),
+            intermediate_size=get_intermediate_size(idx),
+            pre_ff_norm_config=norm_config,
+        ),
+        pre_attention_norm_config=norm_config,
+    )
+  num_layers = 36
+  config = cfg.ModelConfig(
+      vocab_size=32000,
+      num_layers=num_layers,
+      max_seq_len=2048,
+      embedding_dim=3072,
+      kv_cache_max_len=kv_cache_max_len,
+      block_configs=[get_block_config(i) for i in range(num_layers)],
+      final_norm_config=norm_config,
+  )
+  return config
+def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
+  config = get_model_config(kv_cache_max_len)
+  config.vocab_size = 128
+  config.num_layers = 2
+  config.max_seq_len = 2 * kv_cache_max_len
+  config.embedding_dim = 128
+  config.block_configs = config.block_configs[: config.num_layers]
+  for block_config in config.block_configs:
+    block_config.attn_config.num_heads = 3
+    block_config.attn_config.head_dim = 64
+    block_config.ff_config.intermediate_size = 128
+  return config
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
+  config = get_model_config(**kwargs)
+  model = OpenELM(config)
+  loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+  # Since embedding and lm-head use the same weight, we need to set strict
+  # to False.
+  loader.load(model, strict=False)
+  model.eval()
+  return model
+def define_and_run(checkpoint_path: str) -> None:
+  """Instantiates and runs an OpenELM model."""
+  current_dir = pathlib.Path(__file__).parent.resolve()
+  openelm_goldens = torch.load(current_dir / "openelm_lm_logits.pt")
+  kv_cache_max_len = 1024
+  model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
+  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
+  tokens[0, :4] = idx
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
+  assert torch.allclose(
+      openelm_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-05
+  )
+if __name__ == "__main__":
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/openelm"
+  )
+  define_and_run(input_checkpoint_path)

ai_edge_torch/generative/examples/{experimental/phi → phi}/convert_to_tflite.py RENAMED Viewed

@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Note: This is an experimental version of phi2 with external KV cache.
-# Please use with caution.
+"""Example of converting a Phi-2 model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
-from ai_edge_torch.generative.examples.experimental.phi import phi2
-from ai_edge_torch.generative.layers.experimental import ekv_cache
+from ai_edge_torch.generative.examples.phi import phi2
+from ai_edge_torch.generative.layers import kv_cache
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
@@ -32,9 +31,8 @@ def convert_phi2_to_tflite(
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """An example method for converting a Phi-2 model to multi-signature
+  """Converts a Phi-2 model to multi-signature tflite model.
-  tflite model.
   Args:
       checkpoint_path (str): The filepath to the model checkpoint, or directory
         holding the checkpoint.
@@ -49,11 +47,11 @@ def convert_phi2_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  kv = ekv_cache.EKVCache.from_model_config(pytorch_model.config)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_cache.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
@@ -77,11 +75,12 @@ def convert_phi2_to_tflite(
       )
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/phi2_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+      f'/tmp/phi2_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/phi2')
-  convert_phi2_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/phi2')
+  convert_phi2_to_tflite(path)

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl