PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240718__py3-none-any.whl → 0.2.0.dev20240720__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240718py3-none-any.whl → 0.2.0.dev20240720py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (23) hide show

ai_edge_torch/generative/examples/experimental/phi/phi2.py ADDED Viewed

@@ -0,0 +1,184 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Example of building phi-2 model from the Edge Generative API layers.
+#
+# Note: This is an experimental version of phi2 with external KV cache.
+# Please use with caution.
+import os
+from pathlib import Path
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import ai_edge_torch.generative.layers.attention_utils as attn_utils
+import ai_edge_torch.generative.layers.builder as builder
+from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+from ai_edge_torch.generative.layers.experimental.attention import TransformerBlock  # NOQA
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="model.layers.{}.mlp.fc1",
+    ff_down_proj="model.layers.{}.mlp.fc2",
+    attn_query_proj="model.layers.{}.self_attn.q_proj",
+    attn_key_proj="model.layers.{}.self_attn.k_proj",
+    attn_value_proj="model.layers.{}.self_attn.v_proj",
+    attn_output_proj="model.layers.{}.self_attn.dense",
+    pre_attn_norm="model.layers.{}.input_layernorm",
+    embedding="model.embed_tokens",
+    final_norm="model.final_layernorm",
+    lm_head="lm_head",
+)
+class Phi2(nn.Module):
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__()
+    self.config = config
+    # Construct model layers.
+    self.lm_head = nn.Linear(
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
+    )
+    self.tok_embedding = nn.Embedding(
+        config.vocab_size, config.embedding_dim, padding_idx=0
+    )
+    self.transformer_blocks = nn.ModuleList(
+        TransformerBlock(config) for _ in range(config.num_layers)
+    )
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.kv_cache_max,
+        dim=int(config.attn_config.rotary_percentage * config.head_dim),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max, dtype=torch.float32, device=torch.device("cpu")
+    )
+    self.config = config
+  @torch.inference_mode
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.EKVCache,
+  ) -> Tuple[torch.Tensor, kv_utils.EKVCache]:
+    B, T = tokens.size()
+    assert (
+        self.config.max_seq_len >= T
+    ), f"Cannot forward sequence of length {T}, max seq length is only {self.config.max_seq_len}"
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    mask = self.mask_cache.index_select(2, input_pos)
+    mask = mask[:, :, :, : self.config.kv_cache_max]
+    x = self.tok_embedding(tokens)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.EKVCache(tuple(updated_kv_entires))
+    x = self.final_norm(x)
+    res = self.lm_head(x)  # (b, t, vocab_size)
+    return res, updated_kv_cache
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  attn_config = cfg.AttentionConfig(
+      num_heads=32,
+      num_query_groups=32,
+      rotary_percentage=0.4,
+      qkv_use_bias=True,
+      output_proj_use_bias=True,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.SEQUENTIAL,
+      activation=cfg.ActivationConfig(cfg.ActivationType.GELU_TANH),
+      intermediate_size=10240,
+      use_bias=True,
+  )
+  norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.LAYER_NORM)
+  config = cfg.ModelConfig(
+      vocab_size=51200,
+      num_layers=32,
+      max_seq_len=2048,
+      kv_cache_max_len=kv_cache_max_len,
+      embedding_dim=2560,
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      final_norm_config=norm_config,
+      parallel_residual=True,
+      lm_head_use_bias=True,
+      enable_hlfb=True,
+  )
+  return config
+def get_fake_model_config_for_test(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
+  config.num_layers = 2
+  return config
+def build_model(checkpoint_path, test_model=False, **kwargs) -> nn.Module:
+  config = (
+      get_fake_model_config_for_test(**kwargs)
+      if test_model
+      else get_model_config(**kwargs)
+  )
+  model = Phi2(config)
+  if checkpoint_path is not None:
+    loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+    loader.load(model)
+  model.eval()
+  return model
+def define_and_run(checkpoint_path, test_model=False) -> None:
+  kv_cache_max_len = 1024
+  model = build_model(
+      checkpoint_path, test_model=test_model, kv_cache_max_len=kv_cache_max_len
+  )
+  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens[0, :4] = idx
+  input_pos = torch.arange(0, kv_cache_max_len)
+  kv = kv_utils.EKVCache.from_model_config(model.config)
+  print("running an inference")
+  print(model.forward(tokens, input_pos, kv))
+if __name__ == "__main__":
+  checkpoint_path = os.path.join(Path.home(), "Downloads/phi2")
+  define_and_run(checkpoint_path)

ai_edge_torch/generative/examples/experimental/tiny_llama/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

ai_edge_torch/generative/examples/experimental/tiny_llama/convert_to_tflite.py ADDED Viewed

@@ -0,0 +1,89 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Note: This is an experimental version of TinyLlama with external KV cache.
+# Please use with caution.
+import os
+from pathlib import Path
+import torch
+import ai_edge_torch
+from ai_edge_torch.generative.examples.experimental.tiny_llama import tiny_llama  # NOQA
+from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+from ai_edge_torch.generative.quantize import quant_recipes
+def convert_tiny_llama_to_tflite(
+    checkpoint_path: str,
+    prefill_seq_len: int = 512,
+    kv_cache_max_len: int = 1024,
+    quantize: bool = True,
+):
+  """An example method for converting TinyLlama model to multi-signature
+  tflite model.
+  Args:
+      checkpoint_path (str): The filepath to the model checkpoint, or directory
+        holding the checkpoint.
+      prefill_seq_len (int, optional): The maximum size of prefill input tensor.
+        Defaults to 512.
+      kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
+        including both prefill and decode. Defaults to 1024.
+      quantize (bool, optional): Whether the model should be quanized.
+        Defaults to True.
+  """
+  pytorch_model = tiny_llama.build_model(
+      checkpoint_path, kv_cache_max_len=kv_cache_max_len
+  )
+  # Tensors used to trace the model graph during conversion.
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
+  prefill_input_pos = torch.arange(0, prefill_seq_len)
+  decode_token = torch.tensor([[0]], dtype=torch.long)
+  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  kv = kv_utils.EKVCache.from_model_config(pytorch_model.config)
+  quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
+  edge_model = (
+      ai_edge_torch.signature(
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .convert(quant_config=quant_config)
+  )
+  edge_model.export(
+      f'/tmp/tiny_llama_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+  )
+if __name__ == '__main__':
+  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/tiny_llama')
+  convert_tiny_llama_to_tflite(checkpoint_path)

ai_edge_torch/generative/examples/experimental/tiny_llama/tiny_llama.py ADDED Viewed

@@ -0,0 +1,185 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Example of building a TinyLlama model from the Edge Generative API layers.
+#
+# Note: This is an experimental version of TinyLlama with external KV cache.
+# Please use with caution.
+import os
+from pathlib import Path
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import ai_edge_torch.generative.layers.attention_utils as attn_utils
+import ai_edge_torch.generative.layers.builder as builder
+from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+from ai_edge_torch.generative.layers.experimental.attention import TransformerBlock  # NOQA
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="model.layers.{}.mlp.up_proj",
+    ff_down_proj="model.layers.{}.mlp.down_proj",
+    ff_gate_proj="model.layers.{}.mlp.gate_proj",
+    attn_query_proj="model.layers.{}.self_attn.q_proj",
+    attn_key_proj="model.layers.{}.self_attn.k_proj",
+    attn_value_proj="model.layers.{}.self_attn.v_proj",
+    attn_output_proj="model.layers.{}.self_attn.o_proj",
+    pre_attn_norm="model.layers.{}.input_layernorm",
+    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    embedding="model.embed_tokens",
+    final_norm="model.norm",
+    lm_head="lm_head",
+)
+class TinyLLamma(nn.Module):
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__()
+    self.config = config
+    # Construct model layers.
+    self.lm_head = nn.Linear(
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
+    )
+    self.tok_embedding = nn.Embedding(
+        config.vocab_size, config.embedding_dim, padding_idx=0
+    )
+    self.transformer_blocks = nn.ModuleList(
+        TransformerBlock(config) for _ in range(config.num_layers)
+    )
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.kv_cache_max,
+        dim=int(config.attn_config.rotary_percentage * config.head_dim),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max, dtype=torch.float32, device=torch.device("cpu")
+    )
+    self.config = config
+  @torch.inference_mode
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.EKVCache,
+  ) -> Tuple[torch.Tensor, kv_utils.EKVCache]:
+    B, T = tokens.size()
+    assert (
+        self.config.max_seq_len >= T
+    ), f"Cannot forward sequence of length {T}, max seq length is only {self.config.max_seq_len}"
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    mask = self.mask_cache.index_select(2, input_pos)
+    mask = mask[:, :, :, : self.config.kv_cache_max]
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(tokens)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.EKVCache(tuple(updated_kv_entires))
+    x = self.final_norm(x)
+    res = self.lm_head(x)  # (b, t, vocab_size)
+    return res, updated_kv_cache
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  attn_config = cfg.AttentionConfig(
+      num_heads=32,
+      num_query_groups=4,
+      rotary_percentage=1.0,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.GATED,
+      activation=cfg.ActivationConfig(cfg.ActivationType.SILU),
+      intermediate_size=5632,
+  )
+  norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  config = cfg.ModelConfig(
+      vocab_size=32000,
+      num_layers=22,
+      max_seq_len=2048,
+      embedding_dim=2048,
+      kv_cache_max_len=kv_cache_max_len,
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      pre_ff_norm_config=norm_config,
+      final_norm_config=norm_config,
+      enable_hlfb=True,
+  )
+  return config
+def get_fake_model_config_for_test(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
+  config.vocab_size = 128
+  config.num_layers = 2
+  config.ff_config.intermediate_size = 256
+  return config
+def build_model(checkpoint_path, test_model=False, **kwargs) -> nn.Module:
+  config = (
+      get_fake_model_config_for_test(**kwargs)
+      if test_model
+      else get_model_config(**kwargs)
+  )
+  model = TinyLLamma(config)
+  if checkpoint_path is not None:
+    loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+    loader.load(model)
+  model.eval()
+  return model
+def define_and_run(checkpoint_path, test_model=False) -> None:
+  kv_cache_max_len = 1024
+  model = build_model(
+      checkpoint_path, test_model=test_model, kv_cache_max_len=kv_cache_max_len
+  )
+  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens[0, :4] = idx
+  input_pos = torch.arange(0, kv_cache_max_len)
+  kv = kv_utils.EKVCache.from_model_config(model.config)
+  print("running an inference")
+  print(model.forward(tokens, input_pos, kv))
+if __name__ == "__main__":
+  checkpoint_path = os.path.join(Path.home(), "Downloads/tiny_llama")
+  define_and_run(checkpoint_path)

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -159,6 +159,9 @@ def build_2b_model(checkpoint_path, **kwargs) -> nn.Module:
 def define_and_run_2b() -> None:
+  current_dir = Path(__file__).parent.resolve()
+  gemma_goldens = torch.load(current_dir / "gemma_lm_logits.pt")
   kv_cache_max_len = 1024
   checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma-2b")
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
@@ -166,8 +169,9 @@ def define_and_run_2b() -> None:
   tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
   tokens[0, :4] = idx
   input_pos = torch.arange(0, kv_cache_max_len)
-  print("running an inference")
-  print(model.forward(tokens, input_pos))
+  lm_logits = model.forward(tokens, input_pos)
+  print("comparing with goldens..")
+  assert torch.allclose(gemma_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05)
 if __name__ == "__main__":

ai_edge_torch/generative/examples/phi2/phi2.py CHANGED Viewed

@@ -149,6 +149,8 @@ def build_model(checkpoint_path, **kwargs) -> nn.Module:
 def define_and_run() -> None:
+  current_dir = Path(__file__).parent.resolve()
+  phi2_goldens = torch.load(current_dir / "phi2_lm_logits.pt")
   kv_cache_max_len = 1024
   checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/phi2")
   model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
@@ -156,8 +158,9 @@ def define_and_run() -> None:
   tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
   tokens[0, :4] = idx
   input_pos = torch.arange(0, kv_cache_max_len)
-  print("running an inference")
-  print(model.forward(tokens, input_pos))
+  lm_logits = model.forward(tokens, input_pos)
+  print("comparing with goldens..")
+  assert torch.allclose(phi2_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05)
 if __name__ == "__main__":

ai_edge_torch/generative/examples/t5/t5.py CHANGED Viewed

@@ -557,7 +557,8 @@ def get_sample_encoder_input_ids() -> torch.Tensor:
 def define_and_run_t5(checkpoint_path: str) -> None:
-  t5_goldens = torch.load("t5_lm_logits.pt")
+  current_dir = Path(__file__).parent.resolve()
+  t5_goldens = torch.load(current_dir / "t5_lm_logits.pt")
   model = build_t5_model(checkpoint_path)
@@ -579,7 +580,9 @@ def define_and_run_t5(checkpoint_path: str) -> None:
 # TODO(haoliang): Move those tests.
 def define_and_run_t5_split(checkpoint_path: str) -> None:
-  t5_goldens = torch.load("t5_lm_logits.pt")
+  current_dir = Path(__file__).parent.resolve()
+  t5_goldens = torch.load(current_dir / "t5_lm_logits.pt")
   config = get_model_config_t5()
   embedding_layer = nn.Embedding(config.vocab_size, config.embedding_dim, padding_idx=0)
   t5_encoder_model = build_t5_encoder_model(config, embedding_layer, checkpoint_path)

ai_edge_torch/generative/examples/test_models/toy_model_with_external_kv_cache.py CHANGED Viewed

@@ -14,9 +14,8 @@
 # ==============================================================================
 # A toy example which has basic transformer block (w/ externalized KV-Cache).
-from typing import List, Tuple
+from typing import Tuple
-import numpy as np
 import torch
 import torch.nn as nn
 import torch_xla
@@ -24,6 +23,7 @@ import torch_xla
 import ai_edge_torch
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.builder as builder
+from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
 from ai_edge_torch.generative.layers.experimental.attention import TransformerBlock  # NOQA
 import ai_edge_torch.generative.layers.model_config as cfg
@@ -60,27 +60,27 @@ class ToyModelWithExternalKV(torch.nn.Module):
   def forward(
       self,
-      idx: torch.Tensor,
+      tokens: torch.Tensor,
       input_pos: torch.Tensor,
-      k_caches: torch.Tensor,
-      v_caches: torch.Tensor,
-  ) -> (torch.Tensor, torch.Tensor, torch.Tensor):
-    x = self.tok_embedding(idx)
+      kv_cache: kv_utils.EKVCache,
+  ) -> Tuple[torch.Tensor, kv_utils.EKVCache]:
+    x = self.tok_embedding(tokens)
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
     sin = sin.index_select(0, input_pos)
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.max_seq_len]
+    updated_kv_entires = []
     for i, block in enumerate(self.transformer_blocks):
-      input_k, input_v = k_caches[i], v_caches[i]
-      x, (updated_k, updated_v) = block(
-          x, (cos, sin), mask, input_pos, (input_k, input_v)
-      )
-      k_caches[i], v_caches[i] = updated_k, updated_v
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
     x = self.final_norm(x)
-    return self.lm_head(x), k_caches, v_caches
+    updated_kv_cache = kv_utils.EKVCache(tuple(updated_kv_entires))
+    return self.lm_head(x), updated_kv_cache
 def _export_stablehlo_mlir(model, args):
@@ -115,15 +115,15 @@ def get_model_config() -> cfg.ModelConfig:
 def get_sample_prefill_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  idx = torch.unsqueeze(torch.arange(0, 100), 0)
+  tokens = torch.unsqueeze(torch.arange(0, 100), 0)
   input_pos = torch.arange(0, 100)
-  return idx, input_pos
+  return tokens, input_pos
 def get_sample_decode_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  idx = torch.tensor([[1]], dtype=torch.long)
+  tokens = torch.tensor([[1]], dtype=torch.long)
   input_pos = torch.tensor([10])
-  return idx, input_pos
+  return tokens, input_pos
 def define_and_run() -> None:
@@ -131,16 +131,16 @@ def define_and_run() -> None:
   config = get_model_config()
   model = ToyModelWithExternalKV(config)
+  model.eval()
   print('running an inference')
-  k_caches = torch.zeros((2, 1, 100, 4, 4), dtype=torch.float32)
-  v_caches = torch.zeros((2, 1, 100, 4, 4), dtype=torch.float32)
+  kv = kv_utils.EKVCache.from_model_config(config)
-  idx, input_pos = get_sample_prefill_inputs()
-  decode_idx, decode_input_pos = get_sample_decode_inputs()
-  print(model.forward(idx, input_pos, k_caches, v_caches))
+  tokens, input_pos = get_sample_prefill_inputs()
+  decode_token, decode_input_pos = get_sample_decode_inputs()
+  print(model.forward(tokens, input_pos, kv))
   if dump_mlir:
-    mlir_text = _export_stablehlo_mlir(model, (idx, input_pos, k_caches, v_caches))
+    mlir_text = _export_stablehlo_mlir(model, (tokens, input_pos, kv))
     with open('/tmp/toy_model_with_external_kv.stablehlo.mlir', 'w') as f:
       f.write(mlir_text)
@@ -149,13 +149,28 @@ def define_and_run() -> None:
   # in dynamic update slice op.
   print('converting toy model to tflite with 2 signatures (prefill + decode)')
   edge_model = (
-      ai_edge_torch.signature('prefill', model, (idx, input_pos, k_caches, v_caches))
-      .signature('decode', model, (decode_idx, decode_input_pos, k_caches, v_caches))
+      ai_edge_torch.signature(
+          'prefill',
+          model,
+          sample_kwargs={
+              'tokens': tokens,
+              'input_pos': input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
+      )
       .convert()
   )
   edge_model.export('/tmp/toy_external_kv_cache.tflite')
 if __name__ == '__main__':
-  with torch.inference_mode():
-    define_and_run()
+  define_and_run()

ai-edge-torch-nightly 0.2.0.dev20240718__py3-none-any.whl → 0.2.0.dev20240720__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.2.0.dev20240718py3-none-any.whl → 0.2.0.dev20240720py3-none-any.whl