PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/examples/tiny_llama/convert_to_tflite.py CHANGED Viewed

@@ -13,54 +13,56 @@
 # limitations under the License.
 # ==============================================================================
-import os
-from pathlib import Path
+"""Example of converting TinyLlama model to multi-signature tflite model."""
-import torch
+import os
+import pathlib
-import ai_edge_torch
+from absl import app
+from absl import flags
 from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
-from ai_edge_torch.generative.quantize import quant_recipes
+from ai_edge_torch.generative.utilities import converter
+_CHECKPOINT_PATH = flags.DEFINE_string(
+    'checkpoint_path',
+    os.path.join(pathlib.Path.home(), 'Downloads/llm_data/tiny_llama'),
+    'The path to the model checkpoint, or directory holding the checkpoint.',
+)
+_TFLITE_PATH = flags.DEFINE_string(
+    'tflite_path',
+    '/tmp/',
+    'The tflite file path to export.',
+)
+_PREFILL_SEQ_LEN = flags.DEFINE_integer(
+    'prefill_seq_len',
+    1024,
+    'The maximum size of prefill input tensor.',
+)
+_KV_CACHE_MAX_LEN = flags.DEFINE_integer(
+    'kv_cache_max_len',
+    1280,
+    'The maximum size of KV cache buffer, including both prefill and decode.',
+)
+_QUANTIZE = flags.DEFINE_bool(
+    'quantize',
+    True,
+    'Whether the model should be quantized.',
+)
-def convert_tiny_llama_to_tflite(
-    checkpoint_path: str,
-    prefill_seq_len: int = 512,
-    kv_cache_max_len: int = 1024,
-    quantize: bool = True,
-):
-  """An example method for converting TinyLlama model to multi-signature
-  tflite model.
-  Args:
-      checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
-      prefill_seq_len (int, optional): The maximum size of prefill input tensor.
-        Defaults to 512.
-      kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
-        including both prefill and decode. Defaults to 1024.
-      quantize (bool, optional): Whether the model should be quanized.
-        Defaults to True.
-  """
+def main(_):
   pytorch_model = tiny_llama.build_model(
-      checkpoint_path, kv_cache_max_len=kv_cache_max_len
+      _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
-  quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
-  edge_model = (
-      ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
-      )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
-      .convert(quant_config=quant_config)
+  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
+  output_filename = f'tinyllama_{quant_suffix}_seq{_PREFILL_SEQ_LEN.value}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
+  converter.convert_to_tflite(
+      pytorch_model,
+      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      prefill_seq_len=_PREFILL_SEQ_LEN.value,
+      quantize=_QUANTIZE.value,
   )
-  edge_model.export(f'/tmp/tiny_llama_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite')
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/tiny_llama')
-  convert_tiny_llama_to_tflite(checkpoint_path)
+  app.run(main)

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building a TinyLlama model from the Edge Generative API layers.
-import os
-from pathlib import Path
+"""Example of building a TinyLlama model."""
-import numpy as np
-import torch
-import torch.nn as nn
-from ai_edge_torch.generative.layers.attention import TransformerBlock
+from ai_edge_torch.generative.layers import attention
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-import ai_edge_torch.generative.layers.builder as builder
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
+import torch
+from torch import nn
 TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     ff_up_proj="model.layers.{}.mlp.up_proj",
@@ -36,19 +33,19 @@ TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
-    pre_ff_norm="model.layers.{}.post_attention_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head="lm_head",
 )
-class TinyLLamma(nn.Module):
+class TinyLlama(nn.Module):
+  """A TinyLlama model built from the Edge Generative API layers."""
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.lm_head = nn.Linear(
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
@@ -56,35 +53,48 @@ class TinyLLamma(nn.Module):
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
     )
+    # TinyLlama has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(config.attn_config.rotary_percentage * config.head_dim),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
         device=torch.device("cpu"),
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max, dtype=torch.float32, device=torch.device("cpu")
+        size=config.kv_cache_max,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
     )
     self.config = config
-  # The model's forward function takes in additional k/v cache tensors
-  # and returns the updated k/v cache tensors to the caller.
-  # This can be eliminated if we handle k/v cache updates inside the model itself.
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    B, T = idx.size()
-    assert (
-        self.config.max_seq_len >= T
-    ), f"Cannot forward sequence of length {T}, max seq length is only {self.config.max_seq_len}"
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
+    assert self.config.max_seq_len >= seq_len, (
+        f"Cannot forward sequence of length {seq_len}, max seq length is only"
+        f" {self.config.max_seq_len}"
+    )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -92,21 +102,35 @@ class TinyLLamma(nn.Module):
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.kv_cache_max]
-    # forward the model itself
-    x = self.tok_embedding(idx)  # token embeddings of shape (b, t, n_embd)
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(tokens)
+    updated_kv_entires = []
     for i, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a TinyLlama model.
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+  Returns:
+    The model config for a TinyLlama model.
+  """
   attn_config = cfg.AttentionConfig(
       num_heads=32,
+      head_dim=64,
       num_query_groups=4,
       rotary_percentage=1.0,
   )
@@ -116,49 +140,38 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       intermediate_size=5632,
   )
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=32000,
       num_layers=22,
       max_seq_len=2048,
       embedding_dim=2048,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )
   return config
-def get_fake_model_config_for_test() -> cfg.ModelConfig:
-  config = get_model_config()
+def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
   config.vocab_size = 128
   config.num_layers = 2
-  config.ff_config.intermediate_size = 256
+  # TinyLlama has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
   return config
-def build_model(checkpoint_path, **kwargs) -> nn.Module:
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config(**kwargs)
-  model = TinyLLamma(config)
+  model = TinyLlama(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
   loader.load(model)
+  model.eval()
   return model
-def define_and_run() -> None:
-  kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/tiny_llama")
-  model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
-  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
-  tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
-  print("running an inference")
-  print(model.forward(tokens, input_pos))
-if __name__ == "__main__":
-  define_and_run()

ai_edge_torch/generative/examples/tiny_llama/verify.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Verifies the reauthored TinyLlama-1.1B model."""
+import logging
+import pathlib
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
+from ai_edge_torch.generative.utilities import verifier
+import transformers
+_PROMPTS = flags.DEFINE_multi_string(
+    "prompts",
+    "Show me the program to add 2 and 3.",
+    "The input prompts to generate answers.",
+)
+def main(_):
+  checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+  logging.info("Loading the original model from: %s", checkpoint)
+  wrapper_model = verifier.ModelWrapper(
+      model=transformers.AutoModelForCausalLM.from_pretrained(
+          checkpoint, trust_remote_code=True
+      ),
+  )
+  # Locate the cached dir.
+  cached_config_file = transformers.utils.cached_file(
+      checkpoint, transformers.utils.CONFIG_NAME
+  )
+  reauthored_checkpoint = pathlib.Path(cached_config_file).parent
+  logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
+  reauthored_model = tiny_llama.build_model(reauthored_checkpoint)
+  logging.info("Loading the tokenizer from: %s", checkpoint)
+  tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
+  verifier.verify_reauthored_model(
+      original_model=wrapper_model,
+      reauthored_model=reauthored_model,
+      tokenizer=tokenizer,
+      generate_prompts=_PROMPTS.value,
+      atol=1e-04,
+  )
+if __name__ == "__main__":
+  app.run(main)

ai_edge_torch/generative/fx_passes/__init__.py CHANGED Viewed

@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
+from ai_edge_torch.fx_pass_base import CanonicalizePass
+from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass
 import torch
-from ai_edge_torch.convert.fx_passes import CanonicalizePass
-from ai_edge_torch.convert.fx_passes import run_passes
-from ai_edge_torch.generative.fx_passes.remove_sdpa_zero_mask_pass import RemoveSDPACompositeZeroMaskPass  # NOQA
 def run_generative_passes(
     exported_program: torch.export.ExportedProgram,
 ) -> torch.export.ExportedProgram:
-  return run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           RemoveSDPACompositeZeroMaskPass(),

ai_edge_torch/generative/fx_passes/remove_sdpa_zero_mask_pass.py CHANGED Viewed

@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
+from ai_edge_torch import lowertools
 import torch
-from ai_edge_torch.convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch.convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
-class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
+class RemoveSDPACompositeZeroMaskPass(fx_pass_base.ExportedProgramPassBase):
   def is_zero_tensor_node(self, node: torch.fx.Node):
     return node.target == torch.ops.aten.zeros.default
@@ -28,7 +27,7 @@ class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
     for node in graph.nodes:
       if not (
           node.op == "call_function"
-          and node.target == torch.ops.xla.mark_tensor.default
+          and node.target == lowertools.mark_tensor_op
       ):
         continue
@@ -36,7 +35,11 @@ class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
       # Composite info:
       # - name: odml.scaled_dot_product_attention
       # - inputs: q, k, v, mask
-      if name == "odml.scaled_dot_product_attention" and is_input and io_position == 3:
+      if (
+          name == "odml.scaled_dot_product_attention"
+          and is_input
+          and io_position == 3
+      ):
         if self.is_zero_tensor_node(source):
           # Remove the mark_tensor call on the mask input by
           # replacing the target with an identity function.
@@ -44,4 +47,4 @@ class RemoveSDPACompositeZeroMaskPass(ExportedProgramPassBase):
     exported_program.graph_module.graph.lint()
     exported_program.graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl