PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240926__py3-none-any.whl → 0.3.0.dev20240929__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240926py3-none-any.whl → 0.3.0.dev20240929py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

ai_edge_torch/generative/examples/test_models/toy_model.py CHANGED Viewed

@@ -44,13 +44,10 @@ class ToySingleLayerModel(torch.nn.Module):
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.max_seq_len,
         dim=int(attn_config.rotary_percentage * attn_config.head_dim),
-        base=10_000,
-        condense_ratio=1,
-        dtype=torch.float32,
-        device=torch.device('cpu'),
+        base=attn_config.rotary_base,
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.max_seq_len, dtype=torch.float32, device=torch.device('cpu')
+        size=config.max_seq_len,
     )
     self.config = config
@@ -93,13 +90,10 @@ class ToySingleLayerModelWeightSharing(torch.nn.Module):
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.max_seq_len,
         dim=int(attn_config.rotary_percentage * attn_config.head_dim),
-        base=10_000,
-        condense_ratio=1,
-        dtype=torch.float32,
-        device=torch.device('cpu'),
+        base=attn_config.rotary_base,
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.max_seq_len, dtype=torch.float32, device=torch.device('cpu')
+        size=config.max_seq_len,
     )
     self.config = config
@@ -124,6 +118,7 @@ def get_model_config() -> cfg.ModelConfig:
       num_heads=32,
       head_dim=4,
       num_query_groups=4,
+      rotary_base=10000,
       rotary_percentage=1.0,
       enable_kv_cache=False,
   )

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -51,13 +51,10 @@ class ToyModelWithKVCache(torch.nn.Module):
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.max_seq_len,
         dim=int(attn_config.rotary_percentage * attn_config.head_dim),
-        base=10_000,
-        condense_ratio=1,
-        dtype=torch.float32,
-        device=torch.device('cpu'),
+        base=attn_config.rotary_base,
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.max_seq_len, dtype=torch.float32, device=torch.device('cpu')
+        size=config.max_seq_len,
     )
     self.config = config
@@ -91,6 +88,7 @@ def get_model_config() -> cfg.ModelConfig:
       num_heads=32,
       head_dim=4,
       num_query_groups=4,
+      rotary_base=10000,
       rotary_percentage=1.0,
   )
   ff_config = cfg.FeedForwardConfig(

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -67,15 +67,10 @@ class TinyLlama(nn.Module):
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
         dim=int(attn_config.rotary_percentage * attn_config.head_dim),
-        base=10_000,
-        condense_ratio=1,
-        dtype=torch.float32,
-        device=torch.device("cpu"),
+        base=attn_config.rotary_base,
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
         size=config.kv_cache_max,
-        dtype=torch.float32,
-        device=torch.device("cpu"),
     )
     self.config = config
@@ -132,6 +127,7 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_heads=32,
       head_dim=64,
       num_query_groups=4,
+      rotary_base=10000,
       rotary_percentage=1.0,
   )
   ff_config = cfg.FeedForwardConfig(

ai_edge_torch/generative/examples/tiny_llama/verify.py CHANGED Viewed

@@ -21,6 +21,7 @@ import pathlib
 from absl import app
 from absl import flags
 from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
+from ai_edge_torch.generative.utilities import transformers_verifier
 from ai_edge_torch.generative.utilities import verifier
 import transformers
@@ -30,16 +31,20 @@ _PROMPTS = flags.DEFINE_multi_string(
     "Show me the program to add 2 and 3.",
     "The input prompts to generate answers.",
 )
+_MAX_NEW_TOKENS = flags.DEFINE_integer(
+    "max_new_tokens",
+    30,
+    "The maximum size of the generated tokens.",
+)
 def main(_):
   checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
   logging.info("Loading the original model from: %s", checkpoint)
-  wrapper_model = verifier.ModelWrapper(
-      model=transformers.AutoModelForCausalLM.from_pretrained(
-          checkpoint, trust_remote_code=True
-      ),
+  original_model = transformers.AutoModelForCausalLM.from_pretrained(
+      checkpoint, trust_remote_code=True
   )
   # Locate the cached dir.
   cached_config_file = transformers.utils.cached_file(
       checkpoint, transformers.utils.CONFIG_NAME
@@ -52,10 +57,13 @@ def main(_):
   tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
   verifier.verify_reauthored_model(
-      original_model=wrapper_model,
-      reauthored_model=reauthored_model,
-      tokenizer=tokenizer,
+      original_model=transformers_verifier.TransformersModelWrapper(
+          original_model
+      ),
+      reauthored_model=verifier.ReauthoredModelWrapper(reauthored_model),
+      tokenizer=verifier.TokenizerWrapper(tokenizer),
       generate_prompts=_PROMPTS.value,
+      max_new_tokens=_MAX_NEW_TOKENS.value,
       atol=1e-04,
   )

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -83,6 +83,8 @@ class AttentionConfig:
   # Used to determine number of groups in grouped query attention (GQA)
   # https://arxiv.org/pdf/2305.13245.pdf
   num_query_groups: Optional[int]
+  # Base of rotary positional embedding.
+  rotary_base: int = 10_000
   # Percentage of Rotary Positional Embedding added Q and K projections.
   rotary_percentage: Optional[float] = None
   # Whether to transpose the query groups of qkv bundled tensor before

ai_edge_torch/generative/test/test_model_conversion_large.py CHANGED Viewed

@@ -19,9 +19,11 @@ import ai_edge_torch
 from ai_edge_torch import config as ai_edge_config
 from ai_edge_torch.generative.examples.gemma import gemma1
 from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.examples.llama import llama
 from ai_edge_torch.generative.examples.openelm import openelm
 from ai_edge_torch.generative.examples.phi import phi2
 from ai_edge_torch.generative.examples.phi import phi3
+from ai_edge_torch.generative.examples.qwen import qwen
 from ai_edge_torch.generative.examples.smollm import smollm
 from ai_edge_torch.generative.examples.stable_diffusion import clip as sd_clip
 from ai_edge_torch.generative.examples.stable_diffusion import decoder as sd_decoder
@@ -102,6 +104,15 @@ class TestModelConversion(googletest.TestCase):
     pytorch_model = gemma2.Gemma2(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_llama(self):
+    config = llama.get_fake_model_config()
+    pytorch_model = llama.Llama(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
@@ -142,6 +153,15 @@ class TestModelConversion(googletest.TestCase):
     pytorch_model = openelm.OpenELM(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_qwen(self):
+    config = qwen.get_fake_model_config()
+    pytorch_model = qwen.Qwen(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",

ai_edge_torch/generative/utilities/transformers_verifier.py ADDED Viewed

@@ -0,0 +1,42 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for the models predefined in HuggingFace transformers."""
+from typing import cast
+from ai_edge_torch.generative.utilities import verifier
+import torch
+import transformers
+class TransformersModelWrapper(verifier.ModelWrapper):
+  """A wrapper for the model predefined in HuggingFace transformers.
+  Verifier expects forward() to return logits while Transformers models return
+  an object with `logits` field.
+  Transformers models get `max_new_tokens` settings for generate() via
+  GenerationConfig.
+  """
+  def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+    return self.model.forward(tokens).logits
+  def generate(
+      self, inputs: torch.Tensor, max_new_tokens: int
+  ) -> torch.IntTensor:
+    gen_config = transformers.GenerationConfig(max_new_tokens=max_new_tokens)
+    return self.model.generate(inputs=inputs, generation_config=gen_config)

ai_edge_torch/generative/utilities/verifier.py CHANGED Viewed

@@ -16,111 +16,129 @@
 """Common utility functions to verify the reauthored models."""
 import logging
-from typing import List, Optional, Union
+from typing import List
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import torch
-import transformers
 class ModelWrapper(torch.nn.Module):
-  """A wrapper for the model to be verified, this could be a HuggingFace model
+  """A wrapper for the model to be verified.
-  or a regular PyTorch model.
+  It unifies the interface of forward() and generate() of models for the
+  verification to call.
   """
-  def __init__(
-      self,
-      model: torch.nn.Module,
-      model_format: str = "huggingface",
-      hf_generation_config: Optional[transformers.GenerationConfig] = None,
-  ):
+  def __init__(self, model: torch.nn.Module):
     """Initializes the wrapper.
     Args:
-      model (torch.nn.Module): The original model. This could be a model built
-        from HuggingFace transformers, or a regular PyTorch model.
-      model_format (str): The format of the model. It should be either
-        "huggingface" or "pytorch".
-      hf_generation_config (transformers.GenerationConfig): The HuggingFace
-        generation config. This config will only be used if the underlying model
-        is built from HuggingFace transformers.
+      model (torch.nn.Module): The model which might have different interfaces
+        of forward() and generate(). It could be a model built from HuggingFace
+        transformers, a regular PyTorch model, or a model re-authored with
+        ai_edge_torch Generative API.
     """
     super().__init__()
     self.model = model
-    self.model_format = model_format
-    self.hf_generation_config = hf_generation_config
+  def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+    """Gets output logits by forwarding the input tokens.
+    Args:
+      tokens (torch.Tensor): The input tokens to forward. Its dimension is
+        expected to be (batch_size=1, kv_cache_max_len).
+    Returns:
+      The output logits.
+    """
+    raise NotImplementedError("forward() is not implemented.")
   def generate(
-      self, inputs: torch.Tensor
-  ) -> Union[transformers.utils.ModelOutput, torch.LongTensor]:
-    if self.model_format == "huggingface":
-      return self.model.generate(
-          inputs=inputs, generation_config=self.hf_generation_config
-      )
-    else:
-      raise NotImplementedError(
-          "generate() is not implemented for model format: %s"
-          % self.model_format
-      )
+      self, prompts: torch.Tensor, max_new_tokens: int
+  ) -> torch.IntTensor:
+    """Returns the response token IDs to the given prompts tensor.
-  def forward(
-      self,
-      inputs: torch.Tensor,
-  ):
-    return self.model.forward(inputs)
+    The maximum number of tokens to generate might be set by subclasses.
+    Args:
+      prompts (torch.Tensor): The input token IDs to generate with. Its shape is
+        expected to be (batch_size=1, input_ids_len).
+      max_new_tokens (int): The maximum number of response token IDs to
+        generate.
+    Returns:
+      The tensor of response token IDs with shape of (batch_size=1,
+      response_ids_len).
+    """
+    raise NotImplementedError("generate() is not implemented.")
-def forward(
-    model: torch.nn.Module,
-    tokens: torch.Tensor,
-    kv_cache: kv_utils.KVCache,
-) -> tuple[torch.Tensor, kv_utils.KVCache]:
-  """Forwards the model reauthored with ai_edge_torch Generative API.
-  Args:
-    model (torch.nn.Module): The model to forward. It should be a model built
-      with ai_edge_torch Generative API.
-    tokens (torch.Tensor): The input tokens to forward.
-    kv_cache (KVCache): The KV cache to forward.
+class ReauthoredModelWrapper(ModelWrapper):
+  """A wrapper for the model reauthored with ai_edge_torch Generative API."""
-  Returns:
-    The output logits and the updated KV cache.
-  """
-  input_pos = torch.arange(0, tokens.shape[1], dtype=torch.int)
-  output = model.forward(tokens, input_pos, kv_cache)
-  return output["logits"], output["kv_cache"]
+  def _init_kv_cache(self):
+    """Returns an initialized KV cache."""
+    return kv_utils.KVCache.from_model_config(self.model.config)
+  def _forward_with_kv_cache(
+      self,
+      tokens: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> tuple[torch.Tensor, kv_utils.KVCache]:
+    """Forwards the model and updates an external KV cache.
-def generate(
-    model: torch.nn.Module, prompts: torch.Tensor, response_len: int
-) -> torch.Tensor:
-  """Generates the response to the prompts.
+    Args:
+      tokens (torch.Tensor): The input tokens to forward.
+      kv_cache (KVCache): The KV cache to forward.
-  It appends tokens output by the model to the prompts and feeds them back to
-  the model up to decode_len.
+    Returns:
+      The output logits and the updated KV cache.
+    """
+    input_pos = torch.arange(0, tokens.shape[1], dtype=torch.int)
+    output = self.model.forward(tokens, input_pos, kv_cache)
+    return output["logits"], output["kv_cache"]
-  Args:
-    model (torch.nn.Module): The model to generate. It should be a model built
-      with ai_edge_torch Generative API.
-    prompts (torch.Tensor): The prompts to generate.
-    response_len (int): The number of tokens to generate.
+  def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+    logits, _ = self._forward_with_kv_cache(tokens, self._init_kv_cache())
+    return logits
-  Returns:
-    The generated tokens.
-  """
-  input_ids = prompts[0].int().tolist()
-  kv_cache = kv_utils.KVCache.from_model_config(model.config)
-  for _ in range(response_len - len(input_ids)):
-    logits, kv_cache = forward(model, torch.tensor([input_ids]), kv_cache)
-    generated_token = logits[0][-1].argmax().item()
-    input_ids.append(generated_token)
-  return torch.tensor([input_ids])
+  def generate(
+      self, prompts: torch.Tensor, max_new_tokens: int
+  ) -> torch.IntTensor:
+    input_ids = prompts[0].int().tolist()
+    kv_cache = self._init_kv_cache()
+    for _ in range(max_new_tokens):
+      tokens = torch.tensor([input_ids])
+      logits, kv_cache = self._forward_with_kv_cache(tokens, kv_cache)
+      generated_token = logits[0][-1].argmax().item()
+      input_ids.append(generated_token)
+    return torch.tensor([input_ids])
+class TokenizerWrapper(torch.nn.Module):
+  """A wrapper for the tokenizer used for verification."""
+  def __init__(self, tokenizer: torch.nn.Module):
+    """Initializes the wrapper.
+    Args:
+      tokenizer (torch.nn.Module): The tokenizer to wrap.
+    """
+    super().__init__()
+    self.tokenizer = tokenizer
+  def encode(self, prompts: str) -> torch.Tensor:
+    """Encodes the prompts to token IDs."""
+    return self.tokenizer.encode(prompts, return_tensors="pt")
+  def decode(self, token_ids: torch.Tensor) -> str:
+    """Decodes the token IDs to a string."""
+    return self.tokenizer.decode(token_ids)
 def verify_with_input_ids(
     original_model: ModelWrapper,
-    reauthored_model: torch.nn.Module,
+    reauthored_model: ReauthoredModelWrapper,
     input_ids: List[int],
     kv_cache_max_len: int = 1024,
     rtol: float = 1e-05,
@@ -132,8 +150,8 @@ def verify_with_input_ids(
   Args:
     original_model (ModelWrapper): The original model.
-    reauthored_model (torch.nn.Module): The model reauthored with ai_edge_torch
-      Generative API.
+    reauthored_model (ReauthoredModelWrapper): The model reauthored with
+      ai_edge_torch Generative API.
     input_ids (List[int]): The input token IDs to forward with.
     kv_cache_max_len (int): The maximum sequence length of the KV cache.
     rtol (float): The relative tolerance for the comparison.
@@ -147,13 +165,12 @@ def verify_with_input_ids(
   logging.info("Forwarding the original model...")
   outputs_original = original_model.forward(tokens)
-  logits_original = outputs_original.logits[0, len(input_ids) - 1, :]
+  logits_original = outputs_original[0, len(input_ids) - 1, :]
   logging.info("logits_original: %s", logits_original)
   logging.info("Forwarding the reauthored model...")
-  kv_cache = kv_utils.KVCache.from_model_config(reauthored_model.config)
-  outputs_reauthored = forward(reauthored_model, tokens, kv_cache)
-  logits_reauthored = outputs_reauthored[0][0, len(input_ids) - 1, :]
+  outputs_reauthored = reauthored_model.forward(tokens)
+  logits_reauthored = outputs_reauthored[0, len(input_ids) - 1, :]
   logging.info("logits_reauthored: %s", logits_reauthored)
   return torch.allclose(
@@ -163,9 +180,10 @@ def verify_with_input_ids(
 def verify_model_with_prompts(
     original_model: ModelWrapper,
-    reauthored_model: torch.nn.Module,
-    tokenizer: torch.nn.Module,
+    reauthored_model: ReauthoredModelWrapper,
+    tokenizer: TokenizerWrapper,
     prompts: str,
+    max_new_tokens: int,
 ) -> bool:
   """Verifies if the model reauthored generates the same answer of the oringal.
@@ -174,24 +192,24 @@ def verify_model_with_prompts(
   Args:
     original_model (ModelWrapper): The original model.
-    reauthored_model (torch.nn.Module): The model reauthored with ai_edge_torch
-      Generative API.
-    tokenizer (torch.nn.Module): The tokenizer.
+    reauthored_model (ReauthoredModelWrapper): The model reauthored with
+      ai_edge_torch Generative API.
+    tokenizer (TokenizerWrapper): The tokenizer.
     prompts (str): The input prompts to generate answers.
+    max_new_tokens (int): The maximum number of new tokens to generate.
   Returns:
     True if the model reauthored generates the same answer of the original.
   """
-  prompt_tokens = tokenizer.encode(prompts, return_tensors="pt")
+  prompt_tokens = tokenizer.encode(prompts)
   logging.info("Generating answer with the original model...")
-  outputs_original = original_model.generate(prompt_tokens)
+  outputs_original = original_model.generate(prompt_tokens, max_new_tokens)
   response_original = tokenizer.decode(outputs_original[0])
   logging.info("outputs_from_original_model: [[%s]]", response_original)
   logging.info("Generating answer with the reauthored model...")
-  generate_len = len(outputs_original[0])
-  outputs_reauthored = generate(reauthored_model, prompt_tokens, generate_len)
+  outputs_reauthored = reauthored_model.generate(prompt_tokens, max_new_tokens)
   response_reauthored = tokenizer.decode(outputs_reauthored[0])
   logging.info("outputs from reauthored model: [[%s]]", response_reauthored)
@@ -200,9 +218,10 @@ def verify_model_with_prompts(
 def verify_reauthored_model(
     original_model: ModelWrapper,
-    reauthored_model: torch.nn.Module,
-    tokenizer: torch.nn.Module,
+    reauthored_model: ReauthoredModelWrapper,
+    tokenizer: TokenizerWrapper,
     generate_prompts: List[str],
+    max_new_tokens: int = 30,
     forward_input_ids: List[List[int]] = [[1, 2, 3, 4]],
     rtol: float = 1e-05,
     atol: float = 1e-05,
@@ -219,10 +238,11 @@ def verify_reauthored_model(
   Args:
     original_model (ModelWrapper): The original model.
-    reauthored_model (torch.nn.Module): The model reauthored with ai_edge_torch
-      Generative API.
-    tokenizer (torch.nn.Module): The tokenizer.
+    reauthored_model (ReauthoredModelWrapper): The model reauthored with
+      ai_edge_torch Generative API.
+    tokenizer (TokenizerWrapper): The tokenizer.
     generate_prompts (List[str]): List of the input prompts to generate answers.
+    max_new_tokens (int): The maximum number of new tokens to generate.
     forward_input_ids (List[torch.Tensor]): List if ihe input token IDs to
       forward with.
     rtol (float): The relative tolerance for the comparison.
@@ -235,13 +255,13 @@ def verify_reauthored_model(
     ):
       logging.info("PASS")
     else:
-      logging.info("FAILED")
+      logging.error("FAILED")
   for prompts in generate_prompts:
     logging.info("Verifying the reauthored model with prompts:%s", prompts)
     if verify_model_with_prompts(
-        original_model, reauthored_model, tokenizer, prompts
+        original_model, reauthored_model, tokenizer, prompts, max_new_tokens
     ):
       logging.info("PASS")
     else:
-      logging.info("FAILED")
+      logging.error("FAILED")

ai_edge_torch/version.py CHANGED Viewed

@@ -13,4 +13,4 @@
 # limitations under the License.
 # ==============================================================================
-__version__ = "0.3.0.dev20240926"
+__version__ = "0.3.0.dev20240929"

{ai_edge_torch_nightly-0.3.0.dev20240926.dist-info → ai_edge_torch_nightly-0.3.0.dev20240929.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-torch-nightly
-Version: 0.3.0.dev20240926
+Version: 0.3.0.dev20240929
 Summary: Supporting PyTorch models with the Google AI Edge TFLite runtime.
 Home-page: https://github.com/google-ai-edge/ai-edge-torch
 Keywords: On-Device ML,AI,Google,TFLite,PyTorch,LLMs,GenAI

ai-edge-torch-nightly 0.3.0.dev20240926__py3-none-any.whl → 0.3.0.dev20240929__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240926py3-none-any.whl → 0.3.0.dev20240929py3-none-any.whl