PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/quantize/quant_recipe.py CHANGED Viewed

@@ -25,7 +25,8 @@ class LayerQuantRecipe:
   """Quantization recipe for a single Edge Generative API layer (e.g. Attention).
   Generic layer-scoped quantization recipe that specifies how this layer should
-  be quantized by the Edge Generative API. This is applicable to layers implemented
+  be quantized by the Edge Generative API. This is applicable to layers
+  implemented
   in ai_edge_torch/generative/layers/. Combinations of attributes that are not
   supported during runtime will be detected when .verify() is called.
@@ -74,7 +75,8 @@ class LayerQuantRecipe:
     if not is_valid:
       raise ValueError(
-          'Unsupported LayerQuantRecipe configuration. See get_supported_recipe_matrix()'
+          'Unsupported LayerQuantRecipe configuration. See'
+          ' get_supported_recipe_matrix()'
       )
@@ -82,7 +84,8 @@ class LayerQuantRecipe:
 class GenerativeQuantRecipe:
   """Quantization recipe for a model composed of the Edge Generative API layers.
-  Some layers can be specified with different `LayerQuantRecipe` for each block by
+  Some layers can be specified with different `LayerQuantRecipe` for each block
+  by
   providing a dictionary keyed by the TransformerBlock index, e.g. attention
   and feedforward. For example,
@@ -101,11 +104,11 @@ class GenerativeQuantRecipe:
     default: The quantization recipe for global scope of the model.
     embedding: Recipe for the embedding table.
     attention: Recipe for the attention blocks. This could be specified with
-      different LayerQuantRecipe for each block by providing a dictionary
-      keyed by the TransformerBlock index.
+      different LayerQuantRecipe for each block by providing a dictionary keyed
+      by the TransformerBlock index.
     feedforward: Recipe for the feedforward layers. This could be specified with
-      different LayerQuantRecipe for each block by providing a dictionary
-      keyed by the TransformerBlock index.
+      different LayerQuantRecipe for each block by providing a dictionary keyed
+      by the TransformerBlock index.
   """
   default: Optional[LayerQuantRecipe] = None

ai_edge_torch/generative/quantize/quant_recipe_utils.py CHANGED Viewed

@@ -16,7 +16,8 @@
 """Helper functions to construct custom quantization recipes.
 These are intended for more advanced users who want to configure their own
-quantization recipes. For pre-constructed recipes, use `quant_recipes.py` instead.
+quantization recipes. For pre-constructed recipes, use `quant_recipes.py`
+instead.
 Typical usage example:
@@ -41,6 +42,16 @@ def create_layer_quant_int8_dynamic() -> quant_recipe.LayerQuantRecipe:
   )
+def create_layer_quant_int8_weight_only() -> quant_recipe.LayerQuantRecipe:
+  return quant_recipe.LayerQuantRecipe(
+      activation_dtype=quant_attrs.Dtype.FP32,
+      weight_dtype=quant_attrs.Dtype.INT8,
+      mode=quant_attrs.Mode.WEIGHT_ONLY,
+      algorithm=quant_attrs.Algorithm.MIN_MAX,
+      granularity=quant_attrs.Granularity.CHANNELWISE,
+  )
 def create_layer_quant_fp16() -> quant_recipe.LayerQuantRecipe:
   return quant_recipe.LayerQuantRecipe(
       activation_dtype=quant_attrs.Dtype.FP32,

ai_edge_torch/generative/quantize/quant_recipes.py CHANGED Viewed

@@ -40,6 +40,14 @@ def full_int8_dynamic_recipe() -> quant_config.QuantConfig:
   )
+def full_int8_weight_only_recipe() -> quant_config.QuantConfig:
+  return quant_config.QuantConfig(
+      generative_recipe=quant_recipe.GenerativeQuantRecipe(
+          default=quant_recipe_utils.create_layer_quant_int8_weight_only(),
+      )
+  )
 def full_fp16_recipe() -> quant_config.QuantConfig:
   return quant_config.QuantConfig(
       generative_recipe=quant_recipe.GenerativeQuantRecipe(

ai_edge_torch/generative/test/test_kv_cache.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A suite of tests to validate KV Cache layer."""
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+import ai_edge_torch.generative.layers.model_config as cfg
+import torch
+from absl.testing import absltest as googletest
+class TestKVLayers(googletest.TestCase):
+  def _get_test_config(
+      self, num_layers, head_dim, num_query_groups, kv_cache_max_len
+  ):
+    attn_config = cfg.AttentionConfig(
+        num_heads=1, head_dim=head_dim, num_query_groups=num_query_groups
+    )
+    block_config = cfg.TransformerBlockConfig(
+        attn_config=attn_config, ff_config=None
+    )
+    config = cfg.ModelConfig(
+        kv_cache_max_len=kv_cache_max_len,
+        embedding_dim=head_dim,
+        block_configs=block_config,
+        num_layers=num_layers,
+        max_seq_len=None,
+        vocab_size=None,
+    )
+    return config
+  def test_cache_udpate(self):
+    N = 1
+    HEAD_DIM = 2
+    NUM_QG = 1
+    KV_LEN = 4
+    config = self._get_test_config(
+        num_layers=N,
+        head_dim=HEAD_DIM,
+        num_query_groups=NUM_QG,
+        kv_cache_max_len=KV_LEN,
+    )
+    kv = kv_utils.KVCache.from_model_config(config)
+    entry = kv.caches[0]
+    # single-slice update
+    input_pos = torch.tensor([1])
+    k_slice = v_slice = torch.full(
+        (1, 1, NUM_QG, HEAD_DIM), 5, dtype=torch.float
+    )
+    updated_entry = kv_utils.update(entry, input_pos, k_slice, v_slice)
+    self.assertEqual(
+        updated_entry.k_cache.numpy().flatten().tolist(),
+        [0, 0, 5, 5, 0, 0, 0, 0],
+    )
+    self.assertEqual(
+        updated_entry.v_cache.numpy().flatten().tolist(),
+        [0, 0, 5, 5, 0, 0, 0, 0],
+    )
+    # multi-slice update
+    input_pos = torch.tensor([0, 3])
+    k_slice = v_slice = torch.full(
+        (1, 2, NUM_QG, HEAD_DIM), 7, dtype=torch.float
+    )
+    updated_entry = kv_utils.update(entry, input_pos, k_slice, v_slice)
+    self.assertEqual(
+        updated_entry.k_cache.numpy().flatten().tolist(),
+        [7, 7, 0, 0, 0, 0, 7, 7],
+    )
+    self.assertEqual(
+        updated_entry.v_cache.numpy().flatten().tolist(),
+        [7, 7, 0, 0, 0, 0, 7, 7],
+    )
+  def test_serialization(self):
+    class TestModel(torch.nn.Module):
+      def forward(self, kv: kv_utils.KVCache) -> kv_utils.KVCache:
+        updated_kv_entries = [
+            kv_utils.KVCacheEntry(
+                torch.zeros_like(entry.k_cache), torch.zeros_like(entry.v_cache)
+            )
+            for entry in kv.caches
+        ]
+        return kv_utils.KVCache(updated_kv_entries)
+    N = 1
+    HEAD_DIM = 2
+    NUM_QG = 1
+    KV_LEN = 4
+    config = self._get_test_config(
+        num_layers=N,
+        head_dim=HEAD_DIM,
+        num_query_groups=NUM_QG,
+        kv_cache_max_len=KV_LEN,
+    )
+    kv = kv_utils.KVCache.from_model_config(config)
+    model = TestModel()
+    exported_program = torch.export.export(model, (kv,))
+    input_specs = exported_program.graph_signature.input_specs
+    self.assertEqual(len(input_specs), 2)
+    self.assertEqual(input_specs[0].arg.name, "kv_k_0")
+    self.assertEqual(input_specs[1].arg.name, "kv_v_0")
+if __name__ == "__main__":
+  googletest.main()

ai_edge_torch/generative/test/{loader_test.py → test_loader.py} RENAMED Viewed

@@ -16,16 +16,16 @@
 import os
 import tempfile
-import unittest
+from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
+from ai_edge_torch.generative.utilities import loader as loading_utils
 import safetensors.torch
 import torch
-from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
-from ai_edge_torch.generative.utilities import loader as loading_utils
+from absl.testing import absltest as googletest
-class TestLoader(unittest.TestCase):
+class TestLoader(googletest.TestCase):
   """Unit tests that check weight loader."""
   def test_load_safetensors(self):
@@ -59,7 +59,9 @@ class TestLoader(unittest.TestCase):
           "model.layers.0.mlp.down_proj.weight": torch.randn((2048, 5632)),
           "model.layers.0.mlp.gate_proj.weight": torch.randn((5632, 2048)),
           "model.layers.0.mlp.up_proj.weight": torch.randn((5632, 2048)),
-          "model.layers.0.post_attention_layernorm.weight": torch.randn((2048,)),
+          "model.layers.0.post_attention_layernorm.weight": torch.randn((
+              2048,
+          )),
           "model.layers.0.self_attn.k_proj.weight": torch.randn((256, 2048)),
           "model.layers.0.self_attn.o_proj.weight": torch.randn((2048, 2048)),
           "model.layers.0.self_attn.q_proj.weight": torch.randn((2048, 2048)),
@@ -69,7 +71,7 @@ class TestLoader(unittest.TestCase):
       safetensors.torch.save_file(test_weights, file_path)
       cfg = tiny_llama.get_model_config()
       cfg.num_layers = 1
-      model = tiny_llama.TinyLLamma(cfg)
+      model = tiny_llama.TinyLlama(cfg)
       loader = loading_utils.ModelLoader(file_path, tiny_llama.TENSOR_NAMES)
       # if returns successfully, it means all the tensors were initiallized.
@@ -77,4 +79,4 @@ class TestLoader(unittest.TestCase):
 if __name__ == "__main__":
-  unittest.main()
+  googletest.main()

ai_edge_torch/generative/test/test_model_conversion.py CHANGED Viewed

@@ -12,224 +12,160 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Testing model conversion for a few gen-ai models.
-import copy
-import os
-import tempfile
-import unittest
-import numpy as np
-import torch
+"""Testing model conversion for a few gen-ai models."""
 import ai_edge_torch
-from ai_edge_torch.generative.examples.gemma import gemma
-from ai_edge_torch.generative.examples.phi2 import phi2
-from ai_edge_torch.generative.examples.test_models import toy_model_with_kv_cache  # NOQA
+from ai_edge_torch import config as ai_edge_config
+from ai_edge_torch.generative.examples.test_models import toy_model_with_kv_cache
 from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
-from ai_edge_torch.testing import model_coverage
+from ai_edge_torch.generative.layers import kv_cache
+from ai_edge_torch.generative.test import utils as test_utils
+import numpy as np
+import torch
+from absl.testing import absltest as googletest
+from ai_edge_litert import interpreter
-class TestModelConversion(unittest.TestCase):
+class TestModelConversion(googletest.TestCase):
   """Unit tests that check for model conversion and correctness."""
-  def test_toy_model_with_kv_cache(self):
-    config = toy_model_with_kv_cache.get_model_config()
-    pytorch_model = toy_model_with_kv_cache.ToyModelWithKV(config)
-    idx, input_pos = torch.tensor([[1]], dtype=torch.long), torch.tensor(
-        [10], dtype=torch.int64
+  def setUp(self):
+    super().setUp()
+    # Builder function for an Interpreter that supports custom ops.
+    self._interpreter_builder = (
+        lambda tflite_model: lambda: interpreter.InterpreterWithCustomOps(
+            custom_op_registerers=["GenAIOpsRegisterer"],
+            model_content=tflite_model,
+            experimental_default_delegate_latest_features=True,
+        )
     )
-    edge_model = ai_edge_torch.convert(pytorch_model, (idx, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (idx, input_pos),
-              num_valid_inputs=1,
-              atol=1e-5,
-              rtol=1e-5,
-          )
-      )
-  def test_toy_model_with_multi_batches(self):
-    config = toy_model_with_kv_cache.get_model_config()
-    config.batch_size = 2
-    pytorch_model = toy_model_with_kv_cache.ToyModelWithKV(config)
-    idx, input_pos = torch.tensor([[1], [2]], dtype=torch.long), torch.tensor(
-        [10], dtype=torch.int64
+  def _test_model_with_kv_cache(self, config, pytorch_model):
+    tokens, input_pos = torch.tensor([[1]], dtype=torch.int), torch.tensor(
+        [10], dtype=torch.int
+    )
+    kv = kv_cache.KVCache.from_model_config(config)
+    edge_model = ai_edge_torch.convert(
+        pytorch_model,
+        sample_kwargs={
+            "tokens": tokens,
+            "input_pos": input_pos,
+            "kv_cache": kv,
+        },
+    )
+    edge_model.set_interpreter_builder(
+        self._interpreter_builder(edge_model.tflite_model())
     )
-    edge_model = ai_edge_torch.convert(pytorch_model, (idx, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (idx, input_pos),
-              num_valid_inputs=1,
-              atol=1e-5,
-              rtol=1e-5,
-          )
-      )
+    self.assertTrue(
+        test_utils.compare_tflite_torch(
+            edge_model,
+            pytorch_model,
+            tokens,
+            input_pos,
+            kv,
+            signature_name="serving_default",
+            atol=1e-5,
+            rtol=1e-5,
+        )
+    )
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_toy_model_with_kv_cache(self):
+    config = toy_model_with_kv_cache.get_model_config()
+    pytorch_model = toy_model_with_kv_cache.ToyModelWithKVCache(config).eval()
+    self._test_model_with_kv_cache(config, pytorch_model)
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
   def test_toy_model_with_kv_cache_with_hlfb(self):
     config = toy_model_with_kv_cache.get_model_config()
     config.enable_hlfb = True
-    pytorch_model = toy_model_with_kv_cache.ToyModelWithKV(config)
-    idx, input_pos = torch.tensor([[1]], dtype=torch.long), torch.tensor(
-        [10], dtype=torch.int64
-    )
-    edge_model = ai_edge_torch.convert(pytorch_model, (idx, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (idx, input_pos),
-              num_valid_inputs=1,
-              atol=1e-5,
-              rtol=1e-5,
-          )
-      )
-  def test_tiny_llama(self):
-    self.skipTest("b/338288901")
-    config = tiny_llama.get_fake_model_config_for_test()
-    pytorch_model = tiny_llama.TinyLLamma(config)
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10)
-    edge_model = ai_edge_torch.convert(pytorch_model, (tokens, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (tokens, input_pos),
-              num_valid_inputs=1,
-              atol=1e-5,
-              rtol=1e-5,
-          )
-      )
-  def test_tiny_llama_multisig(self):
-    config = tiny_llama.get_fake_model_config_for_test()
-    pytorch_model = tiny_llama.TinyLLamma(config)
+    pytorch_model = toy_model_with_kv_cache.ToyModelWithKVCache(config).eval()
+    self._test_model_with_kv_cache(config, pytorch_model)
+  def _test_multisig_model(self, config, pytorch_model, atol, rtol):
     # prefill
     seq_len = 10
-    prefill_tokens = torch.full((1, seq_len), 0, dtype=torch.long, device="cpu")
+    prefill_tokens = torch.full((1, seq_len), 0, dtype=torch.int, device="cpu")
     prompt_token = torch.from_numpy(np.array([1, 2, 3, 4]))
     prefill_tokens[0, : len(prompt_token)] = prompt_token
-    prefill_input_pos = torch.arange(0, seq_len)
+    prefill_input_pos = torch.arange(0, seq_len, dtype=torch.int)
     # decode
-    decode_token = torch.tensor([[1]], dtype=torch.long)
-    decode_input_pos = torch.tensor([5], dtype=torch.int64)
+    decode_token = torch.tensor([[1]], dtype=torch.int)
+    decode_input_pos = torch.tensor([5], dtype=torch.int)
+    kv = kv_cache.KVCache.from_model_config(config)
     edge_model = (
         ai_edge_torch.signature(
-            "prefill", pytorch_model, (prefill_tokens, prefill_input_pos)
+            "prefill",
+            pytorch_model,
+            sample_kwargs={
+                "tokens": prefill_tokens,
+                "input_pos": prefill_input_pos,
+                "kv_cache": kv,
+            },
+        )
+        .signature(
+            "decode",
+            pytorch_model,
+            sample_kwargs={
+                "tokens": decode_token,
+                "input_pos": decode_input_pos,
+                "kv_cache": kv,
+            },
         )
-        .signature("decode", pytorch_model, (decode_token, decode_input_pos))
         .convert()
     )
+    edge_model.set_interpreter_builder(
+        self._interpreter_builder(edge_model.tflite_model())
+    )
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      copied_model = copy.deepcopy(pytorch_model)
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (prefill_tokens, prefill_input_pos),
-              signature_name="prefill",
-              num_valid_inputs=1,
-          )
-      )
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              copied_model,
-              (decode_token, decode_input_pos),
-              signature_name="decode",
-              num_valid_inputs=1,
-          )
-      )
-  def test_gemma(self):
-    self.skipTest("b/338288901")
-    config = gemma.get_fake_model_config_2b_for_test()
-    model = gemma.Gemma(config)
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10)
-    edge_model = ai_edge_torch.convert(model, (tokens, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      # TODO(talumbau, haoliang): debug numerical diff.
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              model,
-              (tokens, input_pos),
-              num_valid_inputs=1,
-              atol=1e-2,
-              rtol=1e-5,
-          )
-      )
-  def test_phi2(self):
-    self.skipTest("b/338288901")
-    config = phi2.get_fake_model_config_for_test()
-    pytorch_model = phi2.Phi2(config)
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10)
-    edge_model = ai_edge_torch.convert(pytorch_model, (tokens, input_pos))
-    # TODO(b/338288901): re-enable test to check output tensors.
-    skip_output_check = True
-    if skip_output_check is False:
-      self.assertTrue(
-          model_coverage.compare_tflite_torch(
-              edge_model,
-              pytorch_model,
-              (tokens, input_pos),
-              num_valid_inputs=1,
-              atol=1e-5,
-              rtol=1e-5,
-          )
-      )
+    self.assertTrue(
+        test_utils.compare_tflite_torch(
+            edge_model,
+            pytorch_model,
+            prefill_tokens,
+            prefill_input_pos,
+            kv,
+            signature_name="prefill",
+            atol=atol,
+            rtol=atol,
+        )
+    )
+    self.assertTrue(
+        test_utils.compare_tflite_torch(
+            edge_model,
+            pytorch_model,
+            decode_token,
+            decode_input_pos,
+            kv,
+            signature_name="decode",
+            atol=atol,
+            rtol=atol,
+        )
+    )
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_tiny_llama_multisig(self):
+    config = tiny_llama.get_fake_model_config()
+    pytorch_model = tiny_llama.TinyLlama(config).eval()
+    self._test_multisig_model(config, pytorch_model, atol=1e-5, rtol=1e-5)
 if __name__ == "__main__":
-  unittest.main()
+  googletest.main()

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl