PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (50) hide show

ai_edge_torch/_convert/test/test_convert.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ai_edge_torch.testing import model_coverage
 import numpy as np
 import tensorflow as tf
 import torch
+from torch import nn
 import torchvision
 from absl.testing import absltest as googletest
@@ -51,7 +52,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_add(self):
     """Tests conversion of a simple Add module."""
-    class Add(torch.nn.Module):
+    class Add(nn.Module):
       def forward(self, a, b):
         return a + b
@@ -70,7 +71,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_dot_add(self):
     """Tests conversion of a matrix multiplication followed by an add."""
-    class DotAdd(torch.nn.Module):
+    class DotAdd(nn.Module):
       def forward(self, a, b, c):
         return a @ b + c
@@ -99,7 +100,7 @@ class TestConvert(googletest.TestCase):
   def test_signature_args_ordering(self):
     """Tests conversion of a model with more than 10 arguments."""
-    class AddChainWith11Args(torch.nn.Module):
+    class AddChainWith11Args(nn.Module):
       """A model with 11 arguments."""
       def forward(
@@ -152,7 +153,7 @@ class TestConvert(googletest.TestCase):
   def test_multi_output_model(self):
     """Tests conversion of a model that returns multiple outputs."""
-    class BasicAddModelWithMultipleOutputs(torch.nn.Module):
+    class BasicAddModelWithMultipleOutputs(nn.Module):
       """A model that returns multiple outputs."""
       def forward(self, arg0, arg1):
@@ -176,7 +177,7 @@ class TestConvert(googletest.TestCase):
   def test_12_outputs_model(self):
     """Tests conversion of a model that returns more than 10 outputs."""
-    class BasicAddModelWithMultipleOutputs(torch.nn.Module):
+    class BasicAddModelWithMultipleOutputs(nn.Module):
       """A model that returns multiple outputs."""
       def forward(self, arg0, arg1):
@@ -245,7 +246,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_add_converter_flags(self):
     """Tests conversion of an add module setting a tflite converter flag."""
-    class Add(torch.nn.Module):
+    class Add(nn.Module):
       def forward(self, a, b):
         return a + b
@@ -267,6 +268,27 @@ class TestConvert(googletest.TestCase):
     )
     self.assertTrue(os.path.isdir(ir_dump_path))
+  def test_convert_conv_transpose_batch_norm(self):
+    """Tests conversion of a model with ConvTranspose2d and BatchNorm2d."""
+    channels = 2
+    size = 2
+    torch_model = nn.Sequential(
+        nn.ConvTranspose2d(
+            channels, channels, 1, stride=2, dilation=1, bias=False
+        ),
+        nn.BatchNorm2d(channels),
+    )
+    torch_model.eval()
+    sample_input = (torch.rand(1, channels, size, size),)
+    edge_model = ai_edge_torch.convert(torch_model, sample_input)
+    result = model_coverage.compare_tflite_torch(
+        edge_model, torch_model, sample_input
+    )
+    self.assertTrue(result)
   @googletest.skipIf(
       not config.Config.use_torch_xla,
       reason="Shape polymorphism is not yet support with odml_torch.",
@@ -274,7 +296,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_dynamic_batch(self):
     """Test converting a simple model with dynamic batch size."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def __init__(self):
         super().__init__()
@@ -304,7 +326,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_kwargs(self):
     """Test converting a simple model with sample_kwargs."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x, y):
         return x + y
@@ -323,7 +345,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_args_kwargs(self):
     """Test converting a simple model with both sample_args and sample_kwargs."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x, y):
         return x + y
@@ -343,7 +365,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_args_nested_kwargs_1(self):
     """Test converting a simple model with both sample_args and nested sample_kwargs."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x: torch.Tensor, y: torch.Tensor, z: TestContainer1):
         return x + y + z.data_1 + z.data_2[0] + z.data_2[1]
@@ -370,7 +392,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_args_nested_kwargs_2(self):
     """Test converting a simple model with both sample_args and nested sample_kwargs."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x, y, z):
         return x + y + z.data_1 + z.data_2[0][0] + z.data_2[1]
@@ -397,7 +419,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_with_args_nested_kwargs_3(self):
     """Test converting a simple model with both sample_args and nested sample_kwargs."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x, y, z):
         return x + y + z.data_1 + z.data_2[0]["foo"] + z.data_2[1]
@@ -424,7 +446,7 @@ class TestConvert(googletest.TestCase):
   def test_convert_model_non_flat_output_dict(self):
     """Test converting a model with non-flat output structure."""
-    class SampleModel(torch.nn.Module):
+    class SampleModel(nn.Module):
       def forward(self, x, y, z):
         return {"x": x, "y": TestContainer1(data_1=y, data_2=[y, z])}

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py CHANGED Viewed

@@ -13,32 +13,35 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma2 model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_gemma_to_tflite(
+def convert_gemma2_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """Converting a Gemma 2 2B model to multi-signature
-  tflite model.
+  """Converts a Gemma2 2B model to multi-signature tflite model.
   Args:
-      checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
+      checkpoint_path (str): The filepath to the model checkpoint, or directory
+        holding the checkpoint.
       prefill_seq_len (int, optional): The maximum size of prefill input tensor.
         Defaults to 512.
       kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
         including both prefill and decode. Defaults to 1024.
-      quantize (bool, optional): Whether the model should be quanized.
-        Defaults to True.
+      quantize (bool, optional): Whether the model should be quanized. Defaults
+        to True.
   """
   pytorch_model = gemma2.build_2b_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
@@ -48,20 +51,36 @@ def convert_gemma_to_tflite(
   prefill_input_pos = torch.arange(0, prefill_seq_len)
   decode_token = torch.tensor([[0]], dtype=torch.long)
   decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma2_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma2_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma2-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma2-2b')
+  convert_gemma2_to_tflite(path)

ai_edge_torch/generative/examples/gemma/convert_to_tflite.py CHANGED Viewed

@@ -13,11 +13,14 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
@@ -48,20 +51,36 @@ def convert_gemma_to_tflite(
   prefill_input_pos = torch.arange(0, prefill_seq_len)
   decode_token = torch.tensor([[0]], dtype=torch.long)
   decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma-2b')
+  convert_gemma_to_tflite(path)

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building a Gemma model.
+"""Example of building a Gemma model."""
 import os
-from pathlib import Path
+import pathlib
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
@@ -48,7 +50,6 @@ class Gemma(nn.Module):
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
@@ -60,18 +61,20 @@ class Gemma(nn.Module):
     )
     # Gemma re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
+    # Gemma has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        attention.TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -84,16 +87,22 @@ class Gemma(nn.Module):
     )
     self.config = config
-  # The model's forward function takes in additional k/v cache tensors
-  # and returns the updated k/v cache tensors to the caller.
-  # This can be eliminated if we handle k/v cache updates inside the model itself.
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    _, seq_len = idx.size()
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -102,15 +111,20 @@ class Gemma(nn.Module):
     mask = mask[:, :, :, : self.config.kv_cache_max]
     # token embeddings of shape (b, t, n_embd)
-    x = self.tok_embedding(idx)
+    x = self.tok_embedding(tokens)
     x = x * (self.config.embedding_dim**0.5)
-    for _, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -139,18 +153,20 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       epsilon=1e-6,
       zero_centered=True,
   )
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=256000,
       num_layers=18,
       max_seq_len=8192,
       embedding_dim=2048,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
-      parallel_residual=False,
       lm_head_use_bias=False,
       enable_hlfb=True,
   )
@@ -159,7 +175,8 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
 def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
   config = get_model_config_2b(kv_cache_max_len)
-  config.ff_config.intermediate_size = 128
+  # Gemma has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
   config.max_seq_len = 2 * kv_cache_max_len
@@ -170,32 +187,35 @@ def build_2b_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config_2b(**kwargs)
   model = Gemma(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
-  # since embedding and lm-head use the same weight, we need to set strict
+  # Since embedding and lm-head use the same weight, we need to set strict
   # to False.
   loader.load(model, strict=False)
   model.eval()
   return model
-def define_and_run_2b() -> None:
+def define_and_run_2b(checkpoint_path: str) -> None:
   """Instantiates and runs a Gemma 2B model."""
-  current_dir = Path(__file__).parent.resolve()
+  current_dir = pathlib.Path(__file__).parent.resolve()
   gemma_goldens = torch.load(current_dir / "gemma_lm_logits.pt")
   kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma-2b")
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
   tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
   tokens[0, :4] = idx
   input_pos = torch.arange(0, kv_cache_max_len)
-  lm_logits = model.forward(tokens, input_pos)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
   print("comparing with goldens..")
   assert torch.allclose(
-      gemma_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05
+      gemma_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-02
   )
 if __name__ == "__main__":
-  define_and_run_2b()
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/gemma-2b"
+  )
+  define_and_run_2b(input_checkpoint_path)

ai-edge-torch-nightly 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl