PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/examples/t5/t5_attention.py CHANGED Viewed

@@ -16,16 +16,14 @@
 from typing import Optional, Tuple
-import torch
-from torch import nn
-import torch.nn.functional as F
 from ai_edge_torch.generative.layers.attention import CrossAttention
 import ai_edge_torch.generative.layers.builder as builder
 from ai_edge_torch.generative.layers.kv_cache import KVCache
 import ai_edge_torch.generative.layers.model_config as cfg
 from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention  # NOQA
 from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
+import torch
+from torch import nn
 BATCH_SIZE = 1
@@ -33,46 +31,52 @@ BATCH_SIZE = 1
 class EncoderDecoderBlock(nn.Module):
   def __init__(
-      self, config: cfg.ModelConfig, has_relative_attention_bias: bool = False
+      self,
+      config: cfg.TransformerBlockConfig,
+      model_config: cfg.ModelConfig,
+      has_relative_attention_bias: bool = False,
   ) -> None:
     """Initialize an instance of the EncoderDecoderBlock.
     Args:
-      config (cfg.ModelConfig): the configuration object
-        for this transformer block.
-      has_relative_attention_bias (bool): whether the
-        self attention block has relative bias.
+      config (cfg.TransformerBlockConfig): the configuration object for this
+        transformer block.
+      model_config (cfg.ModelConfig): the configuration object for the model
+        this transformer block belongs to.
+      has_relative_attention_bias (bool): whether the self attention block has
+        relative bias.
     """
     super().__init__()
     self.atten_func = T5Attention(
         BATCH_SIZE,
-        config.embedding_dim,
+        model_config.embedding_dim,
         config.attn_config,
         config.pre_attention_norm_config,
-        config.kv_cache_max,
-        config.enable_hlfb,
+        model_config.kv_cache_max,
+        model_config.enable_hlfb,
         has_relative_attention_bias=has_relative_attention_bias,
     )
     # For a decoder, we add a cross attention.
-    if config.is_decoder:
+    if model_config.is_decoder:
       self.cross_atten_func = T5Attention(
           BATCH_SIZE,
-          config.embedding_dim,
+          model_config.embedding_dim,
           config.attn_config,
           config.pre_attention_norm_config,
-          config.kv_cache_max,
-          config.enable_hlfb,
+          model_config.kv_cache_max,
+          model_config.enable_hlfb,
           # Cross Attention does not have relative attention bias.
           has_relative_attention_bias=False,
       )
     else:
       self.cross_atten_func = None
-    self.pre_ff_norm = builder.build_norm(
-        config.embedding_dim, config.pre_ff_norm_config
+    self.post_atten_norm = builder.build_norm(
+        model_config.embedding_dim,
+        config.post_attention_norm_config,
     )
-    self.ff = builder.build_ff(config.embedding_dim, config.ff_config)
+    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
     self.config = config
   def forward(
@@ -119,7 +123,7 @@ class EncoderDecoderBlock(nn.Module):
       )
       attn_out = hidden_states + attn_out
-    forwarded = self.pre_ff_norm(attn_out)
+    forwarded = self.post_atten_norm(attn_out)
     forwarded = self.ff(forwarded)
     hidden_states = attn_out + forwarded
@@ -144,8 +148,10 @@ class T5Attention(CrossAttention):
     Args:
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
-      norm_config (cfg.NormalizationConfig): normalization configure before attention.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
+      norm_config (cfg.NormalizationConfig): normalization configure before
+        attention.
+      kv_cache_max (int): determines the size of the KV Cache buffer, if
+        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
       has_relative_attention_bias (bool): whether we compute relative bias.
     """
@@ -181,9 +187,13 @@ class T5Attention(CrossAttention):
     """
     x = self.pre_atten_norm(x)
-    B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+    B, T, C = (
+        x.size()
+    )  # batch size, sequence length, embedding dimensionality (n_embd)
     query_states = self.q_projection(x)
-    query_states = query_states.reshape(B, T, -1, self.head_dim)  # (B, T, nh_q, hs)
+    query_states = query_states.reshape(
+        B, T, -1, self.config.head_dim
+    )  # (B, T, nh_q, hs)
     if key_value_states is not None:
       (
@@ -195,13 +205,13 @@ class T5Attention(CrossAttention):
       )  # batch size, sequence length, embedding dimensionality (n_embd)
       key_states = self.k_projection(key_value_states)
       value_states = self.v_projection(key_value_states)
-      key_states = key_states.reshape(kvB, kvT, -1, self.head_dim)
-      value_states = value_states.reshape(kvB, kvT, -1, self.head_dim)
+      key_states = key_states.reshape(kvB, kvT, -1, self.config.head_dim)
+      value_states = value_states.reshape(kvB, kvT, -1, self.config.head_dim)
     else:
       key_states = self.k_projection(x)
       value_states = self.v_projection(x)
-      key_states = key_states.reshape(B, T, -1, self.head_dim)
-      value_states = value_states.reshape(B, T, -1, self.head_dim)
+      key_states = key_states.reshape(B, T, -1, self.config.head_dim)
+      value_states = value_states.reshape(B, T, -1, self.config.head_dim)
     if key_value_states is None and self.kv_cache is not None:
       key_states, value_states = self.kv_cache.update_cache(
@@ -218,12 +228,17 @@ class T5Attention(CrossAttention):
             0
         )  # shape (1, num_heads, query_length, key_length)
       else:
-        # position_bias = torch.zeros(B, self.n_heads, T, self.head_dim, dtype=torch.float32)
+        # position_bias = torch.zeros(B, self.n_heads, T, self.config.head_dim, dtype=torch.float32)
         position_bias = torch.zeros_like(mask, dtype=torch.float32)
     mask = mask + position_bias
     y = self.sdpa_func(
-        query_states, key_states, value_states, self.head_dim, mask=mask, scale=1.0
+        query_states,
+        key_states,
+        value_states,
+        self.config.head_dim,
+        mask=mask,
+        scale=1.0,
     )
     y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
     # output projection

ai_edge_torch/generative/examples/test_models/convert_toy_model.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# A toy example which has a single-layer transformer block.
+from absl import app
+import ai_edge_torch
+from ai_edge_torch import lowertools
+from ai_edge_torch.generative.examples.test_models import toy_model
+from ai_edge_torch.generative.examples.test_models import toy_model_with_kv_cache
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+import torch
+KV_CACHE_MAX_LEN = 100
+def convert_toy_model(_) -> None:
+  """Converts a toy model to tflite."""
+  model = toy_model.ToySingleLayerModel(toy_model.get_model_config())
+  idx = torch.unsqueeze(torch.arange(0, KV_CACHE_MAX_LEN), 0)
+  input_pos = torch.arange(0, KV_CACHE_MAX_LEN)
+  print('running an inference')
+  print(
+      model.forward(
+          idx,
+          input_pos,
+      )
+  )
+  # Convert model to tflite.
+  print('converting model to tflite')
+  edge_model = ai_edge_torch.convert(
+      model,
+      (
+          idx,
+          input_pos,
+      ),
+  )
+  edge_model.export('/tmp/toy_model.tflite')
+def _export_stablehlo_mlir(model, args):
+  ep = torch.export.export(model, args)
+  return lowertools.exported_program_to_mlir_text(ep)
+def convert_toy_model_with_kv_cache(_) -> None:
+  """Converts a toy model with kv cache to tflite."""
+  dump_mlir = False
+  config = toy_model_with_kv_cache.get_model_config()
+  model = toy_model_with_kv_cache.ToyModelWithKVCache(config)
+  model.eval()
+  print('running an inference')
+  kv = kv_utils.KVCache.from_model_config(config)
+  tokens, input_pos = toy_model_with_kv_cache.get_sample_prefill_inputs()
+  decode_token, decode_input_pos = (
+      toy_model_with_kv_cache.get_sample_decode_inputs()
+  )
+  print(model.forward(tokens, input_pos, kv))
+  if dump_mlir:
+    mlir_text = _export_stablehlo_mlir(model, (tokens, input_pos, kv))
+    with open('/tmp/toy_model_with_external_kv.stablehlo.mlir', 'w') as f:
+      f.write(mlir_text)
+  # Convert model to tflite with 2 signatures (prefill + decode).
+  print('converting toy model to tflite with 2 signatures (prefill + decode)')
+  edge_model = (
+      ai_edge_torch.signature(
+          'prefill',
+          model,
+          sample_kwargs={
+              'tokens': tokens,
+              'input_pos': input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .convert()
+  )
+  edge_model.export('/tmp/toy_external_kv_cache.tflite')
+if __name__ == '__main__':
+  app.run(convert_toy_model)

ai_edge_torch/generative/examples/test_models/toy_model.py CHANGED Viewed

@@ -15,15 +15,12 @@
 # A toy example which has a single-layer transformer block.
 from typing import Tuple
-import numpy as np
-import torch
-import torch.nn as nn
-import ai_edge_torch
+from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers.attention import TransformerBlock
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-import ai_edge_torch.generative.layers.builder as builder
 import ai_edge_torch.generative.layers.model_config as cfg
+import torch
+from torch import nn
 RoPECache = Tuple[torch.Tensor, torch.Tensor]
 KV_CACHE_MAX_LEN = 100
@@ -37,14 +34,16 @@ class ToySingleLayerModel(torch.nn.Module):
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
     )
     self.tok_embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
-    self.transformer_block = TransformerBlock(config)
+    self.transformer_block = TransformerBlock(config.block_config(0), config)
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    # Toy model has only one block config.
+    attn_config = config.block_config(0).attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.max_seq_len,
-        dim=int(config.attn_config.rotary_percentage * config.head_dim),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -70,9 +69,63 @@ class ToySingleLayerModel(torch.nn.Module):
     return self.lm_head(x)
+class ToySingleLayerModelWeightSharing(torch.nn.Module):
+  def __init__(self, config: cfg.ModelConfig) -> None:
+    super().__init__()
+    self.lm_head = nn.Linear(
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
+    )
+    self.tok_embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
+    self.lm_head = nn.Linear(
+        config.embedding_dim,
+        config.vocab_size,
+        bias=config.lm_head_use_bias,
+    )
+    self.lm_head.weight.data = self.tok_embedding.weight.data
+    self.transformer_block = TransformerBlock(config.block_config(0), config)
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    # Toy model has only one block config.
+    attn_config = config.block_config(0).attn_config
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.max_seq_len,
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device('cpu'),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.max_seq_len, dtype=torch.float32, device=torch.device('cpu')
+    )
+    self.config = config
+  @torch.inference_mode
+  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
+    x = self.tok_embedding(idx)
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    mask = self.mask_cache.index_select(2, input_pos)
+    mask = mask[:, :, :, : self.config.max_seq_len]
+    x = self.transformer_block(x, (cos, sin), mask, input_pos)
+    x = self.final_norm(x)
+    res = self.lm_head(x)
+    return res
 def get_model_config() -> cfg.ModelConfig:
   attn_config = cfg.AttentionConfig(
-      num_heads=32, num_query_groups=4, rotary_percentage=1.0, enable_kv_cache=False
+      num_heads=32,
+      head_dim=4,
+      num_query_groups=4,
+      rotary_percentage=1.0,
+      enable_kv_cache=False,
   )
   ff_config = cfg.FeedForwardConfig(
       type=cfg.FeedForwardType.GATED,
@@ -80,43 +133,18 @@ def get_model_config() -> cfg.ModelConfig:
       intermediate_size=256,
   )
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=400,
       num_layers=1,
       max_seq_len=KV_CACHE_MAX_LEN,
       embedding_dim=128,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
   )
   return config
-def define_and_run() -> None:
-  model = ToySingleLayerModel(get_model_config())
-  idx = torch.unsqueeze(torch.arange(0, KV_CACHE_MAX_LEN), 0)
-  input_pos = torch.arange(0, KV_CACHE_MAX_LEN)
-  print('running an inference')
-  print(
-      model.forward(
-          idx,
-          input_pos,
-      )
-  )
-  # Convert model to tflite.
-  print('converting model to tflite')
-  edge_model = ai_edge_torch.convert(
-      model,
-      (
-          idx,
-          input_pos,
-      ),
-  )
-  edge_model.export('/tmp/toy_model.tflite')
-if __name__ == '__main__':
-  define_and_run()

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -12,24 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# A toy example which has basic transformer block (w/ KV-Cache).
-from typing import List, Tuple
-import numpy as np
-import torch
-import torch.nn as nn
-import torch_xla
+"""A toy example which has basic transformer block (w/ externalized KV-Cache)."""
+from typing import Tuple
-import ai_edge_torch
-from ai_edge_torch.generative.layers.attention import TransformerBlock
+from absl import app
+from ai_edge_torch.generative.layers import attention
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-import ai_edge_torch.generative.layers.builder as builder
 import ai_edge_torch.generative.layers.model_config as cfg
+import torch
+from torch import nn
 RoPECache = Tuple[torch.Tensor, torch.Tensor]
-class ToyModelWithKV(torch.nn.Module):
+class ToyModelWithKVCache(torch.nn.Module):
   def __init__(self, config: cfg.ModelConfig) -> None:
     super().__init__()
@@ -37,16 +37,20 @@ class ToyModelWithKV(torch.nn.Module):
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
     )
     self.tok_embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
+    # Toy model has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.max_seq_len,
-        dim=int(config.attn_config.rotary_percentage * config.head_dim),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -57,29 +61,37 @@ class ToyModelWithKV(torch.nn.Module):
     )
     self.config = config
-  @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    x = self.tok_embedding(idx)
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> Tuple[torch.Tensor, kv_utils.KVCache]:
+    x = self.tok_embedding(tokens)
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
     sin = sin.index_select(0, input_pos)
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.max_seq_len]
-    for i, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
-    x = self.final_norm(x)
-    return self.lm_head(x)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
-def _export_stablehlo_mlir(model, args):
-  ep = torch.export.export(model, args)
-  stablehlo_gm = torch_xla.stablehlo.exported_program_to_stablehlo(ep)
-  return stablehlo_gm.get_stablehlo_text()
+    x = self.final_norm(x)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    return {'logits': self.lm_head(x), 'kv_cache': updated_kv_cache}
 def get_model_config() -> cfg.ModelConfig:
   attn_config = cfg.AttentionConfig(
-      num_heads=32, num_query_groups=4, rotary_percentage=1.0
+      num_heads=32,
+      head_dim=4,
+      num_query_groups=4,
+      rotary_percentage=1.0,
   )
   ff_config = cfg.FeedForwardConfig(
       type=cfg.FeedForwardType.GATED,
@@ -87,15 +99,18 @@ def get_model_config() -> cfg.ModelConfig:
       intermediate_size=256,
   )
   norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=150,
       num_layers=2,
-      max_seq_len=500,
+      max_seq_len=100,
       embedding_dim=128,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      pre_ff_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
       enable_hlfb=True,
   )
@@ -103,41 +118,12 @@ def get_model_config() -> cfg.ModelConfig:
 def get_sample_prefill_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  idx = torch.unsqueeze(torch.arange(0, 100), 0)
-  input_pos = torch.arange(0, 100)
-  return idx, input_pos
+  tokens = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
+  input_pos = torch.arange(0, 100, dtype=torch.int)
+  return tokens, input_pos
 def get_sample_decode_inputs() -> Tuple[torch.Tensor, torch.Tensor]:
-  idx = torch.tensor([[1]], dtype=torch.long)
-  input_pos = torch.tensor([10], dtype=torch.int64)
-  return idx, input_pos
-def define_and_run() -> None:
-  dump_mlir = False
-  config = get_model_config()
-  model = ToyModelWithKV(config)
-  print('running an inference')
-  idx, input_pos = get_sample_prefill_inputs()
-  decode_idx, decode_input_pos = get_sample_decode_inputs()
-  print(model.forward(idx, input_pos))
-  if dump_mlir:
-    mlir_text = _export_stablehlo_mlir(model, (idx, input_pos))
-    with open('/tmp/toy_model_with_kv.stablehlo.mlir', 'w') as f:
-      f.write(mlir_text)
-  # Convert model to tflite with 2 signatures (prefill + decode).
-  print('converting toy model to tflite with 2 signatures (prefill + decode)')
-  edge_model = (
-      ai_edge_torch.signature('prefill', model, (idx, input_pos))
-      .signature('decode', model, (decode_idx, decode_input_pos))
-      .convert()
-  )
-  edge_model.export('/tmp/toy_kv_cache.tflite')
-if __name__ == '__main__':
-  define_and_run()
+  tokens = torch.tensor([[1]], dtype=torch.int)
+  input_pos = torch.tensor([10])
+  return tokens, input_pos

ai_edge_torch/generative/examples/tiny_llama/__init__.py CHANGED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl