PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240911__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240911py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Common building blocks for Attention layer.
-from typing import Optional, Tuple
+"""Common building blocks for Attention layer."""
-import ai_edge_torch.generative.layers.builder as builder
-from ai_edge_torch.generative.layers.kv_cache import KVCache
+from typing import Optional, Tuple, Union
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention  # NOQA
-from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
 import torch
 from torch import nn
@@ -62,7 +62,6 @@ class TransformerBlock(nn.Module):
       config (cfg.ModelConfig): the configuration object for this transformer
         block.
     """
     super().__init__()
     self.pre_atten_norm = builder.build_norm(
         config.embedding_dim, config.pre_attention_norm_config
@@ -71,7 +70,6 @@ class TransformerBlock(nn.Module):
         config.batch_size,
         config.embedding_dim,
         config.attn_config,
-        config.kv_cache_max,
         config.enable_hlfb,
     )
     self.post_atten_norm = builder.build_norm(
@@ -86,7 +84,8 @@ class TransformerBlock(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
     Args:
@@ -94,24 +93,34 @@ class TransformerBlock(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
     Returns:
-      output activation from this transformer block.
+      output activation from this transformer block, and updated kv cache (if
+      passed in).
     """
+    kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       ff_out = self.ff(x_norm)
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      attn_out = self.atten_func(x_norm, rope, mask, input_pos)
+      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      if kv_cache is None:
+        attn_out = atten_func_out
+      else:
+        attn_out, kv = atten_func_out
       x = x + attn_out
       x_norm = self.post_atten_norm(x)
       output = x + self.ff(x_norm)
-    return output
+    return output if kv is None else (output, kv)
 class CausalSelfAttention(nn.Module):
@@ -121,7 +130,6 @@ class CausalSelfAttention(nn.Module):
       batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
@@ -130,8 +138,6 @@ class CausalSelfAttention(nn.Module):
       batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
@@ -147,21 +153,13 @@ class CausalSelfAttention(nn.Module):
     self.output_projection = nn.Linear(
         output_shape, dim, bias=config.output_proj_use_bias
     )
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.config = config
+    self.enable_hlfb = enable_hlfb
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -169,7 +167,8 @@ class CausalSelfAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
        MQA, GQA and MHA.
@@ -179,9 +178,11 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
@@ -224,9 +225,11 @@ class CausalSelfAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     y = self.sdpa_func(
         q,
@@ -240,7 +243,7 @@ class CausalSelfAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)
 class SelfAttention(CausalSelfAttention):
@@ -251,16 +254,19 @@ class SelfAttention(CausalSelfAttention):
       x: torch.Tensor,
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
-  ) -> torch.Tensor:
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
     Args:
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
-      output activation from this self attention layer.
+      output activation from this self attention layer, and the updated
+        KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
     return super().forward(
@@ -279,9 +285,8 @@ class CrossAttention(nn.Module):
       query_dim: int,
       cross_dim: int,
       config: cfg.AttentionConfig,
-      kv_cache_max: int,
       enable_hlfb: bool,
-  ) -> None:
+  ):
     """Initialize an instance of CrossAttention.
     Args:
@@ -289,8 +294,6 @@ class CrossAttention(nn.Module):
       query_dim (int): query tensor's dimension.
       cross_dim (int): cross attention's dimensions, for key and value tensors.
       config (cfg.AttentionConfig): attention specific configurations.
-      kv_cache_max (int): determines the size of the KV Cache buffer, if
-        enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
@@ -309,21 +312,11 @@ class CrossAttention(nn.Module):
         query_dim, query_dim, bias=config.output_proj_use_bias
     )
-    self.kv_cache = None
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          batch_size,
-          kv_cache_max,
-          config.num_query_groups,
-          self.config.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
+    self.sdpa_func = (
+        sdpa.scaled_dot_product_attention_with_hlfb
+        if enable_hlfb
+        else sdpa.scaled_dot_product_attention
+    )
   def forward(
       self,
@@ -332,6 +325,7 @@ class CrossAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -342,6 +336,7 @@ class CrossAttention(nn.Module):
       mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
         [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
+      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
     Returns:
       output activation from this cross attention layer.
@@ -363,9 +358,11 @@ class CrossAttention(nn.Module):
     n_elem = int(self.config.rotary_percentage * self.config.head_dim)
     q, k = _embed_rope(q, k, n_elem, rope)
-    if self.kv_cache is not None:
-      # TODO(haoliang): Handle when execeeding max sequence length.
-      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if kv_cache is not None:
+      kv_cache = kv_utils.update(
+          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
+      )
+      k, v = kv_cache.k_cache, kv_cache.v_cache
     if mask is None:
       mask = torch.zeros(
           (batch_size, 1, target_seq_len, source_seq_len), dtype=torch.float32
@@ -375,4 +372,4 @@ class CrossAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
-    return y
+    return y if kv_cache is None else (y, kv_cache)

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -12,72 +12,181 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# `nn.Module` which implements a KV cache.
-from ai_edge_torch.hlfb import StableHLOCompositeBuilder
+"""Utility functions for externalized KV Cache."""
+import dataclasses
+from typing import List, Tuple
+from ai_edge_torch import hlfb
+from ai_edge_torch.generative.layers import model_config
 import torch
-from torch import nn
+import torch.utils._pytree as pytree
-class KVCache(nn.Module):
+@dataclasses.dataclass
+class KVCacheEntry:
+  """A single cache entry that includes K and V caches.
-  def __init__(
-      self, batch_size, kv_cache_max, n_heads, head_dim, enable_hlfb=False
-  ):
-    """Initializes the KVCache layer.
+  The chaches are built based on the provided config with the shape of
+  (batch_size=1, kv_cache_max, num_query_groups, head_dim).
+  """
-    Args:
-      batch_size (int): batch size. Currently only batch size 1 is supported.
-      kv_cache_max (int): the max length of KV cache.
-      n_heads (int): number of kv heads.
-      head_dim (int): the head dimension size.
-      enable_hlfb (bool): whether hlfb is enabled or not.
-    """
-    super().__init__()
-    cache_shape = (batch_size, kv_cache_max, n_heads, head_dim)
-    self.register_buffer("k_cache", torch.zeros(cache_shape), persistent=False)
-    self.register_buffer("v_cache", torch.zeros(cache_shape), persistent=False)
-    self.enable_hlfb = enable_hlfb
-    self.kv_cache_max = kv_cache_max
+  k_cache: torch.Tensor
+  v_cache: torch.Tensor
-  def update_cache(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache.
+  @classmethod
+  def from_model_config(
+      cls,
+      config: model_config.ModelConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCacheEntry":
+    """Build an instance of the class based on model config."""
+    shape = (
+        1,  # Batch dimmension.
+        config.kv_cache_max,
+        config.attn_config.num_query_groups,
+        config.attn_config.head_dim,
+    )
+    k = torch.zeros(shape, dtype=dtype, device=device)
+    v = torch.zeros(shape, dtype=dtype, device=device)
+    obj = cls(k_cache=k, v_cache=v)
+    return obj
-    Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
-    Returns:
-      The updated key and value tensor.
-    """
-    if self.enable_hlfb:
-      return self.update_cache_with_hlfb(input_pos, k_val, v_val)
+@dataclasses.dataclass
+class KVCache:
+  """A utility class for holding KV cache entries per layer."""
-    updated_k = self.k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = self.v_cache.index_copy_(1, input_pos, v_val)
-    # Here we need a clone otherwise dynamo export will fail.
-    return torch.clone(updated_k), torch.clone(updated_v)
+  caches: Tuple[KVCacheEntry, ...]
-  def update_cache_with_hlfb(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache and enable high-level function boundary.
+  @classmethod
+  def from_model_config(
+      cls,
+      config: model_config.ModelConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCache":
+    """Build an instance of the class based on model config.
     Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+        config (ModelConfig): Model config used for building the cache.
+        dtype (torch.dtype, optional): The data type of the cache tensor.
+          Defaults to torch.float32.
+        device (torch.device, optional): The device placement of the cache
+          tensors. Defaults to None.
     Returns:
-      The updated key and value tensor.
+        KVCache: The created cache object.
     """
+    caches = [
+        KVCacheEntry.from_model_config(config, dtype, device)
+        for _ in range(config.num_layers)
+    ]
+    obj = cls(caches=tuple(caches))
+    return obj
-    builder = StableHLOCompositeBuilder(
-        name="odml.update_kv_cache", attr={"kv_cache_max": self.kv_cache_max}
-    )
-    k_cache, v_cache, input_pos, k_val, v_val = builder.mark_inputs(
-        self.k_cache, self.v_cache, input_pos, k_val, v_val
+  def flatten(self) -> List[torch.Tensor]:
+    """Flatten the cache entries into a list of tensors with order k_i, v_i."""
+    flattened, _ = _flatten_kvc(self)
+    return flattened
+def _flatten_kvc(kvc: KVCache) -> Tuple[List[str], List[str]]:
+  flattened = []
+  flat_names = []
+  none_names = []
+  for i, kv_entry in enumerate(kvc.caches):
+    flattened.append(kv_entry.k_cache)
+    flat_names.append(f"k_{i}")
+    flattened.append(kv_entry.v_cache)
+    flat_names.append(f"v_{i}")
+  return flattened, [flat_names, none_names]
+def _flatten_kvc_with_keys(kvc: KVCache) -> Tuple[List, List]:
+  flattened, (flat_names, none_names) = _flatten_kvc(kvc)
+  return [
+      (pytree.MappingKey(k), v) for k, v in zip(flat_names, flattened)
+  ], flat_names
+def _unflatten_kvc(
+    values: List[torch.Tensor], context: Tuple[List, List]
+) -> KVCache:
+  assert len(values) % 2 == 0, "Found odd number of K and V entries."
+  num_layers = len(values) // 2
+  flat_names = context[0]
+  kv_entries = []
+  for i in range(num_layers):
+    k_cache_idx = flat_names.index(f"k_{i}")
+    v_cache_idx = flat_names.index(f"v_{i}")
+    kv_entries.append(
+        KVCacheEntry(k_cache=values[k_cache_idx], v_cache=values[v_cache_idx])
     )
-    updated_k = k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = v_cache.index_copy_(1, input_pos, v_val)
-    updated_k, updated_v = builder.mark_outputs(updated_k, updated_v)
-    return updated_k, updated_v
+  obj = KVCache(tuple(kv_entries))
+  return obj
+pytree.register_pytree_node(
+    KVCache,
+    _flatten_kvc,
+    _unflatten_kvc,
+    flatten_with_keys_fn=_flatten_kvc_with_keys,
+    serialized_type_name="",
+)
+def update(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+    enable_hlfb: bool = True,
+) -> KVCacheEntry:
+  """Out of place update of Cache buffer.
+  Args:
+      cache (KVCacheEntry): The original cache buffer.
+      input_pos (torch.Tensor): The update slice positions.
+      k_slice (torch.Tensor): The K slice to be updated in the new cache.
+      v_slice (torch.Tensor): The V slice to be updated in the new cache.
+      enable_hlfb (bool, optional): Whether the op is annotated for export with
+        High Level Function Boundary. Defaults to True.
+  Returns:
+      KVCacheEntry: The updated KVCache entry based on the passed inputs.
+  """
+  update_func = _update_kv_hlfb_impl if enable_hlfb else _update_kv_base_impl
+  return update_func(cache, input_pos, k_slice, v_slice)
+def _update_kv_base_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer without High Level Function Boundary annotation."""
+  k = cache.k_cache.index_copy(1, input_pos, k_slice)
+  v = cache.v_cache.index_copy(1, input_pos, v_slice)
+  updated_cache = KVCacheEntry(k, v)
+  return updated_cache
+def _update_kv_hlfb_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer with High Level Function Boundary annotation."""
+  builder = hlfb.StableHLOCompositeBuilder(name="odml.update_external_kv_cache")
+  k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
+      cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
+  )
+  k = k_cache.index_copy(1, input_pos, k_slice)
+  v = v_cache.index_copy(1, input_pos, v_slice)
+  k, v = builder.mark_outputs(k, v)
+  return KVCacheEntry(k, v)

ai_edge_torch/generative/test/{test_experimental_ekv.py → test_kv_cache.py} RENAMED Viewed

@@ -12,19 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# A suite of tests to validate experimental external KV Cache layers and models.
-from ai_edge_torch.generative.examples.experimental.gemma import gemma
-from ai_edge_torch.generative.examples.experimental.phi import phi2
-from ai_edge_torch.generative.examples.experimental.tiny_llama import tiny_llama  # NOQA
-from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+"""A suite of tests to validate KV Cache layer."""
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import torch
 from absl.testing import absltest as googletest
-class TestExternalKVLayers(googletest.TestCase):
+class TestKVLayers(googletest.TestCase):
   def _get_test_config(
       self, num_layers, head_dim, num_query_groups, kv_cache_max_len
@@ -54,7 +52,7 @@ class TestExternalKVLayers(googletest.TestCase):
         num_query_groups=NUM_QG,
         kv_cache_max_len=KV_LEN,
     )
-    kv = kv_utils.EKVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(config)
     entry = kv.caches[0]
     # single-slice update
     input_pos = torch.tensor([1])
@@ -88,14 +86,14 @@ class TestExternalKVLayers(googletest.TestCase):
   def test_serialization(self):
     class TestModel(torch.nn.Module):
-      def forward(self, kv: kv_utils.EKVCache) -> kv_utils.EKVCache:
+      def forward(self, kv: kv_utils.KVCache) -> kv_utils.KVCache:
         updated_kv_entries = [
             kv_utils.KVCacheEntry(
                 torch.zeros_like(entry.k_cache), torch.zeros_like(entry.v_cache)
             )
             for entry in kv.caches
         ]
-        return kv_utils.EKVCache(updated_kv_entries)
+        return kv_utils.KVCache(updated_kv_entries)
     N = 1
     HEAD_DIM = 2
@@ -107,7 +105,7 @@ class TestExternalKVLayers(googletest.TestCase):
         num_query_groups=NUM_QG,
         kv_cache_max_len=KV_LEN,
     )
-    kv = kv_utils.EKVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(config)
     model = TestModel()
     exported_program = torch.export.export(model, (kv,))
     input_specs = exported_program.graph_signature.input_specs
@@ -116,17 +114,5 @@ class TestExternalKVLayers(googletest.TestCase):
     self.assertEqual(input_specs[1].arg.name, "kv_v_0")
-class TestExternalKVModels(googletest.TestCase):
-  def test_can_build_gemma(self):
-    gemma.define_and_run_2b(checkpoint_path=None, test_model=True)
-  def test_can_build_phi2(self):
-    phi2.define_and_run(checkpoint_path=None, test_model=True)
-  def test_can_build_tinyllama(self):
-    tiny_llama.define_and_run(checkpoint_path=None, test_model=True)
 if __name__ == "__main__":
   googletest.main()

ai-edge-torch-nightly 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240911__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240911py3-none-any.whl