PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -12,72 +12,184 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# `nn.Module` which implements a KV cache.
-from ai_edge_torch.hlfb import StableHLOCompositeBuilder
+"""Utility functions for externalized KV Cache."""
+import dataclasses
+from typing import List, Tuple
+from ai_edge_torch import hlfb
+from ai_edge_torch.generative.layers import model_config
 import torch
-from torch import nn
+import torch.utils._pytree as pytree
+BATCH_SIZE = 1
-class KVCache(nn.Module):
-  def __init__(
-      self, batch_size, kv_cache_max, n_heads, head_dim, enable_hlfb=False
-  ):
-    """Initializes the KVCache layer.
+@dataclasses.dataclass
+class KVCacheEntry:
+  """A single cache entry that includes K and V caches.
-    Args:
-      batch_size (int): batch size. Currently only batch size 1 is supported.
-      kv_cache_max (int): the max length of KV cache.
-      n_heads (int): number of kv heads.
-      head_dim (int): the head dimension size.
-      enable_hlfb (bool): whether hlfb is enabled or not.
-    """
-    super().__init__()
-    cache_shape = (batch_size, kv_cache_max, n_heads, head_dim)
-    self.register_buffer("k_cache", torch.zeros(cache_shape), persistent=False)
-    self.register_buffer("v_cache", torch.zeros(cache_shape), persistent=False)
-    self.enable_hlfb = enable_hlfb
-    self.kv_cache_max = kv_cache_max
+  The chaches are built based on the provided config with the shape of
+  (batch_size=1, kv_cache_max, num_query_groups, head_dim).
+  """
-  def update_cache(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache.
+  k_cache: torch.Tensor
+  v_cache: torch.Tensor
-    Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+  @classmethod
+  def from_model_config(
+      cls,
+      kv_cache_max: int,
+      config: model_config.AttentionConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCacheEntry":
+    """Build an instance of the class based on model config."""
+    shape = (BATCH_SIZE, kv_cache_max, config.num_query_groups, config.head_dim)
+    k = torch.zeros(shape, dtype=dtype, device=device)
+    v = torch.zeros(shape, dtype=dtype, device=device)
+    obj = cls(k_cache=k, v_cache=v)
+    return obj
-    Returns:
-      The updated key and value tensor.
-    """
-    if self.enable_hlfb:
-      return self.update_cache_with_hlfb(input_pos, k_val, v_val)
-    updated_k = self.k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = self.v_cache.index_copy_(1, input_pos, v_val)
-    # Here we need a clone otherwise dynamo export will fail.
-    return torch.clone(updated_k), torch.clone(updated_v)
+@dataclasses.dataclass
+class KVCache:
+  """A utility class for holding KV cache entries per layer."""
-  def update_cache_with_hlfb(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache and enable high-level function boundary.
+  caches: Tuple[KVCacheEntry, ...]
+  @classmethod
+  def from_model_config(
+      cls,
+      config: model_config.ModelConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCache":
+    """Build an instance of the class based on model config.
     Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+        config (ModelConfig): Model config used for building the cache.
+        dtype (torch.dtype, optional): The data type of the cache tensor.
+          Defaults to torch.float32.
+        device (torch.device, optional): The device placement of the cache
+          tensors. Defaults to None.
     Returns:
-      The updated key and value tensor.
+        KVCache: The created cache object.
     """
+    caches = [
+        KVCacheEntry.from_model_config(
+            config.kv_cache_max,
+            config.block_config(idx).attn_config,
+            dtype,
+            device,
+        )
+        for idx in range(config.num_layers)
+    ]
+    obj = cls(caches=tuple(caches))
+    return obj
-    builder = StableHLOCompositeBuilder(
-        name="odml.update_kv_cache", attr={"kv_cache_max": self.kv_cache_max}
-    )
-    k_cache, v_cache, input_pos, k_val, v_val = builder.mark_inputs(
-        self.k_cache, self.v_cache, input_pos, k_val, v_val
+  def flatten(self) -> List[torch.Tensor]:
+    """Flatten the cache entries into a list of tensors with order k_i, v_i."""
+    flattened, _ = _flatten_kvc(self)
+    return flattened
+def _flatten_kvc(kvc: KVCache) -> Tuple[List[str], List[str]]:
+  flattened = []
+  flat_names = []
+  none_names = []
+  for i, kv_entry in enumerate(kvc.caches):
+    flattened.append(kv_entry.k_cache)
+    flat_names.append(f"k_{i}")
+    flattened.append(kv_entry.v_cache)
+    flat_names.append(f"v_{i}")
+  return flattened, [flat_names, none_names]
+def _flatten_kvc_with_keys(kvc: KVCache) -> Tuple[List, List]:
+  flattened, (flat_names, none_names) = _flatten_kvc(kvc)
+  return [
+      (pytree.MappingKey(k), v) for k, v in zip(flat_names, flattened)
+  ], flat_names
+def _unflatten_kvc(
+    values: List[torch.Tensor], context: Tuple[List, List]
+) -> KVCache:
+  assert len(values) % 2 == 0, "Found odd number of K and V entries."
+  num_layers = len(values) // 2
+  flat_names = context[0]
+  kv_entries = []
+  for i in range(num_layers):
+    k_cache_idx = flat_names.index(f"k_{i}")
+    v_cache_idx = flat_names.index(f"v_{i}")
+    kv_entries.append(
+        KVCacheEntry(k_cache=values[k_cache_idx], v_cache=values[v_cache_idx])
     )
-    updated_k = k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = v_cache.index_copy_(1, input_pos, v_val)
-    updated_k, updated_v = builder.mark_outputs(updated_k, updated_v)
-    return updated_k, updated_v
+  obj = KVCache(tuple(kv_entries))
+  return obj
+pytree.register_pytree_node(
+    KVCache,
+    _flatten_kvc,
+    _unflatten_kvc,
+    flatten_with_keys_fn=_flatten_kvc_with_keys,
+    serialized_type_name="",
+)
+def update(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+    enable_hlfb: bool = True,
+) -> KVCacheEntry:
+  """Out of place update of Cache buffer.
+  Args:
+      cache (KVCacheEntry): The original cache buffer.
+      input_pos (torch.Tensor): The update slice positions.
+      k_slice (torch.Tensor): The K slice to be updated in the new cache.
+      v_slice (torch.Tensor): The V slice to be updated in the new cache.
+      enable_hlfb (bool, optional): Whether the op is annotated for export with
+        High Level Function Boundary. Defaults to True.
+  Returns:
+      KVCacheEntry: The updated KVCache entry based on the passed inputs.
+  """
+  update_func = _update_kv_hlfb_impl if enable_hlfb else _update_kv_base_impl
+  return update_func(cache, input_pos, k_slice, v_slice)
+def _update_kv_base_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer without High Level Function Boundary annotation."""
+  k = cache.k_cache.index_copy(1, input_pos, k_slice)
+  v = cache.v_cache.index_copy(1, input_pos, v_slice)
+  updated_cache = KVCacheEntry(k, v)
+  return updated_cache
+def _update_kv_hlfb_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer with High Level Function Boundary annotation."""
+  builder = hlfb.StableHLOCompositeBuilder(name="odml.update_external_kv_cache")
+  k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
+      cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
+  )
+  k = k_cache.index_copy(1, input_pos, k_slice)
+  v = v_cache.index_copy(1, input_pos, v_slice)
+  k, v = builder.mark_outputs(k, v)
+  return KVCacheEntry(k, v)

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from dataclasses import field
 import enum
-from typing import Optional, Sequence
+from typing import Optional, Sequence, Union
 @enum.unique
@@ -85,8 +85,8 @@ class AttentionConfig:
   relative_attention_max_distance: int = 0
   # Softcap on the output logits.
   logit_softcap: Optional[float] = None
-  # The types of attention used in the layers of the model.
-  attn_types: Optional[Sequence[AttentionType]] = None
+  # The type of attention.
+  attn_type: Optional[AttentionType] = None
   # The size of the sliding window used for local attention.
   sliding_window_size: Optional[int] = None
@@ -104,6 +104,7 @@ class NormalizationConfig:
   """Normalizater parameters."""
   type: NormalizationType = NormalizationType.NONE
+  enable_hlfb: bool = False
   epsilon: float = 1e-5
   zero_centered: bool = False
   # Number of groups used in group normalization.
@@ -129,13 +130,8 @@ class FeedForwardConfig:
 @dataclass
-class ModelConfig:
-  """Base configurations for building a transformer architecture."""
-  vocab_size: int
-  num_layers: int
-  max_seq_len: int
-  embedding_dim: int
+class TransformerBlockConfig:
+  """TransformerBlock module's parameters."""
   attn_config: AttentionConfig
   ff_config: FeedForwardConfig
@@ -147,15 +143,33 @@ class ModelConfig:
   post_attention_norm_config: NormalizationConfig = field(
       default_factory=NormalizationConfig
   )
+  # If set to True, only attn_config.pre_attention_norm is applied to the input
+  # and the decode's output is computed as `output = input + attn_out + ff_out`
+  # where attention and feed forward are called with pre_attention_norm's
+  # output.
+  parallel_residual: bool = False
+  # The Attention computation will include relative positional bias.
+  relative_attention: bool = False
+@dataclass
+class ModelConfig:
+  """Base configurations for building a transformer architecture."""
+  vocab_size: int
+  num_layers: int
+  max_seq_len: int
+  embedding_dim: int
+  # TransformerBlockConfig for each layer block. If a single
+  # TransformerBlockConfig is provided, it will be used for all layers.
+  block_configs: Union[TransformerBlockConfig, Sequence[TransformerBlockConfig]]
   # The normalization applied before LM head.
   final_norm_config: NormalizationConfig = field(
       default_factory=NormalizationConfig
   )
-  # If set to True, only pre_attention_norm is applied to the input and the
-  # decode's output is computed as `output = input + attn_out + ff_out` where
-  # attention and feed forward are called with pre_attention_norm's output.
-  parallel_residual: bool = False
   # Use bias term within LLM's HEAD.
   lm_head_use_bias: bool = False
   # Whether to turn on high-level function boundary.
@@ -164,9 +178,6 @@ class ModelConfig:
   # The maximum sequence length of the KV cache. Should not exceed max_seq_len.
   kv_cache_max_len: int = 0
-  # The Attention computation will include relative positional bias.
-  relative_attention: bool = False
   # Default batch size of the exported model. Default value is 1.
   batch_size: int = 1
@@ -177,5 +188,13 @@ class ModelConfig:
   def kv_cache_max(self) -> int:
     if self.kv_cache_max_len > 0:
       return self.kv_cache_max_len
-    else:
-      return self.max_seq_len
+    return self.max_seq_len
+  def block_config(self, idx: int) -> TransformerBlockConfig:
+    if isinstance(self.block_configs, TransformerBlockConfig):
+      return self.block_configs
+    if idx < 0 or idx >= len(self.block_configs):
+      raise ValueError(
+          f"Index {idx} is out of range for layer configs: {self.block_configs}"
+      )
+    return self.block_configs[idx]

ai_edge_torch/generative/layers/normalization.py CHANGED Viewed

@@ -14,7 +14,10 @@
 # ==============================================================================
 # Common normalization layers.
+from ai_edge_torch.hlfb import StableHLOCompositeBuilder
 import torch
+from torch import nn
+import torch.nn.functional as F
 # Implementation for RMSNorm from: https://arxiv.org/abs/1910.07467
@@ -58,3 +61,158 @@ class RMSNorm(torch.nn.Module):
       return output * (1 + self.weight)
     else:
       return output * self.weight
+class GroupNorm(torch.nn.Module):
+  def __init__(
+      self,
+      group_num: int,
+      dim: int,
+      eps: float = 1e-5,
+      enable_hlfb: bool = False,
+  ):
+    """Initialize the GroupNorm layer.
+    Args:
+      group_num (int): Number of groups to separate the channels into.
+      dim (int): Dimension of the input tensor.
+      eps (float): A small float value to ensure numerical stability (default:
+        1e-6).
+      enable_hlfb (bool): Whether to convert this normalization into a single
+        op.
+    """
+    super().__init__()
+    self.enable_hlfb = enable_hlfb
+    self.group_num = group_num
+    self.eps = eps
+    self.weight = torch.nn.Parameter(torch.ones(dim))
+    self.bias = torch.nn.Parameter(torch.ones(dim))
+  def forward(self, x):
+    """Running the forward pass of GroupNorm layer.
+    Args:
+      x (torch.Tensor): input tensor.
+    Returns:
+      torch.Tensor: output tensor after applying GroupNorm.
+    """
+    if self.enable_hlfb:
+      return group_norm_with_hlfb(
+          x,
+          self.weight,
+          self.bias,
+          self.group_num,
+          self.eps,
+      )
+    else:
+      return F.group_norm(x, self.group_num, self.weight, self.bias, self.eps)
+class LayerNorm(torch.nn.Module):
+  def __init__(self, dim: int, eps: float = 1e-5, enable_hlfb: bool = False):
+    """Initialize the LayerNorm layer.
+    Args:
+      dim (int): dimension of the input tensor.
+      eps (float): A small float value to ensure numerical stability (default:
+        1e-6).
+      enable_hlfb (bool): Whether to convert this normalization into a single
+        op.
+    """
+    super().__init__()
+    self.enable_hlfb = enable_hlfb
+    self.eps = eps
+    self.weight = torch.nn.Parameter(torch.ones(dim))
+    self.bias = torch.nn.Parameter(torch.ones(dim))
+  def forward(self, x):
+    """Running the forward pass of LayerNorm layer.
+    Args:
+      x (torch.Tensor): input tensor.
+    Returns:
+      torch.Tensor: output tensor after applying LayerNorm.
+    """
+    if self.enable_hlfb:
+      return layer_norm_with_hlfb(
+          x,
+          self.weight,
+          self.bias,
+          self.eps,
+      )
+    else:
+      return F.layer_norm(
+          x,
+          x.shape,
+          self.weight.broadcast_to(x.shape),
+          self.bias.broadcast_to(x.shape),
+          self.eps,
+      )
+def group_norm_with_hlfb(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    b: torch.Tensor,
+    num_groups: int,
+    eps: float,
+):
+  """Group Normalization with high-level function boundary enabled.
+  Args:
+    x (torch.Tensor): Input tensor for Group Normalization, with BCHW shape.
+    w (torch.Tensor): The weight tensor for the normalization.
+    b (torch.Tensor): The bias tensor for the normalization.
+    num_groups (int): Number of groups to separate the channels into.
+    eps (float): A small float value to ensure numerical stability.
+  Returns:
+    The output tensor of Group Normalization.
+  """
+  x = torch.permute(x, (0, 2, 3, 1))
+  builder = StableHLOCompositeBuilder(
+      name="odml.group_norm", attr={"num_groups": num_groups, "eps": eps}
+  )
+  x, w, b = builder.mark_inputs(x, w, b)
+  x = torch.permute(x, (0, 3, 1, 2))
+  y = F.group_norm(x, num_groups, weight=w, bias=b, eps=eps)
+  y = torch.permute(y, (0, 2, 3, 1))
+  y = builder.mark_outputs(y)
+  y = torch.permute(y, (0, 3, 1, 2))
+  return y
+def layer_norm_with_hlfb(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    b: torch.Tensor,
+    eps: float,
+):
+  """Layer Normalization with high-level function boundary enabled.
+  Args:
+    x (torch.Tensor): Input tensor for Layer Normalization.
+    w (torch.Tensor): The weight tensor for the normalization.
+    b (torch.Tensor): The bias tensor for the normalization.
+    eps (float): A small float value to ensure numerical stability.
+  Returns:
+    The output tensor of Layer Normalization.
+  """
+  builder = StableHLOCompositeBuilder(name="odml.layer_norm", attr={"eps": eps})
+  x, w, b = builder.mark_inputs(x, w, b)
+  y = F.layer_norm(
+      x,
+      x.shape,
+      weight=w.broadcast_to(x.shape),
+      bias=b.broadcast_to(x.shape),
+      eps=eps,
+  )
+  y = builder.mark_outputs(y)
+  return y

ai_edge_torch/generative/layers/unet/blocks_2d.py CHANGED Viewed

@@ -122,7 +122,6 @@ class AttentionBlock2D(nn.Module):
         config.attention_batch_size,
         config.dim,
         config.attention_config,
-        0,
         enable_hlfb=config.enable_hlfb,
     )
@@ -180,7 +179,6 @@ class CrossAttentionBlock2D(nn.Module):
         config.query_dim,
         config.cross_dim,
         config.attention_config,
-        0,
         enable_hlfb=config.enable_hlfb,
     )

ai_edge_torch/generative/test/{test_experimental_ekv.py → test_kv_cache.py} RENAMED Viewed

@@ -12,19 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# A suite of tests to validate experimental external KV Cache layers and models.
-from ai_edge_torch.generative.examples.experimental.gemma import gemma
-from ai_edge_torch.generative.examples.experimental.phi import phi2
-from ai_edge_torch.generative.examples.experimental.tiny_llama import tiny_llama  # NOQA
-from ai_edge_torch.generative.layers.experimental import ekv_cache as kv_utils
+"""A suite of tests to validate KV Cache layer."""
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import torch
 from absl.testing import absltest as googletest
-class TestExternalKVLayers(googletest.TestCase):
+class TestKVLayers(googletest.TestCase):
   def _get_test_config(
       self, num_layers, head_dim, num_query_groups, kv_cache_max_len
@@ -32,14 +30,16 @@ class TestExternalKVLayers(googletest.TestCase):
     attn_config = cfg.AttentionConfig(
         num_heads=1, head_dim=head_dim, num_query_groups=num_query_groups
     )
+    block_config = cfg.TransformerBlockConfig(
+        attn_config=attn_config, ff_config=None
+    )
     config = cfg.ModelConfig(
         kv_cache_max_len=kv_cache_max_len,
         embedding_dim=head_dim,
-        attn_config=attn_config,
+        block_configs=block_config,
         num_layers=num_layers,
         max_seq_len=None,
         vocab_size=None,
-        ff_config=None,
     )
     return config
@@ -54,7 +54,7 @@ class TestExternalKVLayers(googletest.TestCase):
         num_query_groups=NUM_QG,
         kv_cache_max_len=KV_LEN,
     )
-    kv = kv_utils.EKVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(config)
     entry = kv.caches[0]
     # single-slice update
     input_pos = torch.tensor([1])
@@ -88,14 +88,14 @@ class TestExternalKVLayers(googletest.TestCase):
   def test_serialization(self):
     class TestModel(torch.nn.Module):
-      def forward(self, kv: kv_utils.EKVCache) -> kv_utils.EKVCache:
+      def forward(self, kv: kv_utils.KVCache) -> kv_utils.KVCache:
         updated_kv_entries = [
             kv_utils.KVCacheEntry(
                 torch.zeros_like(entry.k_cache), torch.zeros_like(entry.v_cache)
             )
             for entry in kv.caches
         ]
-        return kv_utils.EKVCache(updated_kv_entries)
+        return kv_utils.KVCache(updated_kv_entries)
     N = 1
     HEAD_DIM = 2
@@ -107,7 +107,7 @@ class TestExternalKVLayers(googletest.TestCase):
         num_query_groups=NUM_QG,
         kv_cache_max_len=KV_LEN,
     )
-    kv = kv_utils.EKVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(config)
     model = TestModel()
     exported_program = torch.export.export(model, (kv,))
     input_specs = exported_program.graph_signature.input_specs
@@ -116,17 +116,5 @@ class TestExternalKVLayers(googletest.TestCase):
     self.assertEqual(input_specs[1].arg.name, "kv_v_0")
-class TestExternalKVModels(googletest.TestCase):
-  def test_can_build_gemma(self):
-    gemma.define_and_run_2b(checkpoint_path=None, test_model=True)
-  def test_can_build_phi2(self):
-    phi2.define_and_run(checkpoint_path=None, test_model=True)
-  def test_can_build_tinyllama(self):
-    tiny_llama.define_and_run(checkpoint_path=None, test_model=True)
 if __name__ == "__main__":
   googletest.main()

ai_edge_torch/generative/test/test_loader.py CHANGED Viewed

@@ -71,7 +71,7 @@ class TestLoader(googletest.TestCase):
       safetensors.torch.save_file(test_weights, file_path)
       cfg = tiny_llama.get_model_config()
       cfg.num_layers = 1
-      model = tiny_llama.TinyLLamma(cfg)
+      model = tiny_llama.TinyLlama(cfg)
       loader = loading_utils.ModelLoader(file_path, tiny_llama.TENSOR_NAMES)
       # if returns successfully, it means all the tensors were initiallized.

ai-edge-torch-nightly 0.3.0.dev20240909__py3-none-any.whl → 0.3.0.dev20240913__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240909py3-none-any.whl → 0.3.0.dev20240913py3-none-any.whl