PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/layers/feed_forward.py CHANGED Viewed

@@ -14,11 +14,10 @@
 # ==============================================================================
 # Common building blocks for FeedForward layers.
-from typing import Callable
+from typing import Callable, Optional
 import torch
 from torch import nn
-import torch.nn.functional as F
 class SequentialFeedForward(nn.Module):
@@ -30,19 +29,30 @@ class SequentialFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      use_glu=False,
+      pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+      post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
     Args:
-      dim(int): embedding size.
-      hidden_dim(int): hidden dim size of the feedforward layer.
-      activation(Callable): activation function used in this block.
-      use_bias(Boolean): whether to use bias. Default is false.
+      dim (int): embedding size.
+      hidden_dim (int): hidden dim size of the feedforward layer.
+      activation (Callable): activation function used in this block.
+      use_bias (Boolean): whether to use bias. Default is false.
+      use_glu (Boolean): whether to use glu in activation. Default is false.
+      pre_ff_norm (Callable): pre feedforward norm. Default is None.
+      post_ff_norm (Callable): post feedforward norm. Default is None.
     """
     super().__init__()
     self.act = activation
-    self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    if use_glu:
+      self.w1 = nn.Linear(dim, hidden_dim * 2, bias=use_bias)
+    else:
+      self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
+    self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x
+    self.post_ff_norm = post_ff_norm if post_ff_norm else lambda x: x
   def forward(self, x):
     """Forward pass for Feedforward layer.
@@ -53,7 +63,9 @@ class SequentialFeedForward(nn.Module):
     Returns:
       torch.Tensor: output tensor after feedforward.
     """
-    return self.w2(self.act(self.w1(x)))
+    x_norm = self.pre_ff_norm(x)
+    out = self.w2(self.act(self.w1(x_norm)))
+    return self.post_ff_norm(out)
 class GatedFeedForward(nn.Module):
@@ -68,20 +80,31 @@ class GatedFeedForward(nn.Module):
       hidden_dim: int,
       activation: Callable[[torch.Tensor], torch.Tensor],
       use_bias=False,
+      use_glu=False,
+      pre_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+      post_ff_norm: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
   ):
     """Init function for feedforward layer.
     Args:
-      dim(int): embedding size.
-      hidden_dim(int): hidden dim size of the feedforward layer.
-      activation(Callable): activation function used in this block.
-      use_bias(Boolean): whether to use bias. Default is false.
+      dim (int): embedding size.
+      hidden_dim (int): hidden dim size of the feedforward layer.
+      activation (Callable): activation function used in this block.
+      use_bias (Boolean): whether to use bias. Default is false.
+      use_glu (Boolean): whether to use glu in activation. Default is false.
+      pre_ff_norm (Callable): pre feedforward norm. Default is None.
+      post_ff_norm (Callable): post feedforward norm. Default is None.
     """
     super().__init__()
     self.act = activation
-    self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    if use_glu:
+      self.w1 = nn.Linear(dim, hidden_dim * 2, bias=use_bias)
+    else:
+      self.w1 = nn.Linear(dim, hidden_dim, bias=use_bias)
     self.w2 = nn.Linear(hidden_dim, dim, bias=use_bias)
     self.w3 = nn.Linear(dim, hidden_dim, bias=use_bias)
+    self.pre_ff_norm = pre_ff_norm if pre_ff_norm else lambda x: x
+    self.post_ff_norm = post_ff_norm if post_ff_norm else lambda x: x
   def forward(self, x):
     """Forward pass for Feedforward layer.
@@ -92,4 +115,6 @@ class GatedFeedForward(nn.Module):
     Returns:
       torch.Tensor: output tensor after feedforward.
     """
-    return self.w2(self.act(self.w1(x)) * self.w3(x))
+    x_norm = self.pre_ff_norm(x)
+    out = self.w2(self.act(self.w1(x_norm)) * self.w3(x_norm))
+    return self.post_ff_norm(out)

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -12,72 +12,184 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# `nn.Module` which implements a KV cache.
+"""Utility functions for externalized KV Cache."""
+import dataclasses
+from typing import List, Tuple
+from ai_edge_torch import hlfb
+from ai_edge_torch.generative.layers import model_config
 import torch
-from torch import nn
-import torch_xla
+import torch.utils._pytree as pytree
-from ai_edge_torch.hlfb import StableHLOCompositeBuilder
+BATCH_SIZE = 1
-class KVCache(nn.Module):
+@dataclasses.dataclass
+class KVCacheEntry:
+  """A single cache entry that includes K and V caches.
-  def __init__(self, batch_size, kv_cache_max, n_heads, head_dim, enable_hlfb=False):
-    """Initializes the KVCache layer.
+  The chaches are built based on the provided config with the shape of
+  (batch_size=1, kv_cache_max, num_query_groups, head_dim).
+  """
-    Args:
-      batch_size (int): batch size. Currently only batch size 1 is supported.
-      kv_cache_max (int): the max length of KV cache.
-      n_heads (int): number of kv heads.
-      head_dim (int): the head dimension size.
-      enable_hlfb (bool): whether hlfb is enabled or not.
-    """
-    super().__init__()
-    cache_shape = (batch_size, kv_cache_max, n_heads, head_dim)
-    self.register_buffer("k_cache", torch.zeros(cache_shape), persistent=False)
-    self.register_buffer("v_cache", torch.zeros(cache_shape), persistent=False)
-    self.enable_hlfb = enable_hlfb
-    self.kv_cache_max = kv_cache_max
+  k_cache: torch.Tensor
+  v_cache: torch.Tensor
-  def update_cache(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache.
+  @classmethod
+  def from_model_config(
+      cls,
+      kv_cache_max: int,
+      config: model_config.AttentionConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCacheEntry":
+    """Build an instance of the class based on model config."""
+    shape = (BATCH_SIZE, kv_cache_max, config.num_query_groups, config.head_dim)
+    k = torch.zeros(shape, dtype=dtype, device=device)
+    v = torch.zeros(shape, dtype=dtype, device=device)
+    obj = cls(k_cache=k, v_cache=v)
+    return obj
-    Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
-    Returns:
-      The updated key and value tensor.
-    """
-    if self.enable_hlfb:
-      return self.update_cache_with_hlfb(input_pos, k_val, v_val)
+@dataclasses.dataclass
+class KVCache:
+  """A utility class for holding KV cache entries per layer."""
-    updated_k = self.k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = self.v_cache.index_copy_(1, input_pos, v_val)
-    # Here we need a clone otherwise dynamo export will fail.
-    return torch.clone(updated_k), torch.clone(updated_v)
+  caches: Tuple[KVCacheEntry, ...]
-  def update_cache_with_hlfb(self, input_pos, k_val, v_val):
-    """Update an entry in the KV cache and enable high-level function boundary.
+  @classmethod
+  def from_model_config(
+      cls,
+      config: model_config.ModelConfig,
+      dtype: torch.dtype = torch.float32,
+      device: torch.device = None,
+  ) -> "KVCache":
+    """Build an instance of the class based on model config.
     Args:
-      input_pos (torch.Tensor): the input position.
-      k_val (torch.Tensor): the new `key` value.
-      v_val (torch.Tensor): the new `value` value.
+        config (ModelConfig): Model config used for building the cache.
+        dtype (torch.dtype, optional): The data type of the cache tensor.
+          Defaults to torch.float32.
+        device (torch.device, optional): The device placement of the cache
+          tensors. Defaults to None.
     Returns:
-      The updated key and value tensor.
+        KVCache: The created cache object.
     """
+    caches = [
+        KVCacheEntry.from_model_config(
+            config.kv_cache_max,
+            config.block_config(idx).attn_config,
+            dtype,
+            device,
+        )
+        for idx in range(config.num_layers)
+    ]
+    obj = cls(caches=tuple(caches))
+    return obj
-    builder = StableHLOCompositeBuilder(
-        name="odml.update_kv_cache", attr={"kv_cache_max": self.kv_cache_max}
-    )
-    k_cache, v_cache, input_pos, k_val, v_val = builder.mark_inputs(
-        self.k_cache, self.v_cache, input_pos, k_val, v_val
+  def flatten(self) -> List[torch.Tensor]:
+    """Flatten the cache entries into a list of tensors with order k_i, v_i."""
+    flattened, _ = _flatten_kvc(self)
+    return flattened
+def _flatten_kvc(kvc: KVCache) -> Tuple[List[str], List[str]]:
+  flattened = []
+  flat_names = []
+  none_names = []
+  for i, kv_entry in enumerate(kvc.caches):
+    flattened.append(kv_entry.k_cache)
+    flat_names.append(f"k_{i}")
+    flattened.append(kv_entry.v_cache)
+    flat_names.append(f"v_{i}")
+  return flattened, [flat_names, none_names]
+def _flatten_kvc_with_keys(kvc: KVCache) -> Tuple[List, List]:
+  flattened, (flat_names, none_names) = _flatten_kvc(kvc)
+  return [
+      (pytree.MappingKey(k), v) for k, v in zip(flat_names, flattened)
+  ], flat_names
+def _unflatten_kvc(
+    values: List[torch.Tensor], context: Tuple[List, List]
+) -> KVCache:
+  assert len(values) % 2 == 0, "Found odd number of K and V entries."
+  num_layers = len(values) // 2
+  flat_names = context[0]
+  kv_entries = []
+  for i in range(num_layers):
+    k_cache_idx = flat_names.index(f"k_{i}")
+    v_cache_idx = flat_names.index(f"v_{i}")
+    kv_entries.append(
+        KVCacheEntry(k_cache=values[k_cache_idx], v_cache=values[v_cache_idx])
     )
-    updated_k = k_cache.index_copy_(1, input_pos, k_val)
-    updated_v = v_cache.index_copy_(1, input_pos, v_val)
-    updated_k, updated_v = builder.mark_outputs(updated_k, updated_v)
-    return updated_k, updated_v
+  obj = KVCache(tuple(kv_entries))
+  return obj
+pytree.register_pytree_node(
+    KVCache,
+    _flatten_kvc,
+    _unflatten_kvc,
+    flatten_with_keys_fn=_flatten_kvc_with_keys,
+    serialized_type_name="",
+)
+def update(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+    enable_hlfb: bool = True,
+) -> KVCacheEntry:
+  """Out of place update of Cache buffer.
+  Args:
+      cache (KVCacheEntry): The original cache buffer.
+      input_pos (torch.Tensor): The update slice positions.
+      k_slice (torch.Tensor): The K slice to be updated in the new cache.
+      v_slice (torch.Tensor): The V slice to be updated in the new cache.
+      enable_hlfb (bool, optional): Whether the op is annotated for export with
+        High Level Function Boundary. Defaults to True.
+  Returns:
+      KVCacheEntry: The updated KVCache entry based on the passed inputs.
+  """
+  update_func = _update_kv_hlfb_impl if enable_hlfb else _update_kv_base_impl
+  return update_func(cache, input_pos, k_slice, v_slice)
+def _update_kv_base_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer without High Level Function Boundary annotation."""
+  k = cache.k_cache.index_copy(1, input_pos.to(torch.long), k_slice)
+  v = cache.v_cache.index_copy(1, input_pos.to(torch.long), v_slice)
+  updated_cache = KVCacheEntry(k, v)
+  return updated_cache
+def _update_kv_hlfb_impl(
+    cache: KVCacheEntry,
+    input_pos: torch.Tensor,
+    k_slice: torch.Tensor,
+    v_slice: torch.Tensor,
+) -> KVCacheEntry:
+  """Update the cache buffer with High Level Function Boundary annotation."""
+  builder = hlfb.StableHLOCompositeBuilder(name="odml.update_external_kv_cache")
+  k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
+      cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
+  )
+  k = k_cache.index_copy(1, input_pos.to(torch.long), k_slice)
+  v = v_cache.index_copy(1, input_pos.to(torch.long), v_slice)
+  k, v = builder.mark_outputs(k, v)
+  return KVCacheEntry(k, v)

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from dataclasses import field
 import enum
-from typing import Optional
+from typing import Optional, Sequence, Union
 @enum.unique
@@ -30,6 +30,7 @@ class ActivationType(enum.Enum):
   GELU_QUICK = enum.auto()
   GE_GLU = enum.auto()
   RELU = enum.auto()
+  SILU_GLU = enum.auto()
 @enum.unique
@@ -53,11 +54,32 @@ class FeedForwardType(enum.Enum):
   GATED = enum.auto()
+class AttentionType(enum.Enum):
+  GLOBAL = enum.auto()
+  LOCAL_SLIDING = enum.auto()
+@dataclass
+class NormalizationConfig:
+  """Normalizater parameters."""
+  type: NormalizationType = NormalizationType.NONE
+  enable_hlfb: bool = False
+  epsilon: float = 1e-5
+  zero_centered: bool = False
+  # Number of groups used in group normalization.
+  group_num: Optional[float] = None
+  # Whether to use the input shape to determine the dimension of normalization
+  # when type is LAYER_NORM.
+  use_input_shape: bool = True
 @dataclass
 class AttentionConfig:
-  """Attention moduel's parameters."""
+  """Attention model's parameters."""
   num_heads: int
+  head_dim: int
   # Used to determine number of groups in grouped query attention (GQA)
   # https://arxiv.org/pdf/2305.13245.pdf
   num_query_groups: Optional[int]
@@ -75,8 +97,22 @@ class AttentionConfig:
   # Whether to use bias with attention output projection.
   output_proj_use_bias: bool = False
   enable_kv_cache: bool = True
+  # The normalization applied to query projection's output.
+  query_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
+  # The normalization applied to key projection's output.
+  key_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
   relative_attention_num_buckets: int = 0
   relative_attention_max_distance: int = 0
+  # Softcap on the output logits.
+  logit_softcap: Optional[float] = None
+  # The type of attention.
+  attn_type: Optional[AttentionType] = None
+  # The size of the sliding window used for local attention.
+  sliding_window_size: Optional[int] = None
 @dataclass
@@ -95,17 +131,37 @@ class FeedForwardConfig:
   activation: ActivationConfig
   intermediate_size: int
   use_bias: bool = False
+  # The normalization applied to feed forward's input.
+  pre_ff_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
+  # The normalization applied to feed forward's output.
+  post_ff_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
 @dataclass
-class NormalizationConfig:
-  """Normalizater parameters."""
+class TransformerBlockConfig:
+  """TransformerBlock module's parameters."""
-  type: NormalizationType = NormalizationType.NONE
-  epsilon: float = 1e-5
-  zero_centered: bool = False
-  # Number of groups used in group normalization.
-  group_num: Optional[float] = None
+  attn_config: AttentionConfig
+  ff_config: FeedForwardConfig
+  # The normalization applied to attention's input.
+  pre_attention_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
+  # The normalization applied to attentions's output.
+  post_attention_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
+  # If set to True, only attn_config.pre_attention_norm is applied to the input
+  # and the decode's output is computed as `output = input + attn_out + ff_out`
+  # where attention and feed forward are called with pre_attention_norm's
+  # output.
+  parallel_residual: bool = False
+  # The Attention computation will include relative positional bias.
+  relative_attention: bool = False
 @dataclass
@@ -117,21 +173,15 @@ class ModelConfig:
   max_seq_len: int
   embedding_dim: int
-  attn_config: AttentionConfig
-  ff_config: FeedForwardConfig
-  # The normalization applied to attention's input.
-  pre_attention_norm_config: NormalizationConfig = field(
+  # TransformerBlockConfig for each layer block. If a single
+  # TransformerBlockConfig is provided, it will be used for all layers.
+  block_configs: Union[TransformerBlockConfig, Sequence[TransformerBlockConfig]]
+  # The normalization applied before LM head.
+  final_norm_config: NormalizationConfig = field(
       default_factory=NormalizationConfig
   )
-  # The normalization applied to feed forward's input.
-  pre_ff_norm_config: NormalizationConfig = field(default_factory=NormalizationConfig)
-  # The normalization applied before LM head.
-  final_norm_config: NormalizationConfig = field(default_factory=NormalizationConfig)
-  # If set to True, only pre_attention_norm is applied to the input and the
-  # decode's output is computed as `output = input + attn_out + ff_out` where
-  # attention and feed forward are called with pre_attention_norm's output.
-  parallel_residual: bool = False
   # Use bias term within LLM's HEAD.
   lm_head_use_bias: bool = False
   # Whether to turn on high-level function boundary.
@@ -140,19 +190,23 @@ class ModelConfig:
   # The maximum sequence length of the KV cache. Should not exceed max_seq_len.
   kv_cache_max_len: int = 0
-  # The Attention computation will include relative positional bias.
-  relative_attention: bool = False
   # Default batch size of the exported model. Default value is 1.
   batch_size: int = 1
+  # Softcap on the model output logits.
+  final_logit_softcap: Optional[float] = None
   @property
   def kv_cache_max(self) -> int:
     if self.kv_cache_max_len > 0:
       return self.kv_cache_max_len
-    else:
-      return self.max_seq_len
-  @property
-  def head_dim(self) -> int:
-    return self.embedding_dim // self.attn_config.num_heads
+    return self.max_seq_len
+  def block_config(self, idx: int) -> TransformerBlockConfig:
+    if isinstance(self.block_configs, TransformerBlockConfig):
+      return self.block_configs
+    if idx < 0 or idx >= len(self.block_configs):
+      raise ValueError(
+          f"Index {idx} is out of range for layer configs: {self.block_configs}"
+      )
+    return self.block_configs[idx]

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl