PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240610__py3-none-any.whl → 0.2.0.dev20240617__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240610py3-none-any.whl → 0.2.0.dev20240617py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (30) hide show

ai_edge_torch/generative/examples/t5/t5_attention.py CHANGED Viewed

@@ -20,6 +20,7 @@ import torch
 from torch import nn
 import torch.nn.functional as F
+from ai_edge_torch.generative.layers.attention import CrossAttention
 import ai_edge_torch.generative.layers.builder as builder
 from ai_edge_torch.generative.layers.kv_cache import KVCache
 import ai_edge_torch.generative.layers.model_config as cfg
@@ -122,7 +123,7 @@ class EncoderDecoderBlock(nn.Module):
     return hidden_states, position_bias, encoder_decoder_position_bias
-class T5Attention(nn.Module):
+class T5Attention(CrossAttention):
   def __init__(
       self,
@@ -138,51 +139,21 @@ class T5Attention(nn.Module):
     Args:
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
+      norm_config (cfg.NormalizationConfig): normalization configure before attention.
       kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
       enable_hlfb (bool): whether hlfb is enabled or not.
       has_relative_attention_bias (bool): whether we compute relative bias.
     """
-    super().__init__()
+    super().__init__(dim, dim, config, kv_cache_max, enable_hlfb)
     self.pre_atten_norm = builder.build_norm(dim, norm_config)
     self.has_relative_attention_bias = has_relative_attention_bias
     self.relative_attention_num_buckets = config.relative_attention_num_buckets
-    self.d_model = dim
-    self.head_dim = dim // config.num_heads
-    self.n_heads = config.num_heads
-    self.inner_dim = self.n_heads * self.head_dim
-    self.q = nn.Linear(self.d_model, self.inner_dim, bias=config.qkv_use_bias)
-    self.k = nn.Linear(self.d_model, self.inner_dim, bias=config.qkv_use_bias)
-    self.v = nn.Linear(self.d_model, self.inner_dim, bias=config.qkv_use_bias)
-    # output projection
-    self.proj = nn.Linear(
-        self.inner_dim, self.d_model, bias=config.output_proj_use_bias
-    )
     if self.has_relative_attention_bias:
       self.relative_attention_bias = nn.Embedding(
           self.relative_attention_num_buckets, self.n_heads
       )
-    self.config = config
-    self.kv_cache = None
-    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    # Now only supports a max batch_size of 1.
-    if config.enable_kv_cache:
-      self.kv_cache = KVCache(
-          1,
-          kv_cache_max,
-          config.num_query_groups,
-          self.head_dim,
-          enable_hlfb,
-      )
-    if enable_hlfb:
-      self.sdpa_func = scaled_dot_product_attention_with_hlfb
-    else:
-      self.sdpa_func = scaled_dot_product_attention
   def forward(
       self,
       x: torch.Tensor,
@@ -206,7 +177,7 @@ class T5Attention(nn.Module):
     x = self.pre_atten_norm(x)
     B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
-    query_states = self.q(x)
+    query_states = self.q_projection(x)
     query_states = query_states.reshape(B, T, -1, self.head_dim)  # (B, T, nh_q, hs)
     if key_value_states is not None:
@@ -217,13 +188,13 @@ class T5Attention(nn.Module):
       ) = (
           key_value_states.size()
       )  # batch size, sequence length, embedding dimensionality (n_embd)
-      key_states = self.k(key_value_states)
-      value_states = self.v(key_value_states)
+      key_states = self.k_projection(key_value_states)
+      value_states = self.v_projection(key_value_states)
       key_states = key_states.reshape(kvB, kvT, -1, self.head_dim)
       value_states = value_states.reshape(kvB, kvT, -1, self.head_dim)
     else:
-      key_states = self.k(x)
-      value_states = self.v(x)
+      key_states = self.k_projection(x)
+      value_states = self.v_projection(x)
       key_states = key_states.reshape(B, T, -1, self.head_dim)
       value_states = value_states.reshape(B, T, -1, self.head_dim)
@@ -251,5 +222,5 @@ class T5Attention(nn.Module):
     )
     y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
     # output projection
-    y = self.proj(y)
+    y = self.output_projection(y)
     return y, position_bias

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -28,6 +28,33 @@ from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_
 from ai_edge_torch.generative.layers.scaled_dot_product_attention import scaled_dot_product_attention_with_hlfb  # NOQA
+def _embed_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    n_elem: int,
+    rope: Tuple[torch.Tensor, torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+  """Embed rotary positional embedding for query and key.
+  Args:
+    q (torch.Tensor): query tensor.
+    k (torch.Tensor): key tensor.
+    n_elem (int): number of elements to embed rotarty positional embedding.
+    rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
+  """
+  if n_elem > 0:
+    cos, sin = rope
+    q_roped = rotary_pos_emb.apply_rope(
+        q[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
+    )
+    k_roped = rotary_pos_emb.apply_rope(
+        k[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
+    )
+    q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
+    k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
+  return q, k
 class TransformerBlock(nn.Module):
   def __init__(self, config: cfg.ModelConfig) -> None:
@@ -43,6 +70,7 @@ class TransformerBlock(nn.Module):
         config.embedding_dim, config.pre_attention_norm_config
     )
     self.atten_func = CausalSelfAttention(
+        config.batch_size,
         config.embedding_dim,
         config.attn_config,
         config.kv_cache_max,
@@ -92,6 +120,7 @@ class CausalSelfAttention(nn.Module):
   def __init__(
       self,
+      batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
       kv_cache_max: int,
@@ -100,6 +129,7 @@ class CausalSelfAttention(nn.Module):
     """Initialize an instance of CausalSelfAttention.
     Args:
+      batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
       kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
@@ -113,13 +143,12 @@ class CausalSelfAttention(nn.Module):
     self.output_projection = nn.Linear(dim, dim, bias=config.output_proj_use_bias)
     self.config = config
     self.kv_cache = None
+    self.batch_size = batch_size
     # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
-    # Now only supports batch_size of 1.
-    # TODO(haoliang): support batch_size greater than 1.
     if config.enable_kv_cache:
       self.kv_cache = KVCache(
-          1,
+          batch_size,
           kv_cache_max,
           config.num_query_groups,
           self.head_dim,
@@ -152,42 +181,38 @@ class CausalSelfAttention(nn.Module):
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
-    assert B == 1, "Currently only batch_size = 1 is supported."
+    assert (
+        B == self.batch_size
+    ), "batch size of input tensor must match with the batch size specified in the model configuration."
     qkv = self.qkv_projection(x)
     # Assemble into a number of query groups to support MHA, MQA and GQA.
     q_per_kv = self.config.num_heads // self.config.num_query_groups
-    total_qkv = q_per_kv + 2  # Each group has >=1 queries, 1 key, and 1 value.
+    # Each group has >=1 queries, 1 key, and 1 value.
     if self.config.qkv_transpose_before_split:
-      qkv = qkv.view(
-          B, T, total_qkv, self.config.num_query_groups, self.head_dim
-      )  # (B, T, total_qkv, num_query_groups, head_dim)
-      qkv_axis = -3
+      qkv = qkv.view(B, T, -1, self.head_dim)
+      q, k, v = qkv.split(
+          (
+              q_per_kv * self.config.num_query_groups,
+              self.config.num_query_groups,
+              self.config.num_query_groups,
+          ),
+          dim=-2,
+      )
     else:
-      qkv = qkv.view(
-          B, T, self.config.num_query_groups, total_qkv, self.head_dim
-      )  # (B, T, num_query_groups, total_qkv, head_dim)
-      qkv_axis = -2
+      qkv = qkv.view(B, T, self.config.num_query_groups, -1)
+      q, k, v = qkv.split(
+          (q_per_kv * self.head_dim, self.head_dim, self.head_dim), dim=-1
+      )
-    # Split batched computation into three.
-    q, k, v = qkv.split((q_per_kv, 1, 1), dim=qkv_axis)
     q = q.reshape(B, T, -1, self.head_dim)
     k = k.reshape(B, T, -1, self.head_dim)
     v = v.reshape(B, T, -1, self.head_dim)
     # Compute rotary positional embedding for query and key.
     n_elem = int(self.config.rotary_percentage * self.head_dim)
-    if n_elem > 0:
-      cos, sin = rope
-      q_roped = rotary_pos_emb.apply_rope(
-          q[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-      )
-      k_roped = rotary_pos_emb.apply_rope(
-          k[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-      )
-      q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
-      k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
+    q, k = _embed_rope(q, k, n_elem, rope)
     if self.kv_cache is not None:
       # TODO(haoliang): Handle when execeeding max sequence length.
@@ -222,5 +247,108 @@ class SelfAttention(CausalSelfAttention):
     """
     B, T, _ = x.size()
     return super().forward(
-        x, rope=rope, mask=torch.zeros((B, T), dtype=torch.float32), input_pos=input_pos
+        x,
+        rope=rope,
+        mask=torch.zeros((B, 1, T, T), dtype=torch.float32),
+        input_pos=input_pos,
     )
+class CrossAttention(nn.Module):
+  def __init__(
+      self,
+      batch_size: int,
+      query_dim: int,
+      cross_dim: int,
+      config: cfg.AttentionConfig,
+      kv_cache_max: int,
+      enable_hlfb: bool,
+  ) -> None:
+    """Initialize an instance of CrossAttention.
+    Args:
+      batch_size (int): batch size of the input tensor.
+      query_dim (int): query tensor's dimension.
+      cross_dim (int): cross attention's dimensions, for key and value tensors.
+      config (cfg.AttentionConfig): attention specific configurations.
+      kv_cache_max (int): determines the size of the KV Cache buffer, if enabled.
+      enable_hlfb (bool): whether hlfb is enabled or not.
+    """
+    super().__init__()
+    self.config = config
+    self.head_dim = query_dim // config.num_heads
+    self.n_heads = config.num_heads
+    self.q_projection = nn.Linear(query_dim, query_dim, bias=config.qkv_use_bias)
+    self.k_projection = nn.Linear(cross_dim, query_dim, bias=config.qkv_use_bias)
+    self.v_projection = nn.Linear(cross_dim, query_dim, bias=config.qkv_use_bias)
+    self.output_projection = nn.Linear(
+        query_dim, query_dim, bias=config.output_proj_use_bias
+    )
+    self.kv_cache = None
+    # Build a k/v cache with size (batch_size, kv_cache_max, n_heads, head_dim).
+    if config.enable_kv_cache:
+      self.kv_cache = KVCache(
+          batch_size,
+          kv_cache_max,
+          config.num_query_groups,
+          self.head_dim,
+          enable_hlfb,
+      )
+    if enable_hlfb:
+      self.sdpa_func = scaled_dot_product_attention_with_hlfb
+    else:
+      self.sdpa_func = scaled_dot_product_attention
+  def forward(
+      self,
+      x: torch.Tensor,
+      y: torch.Tensor,
+      rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+      mask: Optional[torch.Tensor] = None,
+      input_pos: Optional[torch.Tensor] = None,
+  ):
+    """Forward function of the CrossAttention layer.
+    Args:
+      x (torch.Tensor): the target tensor, with shape [B, target_seq_len, ...].
+      y (torch.Tensor): the source tensor, with shape [B, source_seq_len, ...].
+      rope (Tuple[torch.Tensor, torch.Tensor]): the optional input rope tensor.
+      mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape [B, n_heads, target_seq_len, source_seq_len].
+      input_pos (torch.Tensor): the optional input position tensor.
+    Returns:
+      output activation from this cross attention layer.
+    """
+    batch_size = x.size()[0]
+    target_seq_len = x.size()[1]
+    source_seq_len = y.size()[1]
+    q = self.q_projection(x)
+    k = self.k_projection(y)
+    v = self.v_projection(y)
+    interim_shape = (batch_size, -1, self.n_heads, self.head_dim)
+    q = q.view(interim_shape)
+    k = k.view(interim_shape)
+    v = v.view(interim_shape)
+    # Compute rotary positional embedding for query and key.
+    n_elem = int(self.config.rotary_percentage * self.head_dim)
+    q, k = _embed_rope(q, k, n_elem, rope)
+    if self.kv_cache is not None:
+      # TODO(haoliang): Handle when execeeding max sequence length.
+      k, v = self.kv_cache.update_cache(input_pos, k, v)
+    if mask is None:
+      mask = torch.zeros(
+          (batch_size, 1, target_seq_len, source_seq_len), dtype=torch.float32
+      )
+    y = self.sdpa_func(q, k, v, self.head_dim, mask=mask)
+    y = y.reshape(batch_size, target_seq_len, -1)
+    # Compute the output projection.
+    y = self.output_projection(y)
+    return y

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ActivationType(enum.Enum):
   SILU = enum.auto()
   GELU = enum.auto()
   GELU_TANH = enum.auto()
+  GELU_QUICK = enum.auto()
   GE_GLU = enum.auto()
   RELU = enum.auto()
@@ -138,6 +139,9 @@ class ModelConfig:
   # The Attention computation will include relative positional bias.
   relative_attention: bool = False
+  # Default batch size of the exported model. Default value is 1.
+  batch_size: int = 1
   @property
   def kv_cache_max(self) -> int:
     if self.kv_cache_max_len > 0:

ai-edge-torch-nightly 0.2.0.dev20240610__py3-none-any.whl → 0.2.0.dev20240617__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.2.0.dev20240610py3-none-any.whl → 0.2.0.dev20240617py3-none-any.whl