PyPI - ai-edge-torch-nightly - Versions diffs - 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl

Files changed (169) hide show

ai_edge_torch/generative/layers/normalization.py CHANGED Viewed

@@ -14,19 +14,22 @@
 # ==============================================================================
 # Common normalization layers.
+from ai_edge_torch.hlfb import StableHLOCompositeBuilder
 import torch
+from torch import nn
+import torch.nn.functional as F
 # Implementation for RMSNorm from: https://arxiv.org/abs/1910.07467
 class RMSNorm(torch.nn.Module):
   def __init__(self, dim: int, eps: float = 1e-6, zero_centered_gamma=False):
-    """
-    Initialize the RMSNorm layer.
+    """Initialize the RMSNorm layer.
     Args:
       dim (int): dimension of the input tensor.
-      eps (float): A small float value to ensure numerical stability (default: 1e-6).
+      eps (float): A small float value to ensure numerical stability (default:
+        1e-6).
     """
     super().__init__()
     self.eps = eps
@@ -34,8 +37,7 @@ class RMSNorm(torch.nn.Module):
     self.zero_centered_gamma = zero_centered_gamma
   def _norm(self, x):
-    """
-    Apply RMSNorm normalization.
+    """Apply RMSNorm normalization.
     Args:
       x (torch.Tensor): input tensor.
@@ -46,8 +48,7 @@ class RMSNorm(torch.nn.Module):
     return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
   def forward(self, x):
-    """
-    Running the forward pass of RMSNorm layer.
+    """Running the forward pass of RMSNorm layer.
     Args:
       x (torch.Tensor): input tensor.
@@ -60,3 +61,180 @@ class RMSNorm(torch.nn.Module):
       return output * (1 + self.weight)
     else:
       return output * self.weight
+class GroupNorm(torch.nn.Module):
+  def __init__(
+      self,
+      group_num: int,
+      dim: int,
+      eps: float = 1e-5,
+      enable_hlfb: bool = False,
+  ):
+    """Initialize the GroupNorm layer.
+    Args:
+      group_num (int): Number of groups to separate the channels into.
+      dim (int): Dimension of the input tensor.
+      eps (float): A small float value to ensure numerical stability (default:
+        1e-5).
+      enable_hlfb (bool): Whether to convert this normalization into a single
+        op.
+    """
+    super().__init__()
+    self.enable_hlfb = enable_hlfb
+    self.group_num = group_num
+    self.eps = eps
+    self.weight = torch.nn.Parameter(torch.ones(dim))
+    self.bias = torch.nn.Parameter(torch.ones(dim))
+  def forward(self, x):
+    """Running the forward pass of GroupNorm layer.
+    Args:
+      x (torch.Tensor): input tensor.
+    Returns:
+      torch.Tensor: output tensor after applying GroupNorm.
+    """
+    if self.enable_hlfb:
+      return group_norm_with_hlfb(
+          x,
+          self.weight,
+          self.bias,
+          self.group_num,
+          self.eps,
+      )
+    else:
+      return F.group_norm(x, self.group_num, self.weight, self.bias, self.eps)
+class LayerNorm(torch.nn.Module):
+  def __init__(
+      self,
+      dim: int,
+      eps: float = 1e-5,
+      enable_hlfb: bool = False,
+      use_input_shape: bool = True,
+  ):
+    """Initialize the LayerNorm layer.
+    Args:
+      dim (int): dimension of the input tensor.
+      eps (float): A small float value to ensure numerical stability (default:
+        1e-6).
+      enable_hlfb (bool): Whether to convert this normalization into a single
+        op.
+      use_input_shape (bool): Whether to use the input shape to determine the
+        dimension of normalization (default: True).
+    """
+    super().__init__()
+    self.enable_hlfb = enable_hlfb
+    self.use_input_shape = use_input_shape
+    self.eps = eps
+    self.weight = torch.nn.Parameter(torch.ones(dim))
+    self.bias = torch.nn.Parameter(torch.ones(dim))
+  def forward(self, x):
+    """Running the forward pass of LayerNorm layer.
+    Args:
+      x (torch.Tensor): input tensor.
+    Returns:
+      torch.Tensor: output tensor after applying LayerNorm.
+    """
+    if self.enable_hlfb:
+      return layer_norm_with_hlfb(
+          x, self.weight, self.bias, self.eps, self.use_input_shape
+      )
+    if self.use_input_shape:
+      normalized_shape = x.shape
+      weight = self.weight.broadcast_to(x.shape)
+      bias = self.bias.broadcast_to(x.shape)
+    else:
+      normalized_shape = self.weight.shape
+      weight = self.weight
+      bias = self.bias
+    return F.layer_norm(x, normalized_shape, weight, bias, self.eps)
+def group_norm_with_hlfb(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    b: torch.Tensor,
+    num_groups: int,
+    eps: float,
+):
+  """Group Normalization with high-level function boundary enabled.
+  Args:
+    x (torch.Tensor): Input tensor for Group Normalization, with BCHW shape.
+    w (torch.Tensor): The weight tensor for the normalization.
+    b (torch.Tensor): The bias tensor for the normalization.
+    num_groups (int): Number of groups to separate the channels into.
+    eps (float): A small float value to ensure numerical stability.
+  Returns:
+    The output tensor of Group Normalization.
+  """
+  x = torch.permute(x, (0, 2, 3, 1))
+  # TODO: b/366544750 - Change "reduction_axes" field as an array, rather than
+  # int32 when the bug is fixed.
+  builder = StableHLOCompositeBuilder(
+      name="odml.group_norm",
+      attr={
+          "num_groups": num_groups,
+          "epsilon": eps,
+          "reduction_axes": 3,
+          "channel_axis": 3,
+      },
+  )
+  x, w, b = builder.mark_inputs(x, w, b)
+  x = torch.permute(x, (0, 3, 1, 2))
+  y = F.group_norm(x, num_groups, weight=w, bias=b, eps=eps)
+  y = torch.permute(y, (0, 2, 3, 1))
+  y = builder.mark_outputs(y)
+  y = torch.permute(y, (0, 3, 1, 2))
+  return y
+def layer_norm_with_hlfb(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    b: torch.Tensor,
+    eps: float,
+    use_input_shape: bool,
+):
+  """Layer Normalization with high-level function boundary enabled.
+  Args:
+    x (torch.Tensor): Input tensor for Layer Normalization, with BCHW shape.
+    w (torch.Tensor): The weight tensor for the normalization.
+    b (torch.Tensor): The bias tensor for the normalization.
+    eps (float): A small float value to ensure numerical stability.
+    use_input_shape (bool): Whether to use the input shape to determine the
+      dimension of normalization.
+  Returns:
+    The output tensor of Layer Normalization.
+  """
+  builder = StableHLOCompositeBuilder(
+      name="odml.group_norm",
+      attr={"num_groups": 1, "epsilon": eps, "channel_axis": 1},
+  )
+  x, w, b = builder.mark_inputs(x, w, b)
+  if use_input_shape:
+    normalized_shape = x.shape
+    w = w.broadcast_to(x.shape)
+    b = b.broadcast_to(x.shape)
+  else:
+    normalized_shape = w.shape
+  y = F.layer_norm(x, normalized_shape, w, b, eps=eps)
+  y = builder.mark_outputs(y)
+  return y

ai_edge_torch/generative/layers/rotary_position_embedding.py CHANGED Viewed

@@ -16,13 +16,15 @@
 import torch
-def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+def apply_rope(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> torch.Tensor:
   """Computes rotary positional embedding.
   Args:
-    x(torch.Tensor): the input tensor.
-    cos(torch.Tensor): cosine value for the rope.
-    sin(torch.Tensor): sin value for the rope.
+    x: the input tensor.
+    cos: cosine value for the rope.
+    sin: sin value for the rope.
   Returns:
     output tensor of RoPE.

ai_edge_torch/generative/layers/scaled_dot_product_attention.py CHANGED Viewed

@@ -17,11 +17,10 @@
 import math
 from typing import Optional
+from ai_edge_torch.hlfb import StableHLOCompositeBuilder
 import torch
 import torch.nn.functional as F
-from ai_edge_torch.hlfb import StableHLOCompositeBuilder
 def scaled_dot_product_attention(
     q: torch.Tensor,
@@ -30,6 +29,7 @@ def scaled_dot_product_attention(
     head_size: int,
     mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    softcap: Optional[float] = None,
 ):
   """Scaled dot product attention.
@@ -54,15 +54,26 @@ def scaled_dot_product_attention(
     # Handle the GQA case, where q.shape[1] % k.shape[1] == 0.
     k = k.repeat_interleave(q.shape[1] // k.shape[1], dim=1)
     v = v.repeat_interleave(q.shape[1] // v.shape[1], dim=1)
-  y = F.scaled_dot_product_attention(
-      q,
-      k,
-      v,
-      attn_mask=mask,
-      dropout_p=0.0,
-      is_causal=mask is None,
-      scale=scale,
-  )
+  if softcap is None:
+    y = F.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=mask,
+        dropout_p=0.0,
+        is_causal=mask is None,
+        scale=scale,
+    )
+  else:
+    q.mul_(scale)
+    scores = q @ k.transpose(-1, -2)
+    scores = scores / softcap
+    scores = torch.tanh(scores)
+    scores = scores * softcap
+    scores = scores + mask
+    out = F.softmax(scores.float(), dim=-1).type_as(q)
+    y = torch.matmul(out, v)
   return y.transpose(1, 2)
@@ -73,6 +84,7 @@ def scaled_dot_product_attention_with_hlfb(
     head_size: int,
     mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    softcap: Optional[float] = None,
 ):
   """Scaled dot product attention with high-level function boundary enabled.
@@ -90,8 +102,13 @@ def scaled_dot_product_attention_with_hlfb(
   if scale is None:
     scale = 1.0 / math.sqrt(head_size)
+  attrs = {"scale": scale}
+  if softcap is not None:
+    attrs["logit_cap"] = softcap
   builder = StableHLOCompositeBuilder(
-      name="odml.scaled_dot_product_attention", attr={"scale": scale}
+      name="odml.scaled_dot_product_attention", attr=attrs
   )
   q, k, v, mask = builder.mark_inputs(q, k, v, mask)
@@ -102,15 +119,25 @@ def scaled_dot_product_attention_with_hlfb(
     # Handle the GQA case, where q.shape[1] % k.shape[1] == 0.
     k = k.repeat_interleave(q.shape[1] // k.shape[1], dim=1)
     v = v.repeat_interleave(q.shape[1] // v.shape[1], dim=1)
-  y = F.scaled_dot_product_attention(
-      q,
-      k,
-      v,
-      attn_mask=mask,
-      dropout_p=0.0,
-      is_causal=mask is None,
-      scale=scale,
-  )
+  if softcap is None:
+    y = F.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=mask,
+        dropout_p=0.0,
+        is_causal=mask is None,
+        scale=scale,
+    )
+  else:
+    q.mul_(scale)
+    scores = q @ k.transpose(-1, -2)
+    scores = scores / softcap
+    scores = torch.tanh(scores)
+    scores = scores * softcap
+    scores = scores + mask
+    out = F.softmax(scores.float(), dim=-1).type_as(q)
+    y = torch.matmul(out, v)
   result = y.transpose(1, 2)
   result = builder.mark_outputs(result)

ai-edge-torch-nightly 0.2.0.dev20240714__py3-none-any.whl → 0.3.0.dev20240926__py3-none-any.whl

ai-edge-torch-nightly 0.2.0.dev20240714py3-none-any.whl → 0.3.0.dev20240926py3-none-any.whl