PyPI - ai-edge-torch-nightly - Versions diffs - 0.5.0.dev20250408__py3-none-any.whl → 0.5.0.dev20250410__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.5.0.dev20250408py3-none-any.whl → 0.5.0.dev20250410py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

ai_edge_torch/_convert/conversion.py CHANGED Viewed

@@ -40,8 +40,8 @@ def _run_convert_passes(
       fx_passes.OptimizeLayoutTransposesPass(),
       fx_passes.CanonicalizePass(),
       fx_passes.BuildAtenCompositePass(),
-      fx_passes.CanonicalizePass(),
       fx_passes.RemoveNonUserOutputsPass(),
+      fx_passes.CastInputsBf16ToF32Pass(),
       fx_passes.CanonicalizePass(),
   ]

ai_edge_torch/_convert/fx_passes/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@ from typing import Sequence, Union
 from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass
 from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass
+from ai_edge_torch._convert.fx_passes.cast_inputs_bf16_to_f32_pass import CastInputsBf16ToF32Pass
 from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass
 from ai_edge_torch._convert.fx_passes.remove_non_user_outputs_pass import RemoveNonUserOutputsPass

ai_edge_torch/_convert/fx_passes/cast_inputs_bf16_to_f32_pass.py ADDED Viewed

@@ -0,0 +1,50 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pass to cast all inputs with torch.bfloat16 type to torch.float32."""
+from ai_edge_torch import fx_infra
+import torch
+def cast_f32(x):
+  return x.to(torch.float32)
+class CastInputsBf16ToF32Pass(fx_infra.ExportedProgramPassBase):
+  """This pass casts all inputs with torch.bfloat16 type to torch.float32."""
+  def call(self, exported_program: torch.export.ExportedProgram):
+    modified = False
+    for node in exported_program.graph.nodes:
+      if (
+          node.op == "placeholder"
+          and node.meta.get("val").dtype == torch.bfloat16
+      ):
+        if not node.users:
+          continue
+        modified = True
+        user = next(iter(node.users))
+        with exported_program.graph.inserting_before(user):
+          cast_node = exported_program.graph.call_function(
+              cast_f32,
+              (node,),
+          )
+          node.replace_all_uses_with(cast_node)
+          cast_node.replace_input_with(cast_node, node)
+    exported_program.graph_module.recompile()
+    return fx_infra.ExportedProgramPassResult(exported_program, modified)

ai_edge_torch/_convert/test/test_convert.py CHANGED Viewed

@@ -553,6 +553,27 @@ class TestConvert(googletest.TestCase):
       self.fail(f"PT2E conversion failed: {err}")
     # pylint: enable=broad-except
+  def test_convert_model_with_bfloat16_inputs(self):
+    """Test converting a simple model with torch.bfloat16 input.
+    bf16 inputs would remain in converted model signature but be casted to f32
+    right after the model inputs.
+    """
+    class SampleModel(nn.Module):
+      def forward(self, x: torch.Tensor):
+        return (x + 1) * 1.2
+    model = SampleModel().eval()
+    args = (torch.randn(10, 10).to(torch.bfloat16),)
+    # pylint: disable=broad-except
+    try:
+      ai_edge_torch.convert(model, args)
+    except Exception as err:
+      self.fail(f"Conversion failed with bloat16 inputs: {err}")
+    # pylint: enable=broad-except
 if __name__ == "__main__":
   googletest.main()

ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py CHANGED Viewed

@@ -17,7 +17,7 @@
 from absl import app
 from ai_edge_torch.generative.examples.gemma3 import gemma3
-from ai_edge_torch.generative.layers.experimental import kv_cache
+from ai_edge_torch.generative.layers import kv_cache
 from ai_edge_torch.generative.utilities import converter
 from ai_edge_torch.generative.utilities import export_config
 import torch
@@ -58,7 +58,7 @@ def _create_export_config(
   )
   decode_mask = torch.triu(decode_mask, diagonal=1).unsqueeze(0).unsqueeze(0)
   export_config.decode_mask = decode_mask
-  export_config.kvcache_cls = kv_cache.KVCacheTransposed
+  export_config.kvcache_layout = kv_cache.KV_LAYOUT_TRANSPOSED
   return export_config

ai_edge_torch/generative/examples/gemma3/decoder.py CHANGED Viewed

@@ -18,9 +18,9 @@
 from typing import List, Optional, Tuple
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 from ai_edge_torch.generative.layers.experimental import attention
-from ai_edge_torch.generative.layers.experimental import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 from ai_edge_torch.generative.utilities import export_config as export_cfg
@@ -81,8 +81,8 @@ class DecoderBlock(attention.TransformerBlock):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-      kv_cache: kv_utils.KVCacheEntryBase = None,
-  ) -> Tuple[torch.Tensor, Optional[kv_utils.KVCacheEntryBase]]:
+      kv_cache: kv_utils.KVCacheEntry = None,
+  ) -> Tuple[torch.Tensor, Optional[kv_utils.KVCacheEntry]]:
     """Forward function of the Gemma3Block.
     Exactly the same as TransformerBlock but we call the post-attention norm
@@ -241,13 +241,12 @@ class Decoder(nn.Module):
       self,
       tokens: torch.Tensor,
       input_pos: torch.Tensor,
-      kv_cache: kv_utils.KVCacheBase,
+      kv_cache: kv_utils.KVCache,
       input_embeds: Optional[torch.Tensor] = None,
       mask: Optional[torch.Tensor] = None,
       image_indices: Optional[torch.Tensor] = None,
       export_config: Optional[export_cfg.ExportConfig] = None,
-  ) -> dict[torch.Tensor, kv_utils.KVCacheBase]:
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
     pixel_mask = None
     if input_embeds is None:
       # token embeddings of shape (b, t, n_embd)
@@ -287,10 +286,10 @@ class Decoder(nn.Module):
       rope: List[Tuple[torch.Tensor, torch.Tensor]],
       mask: torch.Tensor | List[torch.Tensor],
       input_pos: torch.Tensor,
-      kv_cache: kv_utils.KVCacheBase,
+      kv_cache: kv_utils.KVCache,
       pixel_mask: Optional[torch.Tensor] = None,
       export_config: Optional[export_cfg.ExportConfig] = None,
-  ) -> dict[torch.Tensor, kv_utils.KVCacheBase]:
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
     """Forwards the model with input embeddings."""
     assert len(self.transformer_blocks) == len(kv_cache.caches), (
         "The number of transformer blocks and the number of KV cache entries"
@@ -326,7 +325,7 @@ class Decoder(nn.Module):
       x, kv_entry = block(x, rope[i], mask_entry, input_pos, kv_entry)
       if kv_entry:
         updated_kv_entries.append(kv_entry)
-    updated_kv_cache = kv_utils.KVCacheBase(tuple(updated_kv_entries))
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
     if export_config is not None:
       if (
           torch.numel(input_pos) > 1

ai_edge_torch/generative/examples/gemma3/verify_util.py CHANGED Viewed

@@ -20,8 +20,8 @@ import os
 from typing import List, Optional, Tuple
 from ai_edge_torch.generative.examples.gemma3 import gemma3
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-from ai_edge_torch.generative.layers.experimental import kv_cache as kv_utils
 from ai_edge_torch.generative.utilities.experimental import verifier
 from gemma import config as gemma_config
 from gemma import model as gemma_model
@@ -94,7 +94,9 @@ class UnifiedGemma3Wrapper(verifier.ReauthoredModelWrapper):
   def _init_kv_cache(self):
     """Returns an initialized KV cache."""
-    return kv_utils.KVCacheTransposed.from_model_config(self.model.model.config)
+    return kv_utils.KVCache.from_model_config(
+        self.model.model.config, kv_layout=kv_utils.KV_LAYOUT_TRANSPOSED
+    )
   def forward(
       self, tokens: torch.Tensor, pixel_values: torch.Tensor = None

ai_edge_torch/generative/layers/experimental/attention.py CHANGED Viewed

@@ -22,9 +22,9 @@ at any time.
 from typing import Optional, Tuple, Union
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers import lora as lora_utils
-from ai_edge_torch.generative.layers.experimental import kv_cache as kv_utils
-from ai_edge_torch.generative.layers.experimental import scaled_dot_product_attention as sdpa
+from ai_edge_torch.generative.layers import sdpa_with_kv_update
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 import torch
@@ -69,9 +69,9 @@ class TransformerBlock(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-      kv_cache: kv_utils.KVCacheEntryBase = None,
+      kv_cache: kv_utils.KVCacheEntry = None,
       lora: Optional[lora_utils.LoRAEntry] = None,
-  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntryBase]]:
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
     Args:
@@ -79,7 +79,7 @@ class TransformerBlock(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntryBase): the optional kv cache entry.
+      kv_cache (KVCacheEntry): the optional kv cache entry.
       lora (LoRAEntry): the optional lora entry.
     Returns:
@@ -146,7 +146,6 @@ class CausalSelfAttention(nn.Module):
     self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
     self.config = config
     self.enable_hlfb = enable_hlfb
-    self.sdpa_func = sdpa.scaled_dot_product_attention
   def forward(
       self,
@@ -154,9 +153,9 @@ class CausalSelfAttention(nn.Module):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
-      kv_cache: Optional[kv_utils.KVCacheEntryBase] = None,
+      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
       lora: Optional[lora_utils.LoRAEntry] = None,
-  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntryBase]]:
+  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
        MQA, GQA and MHA.
@@ -166,8 +165,7 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntryBase): the KV cache entry corresponding to this
-        module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
       lora (LoRAEntry): the optional lora entry.
     Returns:
@@ -221,36 +219,8 @@ class CausalSelfAttention(nn.Module):
       cos, sin = rope
       q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
-    # Transpose k/v to specific layout for GPU implementation.
-    b, _, n, h = q.shape
-    g = n // self.config.num_query_groups
-    # btnh -> bnth -> b(kg)th -> 1(bk)(gt)h
-    q = q.permute(0, 2, 1, 3).reshape(
-        1, b * self.config.num_query_groups, g * T, h
-    )
-    k = k.permute(0, 2, 1, 3).reshape(
-        1, -1, T, self.config.head_dim
-    )  # 1, bk, s, h
-    v = v.permute(0, 2, 3, 1).reshape(
-        1, -1, self.config.head_dim, T
-    )  # 1, bk, h, s
-    if kv_cache is not None:
-      kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
-      k, v = kv_cache.k_cache, kv_cache.v_cache
-    sdpa_out = self.sdpa_func(
-        kv_cache,
-        q,
-        k,
-        v,
-        self.config.head_dim,
-        mask=mask,
-        softcap=self.config.logit_softcap,
-    )  # 1, bk, gt, h
-    sdpa_out = (
-        sdpa_out.reshape(B, -1, T, h).permute(0, 2, 1, 3).reshape(B, T, -1)
+    sdpa_out, kv_cache = sdpa_with_kv_update.sdpa_with_kv_update(
+        q, k, v, kv_cache, input_pos, mask, self.config
     )
     # Compute the output projection.

ai_edge_torch/generative/layers/experimental/kv_cache.py CHANGED Viewed

@@ -18,303 +18,33 @@
 This is an experimental implementation and is subject to change at any time.
 """
-import dataclasses
-import functools
-from typing import Any, List, Tuple, Type
-from ai_edge_torch.generative.layers import model_config
-from ai_edge_torch.generative.layers.experimental import types
 from ai_edge_torch.generative.custom_ops import dynamic_update_slice as dus_utils
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import torch
-import torch.utils._pytree as pytree
-@dataclasses.dataclass
-class KVCacheEntryBase:
-  """A single cache entry that includes K and V caches.
-  The chaches are built based on the provided config with the shape of
-  (batch_size, kv_cache_max, num_query_groups, head_dim).
-  """
-  k_cache: torch.Tensor
-  v_cache: torch.Tensor
-  @classmethod
-  def _from_model_config(
-      cls,
-      k_shape: Tuple[int, ...],
-      v_shape: Tuple[int, ...],
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-  ):
-    """Build an instance of the class based on model config."""
-    k = torch.zeros(k_shape, dtype=dtype, device=device)
-    v = torch.zeros(v_shape, dtype=dtype, device=device)
-    obj = cls(k_cache=k, v_cache=v)
-    return obj
-  @classmethod
-  def from_model_config(
-      cls,
-      kv_cache_max: int,
-      config: model_config.AttentionConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    """Build an instance of the class based on model config."""
-    shape = (batch_size, kv_cache_max, config.num_query_groups, config.head_dim)
-    return cls._from_model_config(shape, shape, dtype, device)
-@dataclasses.dataclass
-class KVCacheEntryBTNH(KVCacheEntryBase):
-  k_type = types.BTNH()
-  v_type = types.BTNH()
-@dataclasses.dataclass
-class KVCacheEntryTransposed(KVCacheEntryBase):
-  k_type = types.BNTH()
-  v_type = types.BNHT()
-  @classmethod
-  def from_model_config(
-      cls,
-      kv_cache_max: int,
-      config: model_config.AttentionConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    """Build an instance of the class based on model config."""
-    k_shape = (
-        batch_size,
-        config.num_query_groups,
-        kv_cache_max,
-        config.head_dim,
-    )  # b, k, s, h
-    v_shape = (
-        batch_size,
-        config.num_query_groups,
-        config.head_dim,
-        kv_cache_max,
-    )  # b, k, h, s
-    return cls._from_model_config(k_shape, v_shape, dtype, device)
-def _flatten_kv_entry(
-    kv_e: KVCacheEntryBase,
-) -> Tuple[List[torch.Tensor], Any]:
-  return ([kv_e.k_cache, kv_e.v_cache], None)
-def _unflatten_kv_entry(
-    kv_entry_ty: Type[KVCacheEntryBase],
-    values: List[torch.Tensor],
-    unused_context: Any,
-) -> KVCacheEntryBase:
-  return kv_entry_ty(*values)
-pytree.register_pytree_node(
-    KVCacheEntryTransposed,
-    _flatten_kv_entry,
-    functools.partial(_unflatten_kv_entry, KVCacheEntryTransposed),
-    serialized_type_name="",
-)
-pytree.register_pytree_node(
-    KVCacheEntryBase,
-    _flatten_kv_entry,
-    functools.partial(_unflatten_kv_entry, KVCacheEntryBase),
-    serialized_type_name="",
-)
-@dataclasses.dataclass
-class KVCacheBase:
-  """A utility class for holding KV cache entries per layer."""
-  caches: Tuple[KVCacheEntryBase, ...]
-  @classmethod
-  def _from_model_config(
-      cls,
-      kv_entry_cls,
-      config: model_config.ModelConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    caches = [
-        kv_entry_cls.from_model_config(
-            config.kv_cache_max,
-            config.block_config(idx).attn_config,
-            dtype,
-            device,
-            batch_size,
-        )
-        for idx in range(config.num_layers)
-    ]
-    obj = cls(caches=tuple(caches))
-    return obj
-  @classmethod
-  def from_model_config(
-      cls,
-      config: model_config.ModelConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    """Build an instance of the class based on model config.
-    Args:
-        config (ModelConfig): Model config used for building the cache.
-        dtype (torch.dtype, optional): The data type of the cache tensor.
-          Defaults to torch.float32.
-        device (torch.device, optional): The device placement of the cache
-          tensors. Defaults to None.
-        batch_size (int, optional): The batch size of the cache tensors.
-          Defaults to 1.
-    Returns:
-        KVCacheBase: The created cache object.
-    """
-    assert batch_size == 1, "Batch size must be 1 for KV Cache."
-    return cls._from_model_config(
-        KVCacheEntryBase,
-        config=config,
-        dtype=dtype,
-        device=device,
-        batch_size=batch_size,
-    )
-  def flatten(self) -> List[torch.Tensor]:
-    """Flatten the cache entries into a list of tensors with order k_i, v_i."""
-    flattened, _ = _flatten_kvc(self)
-    return flattened
-@dataclasses.dataclass
-class KVCacheBTNH(KVCacheBase):
-  @classmethod
-  def from_model_config(
-      cls,
-      config: model_config.ModelConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    return cls._from_model_config(
-        KVCacheEntryBTNH,
-        config=config,
-        dtype=dtype,
-        device=device,
-        batch_size=batch_size,
-    )
-@dataclasses.dataclass
-class KVCacheTransposed(KVCacheBase):
-  @classmethod
-  def from_model_config(
-      cls,
-      config: model_config.ModelConfig,
-      dtype: torch.dtype = torch.float32,
-      device: torch.device = None,
-      batch_size: int = 1,
-  ):
-    return cls._from_model_config(
-        KVCacheEntryTransposed,
-        config=config,
-        dtype=dtype,
-        device=device,
-        batch_size=batch_size,
-    )
-def _flatten_kvc(kvc: KVCacheBase) -> Tuple[List[str], List[str]]:
-  flattened = []
-  flat_names = []
-  none_names = []
-  for i, kv_entry in enumerate(kvc.caches):
-    flattened.append(kv_entry.k_cache)
-    flat_names.append(f"k_{i}")
-    flattened.append(kv_entry.v_cache)
-    flat_names.append(f"v_{i}")
-  return flattened, [flat_names, none_names]
-def _flatten_kvc_with_keys(kvc: KVCacheBase) -> Tuple[List, List]:
-  flattened, (flat_names, none_names) = _flatten_kvc(kvc)
-  return [
-      (pytree.MappingKey(k), v) for k, v in zip(flat_names, flattened)
-  ], flat_names
-def _unflatten_kvc(
-    kv_ty: Type[KVCacheBase],
-    kv_entry_type: Type[KVCacheEntryBase],
-    values: List[torch.Tensor],
-    context: Tuple[List, List],
-) -> KVCacheBase:
-  assert len(values) % 2 == 0, "Found odd number of K and V entries."
-  num_layers = len(values) // 2
-  flat_names = context[0]
-  kv_entries = []
-  for i in range(num_layers):
-    k_cache_idx = flat_names.index(f"k_{i}")
-    v_cache_idx = flat_names.index(f"v_{i}")
-    kv_entries.append(
-        kv_entry_type(k_cache=values[k_cache_idx], v_cache=values[v_cache_idx])
-    )
-  obj = kv_ty(tuple(kv_entries))
-  return obj
-pytree.register_pytree_node(
-    KVCacheTransposed,
-    _flatten_kvc,
-    functools.partial(
-        _unflatten_kvc, KVCacheTransposed, KVCacheEntryTransposed
-    ),
-    flatten_with_keys_fn=_flatten_kvc_with_keys,
-    serialized_type_name="",
-)
-pytree.register_pytree_node(
-    KVCacheBase,
-    _flatten_kvc,
-    functools.partial(_unflatten_kvc, KVCacheBase, KVCacheEntryBase),
-    flatten_with_keys_fn=_flatten_kvc_with_keys,
-    serialized_type_name="",
-)
 def update(
-    cache: KVCacheEntryBase,
+    cache: kv_utils.KVCacheEntry,
     input_pos: torch.Tensor,
     k_slice: torch.Tensor,
     v_slice: torch.Tensor,
-) -> KVCacheEntryBase:
+) -> kv_utils.KVCacheEntry:
   """Out of place update of Cache buffer.
   Args:
-      cache (KVCacheEntryBase): The original cache buffer.
+      cache (kv_utils.KVCacheEntry): The original cache buffer.
       input_pos (torch.Tensor): The update slice positions.
       k_slice (torch.Tensor): The K slice to be updated in the new cache.
       v_slice (torch.Tensor): The V slice to be updated in the new cache.
   Returns:
-      KVCacheEntryBase: The updated KVCacheBase entry based on the passed
+      kv_utils.KVCacheEntry: The updated KVCacheBase entry based on the passed
       inputs.
   """
-  update_kv_cache = _update_kv_impl
+  assert (
+      cache.kv_layout == kv_utils.KV_LAYOUT_TRANSPOSED
+  ), "KV entry must have transposed layout."
+  update_kv_cache = _update_kv_impl_transposed
   return update_kv_cache(cache, input_pos, k_slice, v_slice)
@@ -338,12 +68,12 @@ def _get_slice_indices(
   return slice_indices
-def _update_kv_impl(
-    cache: KVCacheEntryTransposed,
+def _update_kv_impl_transposed(
+    cache: kv_utils.KVCacheEntry,
     input_pos: torch.Tensor,
     k_slice: torch.Tensor,
     v_slice: torch.Tensor,
-) -> KVCacheEntryTransposed:
+) -> kv_utils.KVCacheEntry:
   """Update the cache buffer with High Level Function Boundary annotation."""
   cache_dim = 4
   k_ts_idx = 2
@@ -357,4 +87,4 @@ def _update_kv_impl(
   v = dus_utils.dynamic_update_slice(
       cache.v_cache, v_slice, [x for x in v_slice_indices]
   )
-  return KVCacheEntryTransposed(k, v)
+  return kv_utils.KVCacheEntry(k, v, cache.kv_layout)

ai_edge_torch/generative/layers/experimental/scaled_dot_product_attention.py CHANGED Viewed

@@ -19,7 +19,7 @@ import math
 from typing import Optional
 from ai_edge_torch.generative.custom_ops import bmm_4d as bmm_lib
-from ai_edge_torch.generative.layers.experimental import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers.experimental import types
 from ai_edge_torch.hlfb import StableHLOCompositeBuilder
 from multipledispatch import dispatch
@@ -28,7 +28,7 @@ import torch.nn.functional as F
 def scaled_dot_product_attention(
-    kv: kv_utils.KVCacheBase,
+    kv: kv_utils.KVCacheEntry,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -37,10 +37,10 @@ def scaled_dot_product_attention(
     scale: Optional[float] = None,
     softcap: Optional[float] = None,
 ):
-  if hasattr(kv, "k_type") and hasattr(kv, "v_type"):
+  if hasattr(kv, "kv_layout"):
     return _sdpa(
-        kv.k_type,
-        kv.v_type,
+        kv.kv_layout[0](),  # key layout
+        kv.kv_layout[1](),  # value layout
         query=query,
         key=key,
         value=value,
@@ -49,10 +49,7 @@ def scaled_dot_product_attention(
         scale=scale,
         softcap=softcap,
     )
-  raise ValueError(
-      f"SDPA for K type {type(kv.caches[0].k_type)} and V type"
-      f" {type(kv.caches[0].v_type)} not supported."
-  )
+  raise ValueError("No kv_layout attribute found in kv.")
 @dispatch(types.BNTH, types.BNHT)
@@ -85,7 +82,6 @@ def _sdpa(k_type, v_type, *args, **kwargs):
   padded_logits = logits + mask
   padded_logits = padded_logits.reshape(1, bk, gt, s)
   probs = F.softmax(padded_logits, dim=-1).type_as(key)
   encoded = bmm_lib.bmm_4d(probs, value)
   return encoded  # 1, bk, gt, h

ai_edge_torch/generative/layers/experimental/types.py CHANGED Viewed

@@ -62,6 +62,9 @@ class TensorDimensionMeta(type):
   def __repr__(cls):
     return f'{cls.__name__}'
+  def __iter__(cls):
+    return iter(getattr(cls, 'dimensions'))
 def create_tensor_dimension_order_class(dims: Tuple[TensorDims]):
   """Creates a TensorDimensionMeta class with the specified dimensions.

ai-edge-torch-nightly 0.5.0.dev20250408__py3-none-any.whl → 0.5.0.dev20250410__py3-none-any.whl

ai-edge-torch-nightly 0.5.0.dev20250408py3-none-any.whl → 0.5.0.dev20250410py3-none-any.whl