PyPI - ai-edge-torch-nightly - Versions diffs - 0.5.0.dev20250423__py3-none-any.whl → 0.5.0.dev20250424__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.5.0.dev20250423py3-none-any.whl → 0.5.0.dev20250424py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

ai_edge_torch/generative/examples/gemma3/decoder.py CHANGED Viewed

@@ -17,10 +17,10 @@
 from typing import List, Optional, Tuple
+from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
-from ai_edge_torch.generative.layers.experimental import attention
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 from ai_edge_torch.generative.utilities import export_config as export_cfg

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -21,6 +21,7 @@ from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers import lora as lora_utils
 from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
+from ai_edge_torch.generative.layers import sdpa_with_kv_update
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 import torch
@@ -142,11 +143,6 @@ class CausalSelfAttention(nn.Module):
     self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
     self.config = config
     self.enable_hlfb = enable_hlfb
-    self.sdpa_func = (
-        sdpa.scaled_dot_product_attention_with_hlfb
-        if enable_hlfb
-        else sdpa.scaled_dot_product_attention
-    )
   def forward(
       self,
@@ -174,7 +170,7 @@ class CausalSelfAttention(nn.Module):
         KV Cach Entry (if passed in).
     """
     # Batch size, sequence length, embedding dimensionality.
-    B, T, E = x.size()
+    B, T, _ = x.size()
     qkv = self.qkv_projection(x)
     # Assemble into a number of query groups to support MHA, MQA and GQA.
@@ -218,19 +214,9 @@ class CausalSelfAttention(nn.Module):
       cos, sin = rope
       q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
-    if kv_cache is not None:
-      kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
-      k, v = kv_cache.k_cache, kv_cache.v_cache
-    sdpa_out = self.sdpa_func(
-        q,
-        k,
-        v,
-        self.config.head_dim,
-        mask=mask,
-        softcap=self.config.logit_softcap,
+    sdpa_out, kv_cache = sdpa_with_kv_update.sdpa_with_kv_update(
+        q, k, v, kv_cache, input_pos, mask, self.config, self.enable_hlfb
     )
-    sdpa_out = sdpa_out.reshape(B, T, -1)
     # Compute the output projection.
     y = self.output_projection(sdpa_out)

ai_edge_torch/generative/layers/sdpa_with_kv_update.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Common utility functions for data loading etc.
-from dataclasses import dataclass
+"""Common utility functions for data loading etc."""
 from typing import Tuple
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa_default
 from ai_edge_torch.generative.layers.experimental import kv_cache as kv_utils_experimental
 from ai_edge_torch.generative.layers.experimental import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
-from ai_edge_torch.generative.utilities import types
-from multipledispatch import dispatch
 import torch
@@ -33,32 +33,27 @@ def sdpa_with_kv_update(
     input_pos: torch.Tensor,
     mask: torch.Tensor,
     config: cfg.AttentionConfig,
+    enable_hlfb: bool,
 ) -> Tuple[torch.Tensor, kv_utils.KVCacheEntry]:
-  return sdpa_with_kv_update_impl(
-      kv.kv_layout[0](),  # key layout
-      kv.kv_layout[1](),  # value layout
-      query=query,
-      key=key,
-      value=value,
-      kv=kv,
-      input_pos=input_pos,
-      mask=mask,
-      config=config,
+  """Wrapper function for scaled dot product attention with KV cache update."""
+  if kv is not None and kv.kv_layout == kv_utils.KV_LAYOUT_TRANSPOSED:
+    return _sdpa_with_kv_update_transposed(
+        query, key, value, kv, input_pos, mask, config
+    )
+  return _sdpa_with_kv_update_default(
+      query, key, value, kv, input_pos, mask, config, enable_hlfb
   )
-@dispatch(types.BNTH, types.BNHT)
-def sdpa_with_kv_update_impl(
-    k_type, v_type, *args, **kwargs
+def _sdpa_with_kv_update_transposed(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv: kv_utils.KVCacheEntry,
+    input_pos: torch.Tensor,
+    mask: torch.Tensor,
+    config: cfg.AttentionConfig,
 ) -> Tuple[torch.Tensor, kv_utils.KVCacheEntry]:
-  query = kwargs["query"]
-  key = kwargs["key"]
-  value = kwargs["value"]
-  kv = kwargs["kv"]
-  input_pos = kwargs["input_pos"]
-  mask = kwargs["mask"]
-  config = kwargs["config"]
   # Transpose k/v to specific layout for GPU implementation.
   b, seq_len, n, h = query.shape
   g = n // config.num_query_groups
@@ -74,9 +69,8 @@ def sdpa_with_kv_update_impl(
       1, -1, config.head_dim, seq_len
   )  # 1, bk, h, s
-  if kv is not None:
-    kv = kv_utils_experimental.update(kv, input_pos, key, value)
-    key, value = kv.k_cache, kv.v_cache
+  kv = kv_utils_experimental.update(kv, input_pos, key, value)
+  key, value = kv.k_cache, kv.v_cache
   sdpa_out = sdpa.scaled_dot_product_attention(
       kv,
@@ -95,24 +89,26 @@ def sdpa_with_kv_update_impl(
   return sdpa_out, kv
-@dispatch(object, object)
-def sdpa_with_kv_update_impl(
-    k_type, v_type, *args, **kwargs
+def _sdpa_with_kv_update_default(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv: kv_utils.KVCacheEntry,
+    input_pos: torch.Tensor,
+    mask: torch.Tensor,
+    config: cfg.AttentionConfig,
+    enable_hlfb: bool,
 ) -> Tuple[torch.Tensor, kv_utils.KVCacheEntry]:
-  query = kwargs["query"]
-  key = kwargs["key"]
-  value = kwargs["value"]
-  kv = kwargs["kv"]
-  input_pos = kwargs["input_pos"]
-  mask = kwargs["mask"]
-  config = kwargs["config"]
   b, seq_len, _, _ = query.shape
   if kv is not None:
     kv = kv_utils.update(kv, input_pos, key, value)
     key, value = kv.k_cache, kv.v_cache
-  sdpa_out = sdpa_default.scaled_dot_product_attention(
+  if enable_hlfb:
+    sdpa_func = sdpa_default.scaled_dot_product_attention_with_hlfb
+  else:
+    sdpa_func = sdpa_default.scaled_dot_product_attention
+  sdpa_out = sdpa_func(
       query,
       key,
       value,

ai_edge_torch/generative/test/test_model_conversion.py CHANGED Viewed

@@ -41,7 +41,7 @@ class TestModelConversion(googletest.TestCase):
         )
     )
-  def _get_params(self, enable_hlfb: bool):
+  def _get_params(self, enable_hlfb: bool, kv_layout: kv_cache.KVLayout):
     """Returns a model, edge model and the kwargs to use for testing."""
     config = toy_model_with_kv_cache.get_model_config()
     config.enable_hlfb = enable_hlfb
@@ -49,7 +49,7 @@ class TestModelConversion(googletest.TestCase):
     tokens, input_pos = torch.tensor([[1]], dtype=torch.int), torch.tensor(
         [10], dtype=torch.int
     )
-    kv = kv_cache.KVCache.from_model_config(config)
+    kv = kv_cache.KVCache.from_model_config(config, kv_layout=kv_layout)
     kwargs = {
         "tokens": tokens,
         "input_pos": input_pos,
@@ -65,8 +65,12 @@ class TestModelConversion(googletest.TestCase):
     )
     return pytorch_model, edge_model, kwargs
-  def _test_model_with_kv_cache(self, enable_hlfb: bool):
-    pytorch_model, edge_model, kwargs = self._get_params(enable_hlfb)
+  def _test_model_with_kv_cache(
+      self,
+      enable_hlfb: bool = False,
+      kv_layout: kv_cache.KVLayout = kv_cache.KV_LAYOUT_DEFAULT,
+  ):
+    pytorch_model, edge_model, kwargs = self._get_params(enable_hlfb, kv_layout)
     self.assertTrue(
         test_utils.compare_tflite_torch(
@@ -95,13 +99,22 @@ class TestModelConversion(googletest.TestCase):
   def test_toy_model_with_kv_cache_with_hlfb(self):
     self._test_model_with_kv_cache(enable_hlfb=True)
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+  def test_toy_model_with_kv_cache_transposed(self):
+    self._test_model_with_kv_cache(kv_layout=kv_cache.KV_LAYOUT_TRANSPOSED)
   @googletest.skipIf(
       ai_edge_torch.config.in_oss,
       reason="tests with custom ops are not supported in oss",
   )
   def test_toy_model_has_dus_op(self):
     """Tests that the model has the dynamic update slice op."""
-    _, edge_model, _ = self._get_params(enable_hlfb=True)
+    _, edge_model, _ = self._get_params(
+        enable_hlfb=True, kv_layout=kv_cache.KV_LAYOUT_DEFAULT
+    )
     interpreter_ = interpreter.InterpreterWithCustomOps(
         custom_op_registerers=["GenAIOpsRegisterer"],
         model_content=edge_model.tflite_model(),
@@ -112,7 +125,14 @@ class TestModelConversion(googletest.TestCase):
     op_names = [op["op_name"] for op in interpreter_._get_ops_details()]
     self.assertIn("DYNAMIC_UPDATE_SLICE", op_names)
-  def _test_multisig_model(self, config, pytorch_model, atol, rtol):
+  def _test_multisig_model(
+      self,
+      config,
+      pytorch_model,
+      atol,
+      rtol,
+      kv_layout=kv_cache.KV_LAYOUT_DEFAULT,
+  ):
     # prefill
     seq_len = 10
     prefill_tokens = torch.zeros((1, seq_len), dtype=torch.int, device="cpu")
@@ -124,7 +144,7 @@ class TestModelConversion(googletest.TestCase):
     decode_token = torch.tensor([[1]], dtype=torch.int)
     decode_input_pos = torch.tensor([5], dtype=torch.int)
-    kv = kv_cache.KVCache.from_model_config(config)
+    kv = kv_cache.KVCache.from_model_config(config, kv_layout=kv_layout)
     edge_model = (
         ai_edge_torch.signature(
@@ -160,7 +180,7 @@ class TestModelConversion(googletest.TestCase):
             kv,
             signature_name="prefill",
             atol=atol,
-            rtol=atol,
+            rtol=rtol,
         )
     )
@@ -173,7 +193,7 @@ class TestModelConversion(googletest.TestCase):
             kv,
             signature_name="decode",
             atol=atol,
-            rtol=atol,
+            rtol=rtol,
         )
     )
@@ -186,6 +206,21 @@ class TestModelConversion(googletest.TestCase):
     pytorch_model = tiny_llama.TinyLlama(config).eval()
     self._test_multisig_model(config, pytorch_model, atol=1e-5, rtol=1e-5)
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+  def test_tiny_llama_multisig_kv_layout_transposed(self):
+    config = tiny_llama.get_fake_model_config()
+    pytorch_model = tiny_llama.TinyLlama(config).eval()
+    self._test_multisig_model(
+        config,
+        pytorch_model,
+        atol=1e-5,
+        rtol=1e-5,
+        kv_layout=kv_cache.KV_LAYOUT_TRANSPOSED,
+    )
 if __name__ == "__main__":
   googletest.main()

ai_edge_torch/version.py CHANGED Viewed

@@ -13,4 +13,4 @@
 # limitations under the License.
 # ==============================================================================
-__version__ = "0.5.0.dev20250423"
+__version__ = "0.5.0.dev20250424"

{ai_edge_torch_nightly-0.5.0.dev20250423.dist-info → ai_edge_torch_nightly-0.5.0.dev20250424.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-torch-nightly
-Version: 0.5.0.dev20250423
+Version: 0.5.0.dev20250424
 Summary: Supporting PyTorch models with the Google AI Edge TFLite runtime.
 Home-page: https://github.com/google-ai-edge/ai-edge-torch
 Keywords: On-Device ML,AI,Google,TFLite,PyTorch,LLMs,GenAI

{ai_edge_torch_nightly-0.5.0.dev20250423.dist-info → ai_edge_torch_nightly-0.5.0.dev20250424.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ ai_edge_torch/__init__.py,sha256=8sPR_5uXJA4NEE0nIwNdSl-ADOJEoR8hAgYvBQDY70Y,120
 ai_edge_torch/_config.py,sha256=AiqhbcheF7j_ozIGDLC89k1we95aVgFDa-tR6h7UI0s,2529
 ai_edge_torch/conftest.py,sha256=r0GTrhMRhlmOGrrkvumHN8hkmyug6WvF60vWq8wRIBI,758
 ai_edge_torch/model.py,sha256=N-pNpTxzhaFGhWhnSGd70lBzb9VlEhTOq5mddU7bvvI,5542
-ai_edge_torch/version.py,sha256=DjzQwP8czvLmUu-dJhnWVQJHOuaOqJJKuH2_TOViMvg,706
+ai_edge_torch/version.py,sha256=Nixp49eAXZPPMWEWkqpm_M4Mi_WGPx-I8q2noKuh0hw,706
 ai_edge_torch/_convert/__init__.py,sha256=hHLluseD2R0Hh4W6XZRIXY_dRQeYudjsrKGf6LZz65g,671
 ai_edge_torch/_convert/conversion.py,sha256=dOr3TUfF0UCvkmlUrMqKvgaN4jh3lJ9XFuO-sHaAmIw,5521
 ai_edge_torch/_convert/conversion_utils.py,sha256=Sr8qXVcTwc-ZnZmK7yxVrIOOp1S_vNrwzC0zUvLTI2o,2160
@@ -67,7 +67,7 @@ ai_edge_torch/generative/examples/gemma/verify_gemma2.py,sha256=IoBhEMwH07-tFm5-
 ai_edge_torch/generative/examples/gemma/verify_util.py,sha256=tR8RflXocDZqvuStyw9aFlzuiTllEC8rNnjrxms6_Is,5727
 ai_edge_torch/generative/examples/gemma3/__init__.py,sha256=JaAnrFoXTl3RJX97XspklkTyqOHVyAgRJsZtzNDd10c,671
 ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py,sha256=szssSBrIUYdNIoU7LHdAq7wCqgjaY6qbV8yvTgg796Q,2945
-ai_edge_torch/generative/examples/gemma3/decoder.py,sha256=n6ZQfqNEHuOhY7Pu21bb8Eax8yn2Sx5osTKJKmhonXY,15659
+ai_edge_torch/generative/examples/gemma3/decoder.py,sha256=eXWE5CSX0KeUMsPevgsYOfvyajl9F1RFF4DCWhHcYPA,15646
 ai_edge_torch/generative/examples/gemma3/gemma3.py,sha256=GACDBI_MsFowR8A3wAWrpzradPYe-AUgB9ZjXaVBG-s,6485
 ai_edge_torch/generative/examples/gemma3/image_encoder.py,sha256=uRoLoBWzFtQz5wFZfPCxbkvZsgPAqSkUUsV3977GbYc,5184
 ai_edge_torch/generative/examples/gemma3/verify_gemma3.py,sha256=v8oNXFICmVOtQxfO7IhZ8GnbvotEkDi9lzYHjoQyOso,2464
@@ -150,7 +150,7 @@ ai_edge_torch/generative/examples/tiny_llama/verify.py,sha256=LRu6PSw7Lqu6HGbv1t
 ai_edge_torch/generative/fx_passes/__init__.py,sha256=PFSMsA1vfBfrV9ssBCkYJNl8Hx_bLdWjN01iyjPM5jE,1094
 ai_edge_torch/generative/fx_passes/remove_sdpa_zero_mask_pass.py,sha256=myGjal5A8yIBoqgArd2k40rZmCgD1Ya369KR7182bhI,2129
 ai_edge_torch/generative/layers/__init__.py,sha256=hHLluseD2R0Hh4W6XZRIXY_dRQeYudjsrKGf6LZz65g,671
-ai_edge_torch/generative/layers/attention.py,sha256=wLZ1jgUlcODBWgK3hnnhclHuuQDqYuGOZdYAI9EooOM,13247
+ai_edge_torch/generative/layers/attention.py,sha256=uK1ih2kxPZherwi-pGSm8B--NNWnQ8npEAfgcjMIkEY,12964
 ai_edge_torch/generative/layers/attention_utils.py,sha256=zBVwlBUTs-nStIKCZG0ks5ra7tsqc9ShfakFJKH5rds,7344
 ai_edge_torch/generative/layers/builder.py,sha256=LXGuSHIx6QZAzLFm7aJvlzoMPgQwbXLFchGEKYwOOUA,5090
 ai_edge_torch/generative/layers/feed_forward.py,sha256=hdICat-8gW7-vxDAevJQ8NQ-mynllPiqLdXQMF6JMnc,4189
@@ -160,9 +160,8 @@ ai_edge_torch/generative/layers/model_config.py,sha256=nLXvTkDAIHJQ0PTaWODF8oxJQ
 ai_edge_torch/generative/layers/normalization.py,sha256=MbwH-n80Fob5YvjBzdqDjBizMHLzSJGYRDdbD-rL5C0,6174
 ai_edge_torch/generative/layers/rotary_position_embedding.py,sha256=975zR202MdIrILJ7blceAcxrNqX1ZCN0ECKG1gz-bV8,2655
 ai_edge_torch/generative/layers/scaled_dot_product_attention.py,sha256=vp8dVx6tOe99neJhpbrtIt5fvN5NFw19JVH1v0yi5Mg,4154
-ai_edge_torch/generative/layers/sdpa_with_kv_update.py,sha256=oo9h7pi0GcuylRgp2yUuvUJCrhj03aoWt_fP7EDP4LM,3775
+ai_edge_torch/generative/layers/sdpa_with_kv_update.py,sha256=D4rATT2Ppa9Su7yuRHYnQPJ1dFvUDAyH1GrFnCed7p8,3810
 ai_edge_torch/generative/layers/experimental/__init__.py,sha256=nz-K0h8DfiATHzR6s1_bCw2akUmHWffU1bDRSkIzSqI,592
-ai_edge_torch/generative/layers/experimental/attention.py,sha256=XYbo1KlmiMEuwArye0Ul86jEsdxLr1RG-usRpidZiT8,8001
 ai_edge_torch/generative/layers/experimental/kv_cache.py,sha256=zgpFVftOfllvjh9-UEBSvUbm152SnQETn29rUMMMvAM,2978
 ai_edge_torch/generative/layers/experimental/scaled_dot_product_attention.py,sha256=YFcIGOkaNb-vvQKjI-G9-bC2Z1W0O_qRyIZPlsLl72U,2797
 ai_edge_torch/generative/layers/unet/__init__.py,sha256=hHLluseD2R0Hh4W6XZRIXY_dRQeYudjsrKGf6LZz65g,671
@@ -181,7 +180,7 @@ ai_edge_torch/generative/test/test_custom_dus.py,sha256=MjIhTvkTko872M35XMciobvI
 ai_edge_torch/generative/test/test_kv_cache.py,sha256=1sXN2RPntq0PP3IEy0NkvIbzQ0Y8JhPIwRSFwO9JLlE,5728
 ai_edge_torch/generative/test/test_loader.py,sha256=9mQUeeZKOVApOWSWl2cN9c10axZjMKM1-0Zd823CCS4,3449
 ai_edge_torch/generative/test/test_lora.py,sha256=6QIM6RLTc2HrodGpp_aS3OxM9Rco2KAzEnYgotkg41M,5310
-ai_edge_torch/generative/test/test_model_conversion.py,sha256=jfqkECCX7XKHeBAuDXrkwQJf0vM72eG3LMc5rluha84,6191
+ai_edge_torch/generative/test/test_model_conversion.py,sha256=jSNJ0Eex6VYCkGn3FXbCOOJ2S3-F_QuwJctu3VycjR4,7200
 ai_edge_torch/generative/test/test_model_conversion_large.py,sha256=-v2Vj7Qdd3GyBn4k7BWVgyGzrbcL30Su3nxZYLtwkCs,14787
 ai_edge_torch/generative/test/test_quantize.py,sha256=bEJMhpQ9bIDUZVBXTW888728FcH-i3SyE4JSZZUgU0A,6071
 ai_edge_torch/generative/test/utils.py,sha256=tF6aCfAGJnc9dmzCnZCEOuKNVimfWOqscv9og0DDLHU,2656
@@ -245,8 +244,8 @@ ai_edge_torch/testing/__init__.py,sha256=_yGgvnBZWb7T3IN3mc4x1sS4vM96HZwM8pwIcPG
 ai_edge_torch/testing/export.py,sha256=k5mGDGzwc23Z4zaIVDs8CNh-oOt64gsf9MS9NjhbPy4,3293
 ai_edge_torch/testing/model_coverage/__init__.py,sha256=5P8J6Zk5YYtDvTBucFvB9NGSRI7Gw_24WnrbhXgycEE,765
 ai_edge_torch/testing/model_coverage/model_coverage.py,sha256=UPB448aMDUyC0HNYVqio2rcJPnDN0tBQMP08J6vPYew,4718
-ai_edge_torch_nightly-0.5.0.dev20250423.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
-ai_edge_torch_nightly-0.5.0.dev20250423.dist-info/METADATA,sha256=PGzcX4WVfFW0wE0TSKLAuRB94iemrNff4L8CL_VUMnQ,2051
-ai_edge_torch_nightly-0.5.0.dev20250423.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ai_edge_torch_nightly-0.5.0.dev20250423.dist-info/top_level.txt,sha256=5KXRaF2hwkApYxf7Y8y_tVb9aulGTlbOoNdbx1aKRkE,14
-ai_edge_torch_nightly-0.5.0.dev20250423.dist-info/RECORD,,
+ai_edge_torch_nightly-0.5.0.dev20250424.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
+ai_edge_torch_nightly-0.5.0.dev20250424.dist-info/METADATA,sha256=Gz8c2qvL6qiK7lrd001P55TXltKdycDvDaAq4d4Y-eQ,2051
+ai_edge_torch_nightly-0.5.0.dev20250424.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ai_edge_torch_nightly-0.5.0.dev20250424.dist-info/top_level.txt,sha256=5KXRaF2hwkApYxf7Y8y_tVb9aulGTlbOoNdbx1aKRkE,14
+ai_edge_torch_nightly-0.5.0.dev20250424.dist-info/RECORD,,

ai_edge_torch/generative/layers/experimental/attention.py DELETED Viewed

@@ -1,231 +0,0 @@
-# Copyright 2024 The AI Edge Torch Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Common building blocks for a GPU-specific Attention layer.
-This is a temporary implemenation for the GPU. It is subject to change/removal
-at any time.
-"""
-from typing import Optional, Tuple, Union
-from ai_edge_torch.generative.layers import builder
-from ai_edge_torch.generative.layers import kv_cache as kv_utils
-from ai_edge_torch.generative.layers import lora as lora_utils
-from ai_edge_torch.generative.layers import sdpa_with_kv_update
-import ai_edge_torch.generative.layers.model_config as cfg
-import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
-import torch
-from torch import nn
-class TransformerBlock(nn.Module):
-  def __init__(
-      self,
-      config: cfg.TransformerBlockConfig,
-      model_config: cfg.ModelConfig,
-  ) -> None:
-    """Initialize an instance of the TransformerBlock.
-    Args:
-      config (cfg.TransformerBlockConfig): the configuration object for this
-        transformer block.
-      model_config (cfg.ModelConfig): the configuration object for the model
-        this transformer block belongs to.
-    """
-    super().__init__()
-    self.pre_atten_norm = builder.build_norm(
-        model_config.embedding_dim,
-        config.pre_attention_norm_config,
-    )
-    self.atten_func = CausalSelfAttention(
-        model_config.embedding_dim,
-        config.attn_config,
-        model_config.enable_hlfb,
-    )
-    self.post_atten_norm = builder.build_norm(
-        model_config.embedding_dim,
-        config.post_attention_norm_config,
-    )
-    self.ff = builder.build_ff(model_config.embedding_dim, config.ff_config)
-    self.config = config
-  def forward(
-      self,
-      x: torch.Tensor,
-      rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-      mask: Optional[torch.Tensor] = None,
-      input_pos: Optional[torch.Tensor] = None,
-      kv_cache: kv_utils.KVCacheEntry = None,
-      lora: Optional[lora_utils.LoRAEntry] = None,
-  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
-    """Forward function of the TransformerBlock.
-    Args:
-      x (torch.Tensor): the input tensor.
-      rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
-      mask (torch.Tensor): the optional mask tensor.
-      input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): the optional kv cache entry.
-      lora (LoRAEntry): the optional lora entry.
-    Returns:
-      output activation from this transformer block, and updated kv cache (if
-      passed in).
-    """
-    kv = None
-    if self.config.parallel_residual:
-      x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(
-          x_norm, rope, mask, input_pos, kv_cache, lora
-      )
-      if kv_cache is None:
-        attn_out = atten_func_out
-      else:
-        attn_out, kv = atten_func_out
-      ff_out = self.ff(x_norm)
-      output = x + attn_out + ff_out
-    else:
-      x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(
-          x_norm, rope, mask, input_pos, kv_cache, lora
-      )
-      if kv_cache is None:
-        attn_out = atten_func_out
-      else:
-        attn_out, kv = atten_func_out
-      x = x + attn_out
-      x_norm = self.post_atten_norm(x)
-      output = x + self.ff(x_norm)
-    return output if kv is None else (output, kv)
-class CausalSelfAttention(nn.Module):
-  def __init__(
-      self,
-      dim: int,
-      config: cfg.AttentionConfig,
-      enable_hlfb: bool,
-  ) -> None:
-    """Initialize an instance of CausalSelfAttention.
-    Args:
-      dim (int): causal attention's input/output dimmension.
-      config (cfg.AttentionConfig): attention specific configurations.
-      enable_hlfb (bool): whether hlfb is enabled or not.
-    """
-    super().__init__()
-    self.kv_cache = None
-    qkv_shape = (
-        config.num_heads + 2 * config.num_query_groups
-    ) * config.head_dim
-    output_shape = config.num_heads * config.head_dim
-    # Key, query, value projections for all heads.
-    self.qkv_projection = nn.Linear(dim, qkv_shape, bias=config.qkv_use_bias)
-    self.output_projection = nn.Linear(
-        output_shape, dim, bias=config.output_proj_use_bias
-    )
-    self.query_norm = builder.build_norm(
-        config.head_dim, config.query_norm_config
-    )
-    self.key_norm = builder.build_norm(config.head_dim, config.key_norm_config)
-    self.config = config
-    self.enable_hlfb = enable_hlfb
-  def forward(
-      self,
-      x: torch.Tensor,
-      rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-      mask: Optional[torch.Tensor] = None,
-      input_pos: Optional[torch.Tensor] = None,
-      kv_cache: Optional[kv_utils.KVCacheEntry] = None,
-      lora: Optional[lora_utils.LoRAEntry] = None,
-  ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
-    """Forward function of the CausalSelfAttention layer, which can support
-       MQA, GQA and MHA.
-    Args:
-      x (torch.Tensor): the input tensor.
-      rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
-      mask (torch.Tensor): the optional mask tensor.
-      input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
-      lora (LoRAEntry): the optional lora entry.
-    Returns:
-      output activation from this self attention layer, and the updated
-        KV Cach Entry (if passed in).
-    """
-    # Batch size, sequence length, embedding dimensionality.
-    B, T, E = x.size()
-    qkv = self.qkv_projection(x)
-    # Assemble into a number of query groups to support MHA, MQA and GQA.
-    q_per_kv = self.config.num_heads // self.config.num_query_groups
-    # Each group has >=1 queries, 1 key, and 1 value.
-    if self.config.qkv_transpose_before_split:
-      qkv = qkv.view(B, T, -1, self.config.head_dim)
-      q, k, v = qkv.split(
-          (
-              q_per_kv * self.config.num_query_groups,
-              self.config.num_query_groups,
-              self.config.num_query_groups,
-          ),
-          dim=-2,
-      )
-    else:
-      qkv = qkv.view(B, T, self.config.num_query_groups, -1)
-      q, k, v = qkv.split(
-          (
-              q_per_kv * self.config.head_dim,
-              self.config.head_dim,
-              self.config.head_dim,
-          ),
-          dim=-1,
-      )
-    if lora is not None:
-      q += lora_utils.apply_lora(x, lora.attention.query, shape=q.shape)
-      k += lora_utils.apply_lora(x, lora.attention.key, shape=k.shape)
-      v += lora_utils.apply_lora(x, lora.attention.value, shape=v.shape)
-    q = self.query_norm(q)
-    k = self.key_norm(k)
-    q = q.reshape(B, T, -1, self.config.head_dim)
-    k = k.reshape(B, T, -1, self.config.head_dim)
-    v = v.reshape(B, T, -1, self.config.head_dim)
-    if rope is not None:
-      # Compute rotary positional embedding for query and key.
-      n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      cos, sin = rope
-      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
-    sdpa_out, kv_cache = sdpa_with_kv_update.sdpa_with_kv_update(
-        q, k, v, kv_cache, input_pos, mask, self.config
-    )
-    # Compute the output projection.
-    y = self.output_projection(sdpa_out)
-    if lora is not None:
-      y += lora_utils.apply_lora(sdpa_out, lora.attention.output)
-    return y if kv_cache is None else (y, kv_cache)

{ai_edge_torch_nightly-0.5.0.dev20250423.dist-info → ai_edge_torch_nightly-0.5.0.dev20250424.dist-info}/LICENSE RENAMED Viewed

File without changes

{ai_edge_torch_nightly-0.5.0.dev20250423.dist-info → ai_edge_torch_nightly-0.5.0.dev20250424.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai_edge_torch_nightly-0.5.0.dev20250423.dist-info → ai_edge_torch_nightly-0.5.0.dev20250424.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai-edge-torch-nightly 0.5.0.dev20250423__py3-none-any.whl → 0.5.0.dev20250424__py3-none-any.whl

ai-edge-torch-nightly 0.5.0.dev20250423py3-none-any.whl → 0.5.0.dev20250424py3-none-any.whl