PyPI - litert-torch-nightly - Versions diffs - 0.9.0.dev20260202__py3-none-any.whl → 0.9.0.dev20260203__py3-none-any.whl - Mend

litert-torch-nightly 0.9.0.dev20260202py3-none-any.whl → 0.9.0.dev20260203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

litert_torch/generative/export_hf/core/attention.py CHANGED Viewed

@@ -14,13 +14,83 @@
 # ==============================================================================
 """Optimized Attention layer for HuggingFace integration."""
-from litert_torch.generative.layers import scaled_dot_product_attention as sdpa_lib
+import math
+from typing import Optional
 import jaxtyping as jt
+from litert_torch.generative.custom_ops import bmm_4d as bmm_lib
 import torch
+import torch.nn.functional as F
 import transformers
+def scaled_dot_product_attention_transposed(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    head_size: int,
+    k_ts_idx: int,
+    v_ts_idx: int,
+    mask: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+    softcap: Optional[float] = None,
+    alibi_bias: Optional[torch.Tensor] = None,
+):
+  """Scaled dot product attention with transposed key and value.
+  Args:
+    query: Query tensor, with shape [B, T, N, H].
+    key: Key tensor, with shape [B, T, KV_LEN, H].
+    value: Value tensor, with shape [B, T, H, KV_LEN].
+    head_size (int): head dimension.
+    mask (torch.Tensor): the optional mask tensor.
+    scale (float): the optional scale factor.
+    softcap (float): the optional softcap for the logits.
+    alibi_bias (torch.Tensor): optional alibi bias tensor.
+  Returns:
+    The output tensor of scaled_dot_product_attention_transposed.
+  """
+  if scale is None:
+    scale = 1.0 / math.sqrt(head_size)
+  if alibi_bias is not None:
+    alibi_bias = alibi_bias * scale
+    if mask is None:
+      mask = alibi_bias
+    else:
+      mask = mask + alibi_bias
+  query = query * scale
+  assert mask is not None, "Mask should not be None!"
+  t = mask.shape[2]
+  if k_ts_idx == 2:
+    bmm_fn = bmm_lib.bmm_4d
+  else:
+    assert k_ts_idx == 3, "k_ts_idx must be 2 or 3."
+    bmm_fn = lambda x, y: torch.einsum("abth,abhs->abts", x, y)
+  logits = bmm_fn(query, key)
+  _, bk, gt, s = logits.shape
+  g = gt // t
+  logits = logits.reshape((bk, g, t, s))
+  if softcap is not None:
+    logits = torch.tanh(logits / softcap)
+    logits = logits * softcap
+  padded_logits = logits + mask
+  padded_logits = padded_logits.reshape(1, bk, gt, s)
+  probs = F.softmax(padded_logits, dim=-1).type_as(key)
+  if v_ts_idx == 3:
+    bmm_fn = bmm_lib.bmm_4d
+  else:
+    assert v_ts_idx == 2, "v_ts_idx must be 2 or 3."
+    bmm_fn = lambda x, y: torch.einsum("abts,absh->abth", x, y)
+  encoded = bmm_fn(probs, value)
+  return encoded  # 1, bk, gt, h
 def transposed_attention(
     module: torch.nn.Module,
     query: jt.Float[torch.Tensor, "b n t h"],
@@ -46,20 +116,28 @@ def transposed_attention(
   Returns:
     The attention output tensor.
   """
-  del kwargs  # Unused in this implementation but required by the interface.
   b, n, seq_len, h = query.shape
   g = getattr(module, "num_key_value_groups", 1)
   num_query_groups = n // g
   # bnth -> b(kg)th -> 1(bk)(gt)h
   query = query.reshape(1, b * num_query_groups, g * seq_len, h)
+  key_ts_idx: int | None = kwargs.get("k_ts_idx", None)
+  value_ts_idx: int | None = kwargs.get("v_ts_idx", None)
+  if key_ts_idx is None or value_ts_idx is None:
+    raise ValueError(
+        "Timestamp indices not passed to attention module. The model is not"
+        " passing the kwargs correctly."
+    )
   # 1, bk, gt, h
-  sdpa_out = sdpa_lib.scaled_dot_product_attention_transposed(
-      query,
-      key,
-      value,
-      h,
+  sdpa_out = scaled_dot_product_attention_transposed(
+      query=query,
+      key=key,
+      value=value,
+      head_size=h,
+      k_ts_idx=key_ts_idx,
+      v_ts_idx=value_ts_idx,
       mask=attention_mask,
       scale=scaling,
       softcap=softcap,

litert_torch/generative/export_hf/core/attention_test.py CHANGED Viewed

@@ -71,7 +71,7 @@ class DummyAttentionModule(torch.nn.Module):
     self.scaling = scaling
     self.softcap = softcap
-  def forward(self, query, key, value, attention_mask):
+  def forward(self, query, key, value, attention_mask, **kwargs):
     attention_interface = modeling_utils.ALL_ATTENTION_FUNCTIONS[
         self.attention_implementation
     ]
@@ -84,6 +84,7 @@ class DummyAttentionModule(torch.nn.Module):
         attention_mask,
         scaling=self.scaling,
         softcap=self.softcap,
+        **kwargs,
     )[0]
@@ -139,8 +140,12 @@ class AttentionTest(parameterized.TestCase):
             scaling=scl,
             softcap=scp,
         )
+        attention_kwargs = {
+            'k_ts_idx': 2,
+            'v_ts_idx': 3,
+        }
         expected = attn(query, key, value, mask)
-        actual = test_attn(query, key, value, mask)
+        actual = test_attn(query, key, value, mask, **attention_kwargs)
         self.assertTrue(
             torch.allclose(
                 expected, actual, rtol=1e-2, atol=1e-2, equal_nan=True

litert_torch/generative/export_hf/core/cache.py CHANGED Viewed

@@ -25,18 +25,30 @@ Shape annotations used here:
 """
 from typing import Any, List, Optional, Tuple
+import jaxtyping as jt
 import litert_torch.generative.custom_ops.dynamic_update_slice as tfl_dus
+from litert_torch.generative.export_hf.core import exportable_module_config
 import litert_torch.generative.export_hf.core.cache_base as cache_base_lib
-import jaxtyping as jt
 import torch
 import torch.utils._pytree as pytree
+ExportableModuleConfig = exportable_module_config.ExportableModuleConfig
 # Shape annotations for the cache entries.
-KeyCache = jt.Shaped[torch.Tensor, "1 BK S H"]
-KeySlice = jt.Shaped[torch.Tensor, "1 BK T H"]
-ValueCache = jt.Shaped[torch.Tensor, "1 BK H S"]
-ValueSlice = jt.Shaped[torch.Tensor, "1 BK H T"]
+KeyCache = (
+    jt.Shaped[torch.Tensor, "1 BK S H"] | jt.Shaped[torch.Tensor, "1 BK H S"]
+)
+KeySlice = (
+    jt.Shaped[torch.Tensor, "1 BK T H"] | jt.Shaped[torch.Tensor, "1 BK H T"]
+)
+ValueCache = (
+    jt.Shaped[torch.Tensor, "1 BK H S"] | jt.Shaped[torch.Tensor, "1 BK S H"]
+)
+ValueSlice = (
+    jt.Shaped[torch.Tensor, "1 BK H T"] | jt.Shaped[torch.Tensor, "1 BK T H"]
+)
 def _get_slice_indices(
@@ -77,15 +89,11 @@ def _update_kv_impl(
     k_slice: KeySlice,
     v_slice: ValueSlice,
     cache_position: jt.Int32[torch.Tensor, "T"],
-    reverse_kv: bool = False,
+    k_ts_idx: int,
+    v_ts_idx: int,
 ):
   """Updates the cache buffer using tfl.dynamic_update_slice."""
   cache_dim = 4
-  k_ts_idx = 2  # K Cache shape is 1 BK S H
-  v_ts_idx = 3  # V Cache shape is 1 BK H S
-  if reverse_kv:
-    k_ts_idx = 3  # K Cache shape is 1 BK H S
-    v_ts_idx = 2  # V Cache shape is 1 BK S H
   positions = cache_position[0]  # The position of the first input token.
   k_slice_indices = _get_slice_indices(positions.clone(), cache_dim, k_ts_idx)
   v_slice_indices = _get_slice_indices(positions.clone(), cache_dim, v_ts_idx)
@@ -109,27 +117,26 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
       key_cache: KeyCache,
       value_cache: ValueCache,
       batch_size: int = 1,
-      reverse_kv: bool = False,
+      k_ts_idx: int = 2,
+      v_ts_idx: int = 3,
       **kwargs,
   ):
     super().__init__()
     self.keys = key_cache
     self.values = value_cache
-    self.reverse_kv = reverse_kv
+    self.k_ts_idx = k_ts_idx  # The index of the sequence dimension in K cache.
+    self.v_ts_idx = v_ts_idx  # The index of the sequence dimension in V cache.
+    assert k_ts_idx in [2, 3]
+    assert v_ts_idx in [2, 3]
     self.is_initialized = True
     self.k_cache_shape = self.keys.shape
     self.v_cache_shape = self.values.shape
-    self.max_cache_len = (
-        self.v_cache_shape[2] if reverse_kv else self.k_cache_shape[2]
-    )
+    self.max_cache_len = self.v_cache_shape[self.v_ts_idx]
     self.batch_size = batch_size
-    self.num_key_value_heads = (
-        self.v_cache_shape[1] if reverse_kv else self.k_cache_shape[1]
-    ) // self.batch_size
-    self.head_dim = (
-        self.v_cache_shape[3] if reverse_kv else self.k_cache_shape[3]
-    )
+    v_head_dim_idx = 3 if self.v_ts_idx == 2 else 2
+    self.head_dim = self.v_cache_shape[v_head_dim_idx]
     self.additional_states = kwargs.get("additional_states", None)
     self.cumulative_length = 0
@@ -137,6 +144,12 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
   def get_batch_size(self) -> int:
     return self.batch_size
+  def get_k_ts_idx(self) -> int:
+    return self.k_ts_idx
+  def get_v_ts_idx(self) -> int:
+    return self.v_ts_idx
   def lazy_initialization(self, key_states: torch.Tensor):
     # Since we don't support real lazy initialization, this function could only
     # be called by Cache.early_initialization, where uses a standard cache
@@ -162,13 +175,24 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
     value_states = value_states.to(self.values.dtype)
     if not cache_kwargs.get("kv_slice_preprocessed", False):
-      assert not self.reverse_kv, "Reverse KV is not supported."
-      key_states = key_states.reshape(
-          1, -1, seq_len, self.head_dim
-      )  # 1, bk, s, h
-      value_states = value_states.permute(0, 1, 3, 2).reshape(
-          1, -1, self.head_dim, seq_len
-      )  # 1, bk, h, s
+      if self.k_ts_idx == 3:
+        key_target_shape = (1, -1, self.head_dim, seq_len)
+        key_states = key_states.permute(0, 1, 3, 2).reshape(*key_target_shape)
+      elif self.k_ts_idx == 2:
+        key_target_shape = (1, -1, seq_len, self.head_dim)
+        key_states = key_states.reshape(*key_target_shape)
+      else:
+        raise ValueError(f"Unsupported k_ts_idx: {self.k_ts_idx}")
+      if self.v_ts_idx == 3:
+        value_target_shape = (1, -1, self.head_dim, seq_len)
+        value_states = value_states.permute(0, 1, 3, 2).reshape(
+            *value_target_shape
+        )
+      elif self.v_ts_idx == 2:
+        value_target_shape = (1, -1, seq_len, self.head_dim)
+        value_states = value_states.reshape(*value_target_shape)
+      else:
+        raise ValueError(f"Unsupported v_ts_idx: {self.v_ts_idx}")
     cache_position: jt.Int32[torch.Tensor, "T"] = cache_kwargs.get(
         "cache_position"
@@ -182,7 +206,8 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
         key_states,
         value_states,
         cache_position,
-        self.reverse_kv,
+        self.k_ts_idx,
+        self.v_ts_idx,
     )
     return self.keys, self.values
@@ -203,32 +228,52 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
       cls,
       model_config,
       layer_index,
-      cache_length,
-      batch_size=1,
-      reverse_kv=False,
+      export_config: ExportableModuleConfig,
   ):
     """Infers the KV cache shape from the model config."""
     del layer_index  # Unused.
+    cache_length = export_config.cache_length
+    batch_size = export_config.batch_size
+    k_ts_idx = export_config.k_ts_idx
+    v_ts_idx = export_config.v_ts_idx
     num_kv_heads = model_config.num_key_value_heads
     embed_size_per_head = (
         getattr(model_config, "head_dim", None)
         or model_config.hidden_size // model_config.num_attention_heads
     )
-    k_cache_shape = (
-        1,
-        batch_size * num_kv_heads,
-        cache_length,
-        embed_size_per_head,
-    )
-    v_cache_shape = (
-        1,
-        batch_size * num_kv_heads,
-        embed_size_per_head,
-        cache_length,
-    )
-    if reverse_kv:
-      k_cache_shape, v_cache_shape = v_cache_shape, k_cache_shape
+    if k_ts_idx == 2:
+      k_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          cache_length,
+          embed_size_per_head,
+      )
+    elif k_ts_idx == 3:
+      k_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          embed_size_per_head,
+          cache_length,
+      )
+    else:
+      raise ValueError(f"Unsupported k_ts_idx: {k_ts_idx}")
+    if v_ts_idx == 2:
+      v_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          cache_length,
+          embed_size_per_head,
+      )
+    elif v_ts_idx == 3:
+      v_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          embed_size_per_head,
+          cache_length,
+      )
+    else:
+      raise ValueError(f"Unsupported v_ts_idx: {v_ts_idx}")
     return k_cache_shape, v_cache_shape
   @classmethod
@@ -236,18 +281,22 @@ class LiteRTLMCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
       cls,
       model_config,
       layer_index,
-      cache_length,
-      batch_size=1,
-      reverse_kv=False,
+      export_config: ExportableModuleConfig,
       **kwargs,
   ) -> "LiteRTLMCacheLayer":
     """Creates a KV cache from the model config."""
     k_cache_shape, v_cache_shape = cls._infer_cache_shape_from_config(
-        model_config, layer_index, cache_length, batch_size, reverse_kv
+        model_config, layer_index, export_config
     )
     keys = torch.zeros(k_cache_shape, dtype=torch.float32)
     values = torch.zeros(v_cache_shape, dtype=torch.float32)
-    return cls(keys, values, reverse_kv=reverse_kv, **kwargs)
+    return cls(
+        keys,
+        values,
+        k_ts_idx=export_config.k_ts_idx,
+        v_ts_idx=export_config.v_ts_idx,
+        **kwargs,
+    )
 @cache_base_lib.register_cache_implementation
@@ -258,9 +307,7 @@ class LiteRTLMCache(cache_base_lib.LiteRTLMCacheMixin):
   def create_from_config(
       cls,
       model_config,
-      cache_length,
-      batch_size=1,
-      reverse_kv=False,
+      export_config: ExportableModuleConfig,
       **kwargs,
   ) -> "LiteRTLMCache":
     """Creates a KV cache from the model config."""
@@ -271,9 +318,8 @@ class LiteRTLMCache(cache_base_lib.LiteRTLMCacheMixin):
           LiteRTLMCacheLayer.create_from_config(
               model_config,
               layer_index,
-              cache_length,
-              batch_size=batch_size,
-              reverse_kv=reverse_kv,
+              export_config,
+              **kwargs,
           )
       )
     return cls(layers)
@@ -281,7 +327,7 @@ class LiteRTLMCache(cache_base_lib.LiteRTLMCacheMixin):
 def _flatten_kvc_t(
     kvc: LiteRTLMCache,
-) -> Tuple[List[torch.Tensor], Tuple[List[str], Tuple[int, int, bool]]]:
+) -> Tuple[List[torch.Tensor], Tuple[List[str], Tuple[int, int, int, int]]]:
   """Flattens the cache into a list of tensors."""
   flattened = []
   flat_names = []
@@ -289,22 +335,23 @@ def _flatten_kvc_t(
   layer_0 = kvc.layers[0]
   assert isinstance(layer_0, cache_base_lib.LiteRTLMCacheLayerMixin)
   batch_size = layer_0.get_batch_size()
-  reverse_kv = getattr(layer_0, "reverse_kv", False)
+  k_ts_idx = layer_0.get_k_ts_idx()
+  v_ts_idx = layer_0.get_v_ts_idx()
   for i, layer in enumerate(kvc.layers):
     flattened.append(layer.keys)
     flat_names.append(f"k_{i}")
     flattened.append(layer.values)
     flat_names.append(f"v_{i}")
-  return flattened, (flat_names, (batch_size, num_layers, reverse_kv))
+  return flattened, (flat_names, (batch_size, num_layers, k_ts_idx, v_ts_idx))
 def _unflatten_kvc_t(
     values: List[torch.Tensor],
-    context: Tuple[List[str], Tuple[int, int, bool]],
+    context: Tuple[List[str], Tuple[int, int, int, int]],
 ) -> LiteRTLMCache:
   """Unflattens the cache from a list of tensors."""
   flat_names = context[0]
-  batch_size, num_layers, reverse_kv = context[1]
+  batch_size, num_layers, k_ts_idx, v_ts_idx = context[1]
   layers = []
   for i in range(num_layers):
     k_cache_idx = flat_names.index(f"k_{i}")
@@ -314,7 +361,8 @@ def _unflatten_kvc_t(
             key_cache=values[k_cache_idx],
             value_cache=values[v_cache_idx],
             batch_size=batch_size,
-            reverse_kv=reverse_kv,
+            k_ts_idx=k_ts_idx,
+            v_ts_idx=v_ts_idx,
         )
     )
   obj = LiteRTLMCache(layers)

litert_torch/generative/export_hf/core/cache_base.py CHANGED Viewed

@@ -15,8 +15,11 @@
 """Base class for cache."""
 import abc
+from litert_torch.generative.export_hf.core import exportable_module_config
 from transformers import cache_utils
+ExportableModuleConfig = exportable_module_config.ExportableModuleConfig
 class LiteRTLMCacheLayerMixin(cache_utils.CacheLayerMixin, abc.ABC):
   """Optimized Cache layer class mixin for HuggingFace integration."""
@@ -26,10 +29,24 @@ class LiteRTLMCacheLayerMixin(cache_utils.CacheLayerMixin, abc.ABC):
     """Returns the batch size of the cache."""
     ...
+  @abc.abstractmethod
+  def get_k_ts_idx(self) -> int:
+    """Returns the index of the sequence dimension in K cache."""
+    ...
+  @abc.abstractmethod
+  def get_v_ts_idx(self) -> int:
+    """Returns the index of the sequence dimension in V cache."""
+    ...
   @classmethod
   @abc.abstractmethod
   def create_from_config(
-      cls, model_config, layer_index, cache_length, batch_size=1, **kwargs
+      cls,
+      model_config,
+      layer_index,
+      export_config: ExportableModuleConfig,
+      **kwargs
   ) -> "LiteRTLMCacheLayerMixin":
     ...
@@ -40,7 +57,7 @@ class LiteRTLMCacheMixin(cache_utils.Cache, abc.ABC):
   @classmethod
   @abc.abstractmethod
   def create_from_config(
-      cls, model_config, cache_length, batch_size=1
+      cls, model_config, export_config: ExportableModuleConfig, **kwargs
   ) -> "LiteRTLMCacheMixin":
     """Creates a KV cache from the model config."""
     ...

litert_torch/generative/export_hf/core/export_lib.py CHANGED Viewed

@@ -26,6 +26,7 @@ from litert_torch.generative.export_hf.core import exportable_module
 from litert_torch.generative.export_hf.core import patches as _
 from litert_torch.generative.export_hf.core import utils
 from litert_torch.generative.export_hf.core.external_emb import exportable_module as external_emb_module
+from litert_torch.generative.export_hf.core.external_rope import exportable_module as external_rope_module
 from litert_torch.generative.export_hf.core.external_rope import preprocess_model as external_rope_preprocess_model
 from litert_torch.generative.export_hf.core.mu import mu_pass_lib
 from litert_torch.generative.export_hf.core.split_cache import attention as _
@@ -34,6 +35,7 @@ from litert_torch.generative.tools import tokenizer_to_sentencepiece_lib as toke
 from litert_torch.odml_torch.experimental import torch_tfl
 import torch
 import transformers
 from ai_edge_quantizer import quantizer as quantizer_lib
 from ai_edge_quantizer import recipe as recipe_lib
@@ -174,12 +176,10 @@ def export_text_prefill_decode_model(
   prefill_module_cls, decode_module_cls = get_prefill_decode_exportable_cls(
       export_config
   )
-  prefill_module = prefill_module_cls(model)
-  decode_module = decode_module_cls(model)
+  prefill_module = prefill_module_cls(model, export_config)
+  decode_module = decode_module_cls(model, export_config)
   converter = converter_utils.Converter()
-  sample_prefill_inputs = prefill_module.get_sample_inputs(
-      text_model_config, export_config
-  )
+  sample_prefill_inputs = prefill_module.get_sample_inputs(text_model_config)
   for signature_name, (
       sample_prefill_inputs,
       prefill_dynamic_shapes,
@@ -213,7 +213,7 @@ def export_text_prefill_decode_model(
           sample_kwargs=sample_prefill_inputs,
       )
   sample_decode_inputs, decode_dynamic_shapes = decode_module.get_sample_inputs(
-      text_model_config, export_config
+      text_model_config
   )['decode']
   if has_dynamic_shape:
     print('Exporting decode_module...')
@@ -337,6 +337,55 @@ def export_embedder_model(
   return model_path
+def export_auxiliary_model(
+    model,
+    text_model_config,
+    export_config: exportable_module.ExportableModuleConfig,
+    work_dir: str,
+    quantization_recipe: str | None = None,
+):
+  """Exports auxiliary model."""
+  del quantization_recipe  # Unused.
+  converter = converter_utils.Converter()
+  # RoPE
+  rope_module = external_rope_module.RoPEEmbedder(model)
+  sample_inputs = rope_module.get_sample_inputs(
+      text_model_config, export_config
+  )
+  for signature_name, (sample_input, _) in sample_inputs.items():
+    converter.add_signature(
+        signature_name,
+        rope_module.eval(),
+        sample_kwargs=sample_input,
+    )
+  # Attention Mask
+  attention_mask_module = split_cache_module.SplitAttentionMaskBuilder(model)
+  sample_inputs = attention_mask_module.get_sample_inputs(
+      text_model_config, export_config
+  )
+  for signature_name, (sample_input, _) in sample_inputs.items():
+    converter.add_signature(
+        signature_name,
+        attention_mask_module.eval(),
+        sample_kwargs=sample_input,
+    )
+  # Cache Update
+  cache_update_module = split_cache_module.CacheUpdate(model)
+  sample_inputs = cache_update_module.get_sample_inputs(
+      text_model_config, export_config
+  )
+  for signature_name, (sample_input, _) in sample_inputs.items():
+    converter.add_signature(
+        signature_name,
+        cache_update_module.eval(),
+        sample_kwargs=sample_input,
+    )
+  lrt_model = converter.convert(strict_export=False)
+  model_path = os.path.join(work_dir, 'auxiliary.tflite')
+  lrt_model.export(model_path)
+  return model_path
 def export_tokenizer(
     tokenizer,
     work_dir: str,

litert-torch-nightly 0.9.0.dev20260202__py3-none-any.whl → 0.9.0.dev20260203__py3-none-any.whl

litert-torch-nightly 0.9.0.dev20260202py3-none-any.whl → 0.9.0.dev20260203py3-none-any.whl