PyPI - litert-torch-nightly - Versions diffs - 0.9.0.dev20260202__py3-none-any.whl → 0.9.0.dev20260203__py3-none-any.whl - Mend

litert-torch-nightly 0.9.0.dev20260202py3-none-any.whl → 0.9.0.dev20260203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

litert_torch/generative/export_hf/core/exportable_module.py CHANGED Viewed

@@ -15,38 +15,35 @@
 """Exportable modules."""
 import abc
-import dataclasses
 from litert_torch.generative.export_hf.core import cache as _
 from litert_torch.generative.export_hf.core import cache_base as kv_cache_lib
+from litert_torch.generative.export_hf.core import exportable_module_config
 from litert_torch.generative.export_hf.core import utils
 import torch
-@dataclasses.dataclass
-class ExportableModuleConfig:
-  """Config for exportable modules."""
+ExportableModuleConfig = exportable_module_config.ExportableModuleConfig
-  batch_size: int = 1
-  cache_length: int = 1280
-  prefill_lengths: list[int] = dataclasses.field(default_factory=lambda: [128])
-  # For dynamic shape
-  cache_length_dim: torch.export.Dim | None = None
-  prefill_length_dim: torch.export.Dim | None = None
-  # Export configs
-  externalize_embedder: bool = False
-  externalize_rope: bool = False
-  split_cache: bool = False
+class ExportableModuleBase(torch.nn.Module, abc.ABC):
+  """Base class for exportable modules."""
-  cache_implementation: str = "LiteRTLMCache"
+  def __init__(self, export_config: ExportableModuleConfig):
+    super().__init__()
+    self._export_config = export_config
+  @property
+  def export_config(self) -> ExportableModuleConfig:
+    return self._export_config
-class ExportableModuleBase(torch.nn.Module, abc.ABC):
-  """Base class for exportable modules."""
+  def attention_kwargs(self):
+    k_ts_idx = self.export_config.k_ts_idx
+    v_ts_idx = self.export_config.v_ts_idx
+    return {"k_ts_idx": k_ts_idx, "v_ts_idx": v_ts_idx}
   @abc.abstractmethod
   def get_sample_inputs(
-      self, model_config, export_config: ExportableModuleConfig
+      self, model_config
   ) -> dict[str, tuple[dict[str, torch.Tensor], dict[str, torch.export.Dim]]]:
     """Returns the sample inputs for the model."""
     ...
@@ -55,8 +52,10 @@ class ExportableModuleBase(torch.nn.Module, abc.ABC):
 class LiteRTExportableModuleForDecoderOnlyLM(ExportableModuleBase):
   """Base class for exportable modules for decoder-only LM."""
-  def __init__(self, model: torch.nn.Module):
-    super().__init__()
+  def __init__(
+      self, model: torch.nn.Module, export_config: ExportableModuleConfig
+  ):
+    super().__init__(export_config)
     self.model = model
   def adapt_inputs(
@@ -108,16 +107,13 @@ class LiteRTExportableModuleForDecoderOnlyLM(ExportableModuleBase):
     })
     return ret
-  def get_sample_kv_cache(
-      self, model_config, export_config: ExportableModuleConfig
-  ):
+  def get_sample_kv_cache(self, model_config):
     """Returns the input sample KV cache for the model."""
+    export_config = self.export_config
     num_layers = model_config.num_hidden_layers
-    batch_size = export_config.batch_size
-    cache_length = export_config.cache_length
     kv_cache = kv_cache_lib.CACHE_REGISTRY[
         export_config.cache_implementation
-    ].create_from_config(model_config, cache_length, batch_size)
+    ].create_from_config(model_config, export_config)
     inputs = {"kv_cache": kv_cache}
     if export_config.cache_length_dim is not None:
       all_k_shapes = tuple(
@@ -150,6 +146,7 @@ class LiteRTExportableModuleForDecoderOnlyLMPrefill(
       mask,
   ):
     inputs = self.adapt_inputs(tokens, None, input_pos, kv_cache, mask)
+    inputs |= self.attention_kwargs()
     output = self.model(**inputs)
     return {"kv_cache": output.past_key_values}
@@ -165,11 +162,10 @@ class LiteRTExportableModuleForDecoderOnlyLMPrefill(
     )
     return tokens, tokens_dynamic_shape
-  def get_sample_inputs(
-      self, model_config, export_config: ExportableModuleConfig
-  ):
+  def get_sample_inputs(self, model_config):
+    export_config = self.export_config
     kv_cache_inputs, kv_cache_dynamic_shapes = self.get_sample_kv_cache(
-        model_config, export_config
+        model_config
     )
     batch_size = export_config.batch_size
     cache_length = export_config.cache_length
@@ -218,6 +214,7 @@ class LiteRTExportableModuleForDecoderOnlyLMGenerate(
       mask,
   ):
     inputs = self.adapt_inputs(tokens, None, input_pos, kv_cache, mask)
+    inputs |= self.attention_kwargs()
     output = self.model(**inputs)
     return {"kv_cache": output.past_key_values, "logits": output.logits}
@@ -231,11 +228,10 @@ class LiteRTExportableModuleForDecoderOnlyLMGenerate(
     tokens_dynamic_shape = {"tokens": None} if decode_length_dim else {}
     return tokens, tokens_dynamic_shape
-  def get_sample_inputs(
-      self, model_config, export_config: ExportableModuleConfig
-  ):
+  def get_sample_inputs(self, model_config):
+    export_config = self.export_config
     kv_cache_inputs, kv_cache_dynamic_shapes = self.get_sample_kv_cache(
-        model_config, export_config
+        model_config
     )
     batch_size = export_config.batch_size
     cache_length = export_config.cache_length

litert_torch/generative/export_hf/core/exportable_module_config.py ADDED Viewed

@@ -0,0 +1,39 @@
+# Copyright 2025 The LiteRT Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exportable modules."""
+import dataclasses
+import torch
+@dataclasses.dataclass
+class ExportableModuleConfig:
+  """Config for exportable modules."""
+  batch_size: int = 1
+  cache_length: int = 1280
+  prefill_lengths: list[int] = dataclasses.field(default_factory=lambda: [128])
+  # For dynamic shape
+  cache_length_dim: torch.export.Dim | None = None
+  prefill_length_dim: torch.export.Dim | None = None
+  # Export configs
+  externalize_embedder: bool = False
+  externalize_rope: bool = False
+  split_cache: bool = False
+  cache_implementation: str = "LiteRTLMCache"
+  k_ts_idx: int = 2
+  v_ts_idx: int = 3

litert_torch/generative/export_hf/core/split_cache/attention.py CHANGED Viewed

@@ -17,6 +17,7 @@
 import math
 from typing import Optional
+from litert_torch.generative.custom_ops import bmm_4d as bmm_lib
 from litert_torch.generative.export_hf.core.split_cache import cache as kv_cache_lib
 import torch
 import torch.nn.functional as F
@@ -28,6 +29,8 @@ def _scaled_dot_product_attention(
     key_cache: kv_cache_lib.KeyCacheEntry,
     value_cache: kv_cache_lib.ValueCacheEntry,
     head_size: int,
+    k_ts_idx: int,
+    v_ts_idx: int,
     mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
     softcap: Optional[float] = None,
@@ -40,6 +43,8 @@ def _scaled_dot_product_attention(
     value_cache: A tuple of Value tensor. 1(bk)sh
     head_size (int): head dimension.
     mask (torch.Tensor): the optional mask tensor.
+    k_ts_idx (int): the timestamp index of the key tensor.
+    v_ts_idx (int): the timestamp index of the value tensor.
     scale (float): the optional scale factor.
     softcap (float): the optional softcap for the logits.
@@ -60,8 +65,13 @@ def _scaled_dot_product_attention(
   assert mask is not None, "Mask should not be None!"
   t = mask.shape[2]
-  logits0 = torch.einsum("abth,abhs->abts", query, key_past)
-  logits1 = torch.einsum("abth,abhs->abts", query, key)
+  if k_ts_idx == 2:
+    bmm_fn = bmm_lib.bmm_4d
+  else:
+    assert k_ts_idx == 3, "k_ts_idx must be 2 or 3."
+    bmm_fn = lambda x, y: torch.einsum("abth,abhs->abts", x, y)
+  logits0 = bmm_fn(query, key_past)
+  logits1 = bmm_fn(query, key)
   logits = torch.cat([logits0, logits1], dim=-1)
   _, bk, gt, s = logits.shape
@@ -76,8 +86,13 @@ def _scaled_dot_product_attention(
   probs = F.softmax(padded_logits, dim=-1).type_as(key)
   probs0, probs1 = probs[..., :-t], probs[..., -t:]
-  encoded0 = torch.einsum("abts,absh->abth", probs0, value_past)
-  encoded1 = torch.einsum("abts,absh->abth", probs1, value)
+  if v_ts_idx == 3:
+    bmm_fn = bmm_lib.bmm_4d
+  else:
+    assert v_ts_idx == 2, "v_ts_idx must be 2 or 3."
+    bmm_fn = lambda x, y: torch.einsum("abts,absh->abth", x, y)
+  encoded0 = bmm_fn(probs0, value_past)
+  encoded1 = bmm_fn(probs1, value)
   encoded = encoded0 + encoded1
   return encoded  # 1, bk, gt, h
@@ -94,7 +109,6 @@ def split_cache_attention(
     **kwargs,  # You need to accept **kwargs as models will pass other args
 ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
   """ODML transposed attention implementation for NPU."""
-  del kwargs
   b, n, seq_len, h = query.shape
   if hasattr(module, "num_key_value_groups"):
@@ -102,6 +116,13 @@ def split_cache_attention(
   else:
     g = 1
   num_query_groups = n // g
+  k_ts_idx: int | None = kwargs.get("k_ts_idx", None)
+  v_ts_idx: int | None = kwargs.get("v_ts_idx", None)
+  if k_ts_idx is None or v_ts_idx is None:
+    raise ValueError(
+        "Timestamp indices not passed to attention module. The model is not"
+        " passing the kwargs correctly."
+    )
   # bnth -> b(kg)th -> 1(bk)(gt)h
   query = query.reshape(1, b * num_query_groups, g * seq_len, h)
@@ -113,6 +134,8 @@ def split_cache_attention(
       mask=attention_mask,
       scale=scaling,
       softcap=softcap,
+      k_ts_idx=k_ts_idx,
+      v_ts_idx=v_ts_idx,
   )  # 1, bk, gt, h
   sdpa_out = sdpa_out.reshape(b, -1, seq_len, h).permute(0, 2, 1, 3)
   return sdpa_out, None

litert_torch/generative/export_hf/core/split_cache/cache.py CHANGED Viewed

@@ -25,16 +25,26 @@ Shape annotations used here:
 """
 from typing import Any, List, Optional, Self, Tuple
-import litert_torch.generative.export_hf.core.cache_base as cache_base_lib
 import jaxtyping as jt
+from litert_torch.generative.export_hf.core import exportable_module_config
+import litert_torch.generative.export_hf.core.cache_base as cache_base_lib
 import torch
 import torch.utils._pytree as pytree
+ExportableModuleConfig = exportable_module_config.ExportableModuleConfig
-KeyCache = jt.Shaped[torch.Tensor, "1 BK H S"]
-KeySlice = jt.Shaped[torch.Tensor, "1 BK H T"]
-ValueCache = jt.Shaped[torch.Tensor, "1 BK S H"]
-ValueSlice = jt.Shaped[torch.Tensor, "1 BK T H"]
+KeyCache = (
+    jt.Shaped[torch.Tensor, "1 BK H S"] | jt.Shaped[torch.Tensor, "1 BK S H"]
+)
+KeySlice = (
+    jt.Shaped[torch.Tensor, "1 BK H T"] | jt.Shaped[torch.Tensor, "1 BK T H"]
+)
+ValueCache = (
+    jt.Shaped[torch.Tensor, "1 BK S H"] | jt.Shaped[torch.Tensor, "1 BK H S"]
+)
+ValueSlice = (
+    jt.Shaped[torch.Tensor, "1 BK T H"] | jt.Shaped[torch.Tensor, "1 BK H T"]
+)
 KeyCacheEntry = Tuple[KeyCache, KeySlice | None]
 ValueCacheEntry = Tuple[ValueCache, ValueSlice | None]
@@ -51,6 +61,8 @@ class LiteRTLMSplitCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
       key_cache: KeyCacheEntry,
       value_cache: ValueCacheEntry,
       batch_size: int = 1,
+      k_ts_idx: int = 2,
+      v_ts_idx: int = 3,
       **kwargs,
   ):
     super().__init__()
@@ -62,12 +74,16 @@ class LiteRTLMSplitCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
     self.values = value_cache
     self.is_initialized = True
+    self.k_ts_idx = k_ts_idx
+    self.v_ts_idx = v_ts_idx
     self.k_cache_shape = self.keys[0].shape
     self.v_cache_shape = self.values[0].shape
-    self.max_cache_len = self.k_cache_shape[3]
+    self.max_cache_len = self.k_cache_shape[self.k_ts_idx]
     self.batch_size = batch_size
-    self.num_key_value_heads = self.k_cache_shape[1] // self.batch_size
-    self.head_dim = self.k_cache_shape[2]
+    self.head_dim = (
+        self.k_cache_shape[2] if self.k_ts_idx == 3 else self.k_cache_shape[3]
+    )
     self.additional_states = kwargs.get("additional_states", None)
     self.cumulative_length = 0
@@ -75,6 +91,12 @@ class LiteRTLMSplitCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
   def get_batch_size(self) -> int:
     return self.batch_size
+  def get_k_ts_idx(self) -> int:
+    return self.k_ts_idx
+  def get_v_ts_idx(self) -> int:
+    return self.v_ts_idx
   def lazy_initialization(self, key_states: torch.Tensor):
     # Since we don't support real lazy initialization, this function could only
     # be called by Cache.early_initialization, where uses a standard cache
@@ -97,12 +119,25 @@ class LiteRTLMSplitCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
     value_states = value_states.to(self.values[0].dtype)
-    key_states = key_states.permute(0, 1, 3, 2).reshape(
-        1, -1, self.head_dim, seq_len
-    )  # 1, bk, h, s
-    value_states = value_states.reshape(
-        1, -1, seq_len, self.head_dim
-    )  # 1, bk, s, h
+    if self.k_ts_idx == 2:
+      key_states = key_states.reshape(
+          1, -1, seq_len, self.head_dim
+      )  # 1, bk, s, h
+    else:
+      assert self.k_ts_idx == 3, "k_ts_idx must be 2 or 3."
+      key_states = key_states.permute(0, 1, 3, 2).reshape(
+          1, -1, self.head_dim, seq_len
+      )  # 1, bk, h, s
+    if self.v_ts_idx == 2:
+      value_states = value_states.reshape(
+          1, -1, seq_len, self.head_dim
+      )  # 1, bk, s, h
+    else:
+      assert self.v_ts_idx == 3, "v_ts_idx must be 2 or 3."
+      value_states = value_states.permute(0, 1, 3, 2).reshape(
+          1, -1, self.head_dim, seq_len
+      )  # 1, bk, h, s
     self.keys = (self.keys[0], key_states)
     self.values = (self.values[0], value_states)
@@ -123,37 +158,68 @@ class LiteRTLMSplitCacheLayer(cache_base_lib.LiteRTLMCacheLayerMixin):
   @classmethod
   def _infer_cache_shape_from_config(
-      cls, model_config, layer_index, cache_length, batch_size=1
+      cls,
+      model_config,
+      layer_index,
+      export_config: ExportableModuleConfig,
+      **kwargs,
   ):
     """Infers the KV cache shape from the model config."""
     del layer_index  # Unused.
+    del kwargs  # Unused.
+    cache_length = export_config.cache_length
+    batch_size = export_config.batch_size
+    k_ts_idx = export_config.k_ts_idx
+    v_ts_idx = export_config.v_ts_idx
     num_kv_heads = model_config.num_key_value_heads
     embed_size_per_head = (
         getattr(model_config, "head_dim", None)
         or model_config.hidden_size // model_config.num_attention_heads
     )
-    k_cache_shape = (
-        1,
-        batch_size * num_kv_heads,
-        embed_size_per_head,
-        cache_length,
-    )
-    v_cache_shape = (
-        1,
-        batch_size * num_kv_heads,
-        cache_length,
-        embed_size_per_head,
-    )
+    if k_ts_idx == 2:
+      k_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          cache_length,
+          embed_size_per_head,
+      )
+    else:
+      assert k_ts_idx == 3, "k_ts_idx must be 2 or 3."
+      k_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          embed_size_per_head,
+          cache_length,
+      )
+    if v_ts_idx == 2:
+      v_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          cache_length,
+          embed_size_per_head,
+      )
+    else:
+      assert v_ts_idx == 3, "v_ts_idx must be 2 or 3."
+      v_cache_shape = (
+          1,
+          batch_size * num_kv_heads,
+          embed_size_per_head,
+          cache_length,
+      )
     return k_cache_shape, v_cache_shape
   @classmethod
   def create_from_config(
-      cls, model_config, layer_index, cache_length, batch_size=1, **kwargs
+      cls,
+      model_config,
+      layer_index,
+      export_config: ExportableModuleConfig,
+      **kwargs,
   ) -> Self:
     """Creates a KV cache from the model config."""
     k_cache_shape, v_cache_shape = cls._infer_cache_shape_from_config(
-        model_config, layer_index, cache_length, batch_size
+        model_config, layer_index, export_config, **kwargs
     )
     keys = torch.zeros(k_cache_shape, dtype=torch.float32)
     values = torch.zeros(v_cache_shape, dtype=torch.float32)
@@ -165,14 +231,22 @@ class LiteRTLMSplitCache(cache_base_lib.LiteRTLMCacheMixin):
   """Optimized Cache class for HuggingFace integration."""
   @classmethod
-  def create_from_config(cls, model_config, cache_length, batch_size=1) -> Self:
+  def create_from_config(
+      cls,
+      model_config,
+      export_config: ExportableModuleConfig,
+      **kwargs,
+  ) -> Self:
     """Creates a KV cache from the model config."""
     num_layers = model_config.num_hidden_layers
     layers = []
     for layer_index in range(num_layers):
       layers.append(
           LiteRTLMSplitCacheLayer.create_from_config(
-              model_config, layer_index, cache_length, batch_size=batch_size
+              model_config,
+              layer_index,
+              export_config,
+              **kwargs,
           )
       )
     return cls(layers)
@@ -188,6 +262,8 @@ def _flatten_kvc_t(
   layer_0 = kvc.layers[0]
   assert isinstance(layer_0, cache_base_lib.LiteRTLMCacheLayerMixin)
   batch_size = layer_0.get_batch_size()
+  k_ts_idx = layer_0.get_k_ts_idx()
+  v_ts_idx = layer_0.get_v_ts_idx()
   for i, cache_layer in enumerate(kvc.layers):
     flattened.append(cache_layer.keys[0])
     flat_names.append(f"k_{i}")
@@ -199,16 +275,18 @@ def _flatten_kvc_t(
       flat_names.append(f"k_{i}_slice")
       flattened.append(cache_layer.values[1])
       flat_names.append(f"v_{i}_slice")
-  return flattened, [flat_names, (batch_size, num_layers)]
+  return flattened, [flat_names, (batch_size, num_layers, k_ts_idx, v_ts_idx)]
 def _unflatten_kvc_t(
     values: List[torch.Tensor],
-    context: Tuple[List[str], Tuple[int, int]],
+    context: Tuple[List[str], Tuple[int, int, int, int]],
 ) -> LiteRTLMSplitCache:
   """Unflattens the KV cache from a list of tensors."""
   flat_names = context[0]
   batch_size = context[1][0]
+  k_ts_idx = context[1][2]
+  v_ts_idx = context[1][3]
   num_layers = context[1][1]
   kv_entries = []
   for i in range(num_layers):
@@ -231,6 +309,8 @@ def _unflatten_kvc_t(
             key_cache=(k_cache, k_cache_update),
             value_cache=(v_cache, v_cache_update),
             batch_size=batch_size,
+            k_ts_idx=k_ts_idx,
+            v_ts_idx=v_ts_idx,
         )
     )
   obj = LiteRTLMSplitCache(kv_entries)

litert_torch/generative/export_hf/core/split_cache/exportable_module.py CHANGED Viewed

@@ -14,12 +14,12 @@
 # ==============================================================================
 """Exportable module for split cache attention models."""
+import copy
 from litert_torch.generative.export_hf.core import cache as base_cache_lib
 from litert_torch.generative.export_hf.core import exportable_module as base_exportable_module
 from litert_torch.generative.export_hf.core import utils
 from litert_torch.generative.export_hf.core.split_cache import attention_mask
 from litert_torch.generative.export_hf.core.split_cache import cache as kv_cache_lib
-import numpy as np
 import torch
 from torch import nn
@@ -64,9 +64,9 @@ class LiteRTSplitCacheExportableModuleForDecoderOnlyLM(
     ret['inputs_embeds'] = embeddings
     ret.update({
-        'position_ids': np.arange(embeddings.shape[1])[None, :],
+        'position_ids': torch.arange(embeddings.shape[1])[None, :],
         'past_key_values': kv_cache,
-        'cache_position': np.arange(embeddings.shape[1]),
+        'cache_position': torch.arange(embeddings.shape[1]),
         'attention_mask': masks,
         # Other common settings
         'use_cache': True,
@@ -164,6 +164,7 @@ class LiteRTSplitCacheExportableModuleForDecoderOnlyLMPrefill(
         mask,
         kv_cache,
     )
+    inputs |= self.attention_kwargs()
     output = self.model(**inputs)
     output_cache = output.past_key_values
     return self.post_process_kv_cache(output_cache)
@@ -171,9 +172,9 @@ class LiteRTSplitCacheExportableModuleForDecoderOnlyLMPrefill(
   def get_sample_inputs(
       self,
       model_config,
-      export_config: base_exportable_module.ExportableModuleConfig,
   ):
-    kv_cache_inputs, _ = self.get_sample_kv_cache(model_config, export_config)
+    export_config = self.export_config
+    kv_cache_inputs, _ = self.get_sample_kv_cache(model_config)
     sample_inputs = {}
     for prefill_length in export_config.prefill_lengths:
@@ -207,6 +208,7 @@ class LiteRTSplitCacheExportableModuleForDecoderOnlyLMGenerate(
         mask,
         kv_cache,
     )
+    inputs |= self.attention_kwargs()
     output = self.model(**inputs)
     output_cache = output.past_key_values
     ret = self.post_process_kv_cache(output_cache)
@@ -216,9 +218,9 @@ class LiteRTSplitCacheExportableModuleForDecoderOnlyLMGenerate(
   def get_sample_inputs(
       self,
       model_config,
-      export_config: base_exportable_module.ExportableModuleConfig,
   ):
-    kv_cache_inputs, _ = self.get_sample_kv_cache(model_config, export_config)
+    export_config = self.export_config
+    kv_cache_inputs, _ = self.get_sample_kv_cache(model_config)
     sample_inputs = {
         **kv_cache_inputs,
         **self._get_input(
@@ -322,13 +324,20 @@ class CacheUpdate(torch.nn.Module):
     return {'kv_cache': kv_cache}
   @classmethod
-  def _get_input(cls, model_config, input_length, cache_length, batch_size=1):
+  def _get_input(
+      cls,
+      model_config,
+      input_length,
+      export_config: base_exportable_module.ExportableModuleConfig,
+  ):
     """Gets sample inputs for the model."""
     kv_cache = base_cache_lib.LiteRTLMCache.create_from_config(
-        model_config, cache_length, batch_size, reverse_kv=True
+        model_config, export_config
     )
+    slice_export_config = copy.deepcopy(export_config)
+    slice_export_config.cache_length = input_length
     kv_slice = base_cache_lib.LiteRTLMCache.create_from_config(
-        model_config, input_length, batch_size, reverse_kv=True
+        model_config, slice_export_config
     )
     return {
         'kv_cache': kv_cache,
@@ -348,15 +357,13 @@ class CacheUpdate(torch.nn.Module):
       inputs = cls._get_input(
           model_config,
           prefill_length,
-          export_config.cache_length,
-          export_config.batch_size,
+          export_config,
       )
       sample_inputs[f'prefill_cache_update_{prefill_length}'] = (inputs, {})
     decode_inputs = cls._get_input(
         model_config,
         1,
-        export_config.cache_length,
-        export_config.batch_size,
+        export_config,
     )
     sample_inputs['decode_cache_update'] = (decode_inputs, {})
     return sample_inputs

litert_torch/generative/export_hf/export.py CHANGED Viewed

@@ -30,7 +30,10 @@ def export(
     cache_length=4096,
     quantization_recipe: str = 'dynamic_wi8_afp32',
     enable_dynamic_shape: bool = False,
-    # externalize_embedder: bool = False,
+    externalize_embedder: bool = False,
+    key_ts_idx: int = 2,
+    value_ts_idx: int = 3,
+    split_cache: bool = False,
     auto_model_override: str | None = None,
     # target_accelerator: str | None = None,
     trust_remote_code: bool = False,
@@ -46,6 +49,8 @@ def export(
       auto_model_override=auto_model_override,
   )
   del config  # Unused.
+  if split_cache and not externalize_embedder:
+    raise ValueError('Split cache requires externalize embedder to be enabled.')
   export_config = exportable_module.ExportableModuleConfig(
       batch_size=1,
       prefill_lengths=prefill_lengths,
@@ -56,17 +61,45 @@ def export(
       cache_length_dim=torch.export.Dim('cache_length')
       if enable_dynamic_shape
       else None,
-      externalize_embedder=False,
+      externalize_embedder=externalize_embedder,
+      k_ts_idx=key_ts_idx,
+      v_ts_idx=value_ts_idx,
+      split_cache=split_cache,
+      externalize_rope=split_cache,
+      cache_implementation='LiteRTLMSplitCache'
+      if split_cache
+      else 'LiteRTLMCache',
   )
   export_lib.export_text_prefill_decode_model(
       pt_model, text_model_config, export_config, work_dir, quantization_recipe
   )
   gc.collect()
+  if externalize_embedder:
+    export_lib.export_embedder_model(
+        pt_model,
+        text_model_config,
+        export_config,
+        work_dir,
+        quantization_recipe,
+    )
+  gc.collect()
+  if split_cache:
+    export_lib.export_auxiliary_model(
+        pt_model,
+        text_model_config,
+        export_config,
+        work_dir,
+        quantization_recipe,
+    )
+  gc.collect()
   tokenizer_model_path = export_lib.export_tokenizer(tokenizer, work_dir)
   tflite_model_path = os.path.join(
       work_dir,
       'model_quantized.tflite' if quantization_recipe else 'model.tflite',
   )
+  if externalize_embedder or split_cache:
+    # TODO(weiyiw): Add support for packaging models.
+    return
   litert_lm_builder.package_model(
       pt_model,
       tokenizer,

litert_torch/version.py CHANGED Viewed

@@ -15,4 +15,4 @@
 # The next version of litert-torch.
 # The minor version code should be bumped after every release.
-__version__ = "0.9.0.dev20260202"
+__version__ = "0.9.0.dev20260203"

litert-torch-nightly 0.9.0.dev20260202__py3-none-any.whl → 0.9.0.dev20260203__py3-none-any.whl

litert-torch-nightly 0.9.0.dev20260202py3-none-any.whl → 0.9.0.dev20260203py3-none-any.whl