PyPI - ai-edge-torch-nightly - Versions diffs - 0.6.0.dev20250601__py3-none-any.whl → 0.6.0.dev20250603__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.6.0.dev20250601py3-none-any.whl → 0.6.0.dev20250603py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

ai_edge_torch/generative/examples/t5/t5.py CHANGED Viewed

@@ -128,7 +128,7 @@ class T5(nn.Module):
     self.enc_attn_mask_cache = (
         torch.zeros(
-            (config.kv_cache_max, config.kv_cache_max),
+            (config.max_seq_len, config.max_seq_len),
             dtype=torch.float32,
             device=torch.device("cpu"),
         )
@@ -137,7 +137,7 @@ class T5(nn.Module):
     )
     self.dec_attn_mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max,
+        size=config.max_seq_len,
         dtype=torch.float32,
         device=torch.device("cpu"),
     )
@@ -146,16 +146,16 @@ class T5(nn.Module):
     attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
-        query_length=config.kv_cache_max,
-        key_length=config.kv_cache_max,
+        query_length=config.max_seq_len,
+        key_length=config.max_seq_len,
         num_buckets=attn_config.relative_attention_num_buckets,
         max_distance=attn_config.relative_attention_max_distance,
     )
     self.dec_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=False,
-        query_length=config.kv_cache_max,
-        key_length=config.kv_cache_max,
+        query_length=config.max_seq_len,
+        key_length=config.max_seq_len,
         num_buckets=attn_config.relative_attention_num_buckets,
         max_distance=attn_config.relative_attention_max_distance,
     )
@@ -176,20 +176,20 @@ class T5(nn.Module):
     )
     enc_mask = self.enc_attn_mask_cache.index_select(2, input_pos)
-    enc_mask = enc_mask[:, :, :, : self.config.kv_cache_max]
+    enc_mask = enc_mask[:, :, :, : self.config.max_seq_len]
     # Mask off any "pad" tokens that shouldn't contribute to self-attention
     enc_mask[:, :, :, :] += pad_mask
     dec_mask = self.dec_attn_mask_cache.index_select(2, decoder_input_pos)
-    dec_mask = dec_mask[:, :, :, : self.config.kv_cache_max]
+    dec_mask = dec_mask[:, :, :, : self.config.max_seq_len]
     enc_relative_position = self.enc_rel_pos_mask.index_select(2, input_pos)
     enc_relative_position = enc_relative_position[
-        :, :, :, : self.config.kv_cache_max
+        :, :, :, : self.config.max_seq_len
     ]
     dec_relative_position = self.enc_rel_pos_mask.index_select(
         2, decoder_input_pos
     )
     dec_relative_position = dec_relative_position[
-        :, :, :, : self.config.kv_cache_max
+        :, :, :, : self.config.max_seq_len
     ]
     enc_attention_mask = self.enc_attn_mask_cache.index_select(
         2, decoder_input_pos
@@ -243,7 +243,7 @@ class T5Encoder(nn.Module):
     self.enc_attn_mask_cache = (
         torch.zeros(
-            (config.kv_cache_max, config.kv_cache_max),
+            (config.max_seq_len, config.max_seq_len),
             dtype=torch.float32,
             device=torch.device("cpu"),
         )
@@ -255,8 +255,8 @@ class T5Encoder(nn.Module):
     attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
-        query_length=config.kv_cache_max,
-        key_length=config.kv_cache_max,
+        query_length=config.max_seq_len,
+        key_length=config.max_seq_len,
         num_buckets=attn_config.relative_attention_num_buckets,
         max_distance=attn_config.relative_attention_max_distance,
     )
@@ -275,12 +275,12 @@ class T5Encoder(nn.Module):
     )
     enc_mask = self.enc_attn_mask_cache.index_select(2, input_pos)
-    enc_mask = enc_mask[:, :, :, : self.config.kv_cache_max]
+    enc_mask = enc_mask[:, :, :, : self.config.max_seq_len]
     # Mask off any "pad" tokens that shouldn't contribute to self-attention
     enc_mask[:, :, :, :] += pad_mask
     enc_relative_position = self.enc_rel_pos_mask.index_select(2, input_pos)
     enc_relative_position = enc_relative_position[
-        :, :, :, : self.config.kv_cache_max
+        :, :, :, : self.config.max_seq_len
     ]
     # Convert encoder inputs in embeddings if needed
@@ -315,7 +315,7 @@ class T5Decoder(nn.Module):
     self.enc_attn_mask_cache = (
         torch.zeros(
-            (config.kv_cache_max, config.kv_cache_max),
+            (config.max_seq_len, config.max_seq_len),
             dtype=torch.float32,
             device=torch.device("cpu"),
         )
@@ -327,14 +327,14 @@ class T5Decoder(nn.Module):
     attn_config = config.block_config(0).attn_config
     self.enc_rel_pos_mask = attn_utils.build_relative_position_buckets(
         bidirectional=True,
-        query_length=config.kv_cache_max,
-        key_length=config.kv_cache_max,
+        query_length=config.max_seq_len,
+        key_length=config.max_seq_len,
         num_buckets=attn_config.relative_attention_num_buckets,
         max_distance=attn_config.relative_attention_max_distance,
     )
     self.dec_attn_mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max,
+        size=config.max_seq_len,
     )
   @torch.inference_mode
@@ -346,12 +346,12 @@ class T5Decoder(nn.Module):
       pad_mask: torch.Tensor,
   ) -> torch.Tensor:
     dec_mask = self.dec_attn_mask_cache.index_select(2, decoder_input_pos)
-    dec_mask = dec_mask[:, :, :, : self.config.kv_cache_max]
+    dec_mask = dec_mask[:, :, :, : self.config.max_seq_len]
     dec_relative_position = self.enc_rel_pos_mask.index_select(
         2, decoder_input_pos
     )
     dec_relative_position = dec_relative_position[
-        :, :, :, : self.config.kv_cache_max
+        :, :, :, : self.config.max_seq_len
     ]
     enc_attention_mask = self.enc_attn_mask_cache.index_select(
         2, decoder_input_pos
@@ -603,7 +603,7 @@ def define_and_run_t5(checkpoint_path: str) -> None:
   decode_d_token = torch.tensor([[0]], dtype=torch.int)
   decode_d_input_pos = torch.tensor([0], dtype=torch.int)
-  pad_mask = torch.zeros([model.config.kv_cache_max], dtype=torch.float32)
+  pad_mask = torch.zeros([model.config.max_seq_len], dtype=torch.float32)
   pad_mask[77:] = float("-inf")
   lm_logits = model.forward(
       tokens, input_pos, decode_d_token, decode_d_input_pos, pad_mask
@@ -636,7 +636,7 @@ def define_and_run_t5_split(checkpoint_path: str) -> None:
   decode_d_token = torch.tensor([[0]], dtype=torch.int)
   decode_d_input_pos = torch.tensor([0], dtype=torch.int)
   pad_mask = torch.zeros(
-      [t5_encoder_model.config.kv_cache_max], dtype=torch.float32
+      [t5_encoder_model.config.max_seq_len], dtype=torch.float32
   )
   pad_mask[77:] = float("-inf")
   hidden_states = t5_encoder_model.forward(tokens, input_pos, pad_mask)

ai_edge_torch/generative/examples/t5/t5_attention.py CHANGED Viewed

@@ -53,7 +53,7 @@ class EncoderDecoderBlock(nn.Module):
         model_config.embedding_dim,
         config.attn_config,
         config.pre_attention_norm_config,
-        model_config.kv_cache_max,
+        model_config.max_seq_len,
         model_config.enable_hlfb,
         has_relative_attention_bias=has_relative_attention_bias,
     )
@@ -64,7 +64,7 @@ class EncoderDecoderBlock(nn.Module):
           model_config.embedding_dim,
           config.attn_config,
           config.pre_attention_norm_config,
-          model_config.kv_cache_max,
+          model_config.max_seq_len,
           model_config.enable_hlfb,
           # Cross Attention does not have relative attention bias.
           has_relative_attention_bias=False,

ai_edge_torch/generative/examples/tiny_llama/convert_to_tflite.py CHANGED Viewed

@@ -31,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -29,16 +29,8 @@ class TinyLlama(model_builder.DecoderOnlyModel):
   pass
-def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a TinyLlama model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-  Returns:
-    The model config for a TinyLlama model.
-  """
+def get_model_config() -> cfg.ModelConfig:
+  """Returns the model config for a TinyLlama model."""
   attn_config = cfg.AttentionConfig(
       num_heads=32,
       head_dim=64,
@@ -63,7 +55,6 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=22,
       max_seq_len=2048,
       embedding_dim=2048,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
@@ -71,8 +62,8 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   # TinyLlama has only one block config.
@@ -83,12 +74,13 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(**kwargs),
+      config=get_model_config(),
       tensor_names=TENSOR_NAMES,
       model_class=TinyLlama,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -88,6 +88,12 @@ class KVCacheEntry:
     obj = cls(k_cache=k, v_cache=v, kv_layout=kv_layout)
     return obj
+  def get_max_seq_len(self) -> int:
+    """Get the maximum sequence length in the KV cache."""
+    return self.k_cache.size(
+        self.kv_layout[0].dimensions.index(types.TensorDims.SEQUENCE)
+    )
 @dataclasses.dataclass
 class KVCache:
@@ -98,6 +104,7 @@ class KVCache:
   @classmethod
   def from_model_config(
       cls,
+      kv_cache_max: int,
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device | None = None,
@@ -107,6 +114,7 @@ class KVCache:
     """Build an instance of the class based on model config.
     Args:
+        kv_cache_max (int): The maximum sequence length in the KV cache.
         config (ModelConfig): Model config used for building the cache.
         dtype (torch.dtype, optional): The data type of the cache tensor.
           Defaults to torch.float32.
@@ -120,7 +128,7 @@ class KVCache:
     """
     caches = [
         KVCacheEntry.from_model_config(
-            config.kv_cache_max
+            kv_cache_max
             if not config.block_config(idx).kv_cache_max_len
             else config.block_config(idx).kv_cache_max_len,
             config.block_config(idx).attn_config,
@@ -139,6 +147,10 @@ class KVCache:
     flattened, _ = _flatten_kvc(self)
     return flattened
+  def get_max_seq_len(self) -> int:
+    """Get the maximum sequence length in the KV cache."""
+    return self.caches[0].get_max_seq_len()
 def _flatten_kvc(kvc: KVCache) -> Tuple[List[str], List[str]]:
   flattened = []

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -251,9 +251,6 @@ class ModelConfig:
   # Whether to turn on high-level function boundary.
   enable_hlfb: bool = True
-  # The maximum sequence length of the KV cache. Should not exceed max_seq_len.
-  kv_cache_max_len: int = 0
   # Softcap on the model output logits.
   final_logit_softcap: Optional[float] = None
@@ -261,23 +258,12 @@ class ModelConfig:
   # forward pass. Defaults to a standard implementation.
   build_rope: Callable = rotary_position_embedding.build_rope
-  # Whether or not to use a mask cache. Mask cache can speed up inference when
-  # statically exporting models. However, it is not supported in the dynamic
-  # export.
-  use_mask_cache: bool = True
   # An interleaved sequence of the attention types used in the model.
   # E.g. [AttentionType.LOCAL_SLIDING, AttentionType.LOCAL_SLIDING,
   # AttentionType.GLOBAL] means that the model has an attention pattern of 2
   # local attentions followed by a global attention in a repeated pattern.
   attention_patterns: Optional[Sequence[AttentionType]] = None
-  @property
-  def kv_cache_max(self) -> int:
-    if self.kv_cache_max_len > 0:
-      return self.kv_cache_max_len
-    return self.max_seq_len
   def block_config(self, idx: int) -> TransformerBlockConfig:
     if isinstance(self.block_configs, TransformerBlockConfig):
       return self.block_configs

ai_edge_torch/generative/test/test_kv_cache.py CHANGED Viewed

@@ -25,9 +25,7 @@ from absl.testing import absltest as googletest
 class TestKVLayers(googletest.TestCase):
-  def _get_test_config(
-      self, num_layers, head_dim, num_query_groups, kv_cache_max_len
-  ):
+  def _get_test_config(self, num_layers, head_dim, num_query_groups):
     attn_config = cfg.AttentionConfig(
         num_heads=1, head_dim=head_dim, num_query_groups=num_query_groups
     )
@@ -35,7 +33,6 @@ class TestKVLayers(googletest.TestCase):
         attn_config=attn_config, ff_config=None
     )
     config = cfg.ModelConfig(
-        kv_cache_max_len=kv_cache_max_len,
         embedding_dim=head_dim,
         block_configs=block_config,
         num_layers=num_layers,
@@ -50,12 +47,9 @@ class TestKVLayers(googletest.TestCase):
     NUM_QG = 1
     KV_LEN = 4
     config = self._get_test_config(
-        num_layers=N,
-        head_dim=HEAD_DIM,
-        num_query_groups=NUM_QG,
-        kv_cache_max_len=KV_LEN,
+        num_layers=N, head_dim=HEAD_DIM, num_query_groups=NUM_QG
     )
-    kv = kv_utils.KVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(KV_LEN, config)
     entry = kv.caches[0]
     # single-slice update
     input_pos = torch.tensor([1])
@@ -103,12 +97,9 @@ class TestKVLayers(googletest.TestCase):
     NUM_QG = 1
     KV_LEN = 4
     config = self._get_test_config(
-        num_layers=N,
-        head_dim=HEAD_DIM,
-        num_query_groups=NUM_QG,
-        kv_cache_max_len=KV_LEN,
+        num_layers=N, head_dim=HEAD_DIM, num_query_groups=NUM_QG
     )
-    kv = kv_utils.KVCache.from_model_config(config)
+    kv = kv_utils.KVCache.from_model_config(KV_LEN, config)
     model = TestModel()
     exported_program = torch.export.export(model, (kv,))
     input_specs = exported_program.graph_signature.input_specs
@@ -119,12 +110,11 @@ class TestKVLayers(googletest.TestCase):
   def test_pytree_roundtrip_kv_cache(self):
     NUM_LAYERS = 4
     config = self._get_test_config(
-        num_layers=NUM_LAYERS,
-        head_dim=2,
-        num_query_groups=1,
-        kv_cache_max_len=4,
+        num_layers=NUM_LAYERS, head_dim=2, num_query_groups=1
+    )
+    kv = kv_utils.KVCache.from_model_config(
+        kv_cache_max=4, config=config, batch_size=1
     )
-    kv = kv_utils.KVCache.from_model_config(config, batch_size=1)
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
@@ -133,13 +123,13 @@ class TestKVLayers(googletest.TestCase):
   def test_pytree_roundtrip_kv_cache_derived(self):
     NUM_LAYERS = 4
     config = self._get_test_config(
-        num_layers=NUM_LAYERS,
-        head_dim=2,
-        num_query_groups=1,
-        kv_cache_max_len=4,
+        num_layers=NUM_LAYERS, head_dim=2, num_query_groups=1
     )
     kv = kv_utils.KVCache.from_model_config(
-        config, batch_size=1, kv_layout=kv_utils.KV_LAYOUT_TRANSPOSED
+        kv_cache_max=4,
+        config=config,
+        batch_size=1,
+        kv_layout=kv_utils.KV_LAYOUT_TRANSPOSED,
     )
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)

ai_edge_torch/generative/test/test_lora.py CHANGED Viewed

@@ -58,12 +58,7 @@ class TestLora(googletest.TestCase):
     safetensors_file = resource_loader.get_path_to_datafile(
         "fixtures/test_lora_rank16.safetensors"
     )
-    config = self._get_test_config(
-        num_layers=1,
-        head_dim=8,
-        num_query_groups=1,
-        kv_cache_max_len=16,
-    )
+    config = self._get_test_config(num_layers=1, head_dim=8, num_query_groups=1)
     lora = lora_utils.LoRA.from_safetensors(
         safetensors_file,
         scale=1.0,
@@ -84,12 +79,8 @@ class TestLora(googletest.TestCase):
     n = 1
     head_dim = 2
     num_query_groups = 1
-    key_length = 4
     config = self._get_test_config(
-        num_layers=n,
-        head_dim=head_dim,
-        num_query_groups=num_query_groups,
-        kv_cache_max_len=key_length,
+        num_layers=n, head_dim=head_dim, num_query_groups=num_query_groups
     )
     inputs = torch.zeros((n, 1, head_dim))
     lora = lora_utils.LoRA.zeros(rank=16, config=config)
@@ -111,20 +102,13 @@ class TestLora(googletest.TestCase):
   def test_lora_tflite_serialization(self):
     """Tests the serialization of the LoRA module."""
-    config = self._get_test_config(
-        num_layers=2,
-        head_dim=8,
-        num_query_groups=1,
-        kv_cache_max_len=16,
-    )
+    config = self._get_test_config(num_layers=2, head_dim=8, num_query_groups=1)
     lora = lora_utils.LoRA.random(rank=16, config=config)
     flatbuffer_model = lora.to_tflite()
     recovered_lora = lora_utils.LoRA.from_flatbuffers(flatbuffer_model)
     self.assertEqual(lora, recovered_lora)
-  def _get_test_config(
-      self, num_layers, head_dim, num_query_groups, kv_cache_max_len
-  ):
+  def _get_test_config(self, num_layers, head_dim, num_query_groups):
     """Returns a test model config."""
     attn_config = cfg.AttentionConfig(
         num_heads=1, head_dim=head_dim, num_query_groups=num_query_groups
@@ -133,7 +117,6 @@ class TestLora(googletest.TestCase):
         attn_config=attn_config, ff_config=None
     )
     config = cfg.ModelConfig(
-        kv_cache_max_len=kv_cache_max_len,
         embedding_dim=head_dim,
         block_configs=block_config,
         num_layers=num_layers,

ai_edge_torch/generative/test/test_model_conversion.py CHANGED Viewed

@@ -47,7 +47,9 @@ class TestModelConversion(googletest.TestCase):
     tokens, input_pos = torch.tensor([[1]], dtype=torch.int), torch.tensor(
         [10], dtype=torch.int
     )
-    kv = kv_cache.KVCache.from_model_config(config, kv_layout=kv_layout)
+    kv = kv_cache.KVCache.from_model_config(
+        kv_cache_max=config.max_seq_len, config=config, kv_layout=kv_layout
+    )
     kwargs = {
         "tokens": tokens,
         "input_pos": input_pos,
@@ -122,7 +124,9 @@ class TestModelConversion(googletest.TestCase):
     decode_token = torch.tensor([[1]], dtype=torch.int)
     decode_input_pos = torch.tensor([5], dtype=torch.int)
-    kv = kv_cache.KVCache.from_model_config(config, kv_layout=kv_layout)
+    kv = kv_cache.KVCache.from_model_config(
+        kv_cache_max=128, config=config, kv_layout=kv_layout
+    )
     edge_model = (
         ai_edge_torch.signature(
@@ -177,12 +181,12 @@ class TestModelConversion(googletest.TestCase):
   def test_tiny_llama_multisig(self):
     config = tiny_llama.get_fake_model_config()
-    pytorch_model = tiny_llama.TinyLlama(config).eval()
+    pytorch_model = tiny_llama.TinyLlama(config, mask_cache_size=128).eval()
     self._test_multisig_model(config, pytorch_model, atol=1e-5, rtol=1e-5)
   def test_tiny_llama_multisig_kv_layout_transposed(self):
     config = tiny_llama.get_fake_model_config()
-    pytorch_model = tiny_llama.TinyLlama(config).eval()
+    pytorch_model = tiny_llama.TinyLlama(config, mask_cache_size=128).eval()
     self._test_multisig_model(
         config,
         pytorch_model,

ai_edge_torch/generative/test/test_model_conversion_large.py CHANGED Viewed

@@ -55,6 +55,7 @@ class TestModelConversion(googletest.TestCase):
             experimental_default_delegate_latest_features=True,
         )
     )
+    self._kv_cache_max = 128
     # Default cache_size_limit, 8 is hit and aborts often when the tests are
     # running all together. Doubles it to avoid abortion.
     torch._dynamo.config.cache_size_limit = 16
@@ -64,7 +65,7 @@ class TestModelConversion(googletest.TestCase):
     seq_len = 10
     tokens = torch.zeros((1, seq_len), dtype=torch.int, device="cpu")
     input_pos = torch.arange(0, seq_len, dtype=torch.int)
-    kv = kv_cache.KVCache.from_model_config(config)
+    kv = kv_cache.KVCache.from_model_config(self._kv_cache_max, config)
     edge_model = ai_edge_torch.signature(
         signature_name,
@@ -95,74 +96,77 @@ class TestModelConversion(googletest.TestCase):
   def test_gemma1(self):
     config = gemma1.get_fake_model_config()
-    pytorch_model = gemma1.Gemma1(config).eval()
+    pytorch_model = gemma1.Gemma1(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   def test_gemma2(self):
     config = gemma2.get_fake_model_config()
-    pytorch_model = gemma2.Gemma2(config).eval()
+    pytorch_model = gemma2.Gemma2(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   def test_llama(self):
     config = llama.get_fake_model_config()
-    pytorch_model = llama.Llama(config).eval()
+    pytorch_model = llama.Llama(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   def test_phi2(self):
     config = phi2.get_fake_model_config()
-    pytorch_model = phi2.Phi2(config).eval()
+    pytorch_model = phi2.Phi2(config, self._kv_cache_max).eval()
     # Phi-2 logits are very big, so we need a larger absolute tolerance.
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   def test_phi3(self):
     config = phi3.get_fake_model_config()
-    pytorch_model = phi3.Phi3_5Mini(config).eval()
+    pytorch_model = phi3.Phi3_5Mini(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
   def test_phi4(self):
     config = phi4.get_fake_model_config()
-    pytorch_model = phi4.Phi4Mini(config).eval()
+    pytorch_model = phi4.Phi4Mini(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   def test_smollm(self):
     config = smollm.get_fake_model_config()
-    pytorch_model = smollm.SmolLM(config).eval()
+    pytorch_model = smollm.SmolLM(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   def test_smollm2(self):
     config = smollm.get_fake_model_config_v2()
-    pytorch_model = smollm.SmolLM2(config).eval()
+    pytorch_model = smollm.SmolLM2(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   def test_openelm(self):
     config = openelm.get_fake_model_config()
-    pytorch_model = openelm.OpenELM(config).eval()
+    pytorch_model = openelm.OpenELM(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   def test_qwen(self):
     config = qwen.get_fake_model_config()
-    pytorch_model = qwen.Qwen(config).eval()
+    pytorch_model = qwen.Qwen(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   def test_deepseek(self):
     config = deepseek.get_fake_model_config()
-    pytorch_model = deepseek.DeepSeekDistillQwen(config).eval()
+    pytorch_model = deepseek.DeepSeekDistillQwen(
+        config, self._kv_cache_max
+    ).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
   def test_hammer(self):
     config = hammer.get_fake_model_config()
-    pytorch_model = hammer.Hammer(config).eval()
+    pytorch_model = hammer.Hammer(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
   def test_amd_llama_135m(self):
     config = amd_llama_135m.get_fake_model_config()
-    pytorch_model = amd_llama_135m.AmdLlama(config).eval()
+    pytorch_model = amd_llama_135m.AmdLlama(config, self._kv_cache_max).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
   def _test_paligemma_model(self, decoder_class, decoder_config, atol, rtol):
     config = paligemma.get_fake_model_config(decoder_config)
-    pytorch_model = paligemma.PaliGemma(config, decoder_class).eval()
+    pytorch_model = paligemma.PaliGemma(
+        config, decoder_class, mask_cache_size=self._kv_cache_max
+    ).eval()
     image_config = config.image_encoder_config.image_embedding
     num_patches = (image_config.image_size // image_config.patch_size) ** 2
@@ -171,7 +175,9 @@ class TestModelConversion(googletest.TestCase):
     seq_len = num_patches + 10
     tokens = torch.zeros((1, seq_len), dtype=torch.int)
     input_pos = torch.arange(0, seq_len, dtype=torch.int)
-    kv = kv_cache.KVCache.from_model_config(config.decoder_config)
+    kv = kv_cache.KVCache.from_model_config(
+        self._kv_cache_max, config.decoder_config
+    )
     pixel_values = torch.zeros((1, 3, 8, 8), dtype=torch.float32)
     edge_model = ai_edge_torch.signature(
@@ -218,7 +224,7 @@ class TestModelConversion(googletest.TestCase):
   def test_qwen_vl_model(self):
     config = qwen_vl.get_fake_model_config()
-    pytorch_model = qwen_vl.QwenVL(config).eval()
+    pytorch_model = qwen_vl.QwenVL(config, self._kv_cache_max).eval()
     grid_thw = pytorch_model.image_encoder.get_grid_thw()
     pixel_values_size = pytorch_model.image_encoder.get_pixel_values_size(
@@ -229,7 +235,9 @@ class TestModelConversion(googletest.TestCase):
     seq_len = pixel_values_size[0] + 10
     tokens = torch.zeros((1, seq_len), dtype=torch.int)
     input_pos = torch.arange(0, seq_len, dtype=torch.int)
-    kv = kv_cache.KVCache.from_model_config(config.decoder_config)
+    kv = kv_cache.KVCache.from_model_config(
+        self._kv_cache_max, config.decoder_config
+    )
     pixel_values = torch.zeros(pixel_values_size, dtype=torch.float32)
     edge_model = ai_edge_torch.signature(

ai-edge-torch-nightly 0.6.0.dev20250601__py3-none-any.whl → 0.6.0.dev20250603__py3-none-any.whl

ai-edge-torch-nightly 0.6.0.dev20250601py3-none-any.whl → 0.6.0.dev20250603py3-none-any.whl