PyPI - ai-edge-torch-nightly - Versions diffs - 0.6.0.dev20250602__py3-none-any.whl → 0.6.0.dev20250603__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.6.0.dev20250602py3-none-any.whl → 0.6.0.dev20250603py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

ai_edge_torch/generative/examples/amd_llama_135m/amd_llama_135m.py CHANGED Viewed

@@ -29,16 +29,8 @@ class AmdLlama(model_builder.DecoderOnlyModel):
   pass
-def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for an AMD-Llama-135m model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-  Returns:
-    The model config for an AMD-Llama-135m model.
-  """
+def get_model_config() -> cfg.ModelConfig:
+  """Returns the model config for an AMD-Llama-135m model."""
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=64,
@@ -63,7 +55,6 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=12,
       max_seq_len=2048,
       embedding_dim=768,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
@@ -71,8 +62,8 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   config.block_config(0).ff_config.intermediate_size = 64
@@ -82,12 +73,13 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] | None = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(**kwargs),
+      config=get_model_config(),
       tensor_names=TENSOR_NAMES,
       model_class=AmdLlama,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )

ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py CHANGED Viewed

@@ -31,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/deepseek/convert_to_tflite.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ai_edge_torch.generative.utilities import loader
 flags = converter.define_conversion_flags('deepseek')
 def main(_):
   checkpoint_path = flags.FLAGS.checkpoint_path
   pytorch_model = deepseek.build_model(
@@ -30,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/deepseek/deepseek.py CHANGED Viewed

@@ -29,16 +29,8 @@ class DeepSeekDistillQwen(model_builder.DecoderOnlyModel):
   pass
-def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Qwen 2.5 3B model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-  Returns:
-    The model config for a SmolLM model.
-  """
+def get_model_config() -> cfg.ModelConfig:
+  """Returns the model config for a Qwen 2.5 3B model."""
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=128,
@@ -66,7 +58,6 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=28,
       max_seq_len=4096,
       embedding_dim=1536,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
@@ -74,8 +65,8 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   # DeepSeek-R1-Distill-Qwen has only one block config.
@@ -86,12 +77,13 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(**kwargs),
+      config=get_model_config(),
       tensor_names=TENSOR_NAMES,
       model_class=DeepSeekDistillQwen,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )

ai_edge_torch/generative/examples/gemma/convert_gemma1_to_tflite.py CHANGED Viewed

@@ -31,13 +31,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py CHANGED Viewed

@@ -33,13 +33,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/gemma/gemma1.py CHANGED Viewed

@@ -42,16 +42,8 @@ class Gemma1(model_builder.DecoderOnlyModel):
   pass
-def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Gemma 2B model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-  Returns:
-    The model config for a Gemma 2B model.
-  """
+def get_model_config_2b() -> cfg.ModelConfig:
+  """Returns the model config for a Gemma 2B model."""
   attn_config = cfg.AttentionConfig(
       num_heads=8,
       head_dim=256,
@@ -80,7 +72,6 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_use_bias=False,
@@ -88,25 +79,26 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
-  config = get_model_config_2b(kv_cache_max_len)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config_2b()
   # Gemma has only one block config.
   config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 2 * kv_cache_max_len
+  config.max_seq_len = 256
   return config
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config_2b(**kwargs),
+      config=get_model_config_2b(),
       tensor_names=TENSOR_NAMES,
       model_class=Gemma1,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
   )

ai_edge_torch/generative/examples/gemma/gemma2.py CHANGED Viewed

@@ -104,7 +104,7 @@ class Gemma2Block(attention.TransformerBlock):
 class Gemma2(nn.Module):
   """A Gemma2 model built from the Edge Generative API layers."""
-  def __init__(self, config: cfg.ModelConfig):
+  def __init__(self, config: cfg.ModelConfig, mask_cache_size: int = 0):
     super().__init__()
     # Construct model layers.
@@ -126,17 +126,24 @@ class Gemma2(nn.Module):
         config.embedding_dim,
         config.final_norm_config,
     )
-    self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max,
-    )
+    self.config = config
+    self.build_mask_cache(mask_cache_size)
+  def build_mask_cache(self, mask_cache_size: int):
+    assert (
+        mask_cache_size <= self.config.max_seq_len
+    ), "Mask cache size must be less than or equal to the max seq length."
+    if mask_cache_size <= 0:
+      self.mask_cache = None
+      self.sliding_window_mask_cache = None
+      return
+    self.mask_cache = attn_utils.build_causal_mask_cache(mask_cache_size)
     # Gemma2 has same hyper parameters for each layer except for attention
     # types. Use the first layer.
-    attn_config = config.block_config(0).attn_config
     self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
-        size=config.kv_cache_max,
-        window_size=attn_config.sliding_window_size,
+        size=mask_cache_size,
+        window_size=self.config.block_config(0).attn_config.sliding_window_size,
     )
-    self.config = config
   def get_attention_mask(
       self, attn_type: cfg.AttentionType, input_pos: torch.Tensor
@@ -167,6 +174,7 @@ class Gemma2(nn.Module):
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
     rope = rotary_pos_emb.build_rope(input_pos, n_elem, attn_config.rotary_base)
     if mask is None:
+      assert self.mask_cache is not None, "Mask cache must be built."
       mask = [
           self.get_attention_mask(
               self.config.block_config(i).attn_config.attn_type, input_pos
@@ -222,16 +230,8 @@ class Gemma2(nn.Module):
     return {"logits": res, "kv_cache": updated_kv_cache}
-def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
-  """Returns the model config for a Gemma2 2B model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 1024.
-  Returns:
-    The model config for a Gemma 2B model.
-  """
+def get_model_config_2b() -> cfg.ModelConfig:
+  """Returns the model config for a Gemma2 2B model."""
   norm_config = cfg.NormalizationConfig(
       type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6, zero_centered=True
   )
@@ -277,7 +277,6 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=[get_block_config(i) for i in range(num_layers)],
       final_norm_config=norm_config,
       lm_head_use_bias=False,
@@ -286,11 +285,11 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
-  config = get_model_config_2b(kv_cache_max_len)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_model_config_2b()
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 2 * kv_cache_max_len
+  config.max_seq_len = 256
   config.embedding_dim = 128
   config.embedding_scale = config.embedding_dim**0.5
   config.block_configs = config.block_configs[: config.num_layers]
@@ -305,16 +304,17 @@ def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs,
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   for tensor_names in TENSOR_NAMES_DICT.values():
     try:
       return model_builder.build_decoder_only_model(
           checkpoint_path=checkpoint_path,
-          config=get_model_config_2b(**kwargs),
+          config=get_model_config_2b(),
           tensor_names=tensor_names,
           model_class=Gemma2,
           custom_loader=custom_loader,
+          mask_cache_size=mask_cache_size,
       )
     except KeyError as _:
       continue

ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py CHANGED Viewed

@@ -40,7 +40,7 @@ def main(_):
         custom_loader=loader.maybe_get_custom_loader(
             checkpoint_path, flags.FLAGS.custom_checkpoint_loader
         ),
-        kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+        mask_cache_size=converter.get_mask_cache_size_from_flags(),
     )
   else:
     raise ValueError(f'Unsupported model size: {_MODEL_SIZE.value}')
@@ -50,6 +50,7 @@ def main(_):
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/gemma3/decoder.py CHANGED Viewed

@@ -74,6 +74,7 @@ TENSOR_NAMES_DICT = {
 class DecoderBlock(attention.TransformerBlock):
+  """A Gemma3 decoder block built from the Edge Generative API layers."""
   def forward(
       self,
@@ -111,7 +112,7 @@ class DecoderBlock(attention.TransformerBlock):
 class Decoder(nn.Module):
   """A Gemma3 decoder model built from the Edge Generative API layers."""
-  def __init__(self, config: cfg.ModelConfig):
+  def __init__(self, config: cfg.ModelConfig, mask_cache_size: int = 0):
     super().__init__()
     # Construct model layers.
@@ -130,10 +131,17 @@ class Decoder(nn.Module):
     self.final_norm = builder.build_norm(
         config.embedding_dim, config.final_norm_config
     )
-    self.mask_cache = attn_utils.build_causal_mask_cache(
-        size=config.kv_cache_max,
-    )
     self.config = config
+    self.build_mask_cache(mask_cache_size)
+  def build_mask_cache(self, mask_cache_size: int):
+    assert (
+        mask_cache_size <= self.config.max_seq_len
+    ), "Mask cache size must be less than or equal to the max seq length."
+    if mask_cache_size <= 0:
+      self.mask_cache = None
+    else:
+      self.mask_cache = attn_utils.build_causal_mask_cache(mask_cache_size)
   def get_local_global_attention_mask(
       self,
@@ -205,9 +213,8 @@ class Decoder(nn.Module):
     mask = torch.where(mask, 0, self.config.causal_mask_value)
     return mask
-  def build_pixel_mask(self, image_indices: torch.Tensor):
+  def build_pixel_mask(self, image_indices: torch.Tensor, max_seq_len: int):
     pixel_mask = image_indices >= 0
-    max_seq_len = self.config.kv_cache_max
     if pixel_mask.size(1) < max_seq_len:
       pixel_mask = torch.cat(
           [
@@ -234,14 +241,12 @@ class Decoder(nn.Module):
       image_indices: Optional[torch.Tensor] = None,
       export_config: Optional[export_cfg.ExportConfig] = None,
   ) -> dict[torch.Tensor, kv_utils.KVCache]:
-    pixel_mask = None
     if input_embeds is None:
       # token embeddings of shape (b, t, n_embd)
       input_embeds = self.tok_embedding(tokens)
       if self.config.embedding_scale is not None:
         input_embeds = input_embeds * self.config.embedding_scale
-    if image_indices is not None:
-      pixel_mask = self.build_pixel_mask(image_indices)
     # RoPE parameters are the same for all blocks. Use the first layer.
     attn_config = self.config.block_config(0).attn_config
     # Different rotary base for global and local attention
@@ -254,9 +259,19 @@ class Decoder(nn.Module):
         )
         for i in range(self.config.num_layers)
     ]
     if mask is None:
+      assert self.mask_cache is not None, "Mask cache must be built."
+      assert kv_cache is not None, "KV cache must be provided."
+      kv_cache_max_len = kv_cache.get_max_seq_len()
       mask = self.mask_cache.index_select(2, input_pos)
-      mask = mask[:, :, :, : self.config.kv_cache_max]
+      mask = mask[:, :, :, :kv_cache_max_len]
+    else:
+      kv_cache_max_len = mask.size(3)
+    pixel_mask = None
+    if image_indices is not None:
+      pixel_mask = self.build_pixel_mask(image_indices, kv_cache_max_len)
     return self._forward_with_embeds(
         input_embeds, rope, mask, input_pos, kv_cache, pixel_mask, export_config
@@ -322,16 +337,8 @@ class Decoder(nn.Module):
     return {"logits": res, "kv_cache": updated_kv_cache}
-def get_decoder_config_1b(kv_cache_max_len: int = 2048) -> cfg.ModelConfig:
-  """Returns the model config for a Gemma3 1B model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 2048.
-  Returns:
-    The model config for a Gemma 1B model.
-  """
+def get_decoder_config_1b() -> cfg.ModelConfig:
+  """Returns the model config for a Gemma3 1B model."""
   norm_config = cfg.NormalizationConfig(
       type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6, zero_centered=True,
   )
@@ -376,7 +383,6 @@ def get_decoder_config_1b(kv_cache_max_len: int = 2048) -> cfg.ModelConfig:
       max_seq_len=32_768,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=[get_block_config(i) for i in range(num_layers)],
       final_norm_config=norm_config,
       lm_head_use_bias=False,
@@ -385,20 +391,12 @@ def get_decoder_config_1b(kv_cache_max_len: int = 2048) -> cfg.ModelConfig:
   return config
-def get_fake_decoder_config_1b(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
-  """Returns a fake model config for a Gemma3 1B model.
-  Args:
-    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
-      is 128.
-  Returns:
-    A fake model config for a Gemma 1B model.
-  """
-  config = get_decoder_config_1b(kv_cache_max_len)
+def get_fake_decoder_config_1b() -> cfg.ModelConfig:
+  """Returns a fake model config for a Gemma3 1B model."""
+  config = get_decoder_config_1b()
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 2 * kv_cache_max_len
+  config.max_seq_len = 256
   config.embedding_dim = 128
   config.embedding_scale = config.embedding_dim**0.5
   config.block_configs = config.block_configs[: config.num_layers]
@@ -413,7 +411,7 @@ def get_fake_decoder_config_1b(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
 def build_model_1b(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs,
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   # TODO(b/403644647): Better error handling for loading checkpoints with
   # different tensor names.
@@ -421,10 +419,11 @@ def build_model_1b(
     try:
       return model_builder.build_decoder_only_model(
           checkpoint_path=checkpoint_path,
-          config=get_decoder_config_1b(**kwargs),
+          config=get_decoder_config_1b(),
           tensor_names=tensor_names,
           model_class=Decoder,
           custom_loader=custom_loader,
+          mask_cache_size=mask_cache_size,
       )
     except KeyError as ke:
       continue

ai_edge_torch/generative/examples/gemma3/gemma3.py CHANGED Viewed

@@ -48,13 +48,13 @@ class Gemma3MMConfig:
 class Gemma3MM(nn.Module):
   """A Gemma3 multimodal model built from the Edge Generative API layers."""
-  def __init__(self, config: Gemma3MMConfig):
+  def __init__(self, config: Gemma3MMConfig, mask_cache_size: int = 0):
     super().__init__()
     self.image_encoder = image_encoder.SiglipVisionEncoderWithExit(
         config.image_encoder_config
     )
-    self.decoder = decoder.Decoder(config.decoder_config)
+    self.decoder = decoder.Decoder(config.decoder_config, mask_cache_size)
     self.mm_norm = builder.build_norm(
         config.image_encoder_config.embedding_dim,
         config.mm_norm_config,
@@ -150,10 +150,10 @@ class Gemma3MM(nn.Module):
     )
-def get_fake_model_config(**kwargs) -> Gemma3MMConfig:
+def get_fake_model_config() -> Gemma3MMConfig:
   return Gemma3MMConfig(
       image_encoder_config=image_encoder.get_fake_image_encoder_config(),
-      decoder_config=decoder.get_fake_decoder_config_1b(**kwargs),
+      decoder_config=decoder.get_fake_decoder_config_1b(),
       image_token_id=127,
       image_projection_scale=128**0.5,
       image_projection_use_bias=False,
@@ -167,13 +167,15 @@ def get_fake_model_config(**kwargs) -> Gemma3MMConfig:
 def build_model_1b(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs,
+    mask_cache_size: int = 0,
 ) -> decoder.Decoder:
   if checkpoint_path:
-    model = decoder.build_model_1b(checkpoint_path, custom_loader, **kwargs)
+    model = decoder.build_model_1b(
+        checkpoint_path, custom_loader, mask_cache_size
+    )
   else:
-    config = decoder.get_decoder_config_1b(**kwargs)
-    model = decoder.Decoder(config)
+    config = decoder.get_decoder_config_1b()
+    model = decoder.Decoder(config, mask_cache_size)
   # TODO: Load the parameters of decoder from checkpoint.
   model.eval()
   return model

ai_edge_torch/generative/examples/hammer/convert_to_tflite.py CHANGED Viewed

@@ -43,13 +43,14 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
+      mask_cache_size=converter.get_mask_cache_size_from_flags(),
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),

ai_edge_torch/generative/examples/hammer/hammer.py CHANGED Viewed

@@ -29,7 +29,7 @@ class Hammer(model_builder.DecoderOnlyModel):
   pass
-def get_1_5b_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+def get_1_5b_model_config() -> cfg.ModelConfig:
   """Returns the model config for a Hammer 2.1 1.5B model."""
   attn_config = cfg.AttentionConfig(
       num_heads=12,
@@ -58,16 +58,15 @@ def get_1_5b_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       num_layers=28,
       max_seq_len=32768,
       embedding_dim=1536,
-      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
   )
   return config
-def get_0_5b_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+def get_0_5b_model_config() -> cfg.ModelConfig:
   """Returns the model config for a Hammer 2.1 0.5B model."""
-  config = get_1_5b_model_config(kv_cache_max_len)
+  config = get_1_5b_model_config()
   # Hammer has only one block config.
   block_config = config.block_config(0)
   block_config.attn_config.num_heads = 14
@@ -78,8 +77,8 @@ def get_0_5b_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   return config
-def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
-  config = get_1_5b_model_config(**kwargs)
+def get_fake_model_config() -> cfg.ModelConfig:
+  config = get_1_5b_model_config()
   config.vocab_size = 128
   config.num_layers = 2
   config.embedding_dim = 16
@@ -88,29 +87,37 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
   return config
-def build_1_5b_model(
+def _build_model(
     checkpoint_path: str,
+    config: cfg.ModelConfig,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_1_5b_model_config(**kwargs),
+      config=config,
       tensor_names=TENSOR_NAMES,
       model_class=Hammer,
       custom_loader=custom_loader,
+      mask_cache_size=mask_cache_size,
+  )
+def build_1_5b_model(
+    checkpoint_path: str,
+    custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
+    mask_cache_size: int = 0,
+) -> nn.Module:
+  return _build_model(
+      checkpoint_path, get_1_5b_model_config(), custom_loader, mask_cache_size
   )
 def build_0_5b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    **kwargs
+    mask_cache_size: int = 0,
 ) -> nn.Module:
-  return model_builder.build_decoder_only_model(
-      checkpoint_path=checkpoint_path,
-      config=get_0_5b_model_config(**kwargs),
-      tensor_names=TENSOR_NAMES,
-      model_class=Hammer,
-      custom_loader=custom_loader,
+  return _build_model(
+      checkpoint_path, get_0_5b_model_config(), custom_loader, mask_cache_size
   )

ai-edge-torch-nightly 0.6.0.dev20250602__py3-none-any.whl → 0.6.0.dev20250603__py3-none-any.whl

ai-edge-torch-nightly 0.6.0.dev20250602py3-none-any.whl → 0.6.0.dev20250603py3-none-any.whl