PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20250107__py3-none-any.whl → 0.3.0.dev20250109__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20250107py3-none-any.whl → 0.3.0.dev20250109py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

ai_edge_torch/generative/examples/qwen/convert_to_tflite.py CHANGED Viewed

@@ -35,10 +35,15 @@ _CHECKPOINT_PATH = flags.DEFINE_string(
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/qwen'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'qwen',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -55,6 +60,12 @@ _QUANTIZE = flags.DEFINE_bool(
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 _BUILDER = {
     '0.5b': qwen.build_0_5b_model,
@@ -67,16 +78,13 @@ def main(_):
   pytorch_model = _BUILDER[_MODEL_SIZE.value](
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  model_size = _MODEL_SIZE.value.replace('.', '_')
-  output_filename = (
-      f'qwen_{model_size}_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
-  )
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )

ai_edge_torch/generative/examples/smollm/convert_to_tflite.py CHANGED Viewed

@@ -29,10 +29,15 @@ _CHECKPOINT_PATH = flags.DEFINE_string(
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'smollm',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -49,20 +54,24 @@ _QUANTIZE = flags.DEFINE_bool(
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 def main(_):
   pytorch_model = smollm.build_model(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = f'smollm_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )

ai_edge_torch/generative/examples/smollm/convert_v2_to_tflite.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of converting SmolLM2 model to multi-signature tflite model."""
+import os
+import pathlib
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.smollm import smollm
+from ai_edge_torch.generative.utilities import converter
+from ai_edge_torch.generative.utilities.model_builder import ExportConfig
+_CHECKPOINT_PATH = flags.DEFINE_string(
+    'checkpoint_path',
+    os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm2'),
+    'The path to the model checkpoint, or directory holding the checkpoint.',
+)
+_TFLITE_PATH = flags.DEFINE_string(
+    'tflite_path',
+    '/tmp/',
+    'The tflite file path to export.',
+)
+_PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
+    'prefill_seq_lens',
+    (8, 64, 128, 256, 512, 1024),
+    'List of the maximum sizes of prefill input tensors.',
+)
+_KV_CACHE_MAX_LEN = flags.DEFINE_integer(
+    'kv_cache_max_len',
+    1280,
+    'The maximum size of KV cache buffer, including both prefill and decode.',
+)
+_QUANTIZE = flags.DEFINE_bool(
+    'quantize',
+    True,
+    'Whether the model should be quantized.',
+)
+def main(_):
+  pytorch_model = smollm.build_model_v2(
+      _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
+  )
+  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
+  output_filename = f'smollm2_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
+  converter.convert_to_tflite(
+      pytorch_model,
+      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      prefill_seq_len=_PREFILL_SEQ_LENS.value,
+      quantize=_QUANTIZE.value,
+      export_config=ExportConfig(),
+  )
+if __name__ == '__main__':
+  app.run(main)

ai_edge_torch/generative/examples/smollm/smollm.py CHANGED Viewed

@@ -85,3 +85,41 @@ def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
       tensor_names=TENSOR_NAMES,
       model_class=SmolLM,
   )
+class SmolLM2(model_builder.DecoderOnlyModel):
+  """A SmolLM2 model built from the Edge Generative API layers."""
+  pass
+def get_model_config_v2(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a SmolLM2 135M model.
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+  Returns:
+    The model config for a SmolLM2 model.
+  """
+  config = get_model_config(kv_cache_max_len)
+  config.block_config(0).attn_config.rotary_base = 100000
+  return config
+def get_fake_model_config_v2(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config_v2(**kwargs)
+  config.vocab_size = 128
+  config.num_layers = 2
+  # SmolLM2 has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
+  return config
+def build_model_v2(checkpoint_path: str, **kwargs) -> nn.Module:
+  return model_builder.build_decoder_only_model(
+      checkpoint_path=checkpoint_path,
+      config=get_model_config_v2(**kwargs),
+      tensor_names=TENSOR_NAMES,
+      model_class=SmolLM2,
+  )

ai_edge_torch/generative/examples/smollm/verify.py CHANGED Viewed

@@ -36,10 +36,26 @@ _MAX_NEW_TOKENS = flags.DEFINE_integer(
     30,
     "The maximum size of the generated tokens.",
 )
+_MODEL_VERSION = flags.DEFINE_enum(
+    "model_version",
+    "v1",
+    ["v1", "v2"],
+    "The version of SmolLm to verify.",
+)
+_CHECKPOINT = {
+    "v1": "HuggingFaceTB/SmolLM-135M",
+    "v2": "HuggingFaceTB/SmolLM2-135M",
+}
+_BUILDER = {
+    "v1": smollm.build_model,
+    "v2": smollm.build_model_v2,
+}
 def main(_):
-  checkpoint = "HuggingFaceTB/SmolLM-135M"
+  checkpoint = _CHECKPOINT[_MODEL_VERSION.value]
+  builder = _BUILDER[_MODEL_VERSION.value]
   logging.info("Loading the original model from: %s", checkpoint)
   original_model = transformers.AutoModelForCausalLM.from_pretrained(checkpoint)
@@ -49,7 +65,7 @@ def main(_):
   )
   reauthored_checkpoint = pathlib.Path(cached_config_file).parent
   logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
-  reauthored_model = smollm.build_model(reauthored_checkpoint)
+  reauthored_model = builder(reauthored_checkpoint)
   logging.info("Loading the tokenizer from: %s", checkpoint)
   tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -72,14 +72,14 @@ class ToyModelWithKVCache(torch.nn.Module):
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.max_seq_len]
-    updated_kv_entires = []
+    updated_kv_entries = []
     for i, block in enumerate(self.transformer_blocks):
       kv_entry = kv_cache.caches[i] if kv_cache else None
       x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
       if kv_entry:
-        updated_kv_entires.append(kv_entry)
+        updated_kv_entries.append(kv_entry)
-    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
     if export_config is not None:
       if (

ai_edge_torch/generative/examples/tiny_llama/convert_to_tflite.py CHANGED Viewed

@@ -29,10 +29,15 @@ _CHECKPOINT_PATH = flags.DEFINE_string(
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/tiny_llama'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'tinyllama',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -49,21 +54,24 @@ _QUANTIZE = flags.DEFINE_bool(
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 def main(_):
   pytorch_model = tiny_llama.build_model(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = (
-      f'tinyllama_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
-  )
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import Optional, Tuple, Union
 from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import lora as lora_utils
 from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
@@ -26,33 +27,6 @@ import torch
 from torch import nn
-def _embed_rope(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    n_elem: int,
-    rope: Tuple[torch.Tensor, torch.Tensor],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-  """Embed rotary positional embedding for query and key.
-  Args:
-    q (torch.Tensor): query tensor.
-    k (torch.Tensor): key tensor.
-    n_elem (int): number of elements to embed rotarty positional embedding.
-    rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
-  """
-  if n_elem > 0:
-    cos, sin = rope
-    q_roped = rotary_pos_emb.apply_rope(
-        q[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-    )
-    k_roped = rotary_pos_emb.apply_rope(
-        k[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-    )
-    q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
-    k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
-  return q, k
 class TransformerBlock(nn.Module):
   def __init__(
@@ -93,6 +67,7 @@ class TransformerBlock(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: kv_utils.KVCacheEntry = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
@@ -102,6 +77,7 @@ class TransformerBlock(nn.Module):
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
       kv_cache (KVCacheEntry): the optional kv cache entry.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this transformer block, and updated kv cache (if
@@ -110,7 +86,9 @@ class TransformerBlock(nn.Module):
     kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      atten_func_out = self.atten_func(
+          x_norm, rope, mask, input_pos, kv_cache, lora
+      )
       if kv_cache is None:
         attn_out = atten_func_out
       else:
@@ -119,7 +97,9 @@ class TransformerBlock(nn.Module):
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      atten_func_out = self.atten_func(
+          x_norm, rope, mask, input_pos, kv_cache, lora
+      )
       if kv_cache is None:
         attn_out = atten_func_out
       else:
@@ -179,6 +159,7 @@ class CausalSelfAttention(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
@@ -189,7 +170,8 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this self attention layer, and the updated
@@ -228,6 +210,11 @@ class CausalSelfAttention(nn.Module):
           dim=-1,
       )
+    if lora is not None:
+      q += lora_utils.apply_lora(x, lora.attention.query, shape=q.shape)
+      k += lora_utils.apply_lora(x, lora.attention.key, shape=k.shape)
+      v += lora_utils.apply_lora(x, lora.attention.value, shape=v.shape)
     q = self.query_norm(q)
     k = self.key_norm(k)
@@ -238,13 +225,14 @@ class CausalSelfAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      q, k = _embed_rope(q, k, n_elem, rope)
+      cos, sin = rope
+      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
       k, v = kv_cache.k_cache, kv_cache.v_cache
-    y = self.sdpa_func(
+    sdpa_out = self.sdpa_func(
         q,
         k,
         v,
@@ -252,10 +240,13 @@ class CausalSelfAttention(nn.Module):
         mask=mask,
         softcap=self.config.logit_softcap,
     )
-    y = y.reshape(B, T, -1)
+    sdpa_out = sdpa_out.reshape(B, T, -1)
     # Compute the output projection.
-    y = self.output_projection(y)
+    y = self.output_projection(sdpa_out)
+    if lora is not None:
+      y += lora_utils.apply_lora(sdpa_out, lora.attention.output)
     return y if kv_cache is None else (y, kv_cache)
@@ -268,6 +259,7 @@ class SelfAttention(CausalSelfAttention):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
@@ -275,18 +267,23 @@ class SelfAttention(CausalSelfAttention):
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this self attention layer, and the updated
         KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
+    assert (
+        kv_cache is None
+    ), "KV cache is not supported in non-causal SelfAttention."
     return super().forward(
         x,
         rope=rope,
         mask=torch.zeros((B, 1, T, T), dtype=torch.float32),
         input_pos=input_pos,
+        lora=lora,
     )
@@ -343,6 +340,7 @@ class CrossAttention(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -353,7 +351,8 @@ class CrossAttention(nn.Module):
       mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
         [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this cross attention layer.
@@ -366,6 +365,11 @@ class CrossAttention(nn.Module):
     k = self.k_projection(y)
     v = self.v_projection(y)
+    if lora is not None:
+      q += lora_utils.apply_lora(x, lora.attention.query, shape=q.shape)
+      k += lora_utils.apply_lora(x, lora.attention.key, shape=k.shape)
+      v += lora_utils.apply_lora(x, lora.attention.value, shape=v.shape)
     interim_shape = (batch_size, -1, self.n_heads, self.config.head_dim)
     q = q.view(interim_shape)
     k = k.view(interim_shape)
@@ -374,7 +378,8 @@ class CrossAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      q, k = _embed_rope(q, k, n_elem, rope)
+      cos, sin = rope
+      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
@@ -388,4 +393,7 @@ class CrossAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
+    if lora is not None:
+      y += lora_utils.apply_lora(y, lora.attention.output)
     return y if kv_cache is None else (y, kv_cache)

ai-edge-torch-nightly 0.3.0.dev20250107__py3-none-any.whl → 0.3.0.dev20250109__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20250107py3-none-any.whl → 0.3.0.dev20250109py3-none-any.whl