PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20250105__py3-none-any.whl → 0.3.0.dev20250108__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20250105py3-none-any.whl → 0.3.0.dev20250108py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

ai_edge_torch/generative/examples/smollm/convert_to_tflite.py CHANGED Viewed

@@ -29,10 +29,15 @@ _CHECKPOINT_PATH = flags.DEFINE_string(
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'smollm',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -49,20 +54,24 @@ _QUANTIZE = flags.DEFINE_bool(
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 def main(_):
   pytorch_model = smollm.build_model(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = f'smollm_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -72,14 +72,14 @@ class ToyModelWithKVCache(torch.nn.Module):
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.max_seq_len]
-    updated_kv_entries = []
+    updated_kv_entires = []
     for i, block in enumerate(self.transformer_blocks):
       kv_entry = kv_cache.caches[i] if kv_cache else None
       x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
       if kv_entry:
-        updated_kv_entries.append(kv_entry)
+        updated_kv_entires.append(kv_entry)
-    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     if export_config is not None:
       if (

ai_edge_torch/generative/examples/tiny_llama/convert_to_tflite.py CHANGED Viewed

@@ -29,10 +29,15 @@ _CHECKPOINT_PATH = flags.DEFINE_string(
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/tiny_llama'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'tinyllama',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -49,21 +54,24 @@ _QUANTIZE = flags.DEFINE_bool(
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
 def main(_):
   pytorch_model = tiny_llama.build_model(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = (
-      f'tinyllama_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
-  )
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
       export_config=ExportConfig(),
   )

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import Optional, Tuple, Union
 from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.layers import lora as lora_utils
 from ai_edge_torch.generative.layers import scaled_dot_product_attention as sdpa
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
@@ -26,6 +27,33 @@ import torch
 from torch import nn
+def _embed_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    n_elem: int,
+    rope: Tuple[torch.Tensor, torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+  """Embed rotary positional embedding for query and key.
+  Args:
+    q (torch.Tensor): query tensor.
+    k (torch.Tensor): key tensor.
+    n_elem (int): number of elements to embed rotarty positional embedding.
+    rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
+  """
+  if n_elem > 0:
+    cos, sin = rope
+    q_roped = rotary_pos_emb.apply_rope(
+        q[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
+    )
+    k_roped = rotary_pos_emb.apply_rope(
+        k[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
+    )
+    q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
+    k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
+  return q, k
 class TransformerBlock(nn.Module):
   def __init__(
@@ -66,6 +94,7 @@ class TransformerBlock(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: kv_utils.KVCacheEntry = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the TransformerBlock.
@@ -75,6 +104,7 @@ class TransformerBlock(nn.Module):
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
       kv_cache (KVCacheEntry): the optional kv cache entry.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this transformer block, and updated kv cache (if
@@ -83,7 +113,9 @@ class TransformerBlock(nn.Module):
     kv = None
     if self.config.parallel_residual:
       x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      atten_func_out = self.atten_func(
+          x_norm, rope, mask, input_pos, kv_cache, lora
+      )
       if kv_cache is None:
         attn_out = atten_func_out
       else:
@@ -92,7 +124,9 @@ class TransformerBlock(nn.Module):
       output = x + attn_out + ff_out
     else:
       x_norm = self.pre_atten_norm(x)
-      atten_func_out = self.atten_func(x_norm, rope, mask, input_pos, kv_cache)
+      atten_func_out = self.atten_func(
+          x_norm, rope, mask, input_pos, kv_cache, lora
+      )
       if kv_cache is None:
         attn_out = atten_func_out
       else:
@@ -152,6 +186,7 @@ class CausalSelfAttention(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the CausalSelfAttention layer, which can support
@@ -162,7 +197,8 @@ class CausalSelfAttention(nn.Module):
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       mask (torch.Tensor): the optional mask tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this self attention layer, and the updated
@@ -201,6 +237,11 @@ class CausalSelfAttention(nn.Module):
           dim=-1,
       )
+    if lora is not None:
+      q += lora_utils.apply_lora(x, lora.attention.query, shape=q.shape)
+      k += lora_utils.apply_lora(x, lora.attention.key, shape=k.shape)
+      v += lora_utils.apply_lora(x, lora.attention.value, shape=v.shape)
     q = self.query_norm(q)
     k = self.key_norm(k)
@@ -211,14 +252,13 @@ class CausalSelfAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      cos, sin = rope
-      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
+      q, k = _embed_rope(q, k, n_elem, rope)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
       k, v = kv_cache.k_cache, kv_cache.v_cache
-    y = self.sdpa_func(
+    sdpa_out = self.sdpa_func(
         q,
         k,
         v,
@@ -226,10 +266,13 @@ class CausalSelfAttention(nn.Module):
         mask=mask,
         softcap=self.config.logit_softcap,
     )
-    y = y.reshape(B, T, -1)
+    sdpa_out = sdpa_out.reshape(B, T, -1)
     # Compute the output projection.
-    y = self.output_projection(y)
+    y = self.output_projection(sdpa_out)
+    if lora is not None:
+      y += lora_utils.apply_lora(sdpa_out, lora.attention.output)
     return y if kv_cache is None else (y, kv_cache)
@@ -242,6 +285,7 @@ class SelfAttention(CausalSelfAttention):
       rope: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ) -> Union[torch.Tensor, Tuple[torch.Tensor, kv_utils.KVCacheEntry]]:
     """Forward function of the SelfAttention layer, which can support MQA, GQA and MHA.
@@ -249,18 +293,23 @@ class SelfAttention(CausalSelfAttention):
       x (torch.Tensor): the input tensor.
       rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this self attention layer, and the updated
         KV Cach Entry (if passed in).
     """
     B, T, _ = x.size()
+    assert (
+        kv_cache is None
+    ), "KV cache is not supported in non-causal SelfAttention."
     return super().forward(
         x,
         rope=rope,
         mask=torch.zeros((B, 1, T, T), dtype=torch.float32),
         input_pos=input_pos,
+        lora=lora,
     )
@@ -317,6 +366,7 @@ class CrossAttention(nn.Module):
       mask: Optional[torch.Tensor] = None,
       input_pos: Optional[torch.Tensor] = None,
       kv_cache: Optional[kv_utils.KVCacheEntry] = None,
+      lora: Optional[lora_utils.LoRAEntry] = None,
   ):
     """Forward function of the CrossAttention layer.
@@ -327,7 +377,8 @@ class CrossAttention(nn.Module):
       mask (torch.Tensor): the optional mask tensor can be broadcaseted to shape
         [B, n_heads, target_seq_len, source_seq_len].
       input_pos (torch.Tensor): the optional input position tensor.
-      kv_cache (KVCacheEntry): The KV cache entry corresponding to this module.
+      kv_cache (KVCacheEntry): the KV cache entry corresponding to this module.
+      lora (LoRAEntry): the optional lora entry.
     Returns:
       output activation from this cross attention layer.
@@ -340,6 +391,11 @@ class CrossAttention(nn.Module):
     k = self.k_projection(y)
     v = self.v_projection(y)
+    if lora is not None:
+      q += lora_utils.apply_lora(x, lora.attention.query, shape=q.shape)
+      k += lora_utils.apply_lora(x, lora.attention.key, shape=k.shape)
+      v += lora_utils.apply_lora(x, lora.attention.value, shape=v.shape)
     interim_shape = (batch_size, -1, self.n_heads, self.config.head_dim)
     q = q.view(interim_shape)
     k = k.view(interim_shape)
@@ -348,8 +404,7 @@ class CrossAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      cos, sin = rope
-      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
+      q, k = _embed_rope(q, k, n_elem, rope)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
@@ -363,4 +418,7 @@ class CrossAttention(nn.Module):
     # Compute the output projection.
     y = self.output_projection(y)
+    if lora is not None:
+      y += lora_utils.apply_lora(y, lora.attention.output)
     return y if kv_cache is None else (y, kv_cache)

ai-edge-torch-nightly 0.3.0.dev20250105__py3-none-any.whl → 0.3.0.dev20250108__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20250105py3-none-any.whl → 0.3.0.dev20250108py3-none-any.whl