PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20241206__py3-none-any.whl → 0.3.0.dev20241213__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20241206py3-none-any.whl → 0.3.0.dev20241213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

ai_edge_torch/generative/examples/tiny_llama/tiny_llama.py CHANGED Viewed

@@ -17,10 +17,16 @@
 import ai_edge_torch.generative.layers.model_config as cfg
 from ai_edge_torch.generative.utilities import model_builder
+from torch import nn
 TENSOR_NAMES = model_builder.TENSOR_NAMES_WITH_SEPARATE_LM_HEAD
+class TinyLlama(model_builder.DecoderOnlyModel):
+  """A TinyLlama model built from the Edge Generative API layers."""
+  pass
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
   """Returns the model config for a TinyLlama model.
@@ -73,11 +79,10 @@ def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
   return config
-def build_model(
-    checkpoint_path: str, **kwargs
-) -> model_builder.DecoderOnlyModel:
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
       config=get_model_config(**kwargs),
       tensor_names=TENSOR_NAMES,
+      model_class=TinyLlama,
   )

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -241,9 +241,7 @@ class CausalSelfAttention(nn.Module):
       q, k = _embed_rope(q, k, n_elem, rope)
     if kv_cache is not None:
-      kv_cache = kv_utils.update(
-          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
-      )
+      kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
       k, v = kv_cache.k_cache, kv_cache.v_cache
     y = self.sdpa_func(
@@ -379,9 +377,7 @@ class CrossAttention(nn.Module):
       q, k = _embed_rope(q, k, n_elem, rope)
     if kv_cache is not None:
-      kv_cache = kv_utils.update(
-          kv_cache, input_pos, k, v, enable_hlfb=self.enable_hlfb
-      )
+      kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
       k, v = kv_cache.k_cache, kv_cache.v_cache
     if mask is None:
       mask = torch.zeros(

ai_edge_torch/generative/layers/kv_cache.py CHANGED Viewed

@@ -146,7 +146,7 @@ def update(
     input_pos: torch.Tensor,
     k_slice: torch.Tensor,
     v_slice: torch.Tensor,
-    enable_hlfb: bool = True,
+    use_dus: bool = True,
 ) -> KVCacheEntry:
   """Out of place update of Cache buffer.
@@ -155,17 +155,14 @@ def update(
       input_pos (torch.Tensor): The update slice positions.
       k_slice (torch.Tensor): The K slice to be updated in the new cache.
       v_slice (torch.Tensor): The V slice to be updated in the new cache.
-      enable_hlfb (bool, optional): Whether the op is annotated for export with
-        High Level Function Boundary. Defaults to True.
   Returns:
       KVCacheEntry: The updated KVCache entry based on the passed inputs.
   """
-  # Don't enable HLFB for kv cache op for now, since it won't work with LLM
-  # inference engine. Remove this part once we ship a new LLM inference engine.
-  enable_hlfb=False
-  update_func = _update_kv_hlfb_impl if enable_hlfb else _update_kv_base_impl
-  return update_func(cache, input_pos, k_slice, v_slice)
+  # Turn dynamic_update_slice updates off for now.
+  use_dus=False
+  update_kv_cache = _update_kv_impl if use_dus else _update_kv_base_impl
+  return update_kv_cache(cache, input_pos, k_slice, v_slice)
 def _update_kv_base_impl(
@@ -181,18 +178,28 @@ def _update_kv_base_impl(
   return updated_cache
-def _update_kv_hlfb_impl(
+def _get_slice_indices(positions: torch.Tensor) -> torch.Tensor:
+  """Dynamic Update Slice updates are a variadic sequence of 0-rank tensors."""
+  zero = torch.zeros([]).int()
+  positions = positions.int()[0].reshape([])
+  return [zero, positions, zero, zero]
+def _update_kv_impl(
     cache: KVCacheEntry,
     input_pos: torch.Tensor,
     k_slice: torch.Tensor,
     v_slice: torch.Tensor,
 ) -> KVCacheEntry:
-  """Update the cache buffer with High Level Function Boundary annotation."""
-  builder = hlfb.StableHLOCompositeBuilder(name="odml.update_external_kv_cache")
-  k_cache, v_cache, input_pos, k_slice, v_slice = builder.mark_inputs(
-      cache.k_cache, cache.v_cache, input_pos, k_slice, v_slice
-  )
-  k = k_cache.index_copy(1, input_pos.to(torch.long), k_slice)
-  v = v_cache.index_copy(1, input_pos.to(torch.long), v_slice)
-  k, v = builder.mark_outputs(k, v)
-  return KVCacheEntry(k, v)
+  """Update the cache buffer for K and V caches."""
+  # NB: Here assume that input_pos == range(input_pos[0], len(input_pos))
+  k_slice_indices = _get_slice_indices(input_pos)
+  v_slice_indices = _get_slice_indices(input_pos)
+  k = dynamic_update_slice(cache.k_cache, k_slice, k_slice_indices)
+  v = dynamic_update_slice(cache.v_cache, v_slice, v_slice_indices)
+  updated_cache = KVCacheEntry(k, v)
+  return updated_cache

ai_edge_torch/generative/layers/normalization.py CHANGED Viewed

@@ -190,14 +190,12 @@ def group_norm_with_hlfb(
   """
   x = torch.permute(x, (0, 2, 3, 1))
-  # TODO: b/366544750 - Change "reduction_axes" field as an array, rather than
-  # int32 when the bug is fixed.
   builder = StableHLOCompositeBuilder(
       name="odml.group_norm",
       attr={
           "num_groups": num_groups,
           "epsilon": eps,
-          "reduction_axes": 3,
+          "reduction_axes": [3],
           "channel_axis": 3,
       },
   )

ai_edge_torch/generative/test/test_kv_cache.py CHANGED Viewed

@@ -71,18 +71,18 @@ class TestKVLayers(googletest.TestCase):
         [0, 0, 5, 5, 0, 0, 0, 0],
     )
     # multi-slice update
-    input_pos = torch.tensor([0, 3])
+    input_pos = torch.tensor([0, 1])
     k_slice = v_slice = torch.full(
         (1, 2, NUM_QG, HEAD_DIM), 7, dtype=torch.float
     )
     updated_entry = kv_utils.update(entry, input_pos, k_slice, v_slice)
     self.assertEqual(
         updated_entry.k_cache.numpy().flatten().tolist(),
-        [7, 7, 0, 0, 0, 0, 7, 7],
+        [7, 7, 7, 7, 0, 0, 0, 0],
     )
     self.assertEqual(
         updated_entry.v_cache.numpy().flatten().tolist(),
-        [7, 7, 0, 0, 0, 0, 7, 7],
+        [7, 7, 7, 7, 0, 0, 0, 0],
     )
   def test_serialization(self):

ai_edge_torch/generative/test/test_model_conversion.py CHANGED Viewed

@@ -21,7 +21,6 @@ from ai_edge_torch.generative.examples.test_models import toy_model_with_kv_cach
 from ai_edge_torch.generative.examples.tiny_llama import tiny_llama
 from ai_edge_torch.generative.layers import kv_cache
 from ai_edge_torch.generative.test import utils as test_utils
-from ai_edge_torch.generative.utilities import model_builder
 import numpy as np
 import torch
@@ -101,8 +100,8 @@ class TestModelConversion(googletest.TestCase):
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
   )
-  def test_toy_model_has_ekv_op(self):
-    """Tests that the model has the external kv cache op."""
+  def test_toy_model_has_dus_op(self):
+    """Tests that the model has the dynamic update slice op."""
     _, edge_model, _ = self._get_params(enable_hlfb=True)
     interpreter_ = interpreter.InterpreterWithCustomOps(
         custom_op_registerers=["GenAIOpsRegisterer"],
@@ -112,7 +111,7 @@ class TestModelConversion(googletest.TestCase):
     # pylint: disable=protected-access
     op_names = [op["op_name"] for op in interpreter_._get_ops_details()]
-    self.assertIn("odml.update_external_kv_cache", op_names)
+    self.assertIn("DYNAMIC_UPDATE_SLICE", op_names)
   def _test_multisig_model(self, config, pytorch_model, atol, rtol):
     # prefill
@@ -185,7 +184,7 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_tiny_llama_multisig(self):
     config = tiny_llama.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
+    pytorch_model = tiny_llama.TinyLlama(config).eval()
     self._test_multisig_model(config, pytorch_model, atol=1e-5, rtol=1e-5)

ai_edge_torch/generative/test/test_model_conversion_large.py CHANGED Viewed

@@ -32,7 +32,6 @@ from ai_edge_torch.generative.examples.stable_diffusion import decoder as sd_dec
 from ai_edge_torch.generative.examples.stable_diffusion import diffusion as sd_diffusion
 from ai_edge_torch.generative.layers import kv_cache
 from ai_edge_torch.generative.test import utils as test_utils
-from ai_edge_torch.generative.utilities import model_builder
 import numpy as np
 import torch
@@ -53,12 +52,15 @@ class TestModelConversion(googletest.TestCase):
             experimental_default_delegate_latest_features=True,
         )
     )
+    # Default cache_size_limit, 8 is hit and aborts often when the tests are
+    # running all together. Doubles it to avoid abortion.
+    torch._dynamo.config.cache_size_limit = 16
+    np.random.seed(1234)  # Make np.random deterministic.
   def _test_model(self, config, model, signature_name, atol, rtol):
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.zeros((1, 10), dtype=torch.int, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10, dtype=torch.int)
+    seq_len = 10
+    tokens = torch.zeros((1, seq_len), dtype=torch.int, device="cpu")
+    input_pos = torch.arange(0, seq_len, dtype=torch.int)
     kv = kv_cache.KVCache.from_model_config(config)
     edge_model = ai_edge_torch.signature(
@@ -74,6 +76,7 @@ class TestModelConversion(googletest.TestCase):
         self._interpreter_builder(edge_model.tflite_model())
     )
+    tokens = torch.arange(1, seq_len + 1, dtype=torch.int).unsqueeze(0)
     self.assertTrue(
         test_utils.compare_tflite_torch(
             edge_model,
@@ -93,10 +96,8 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_gemma1(self):
     config = gemma1.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
-    self._test_model(
-        config, pytorch_model, "serving_default", atol=1e-2, rtol=1e-5
-    )
+    pytorch_model = gemma1.Gemma1(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
@@ -122,10 +123,9 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_phi2(self):
     config = phi2.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
-    self._test_model(
-        config, pytorch_model, "serving_default", atol=1e-3, rtol=1e-3
-    )
+    pytorch_model = phi2.Phi2(config).eval()
+    # Phi-2 logits are very big, so we need a larger absolute tolerance.
+    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
@@ -142,7 +142,7 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_smollm(self):
     config = smollm.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
+    pytorch_model = smollm.SmolLM(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   @googletest.skipIf(
@@ -151,7 +151,7 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_openelm(self):
     config = openelm.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
+    pytorch_model = openelm.OpenELM(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
   @googletest.skipIf(
@@ -160,7 +160,7 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_qwen(self):
     config = qwen.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
+    pytorch_model = qwen.Qwen(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
   @googletest.skipIf(
@@ -169,26 +169,26 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_amd_llama_135m(self):
     config = amd_llama_135m.get_fake_model_config()
-    pytorch_model = model_builder.DecoderOnlyModel(config).eval()
-    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
+    pytorch_model = amd_llama_135m.AmdLlama(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
   )
-  def test_paligemma(self):
+  def disabled_test_paligemma(self):
     config = paligemma.get_fake_model_config()
     pytorch_model = paligemma.PaliGemma(config).eval()
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
     image_embedding_config = config.image_encoder_config.image_embedding
     num_patches = (
         image_embedding_config.image_size // image_embedding_config.patch_size
     ) ** 2
     # Make sure the token size is longer than the number of image patches.
-    tokens_len = num_patches + 10
-    tokens = torch.zeros((1, tokens_len), dtype=torch.int, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, tokens_len, dtype=torch.int)
+    seq_len = num_patches + 10
+    tokens = torch.zeros((1, seq_len), dtype=torch.int, device="cpu")
+    input_pos = torch.arange(0, seq_len, dtype=torch.int)
     kv = kv_cache.KVCache.from_model_config(config.decoder_config)
     pixel_values = torch.zeros((1, 3, 8, 8), dtype=torch.float32, device="cpu")
@@ -206,6 +206,7 @@ class TestModelConversion(googletest.TestCase):
         self._interpreter_builder(edge_model.tflite_model())
     )
+    tokens = torch.arange(1, seq_len + 1, dtype=torch.int).unsqueeze(0)
     self.assertTrue(
         test_utils.compare_tflite_torch(
             edge_model,
@@ -244,7 +245,7 @@ class TestModelConversion(googletest.TestCase):
         signature_name="encode",
     )
     self.assertTrue(
-        np.allclose(
+        test_utils.compare_logits(
             edge_output,
             torch_output.detach().numpy(),
             atol=1e-4,
@@ -258,14 +259,16 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_stable_diffusion_diffusion(self):
     config = sd_diffusion.get_fake_model_config(2)
+    # Reduce stddev(scale) of input values to avoid too big output logits which
+    # fails comparisons with reasonable tolerances.
     latents = torch.from_numpy(
-        np.random.normal(size=(2, 4, 8, 8)).astype(np.float32)
+        np.random.normal(size=(2, 4, 8, 8), scale=0.1).astype(np.float32)
     )
     context = torch.from_numpy(
-        np.random.normal(size=(2, 4, 4)).astype(np.float32)
+        np.random.normal(size=(2, 4, 4), scale=0.1).astype(np.float32)
     )
     time_embedding = torch.from_numpy(
-        np.random.normal(size=(2, 2)).astype(np.float32)
+        np.random.normal(size=(2, 2), scale=0.1).astype(np.float32)
     )
     pytorch_model = sd_diffusion.Diffusion(config).eval()
@@ -284,7 +287,7 @@ class TestModelConversion(googletest.TestCase):
         signature_name="diffusion",
     )
     self.assertTrue(
-        np.allclose(
+        test_utils.compare_logits(
             edge_output,
             torch_output.detach().numpy(),
             atol=1e-4,
@@ -298,8 +301,10 @@ class TestModelConversion(googletest.TestCase):
   )
   def test_stable_diffusion_decoder(self):
     config = sd_decoder.get_fake_model_config()
+    # Reduce stddev(scale) of input values to avoid too big output logits which
+    # fails comparisons with reasonable tolerances.
     latents = torch.from_numpy(
-        np.random.normal(size=(1, 4, 64, 64)).astype(np.float32)
+        np.random.normal(size=(1, 4, 64, 64), scale=0.1).astype(np.float32)
     )
     pytorch_model = sd_decoder.Decoder(config).eval()
@@ -316,10 +321,10 @@ class TestModelConversion(googletest.TestCase):
         signature_name="decode",
     )
     self.assertTrue(
-        np.allclose(
+        test_utils.compare_logits(
             edge_output,
             torch_output.detach().numpy(),
-            atol=1e-4,
+            atol=1e-3,
             rtol=1e-5,
         )
     )

ai_edge_torch/generative/test/utils.py CHANGED Viewed

@@ -15,6 +15,8 @@
 """Common utils for testing."""
+import logging
 from ai_edge_torch import model
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.lowertools import common_utils
@@ -33,7 +35,7 @@ def compare_tflite_torch(
     atol: float = 1e-5,
     rtol: float = 1e-5,
     **kwargs,
-):
+) -> bool:
   """Compares torch models and TFLite models."""
   values, spec = pytree.tree_flatten({"kv_cache": kv_cache})
   flat_names = common_utils.flat_dict_names(spec.children_specs, spec.context)
@@ -49,9 +51,32 @@ def compare_tflite_torch(
       **kwargs,
   )
-  return np.allclose(
-      edge_output["logits"],
-      torch_output["logits"].detach().numpy(),
-      atol=atol,
-      rtol=rtol,
+  return compare_logits(
+      edge_output["logits"], torch_output["logits"].detach().numpy(), atol, rtol
   )
+def compare_logits(
+    edge_logits: np.ndarray,
+    torch_logits: dict[str, torch.Tensor],
+    atol: float = 1e-5,
+    rtol: float = 1e-5,
+) -> bool:
+  """Compares logits from edge model and torch model."""
+  if np.allclose(edge_logits, torch_logits, rtol, atol, equal_nan=True):
+    return True
+  logging.info("edge_logits: %s", edge_logits)
+  logging.info("torch_logits: %s", torch_logits)
+  orig_atol = atol
+  while rtol < 1:
+    atol = orig_atol
+    while atol < 1:
+      if np.allclose(edge_logits, torch_logits, rtol, atol, equal_nan=True):
+        logging.info("Got allclose true with atol=%s, rtol=%s", atol, rtol)
+        return False
+      atol *= 10
+    rtol *= 10
+  logging.info("allclose failed with reasonable atol and rtol.")
+  return False

ai_edge_torch/generative/utilities/converter.py CHANGED Viewed

@@ -15,13 +15,28 @@
 """Common utility functions for model conversion."""
-from typing import Union
+from functools import partial
+from typing import Any, Union
 from ai_edge_torch._convert import converter as converter_utils
 import ai_edge_torch.generative.layers.kv_cache as kv_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 from ai_edge_torch.generative.quantize import quant_recipes
+from ai_edge_torch.generative.utilities.model_builder import ExportConfig
 import torch
+import torch.nn as nn
+class ExportableModule(torch.nn.Module):
+  def __init__(self, module, **extra_kwargs):
+    super().__init__()
+    self.module = module
+    self.extra_kwargs = extra_kwargs
+  def forward(self, *export_args, **export_kwargs):
+    full_kwargs = {**export_kwargs, **self.extra_kwargs}
+    return self.module(*export_args, **full_kwargs)
 def convert_to_tflite(
@@ -31,6 +46,7 @@ def convert_to_tflite(
     pixel_values_size: torch.Size = None,
     quantize: bool = True,
     config: cfg.ModelConfig = None,
+    export_config: ExportConfig = None,
 ):
   """Converts a nn.Module model to multi-signature tflite model.
@@ -97,6 +113,11 @@ def convert_to_tflite(
   )
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
+  # For export, we create a module that captures any non-exportable,
+  # arugments, e.g. the generation config object.
+  mod = ExportableModule(pytorch_model, export_config=export_config)
   converter = converter_utils.Converter()
   for i in range(len(prefill_seq_lens)):
     prefill_seq_len = prefill_seq_lens[i]
@@ -108,7 +129,7 @@ def convert_to_tflite(
       prefill_signature_name = f'prefill_{prefill_seq_len}'
     converter.add_signature(
         prefill_signature_name,
-        pytorch_model,
+        mod,
         sample_kwargs={
             'tokens': prefill_tokens,
             'input_pos': prefill_input_pos,
@@ -118,7 +139,7 @@ def convert_to_tflite(
     if prefill_pixel_values is not None:
       converter.add_signature(
           prefill_signature_name + '_pixel',
-          pytorch_model,
+          mod,
           sample_kwargs={
               'tokens': prefill_tokens,
               'input_pos': prefill_input_pos,
@@ -129,7 +150,7 @@ def convert_to_tflite(
   converter.add_signature(
       'decode',
-      pytorch_model,
+      mod,
       sample_kwargs={
           'tokens': decode_token,
           'input_pos': decode_input_pos,

ai_edge_torch/generative/utilities/model_builder.py CHANGED Viewed

@@ -16,7 +16,8 @@
 """Utilities to be used for re-authoring transformer models."""
 import copy
-from typing import Tuple
+from dataclasses import dataclass
+from typing import Optional, Tuple
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
@@ -45,6 +46,15 @@ TENSOR_NAMES_WITH_SEPARATE_LM_HEAD = copy.copy(TENSOR_NAMES)
 TENSOR_NAMES_WITH_SEPARATE_LM_HEAD.lm_head = "lm_head"
+@dataclass
+class ExportConfig:
+  """Model generating configuration settings."""
+  # On prefill signatures, should the model produce logit output?
+  # When False, only decode signatures will produce output.
+  output_logits_on_prefill: bool = False
 class DecoderOnlyModel(nn.Module):
   """A simple decoder-only transformer model built from the Edge Generative API.
@@ -93,6 +103,7 @@ class DecoderOnlyModel(nn.Module):
       tokens: torch.Tensor,
       input_pos: torch.Tensor,
       kv_cache: kv_utils.KVCache,
+      export_config: Optional[ExportConfig] = None,
   ) -> dict[torch.Tensor, kv_utils.KVCache]:
     _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
@@ -108,7 +119,7 @@ class DecoderOnlyModel(nn.Module):
     mask = mask[:, :, :, : self.config.kv_cache_max]
     return self.forward_with_embeds(
-        input_embeds, rope, mask, input_pos, kv_cache
+        input_embeds, rope, mask, input_pos, kv_cache, export_config
     )
   def forward_with_embeds(
@@ -118,6 +129,7 @@ class DecoderOnlyModel(nn.Module):
       mask: torch.Tensor,
       input_pos: torch.Tensor,
       kv_cache: kv_utils.KVCache,
+      export_config: Optional[ExportConfig] = None,
   ) -> dict[torch.Tensor, kv_utils.KVCache]:
     """Forwards the model with input embeddings."""
     assert len(self.transformer_blocks) == len(kv_cache.caches), (
@@ -137,6 +149,13 @@ class DecoderOnlyModel(nn.Module):
         updated_kv_entires.append(kv_entry)
     updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    if export_config is not None:
+      if (
+          torch.numel(input_pos) > 1
+          and not export_config.output_logits_on_prefill
+      ):
+        return {"kv_cache": updated_kv_cache}
     x = self.final_norm(x)
     logits = self.lm_head(x)  # (b, t, vocab_size)
     return {"logits": logits, "kv_cache": updated_kv_cache}
@@ -146,8 +165,9 @@ def build_decoder_only_model(
     checkpoint_path: str,
     config: cfg.ModelConfig,
     tensor_names: loading_utils.ModelLoader.TensorNames,
-) -> DecoderOnlyModel:
-  transformer = DecoderOnlyModel(config)
+    model_class: type[nn.Module] = DecoderOnlyModel,
+) -> nn.Module:
+  transformer = model_class(config)
   loader = loading_utils.ModelLoader(checkpoint_path, tensor_names)
   loader.load(
       transformer, strict=not config.lm_head_share_weight_with_embedding

ai_edge_torch/generative/utilities/verifier.py CHANGED Viewed

@@ -19,6 +19,7 @@ import logging
 from typing import List
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.utilities.model_builder import ExportConfig
 import torch
@@ -40,6 +41,7 @@ class ModelWrapper(torch.nn.Module):
     """
     super().__init__()
     self.model = model
+    self.export_config = ExportConfig(output_logits_on_prefill=True)
   def forward(
       self, tokens: torch.Tensor, pixel_values: torch.Tensor = None
@@ -103,13 +105,25 @@ class ReauthoredModelWrapper(ModelWrapper):
     Returns:
       The output logits and the updated KV cache.
     """
+    # Verification requires logit outputs on prefill for comparison.
+    if (
+        self.export_config is not None
+        and not self.export_config.output_logits_on_prefill
+    ):
+      raise ValueError("Verifier requires logit output on prefill.")
     # Since the reauthored model doesn't include keyword arguments, pass
     # pixel_values only when it is not None. Otherwise, it may raise an error.
     if pixel_values is None:
-      output = self.model.forward(tokens, input_pos, kv_cache)
+      output = self.model.forward(
+          tokens, input_pos, kv_cache, export_config=self.export_config
+      )
     else:
       output = self.model.forward(
-          tokens, input_pos, kv_cache, pixel_values=pixel_values
+          tokens,
+          input_pos,
+          kv_cache,
+          pixel_values=pixel_values,
+          export_config=self.export_config,
       )
     return output["logits"], output["kv_cache"]

ai_edge_torch/odml_torch/lowerings/__init__.py CHANGED Viewed

@@ -21,6 +21,6 @@ from . import _quantized_decomposed
 from . import context
 from . import registry
 from . import utils
-from .registry import decompositions
+from .decomp import decompositions
 from .registry import lookup
 from .registry import lower

ai-edge-torch-nightly 0.3.0.dev20241206__py3-none-any.whl → 0.3.0.dev20241213__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20241206py3-none-any.whl → 0.3.0.dev20241213py3-none-any.whl