PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20241218__py3-none-any.whl → 0.3.0.dev20241224__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20241218py3-none-any.whl → 0.3.0.dev20241224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

ai_edge_torch/generative/examples/paligemma/verify_decoder2.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Verifies the reauthored decoder of PaliGemma2 3B model."""
+import logging
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.paligemma import decoder2
+from ai_edge_torch.generative.utilities import transformers_verifier
+from ai_edge_torch.generative.utilities import verifier
+import kagglehub
+import transformers
+_PROMPTS = flags.DEFINE_multi_string(
+    "prompts",
+    "What is the meaning of life?",
+    "The input prompts to generate answers.",
+)
+_MAX_NEW_TOKENS = flags.DEFINE_integer(
+    "max_new_tokens",
+    30,
+    "The maximum size of the generated tokens.",
+)
+def main(_):
+  checkpoint = kagglehub.model_download(
+      "google/paligemma-2/transformers/paligemma2-3b-pt-224"
+  )
+  logging.info("Loading the original model from: %s", checkpoint)
+  original_full_model = (
+      transformers.PaliGemmaForConditionalGeneration.from_pretrained(checkpoint)
+  )
+  original_language_model = original_full_model.eval().language_model
+  logging.info("Building the reauthored model from: %s", checkpoint)
+  reauthored_model = decoder2.build_decoder2(checkpoint)
+  logging.info("Loading the tokenizer from: %s", checkpoint)
+  # It works only when GemmaTokenizerFast is available. In some environments,
+  # use_fast=False doeesn't work either if the tokenizer cannot load the
+  # sentencepiece model file properly.
+  processor = transformers.AutoProcessor.from_pretrained(checkpoint)
+  verifier.verify_reauthored_model(
+      original_model=transformers_verifier.TransformersModelWrapper(
+          original_language_model
+      ),
+      reauthored_model=verifier.ReauthoredModelWrapper(reauthored_model),
+      tokenizer=verifier.TokenizerWrapper(processor.tokenizer),
+      generate_prompts=_PROMPTS.value,
+      max_new_tokens=_MAX_NEW_TOKENS.value,
+      atol=1e-04,
+  )
+if __name__ == "__main__":
+  app.run(main)

ai_edge_torch/generative/examples/paligemma/verify_image_encoder.py CHANGED Viewed

@@ -20,31 +20,48 @@ import pathlib
 from absl import app
 from absl import flags
 from ai_edge_torch.generative.examples.paligemma import image_encoder
+import kagglehub
 from PIL import Image
 import requests
 import torch
 import transformers
+_VERSION = flags.DEFINE_enum(
+    "version",
+    "2",
+    ["1", "2"],
+    "The version of PaliGemma vision model to verify.",
+)
 _IMAGE_URL = flags.DEFINE_string(
     "image_url",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true",
     "The image URI to encode.",
 )
+_CHECKPOINT = {
+    "1": "google/paligemma-3b-mix-224",
+    "2": "google/paligemma-2/transformers/paligemma2-3b-pt-224",
+}
 def main(_):
-  checkpoint = "google/paligemma-3b-mix-224"
+  if _VERSION.value == "1":
+    checkpoint = _CHECKPOINT[_VERSION.value]
+    # Locate the cached dir.
+    cached_config_file = transformers.utils.cached_file(
+        checkpoint, transformers.utils.CONFIG_NAME
+    )
+    reauthored_checkpoint = pathlib.Path(cached_config_file).parent
+  else:
+    checkpoint = kagglehub.model_download(_CHECKPOINT[_VERSION.value])
+    reauthored_checkpoint = checkpoint
   logging.info("Loading the original model from: %s", checkpoint)
   original_full_model = (
       transformers.PaliGemmaForConditionalGeneration.from_pretrained(checkpoint)
   )
   original_vision_model = original_full_model.eval().vision_tower
-  # Locate the cached dir.
-  cached_config_file = transformers.utils.cached_file(
-      checkpoint, transformers.utils.CONFIG_NAME
-  )
-  reauthored_checkpoint = pathlib.Path(cached_config_file).parent
   logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
   reauthored_model = image_encoder.build_image_encoder(reauthored_checkpoint)
@@ -69,7 +86,7 @@ def main(_):
   try:
     assert torch.allclose(
-        outputs_original, outputs_reauthored, atol=1e-04, rtol=1e-04
+        outputs_original, outputs_reauthored, atol=1e-03, rtol=1e-04
     )
   except AssertionError as e:
     logging.error("*** FAILED *** verify with an image")

ai_edge_torch/generative/examples/test_models/toy_model_with_kv_cache.py CHANGED Viewed

@@ -72,14 +72,14 @@ class ToyModelWithKVCache(torch.nn.Module):
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.max_seq_len]
-    updated_kv_entires = []
+    updated_kv_entries = []
     for i, block in enumerate(self.transformer_blocks):
       kv_entry = kv_cache.caches[i] if kv_cache else None
       x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
       if kv_entry:
-        updated_kv_entires.append(kv_entry)
+        updated_kv_entries.append(kv_entry)
-    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
     if export_config is not None:
       if (

ai_edge_torch/generative/layers/attention.py CHANGED Viewed

@@ -26,33 +26,6 @@ import torch
 from torch import nn
-def _embed_rope(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    n_elem: int,
-    rope: Tuple[torch.Tensor, torch.Tensor],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-  """Embed rotary positional embedding for query and key.
-  Args:
-    q (torch.Tensor): query tensor.
-    k (torch.Tensor): key tensor.
-    n_elem (int): number of elements to embed rotarty positional embedding.
-    rope (Tuple[torch.Tensor, torch.Tensor]): the input rope tensor.
-  """
-  if n_elem > 0:
-    cos, sin = rope
-    q_roped = rotary_pos_emb.apply_rope(
-        q[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-    )
-    k_roped = rotary_pos_emb.apply_rope(
-        k[..., :n_elem], cos.repeat(1, 2), sin.repeat(1, 2)
-    )
-    q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
-    k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
-  return q, k
 class TransformerBlock(nn.Module):
   def __init__(
@@ -238,7 +211,8 @@ class CausalSelfAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      q, k = _embed_rope(q, k, n_elem, rope)
+      cos, sin = rope
+      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)
@@ -374,7 +348,8 @@ class CrossAttention(nn.Module):
     if rope is not None:
       # Compute rotary positional embedding for query and key.
       n_elem = int(self.config.rotary_percentage * self.config.head_dim)
-      q, k = _embed_rope(q, k, n_elem, rope)
+      cos, sin = rope
+      q, k = rotary_pos_emb.apply_rope_inline(q, k, cos, sin)
     if kv_cache is not None:
       kv_cache = kv_utils.update(kv_cache, input_pos, k, v)

ai_edge_torch/generative/layers/rotary_position_embedding.py CHANGED Viewed

@@ -32,57 +32,64 @@ def apply_rope(
   """
   x = x.transpose(1, 2)
   head_size = x.size(-1)
-  x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
-  x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
-  rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
-  roped = (x * cos) + (rotated * sin)
+  x1, x2 = torch.split(x, head_size // 2, dim=-1)
+  left = x1 * cos - x2 * sin
+  right = x2 * cos + x1 * sin
+  roped = torch.cat([left, right], dim=-1)
   return roped.transpose(1, 2).type_as(x)
-def apply_rope_inline(
-    q: torch.Tensor,
-    k: torch.Tensor,
+def build_rope(
     input_pos: torch.Tensor,
     n_elem: int,
+    head_dim: int,
     base: int = 10_000,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-  """Computes rotary positional embedding inline for a query and key.
+  """Computes rotary positional embedding cosine and sine tensors.
   Args:
-    q: the query tensor.
-    k: the key tensor.
     input_pos: the sequence indices for the query and key
     n_elem: number of elements of the head dimension for RoPE computation
+    base: the base of the exponentiated value for RoPE.
   Returns:
-    output the RoPE'd query and key.
+    cos, sin tensors
   """
   if n_elem <= 0:
-    return q, k
+    return None, None
   theta = 1.0 / (base ** (torch.arange(0, n_elem, 2).float() / n_elem))
   freq_exponents = (2.0 / n_elem) * torch.arange(
-      q.shape[-1] // 2, dtype=torch.float32
+      head_dim // 2, dtype=torch.float32
   )
   timescale = float(base) ** freq_exponents
   radians = input_pos.clone().unsqueeze(0).unsqueeze(-1) / timescale.unsqueeze(
       0
   ).unsqueeze(0)
-  cos = torch.cos(radians).type_as(q)
-  sin = torch.sin(radians).type_as(q)
+  cos = torch.cos(radians)
+  sin = torch.sin(radians)
+  return cos, sin
-  def apply(x, sin, cos):
-    x = x.transpose(1, 2)
-    b, h, s, d = x.shape
-    ans = torch.split(x, d // 2, dim=-1)
-    x1, x2 = ans
-    left = x1 * cos - x2 * sin
-    right = x2 * cos + x1 * sin
-    res = torch.cat([left, right], dim=-1)
-    res = res.transpose(1, 2)
-    return res
+def apply_rope_inline(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+  """Computes rotary positional embedding inline for a query and key.
+  Args:
+    q: the query tensor.
+    k: the key tensor.
+    cos: the cosine tensor.
+    sin: the sine tensor.
+  Returns:
+    output the RoPE'd query and key.
+  """
-  q_roped = apply(q, sin, cos)
-  k_roped = apply(k, sin, cos)
+  q_roped = apply_rope(q, cos, sin)
+  k_roped = apply_rope(k, cos, sin)
   return q_roped, k_roped

ai_edge_torch/generative/test/test_model_conversion_large.py CHANGED Viewed

@@ -21,6 +21,8 @@ from ai_edge_torch.generative.examples.gemma import gemma1
 from ai_edge_torch.generative.examples.gemma import gemma2
 from ai_edge_torch.generative.examples.llama import llama
 from ai_edge_torch.generative.examples.openelm import openelm
+from ai_edge_torch.generative.examples.paligemma import decoder
+from ai_edge_torch.generative.examples.paligemma import decoder2
 from ai_edge_torch.generative.examples.paligemma import paligemma
 from ai_edge_torch.generative.examples.phi import phi2
 from ai_edge_torch.generative.examples.phi import phi3
@@ -171,13 +173,9 @@ class TestModelConversion(googletest.TestCase):
     pytorch_model = amd_llama_135m.AmdLlama(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
-  @googletest.skipIf(
-      ai_edge_torch.config.in_oss,
-      reason="tests with custom ops are not supported in oss",
-  )
-  def disabled_test_paligemma(self):
-    config = paligemma.get_fake_model_config()
-    pytorch_model = paligemma.PaliGemma(config).eval()
+  def _test_paligemma_model(self, decoder_class, decoder_config, atol, rtol):
+    config = paligemma.get_fake_model_config(decoder_config)
+    pytorch_model = paligemma.PaliGemma(config, decoder_class).eval()
     image_embedding_config = config.image_encoder_config.image_embedding
     num_patches = (
@@ -215,11 +213,32 @@ class TestModelConversion(googletest.TestCase):
             kv,
             pixel_values=pixel_values,
             signature_name="prefill_pixel",
-            atol=1e-3,
-            rtol=1e-5,
+            atol=atol,
+            rtol=rtol,
         )
     )
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+  def disabled_test_paligemma1(self):
+    self._test_paligemma_model(
+        decoder.Decoder, decoder.get_fake_decoder_config, atol=1e-3, rtol=1e-5
+    )
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+  def disabled_test_paligemma2(self):
+    self._test_paligemma_model(
+        decoder2.Decoder2,
+        decoder2.get_fake_decoder2_config,
+        atol=1e-3,
+        rtol=1e-5,
+    )
   @googletest.skipIf(
       ai_edge_torch.config.in_oss,
       reason="tests with custom ops are not supported in oss",

ai_edge_torch/generative/utilities/model_builder.py CHANGED Viewed

@@ -24,6 +24,7 @@ from ai_edge_torch.generative.layers import builder
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 import ai_edge_torch.generative.utilities.loader as loading_utils
 import torch
 from torch import nn
@@ -85,13 +86,6 @@ class DecoderOnlyModel(nn.Module):
         config.embedding_dim,
         config.final_norm_config,
     )
-    # ROPE parameters for all attn_configs are the same. Take the first one.
-    attn_config = config.block_config(0).attn_config
-    self.rope_cache = attn_utils.build_rope_cache(
-        size=config.kv_cache_max,
-        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
-        base=attn_config.rotary_base,
-    )
     self.mask_cache = attn_utils.build_causal_mask_cache(
         size=config.kv_cache_max,
     )
@@ -113,16 +107,22 @@ class DecoderOnlyModel(nn.Module):
     # token embeddings of shape (b, t, n_embd)
     input_embeds = self.tok_embedding(tokens)
-    cos, sin = self.rope_cache
-    rope = (cos.index_select(0, input_pos), sin.index_select(0, input_pos))
+    # ROPE parameters for all attn_configs are the same. Take the first one.
+    attn_config = self.config.block_config(0).attn_config
+    n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
+    rope = rotary_pos_emb.build_rope(
+        input_pos, n_elem, attn_config.head_dim, attn_config.rotary_base
+    )
     mask = self.mask_cache.index_select(2, input_pos)
     mask = mask[:, :, :, : self.config.kv_cache_max]
-    return self.forward_with_embeds(
+    return self._forward_with_embeds(
         input_embeds, rope, mask, input_pos, kv_cache, export_config
     )
-  def forward_with_embeds(
+  def _forward_with_embeds(
       self,
       input_embeds: torch.Tensor,
       rope: Tuple[torch.Tensor, torch.Tensor],
@@ -141,13 +141,13 @@ class DecoderOnlyModel(nn.Module):
     if self.config.embedding_scale is not None:
       x = x * self.config.embedding_scale
-    updated_kv_entires = []
+    updated_kv_entries = []
     for i, block in enumerate(self.transformer_blocks):
       kv_entry = kv_cache.caches[i] if kv_cache else None
       x, kv_entry = block(x, rope, mask, input_pos, kv_entry)
       if kv_entry:
-        updated_kv_entires.append(kv_entry)
-    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+        updated_kv_entries.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
     if export_config is not None:
       if (

ai_edge_torch/generative/utilities/verifier.py CHANGED Viewed

@@ -16,7 +16,7 @@
 """Common utility functions to verify the reauthored models."""
 import logging
-from typing import List
+from typing import Any,List
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.utilities.model_builder import ExportConfig
@@ -87,6 +87,10 @@ class ReauthoredModelWrapper(ModelWrapper):
     """Returns an initialized KV cache."""
     return kv_utils.KVCache.from_model_config(self.model.config)
+  def _get_extra_args_for_forward(self) -> dict[str, Any]:
+    """Returns extra arguments for the forward() method."""
+    return {}
   def _forward_with_kv_cache(
       self,
       tokens: torch.Tensor,
@@ -105,26 +109,15 @@ class ReauthoredModelWrapper(ModelWrapper):
     Returns:
       The output logits and the updated KV cache.
     """
-    # Verification requires logit outputs on prefill for comparison.
-    if (
-        self.export_config is not None
-        and not self.export_config.output_logits_on_prefill
-    ):
-      raise ValueError("Verifier requires logit output on prefill.")
-    # Since the reauthored model doesn't include keyword arguments, pass
-    # pixel_values only when it is not None. Otherwise, it may raise an error.
-    if pixel_values is None:
-      output = self.model.forward(
-          tokens, input_pos, kv_cache, export_config=self.export_config
-      )
-    else:
-      output = self.model.forward(
-          tokens,
-          input_pos,
-          kv_cache,
-          pixel_values=pixel_values,
-          export_config=self.export_config,
-      )
+    extra_args = self._get_extra_args_for_forward()
+    if self.export_config is not None:
+      # Verification requires logit outputs on prefill for comparison.
+      if not self.export_config.output_logits_on_prefill:
+        raise ValueError("Verifier requires logit output on prefill.")
+      extra_args["export_config"] = self.export_config
+    if pixel_values is not None:
+      extra_args["pixel_values"] = pixel_values
+    output = self.model.forward(tokens, input_pos, kv_cache, **extra_args)
     return output["logits"], output["kv_cache"]
   def forward(
@@ -141,6 +134,7 @@ class ReauthoredModelWrapper(ModelWrapper):
       prompts: torch.Tensor,
       max_new_tokens: int,
       pixel_values: torch.Tensor = None,
+      eos_token_id: int = 1,
   ) -> torch.IntTensor:
     input_ids = prompts[0].int().tolist()
     tokens = torch.tensor([input_ids])
@@ -152,6 +146,8 @@ class ReauthoredModelWrapper(ModelWrapper):
       )
       generated_token = logits[0][-1].argmax().item()
       input_ids.append(generated_token)
+      if generated_token == eos_token_id:
+        break
       tokens = torch.tensor([[generated_token]])
       input_pos = torch.tensor([len(input_ids) - 1])
       pixel_values = None  # Pass only for the first time.
@@ -254,7 +250,11 @@ def verify_model_with_prompts(
   logging.info("outputs_from_original_model: [[%s]]", response_original)
   logging.info("Generating answer with the reauthored model...")
-  outputs_reauthored = reauthored_model.generate(prompt_tokens, max_new_tokens)
+  outputs_reauthored = reauthored_model.generate(
+      prompt_tokens,
+      max_new_tokens,
+      eos_token_id=tokenizer.tokenizer.eos_token_id,
+  )
   response_reauthored = tokenizer.decode(outputs_reauthored[0])
   logging.info("outputs from reauthored model: [[%s]]", response_reauthored)

ai_edge_torch/odml_torch/export.py CHANGED Viewed

@@ -198,7 +198,12 @@ class MlirLowered:
     # build, which may not have the same StableHLO version as what used in
     # TFLite converter. Therefore we always serialize MLIR module in VHLO.
     # TODO(b/362798610) Build MLIR pybinding in ai-edge-torch release.
-    target_version = stablehlo.get_minimum_version()
+    if stablehlo.get_api_version() < 9:
+      target_version = stablehlo.get_minimum_version()
+    else:
+      target_version = stablehlo.get_version_from_compatibility_requirement(
+          stablehlo.StablehloCompatibilityRequirement.WEEK_4
+      )
     module_bytecode = xla_extension.mlir.serialize_portable_artifact(
         self.module_bytecode, target_version
     )

ai_edge_torch/odml_torch/jax_bridge/__init__.py CHANGED Viewed

@@ -12,4 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from ai_edge_torch.odml_torch.jax_bridge._wrap import wrap
+from ai_edge_torch.odml_torch.jax_bridge import _wrap
+from ai_edge_torch.odml_torch.jax_bridge import utils
+wrap = _wrap.wrap

ai_edge_torch/odml_torch/lowerings/__init__.py CHANGED Viewed

@@ -18,6 +18,7 @@ from . import _convolution
 from . import _jax_lowerings
 from . import _layer_norm
 from . import _quantized_decomposed
+from . import _rand
 from . import context
 from . import registry
 from . import utils

ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py CHANGED Viewed

@@ -26,6 +26,7 @@ import torch_xla2.ops.ops_registry  # Import to load torch_xla2 ops
 LoweringContext = context.LoweringContext
 @functools.cache
 def _log_usage(op):
   logging.warning("Use jax lowering: %s", str(op))
@@ -184,8 +185,6 @@ lower_by_torch_xla2(torch.ops.aten.permute_copy)
 lower_by_torch_xla2(torch.ops.aten.pixel_shuffle)
 lower_by_torch_xla2(torch.ops.aten.pow)
 lower_by_torch_xla2(torch.ops.aten.prod)
-lower_by_torch_xla2(torch.ops.aten.rand)
-lower_by_torch_xla2(torch.ops.aten.randn)
 lower_by_torch_xla2(torch.ops.aten.reciprocal)
 lower_by_torch_xla2(torch.ops.aten.reflection_pad1d)
 lower_by_torch_xla2(torch.ops.aten.relu)

ai-edge-torch-nightly 0.3.0.dev20241218__py3-none-any.whl → 0.3.0.dev20241224__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20241218py3-none-any.whl → 0.3.0.dev20241224py3-none-any.whl