PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ai_edge_torch/generative/layers/model_config.py CHANGED Viewed

@@ -30,6 +30,7 @@ class ActivationType(enum.Enum):
   GELU_QUICK = enum.auto()
   GE_GLU = enum.auto()
   RELU = enum.auto()
+  SILU_GLU = enum.auto()
 @enum.unique
@@ -58,6 +59,18 @@ class AttentionType(enum.Enum):
   LOCAL_SLIDING = enum.auto()
+@dataclass
+class NormalizationConfig:
+  """Normalizater parameters."""
+  type: NormalizationType = NormalizationType.NONE
+  enable_hlfb: bool = False
+  epsilon: float = 1e-5
+  zero_centered: bool = False
+  # Number of groups used in group normalization.
+  group_num: Optional[float] = None
 @dataclass
 class AttentionConfig:
   """Attention model's parameters."""
@@ -81,6 +94,14 @@ class AttentionConfig:
   # Whether to use bias with attention output projection.
   output_proj_use_bias: bool = False
   enable_kv_cache: bool = True
+  # The normalization applied to query projection's output.
+  query_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
+  # The normalization applied to key projection's output.
+  key_norm_config: NormalizationConfig = field(
+      default_factory=NormalizationConfig
+  )
   relative_attention_num_buckets: int = 0
   relative_attention_max_distance: int = 0
   # Softcap on the output logits.
@@ -94,21 +115,9 @@ class AttentionConfig:
 @dataclass
 class ActivationConfig:
   type: ActivationType = ActivationType.LINEAR
-  # Dimension of input and output, used in GeGLU.
-  dim_in: Optional[int] = None
-  dim_out: Optional[int] = None
-@dataclass
-class NormalizationConfig:
-  """Normalizater parameters."""
-  type: NormalizationType = NormalizationType.NONE
-  enable_hlfb: bool = False
-  epsilon: float = 1e-5
-  zero_centered: bool = False
-  # Number of groups used in group normalization.
-  group_num: Optional[float] = None
+  # Whether to GLU gate is the front part instead of the back part of input
+  # when ActivationType is `GE_GLU` or `SILU_GLU`.
+  gate_is_front: bool = False
 @dataclass

ai_edge_torch/generative/quantize/example.py CHANGED Viewed

@@ -25,9 +25,9 @@ def main():
   config = gemma.get_fake_model_config()
   model = gemma.Gemma(config)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, 10), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, 10)
+  input_pos = torch.arange(0, 10, dtype=torch.int)
   # Create a quantization recipe to be applied to the model
   quant_config = quant_recipes.full_int8_dynamic_recipe()

ai_edge_torch/generative/test/test_model_conversion.py CHANGED Viewed

@@ -42,15 +42,9 @@ class TestModelConversion(googletest.TestCase):
         )
     )
-  @googletest.skipIf(
-      ai_edge_config.Config.use_torch_xla,
-      reason="tests with custom ops are not supported on oss",
-  )
-  def test_toy_model_with_kv_cache(self):
-    config = toy_model_with_kv_cache.get_model_config()
-    pytorch_model = toy_model_with_kv_cache.ToyModelWithKVCache(config).eval()
-    tokens, input_pos = torch.tensor([[1]], dtype=torch.long), torch.tensor(
-        [10], dtype=torch.int64
+  def _test_model_with_kv_cache(self, config, pytorch_model):
+    tokens, input_pos = torch.tensor([[1]], dtype=torch.int), torch.tensor(
+        [10], dtype=torch.int
     )
     kv = kv_cache.KVCache.from_model_config(config)
@@ -83,58 +77,32 @@ class TestModelConversion(googletest.TestCase):
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
   )
-  def test_toy_model_with_kv_cache_with_hlfb(self):
+  def test_toy_model_with_kv_cache(self):
     config = toy_model_with_kv_cache.get_model_config()
-    config.enable_hlfb = True
     pytorch_model = toy_model_with_kv_cache.ToyModelWithKVCache(config).eval()
-    tokens, input_pos = torch.tensor([[1]], dtype=torch.long), torch.tensor(
-        [10], dtype=torch.int64
-    )
-    kv = kv_cache.KVCache.from_model_config(config)
-    edge_model = ai_edge_torch.convert(
-        pytorch_model,
-        sample_kwargs={
-            "tokens": tokens,
-            "input_pos": input_pos,
-            "kv_cache": kv,
-        },
-    )
-    edge_model.set_interpreter_builder(
-        self._interpreter_builder(edge_model.tflite_model())
-    )
-    self.assertTrue(
-        test_utils.compare_tflite_torch(
-            edge_model,
-            pytorch_model,
-            tokens,
-            input_pos,
-            kv,
-            signature_name="serving_default",
-            atol=1e-5,
-            rtol=1e-5,
-        )
-    )
+    self._test_model_with_kv_cache(config, pytorch_model)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
   )
-  def test_tiny_llama_multisig(self):
-    config = tiny_llama.get_fake_model_config()
-    pytorch_model = tiny_llama.TinyLlama(config).eval()
+  def test_toy_model_with_kv_cache_with_hlfb(self):
+    config = toy_model_with_kv_cache.get_model_config()
+    config.enable_hlfb = True
+    pytorch_model = toy_model_with_kv_cache.ToyModelWithKVCache(config).eval()
+    self._test_model_with_kv_cache(config, pytorch_model)
+  def _test_multisig_model(self, config, pytorch_model, atol, rtol):
     # prefill
     seq_len = 10
-    prefill_tokens = torch.full((1, seq_len), 0, dtype=torch.long, device="cpu")
+    prefill_tokens = torch.full((1, seq_len), 0, dtype=torch.int, device="cpu")
     prompt_token = torch.from_numpy(np.array([1, 2, 3, 4]))
     prefill_tokens[0, : len(prompt_token)] = prompt_token
-    prefill_input_pos = torch.arange(0, seq_len)
+    prefill_input_pos = torch.arange(0, seq_len, dtype=torch.int)
     # decode
-    decode_token = torch.tensor([[1]], dtype=torch.long)
-    decode_input_pos = torch.tensor([5], dtype=torch.int64)
+    decode_token = torch.tensor([[1]], dtype=torch.int)
+    decode_input_pos = torch.tensor([5], dtype=torch.int)
     kv = kv_cache.KVCache.from_model_config(config)
@@ -171,8 +139,8 @@ class TestModelConversion(googletest.TestCase):
             prefill_input_pos,
             kv,
             signature_name="prefill",
-            atol=1e-5,
-            rtol=1e-5,
+            atol=atol,
+            rtol=atol,
         )
     )
@@ -184,11 +152,20 @@ class TestModelConversion(googletest.TestCase):
             decode_input_pos,
             kv,
             signature_name="decode",
-            atol=1e-5,
-            rtol=1e-5,
+            atol=atol,
+            rtol=atol,
         )
     )
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_tiny_llama_multisig(self):
+    config = tiny_llama.get_fake_model_config()
+    pytorch_model = tiny_llama.TinyLlama(config).eval()
+    self._test_multisig_model(config, pytorch_model, atol=1e-5, rtol=1e-5)
 if __name__ == "__main__":
   googletest.main()

ai_edge_torch/generative/test/test_model_conversion_large.py CHANGED Viewed

@@ -19,7 +19,9 @@ import ai_edge_torch
 from ai_edge_torch import config as ai_edge_config
 from ai_edge_torch.generative.examples.gemma import gemma
 from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.examples.openelm import openelm
 from ai_edge_torch.generative.examples.phi import phi2
+from ai_edge_torch.generative.examples.smollm import smollm
 from ai_edge_torch.generative.layers import kv_cache
 from ai_edge_torch.generative.test import utils as test_utils
 import numpy as np
@@ -43,28 +45,22 @@ class TestModelConversion(googletest.TestCase):
         )
     )
-  @googletest.skipIf(
-      ai_edge_config.Config.use_torch_xla,
-      reason="tests with custom ops are not supported on oss",
-  )
-  def test_gemma(self):
-    config = gemma.get_fake_model_config()
-    model = gemma.Gemma(config)
+  def _test_model(self, config, model, signature_name, atol, rtol):
     idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
+    tokens = torch.full((1, 10), 0, dtype=torch.int, device="cpu")
     tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10)
+    input_pos = torch.arange(0, 10, dtype=torch.int)
     kv = kv_cache.KVCache.from_model_config(config)
-    edge_model = ai_edge_torch.convert(
+    edge_model = ai_edge_torch.signature(
+        signature_name,
         model,
         sample_kwargs={
             "tokens": tokens,
             "input_pos": input_pos,
             "kv_cache": kv,
         },
-    )
+    ).convert()
     edge_model.set_interpreter_builder(
         self._interpreter_builder(edge_model.tflite_model())
     )
@@ -76,9 +72,9 @@ class TestModelConversion(googletest.TestCase):
             tokens,
             input_pos,
             kv,
-            signature_name="serving_default",
-            atol=1e-2,
-            rtol=1e-5,
+            signature_name=signature_name,
+            atol=atol,
+            rtol=rtol,
         )
     )
@@ -86,42 +82,21 @@ class TestModelConversion(googletest.TestCase):
       ai_edge_config.Config.use_torch_xla,
       reason="tests with custom ops are not supported on oss",
   )
-  def test_gemma2(self):
-    config = gemma2.get_fake_model_config()
-    model = gemma2.Gemma2(config)
-    model.eval()
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    prefill_tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
-    prefill_tokens[0, :4] = idx
-    prefill_input_pos = torch.arange(0, 10)
-    kv = kv_cache.KVCache.from_model_config(config)
-    edge_model = ai_edge_torch.signature(
-        "prefill",
-        model,
-        sample_kwargs={
-            "tokens": prefill_tokens,
-            "input_pos": prefill_input_pos,
-            "kv_cache": kv,
-        },
-    ).convert()
-    edge_model.set_interpreter_builder(
-        self._interpreter_builder(edge_model.tflite_model())
+  def test_gemma(self):
+    config = gemma.get_fake_model_config()
+    pytorch_model = gemma.Gemma(config).eval()
+    self._test_model(
+        config, pytorch_model, "serving_default", atol=1e-2, rtol=1e-5
     )
-    self.assertTrue(
-        test_utils.compare_tflite_torch(
-            edge_model,
-            model,
-            prefill_tokens,
-            prefill_input_pos,
-            kv,
-            signature_name="prefill",
-            atol=1e-1,
-            rtol=1e-3,
-        )
-    )
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_gemma2(self):
+    config = gemma2.get_fake_model_config()
+    pytorch_model = gemma2.Gemma2(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-1, rtol=1e-3)
   @googletest.skipIf(
       ai_edge_config.Config.use_torch_xla,
@@ -130,37 +105,27 @@ class TestModelConversion(googletest.TestCase):
   def test_phi2(self):
     config = phi2.get_fake_model_config()
     pytorch_model = phi2.Phi2(config).eval()
-    idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-    tokens = torch.full((1, 10), 0, dtype=torch.long, device="cpu")
-    tokens[0, :4] = idx
-    input_pos = torch.arange(0, 10)
-    kv = kv_cache.KVCache.from_model_config(config)
-    edge_model = ai_edge_torch.convert(
-        pytorch_model,
-        sample_kwargs={
-            "tokens": tokens,
-            "input_pos": input_pos,
-            "kv_cache": kv,
-        },
-    )
-    edge_model.set_interpreter_builder(
-        self._interpreter_builder(edge_model.tflite_model())
+    self._test_model(
+        config, pytorch_model, "serving_default", atol=1e-3, rtol=1e-3
     )
-    self.assertTrue(
-        test_utils.compare_tflite_torch(
-            edge_model,
-            pytorch_model,
-            tokens,
-            input_pos,
-            kv,
-            signature_name="serving_default",
-            atol=1e-3,
-            rtol=1e-3,
-        )
-    )
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_smollm(self):
+    config = smollm.get_fake_model_config()
+    pytorch_model = smollm.SmolLM(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
+  @googletest.skipIf(
+      ai_edge_config.Config.use_torch_xla,
+      reason="tests with custom ops are not supported on oss",
+  )
+  def test_openelm(self):
+    config = openelm.get_fake_model_config()
+    pytorch_model = openelm.OpenELM(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-4, rtol=1e-5)
 if __name__ == "__main__":

ai_edge_torch/generative/test/test_quantize.py CHANGED Viewed

@@ -115,8 +115,8 @@ class TestQuantizeConvert(parameterized.TestCase):
   def test_quantize_convert_toy_sizes(self, quant_config):
     config = toy_model.get_model_config()
     pytorch_model = toy_model.ToySingleLayerModel(config)
-    idx = torch.unsqueeze(torch.arange(0, 100), 0)
-    input_pos = torch.arange(0, 100)
+    idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
+    input_pos = torch.arange(0, 100, dtype=torch.int)
     quantized_model = ai_edge_torch.convert(
         pytorch_model, (idx, input_pos), quant_config=quant_config
@@ -131,8 +131,8 @@ class TestQuantizeConvert(parameterized.TestCase):
   def test_quantize_convert_toy_weight_sharing(self):
     config = toy_model.get_model_config()
     pytorch_model = toy_model.ToySingleLayerModelWeightSharing(config)
-    idx = torch.unsqueeze(torch.arange(0, 100), 0)
-    input_pos = torch.arange(0, 100)
+    idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
+    input_pos = torch.arange(0, 100, dtype=torch.int)
     quant_config = quant_recipes.full_int8_dynamic_recipe()
     quantized_model = ai_edge_torch.convert(
@@ -149,7 +149,7 @@ class TestQuantizeConvert(parameterized.TestCase):
     self.skipTest("b/338288901")
     config = toy_model_with_kv_cache.get_model_config()
     pytorch_model = toy_model_with_kv_cache.ToyModelWithKV(config)
-    idx, input_pos = torch.tensor([[1]], dtype=torch.long), torch.tensor(
+    idx, input_pos = torch.tensor([[1]], dtype=torch.int), torch.tensor(
         [10], dtype=torch.int64
     )

ai_edge_torch/generative/utilities/loader.py CHANGED Viewed

@@ -101,6 +101,8 @@ class ModelLoader:
     attn_value_proj: str = None
     attn_fused_qkv_proj: str = None
     attn_output_proj: str = None
+    attn_query_norm: str = None
+    attn_key_norm: str = None
     ff_up_proj: str = None
     ff_down_proj: str = None
@@ -323,6 +325,17 @@ class ModelLoader:
             )
         )
+    if self._names.attn_query_norm is not None:
+      attn_query_norm_name = self._names.attn_query_norm.format(idx)
+      converted_state[f"{prefix}.atten_func.query_norm.weight"] = state.pop(
+          f"{attn_query_norm_name}.weight"
+      )
+    if self._names.attn_key_norm is not None:
+      attn_key_norm_name = self._names.attn_key_norm.format(idx)
+      converted_state[f"{prefix}.atten_func.key_norm.weight"] = state.pop(
+          f"{attn_key_norm_name}.weight"
+      )
     o_name = self._names.attn_output_proj.format(idx)
     converted_state[f"{prefix}.atten_func.output_projection.weight"] = (
         state.pop(f"{o_name}.weight")

ai_edge_torch/odml_torch/export.py CHANGED Viewed

@@ -223,6 +223,41 @@ class MlirLowered:
     return tf_integration.mlir_to_flatbuffer(self)
+# TODO(b/331481564) Make this a ai_edge_torch FX pass.
+def _convert_i64_to_i32(exported_program: torch.export.ExportedProgram):
+  """Convert internal constant aten ops' output from int64 to int32.
+  Int32 generally has better performance and compatibility than int64 in
+  runtime. This pass converts aten op where the output(s) are int64 constant
+  tensors to return int32 constant tensors.
+  Args:
+    exported_program: The exported program to apply the pass.
+  """
+  def in_i32(x: int):
+    return -2147483648 <= x <= 2147483647
+  def rewrite_arange(node: torch.fx.Node):
+    tensor_meta = node.meta.get("tensor_meta", None)
+    if not tensor_meta:
+      return
+    start, end = node.args[:2]
+    if tensor_meta.dtype != torch.int64:
+      return
+    if not (in_i32(start) and in_i32(end)):
+      return
+    op = node.target
+    node.target = lambda *args, **kwargs: op(*args, **kwargs).type(torch.int32)
+  graph_module = exported_program.graph_module
+  for node in graph_module.graph.nodes:
+    if node.target == torch.ops.aten.arange.start_step:
+      rewrite_arange(node)
 def exported_program_to_mlir(
     exported_program: torch.export.ExportedProgram,
 ) -> MlirLowered:
@@ -231,6 +266,11 @@ def exported_program_to_mlir(
       lowerings.decompositions()
   )
+  _convert_i64_to_i32(exported_program)
+  exported_program = exported_program.run_decompositions(
+      lowerings.decompositions()
+  )
   with export_utils.create_ir_context() as context, ir.Location.unknown():
     module = ir.Module.create()

ai_edge_torch/odml_torch/lowerings/_basic.py CHANGED Viewed

@@ -202,3 +202,47 @@ def _aten_div(mod, x, y, *, rounding_mode=None, out=None) -> ir.Value:
   x, y = utils.broadcast_args_if_needed(x, y)
   return stablehlo.divide(x, y)
+# Schema:
+#   - aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt?
+#       start=None, SymInt? end=None, SymInt step=1) -> Tensor
+# Torch Reference:
+#   - https://pytorch.org/docs/stable/generated/torch.slice_scatter.html
+#   - https://github.com/pytorch/pytorch/blob/18f9331e5deb4c02ae5c206e133a9b4add49bd97/aten/src/ATen/native/TensorShape.cpp#L4002
+@lower(torch.ops.aten.slice_scatter)
+def _aten_slice_scatter(lctx, self, src, dim=0, start=None, end=None, step=1):
+  start = start or 0
+  end = end or self.type.shape[dim]
+  if start < 0:
+    start = self.type.shape[dim] + start
+  if end < 0:
+    end = self.type.shape[dim] + end
+  end = start + step * math.ceil((end - start) / step) - (step - 1)
+  padding_low = start
+  padding_high = self.type.shape[dim] - end
+  rank = len(self.type.shape)
+  src = stablehlo.pad(
+      src,
+      utils.splat(0, src.type.element_type, []),
+      edge_padding_low=[padding_low if i == dim else 0 for i in range(rank)],
+      edge_padding_high=[padding_high if i == dim else 0 for i in range(rank)],
+      interior_padding=[step - 1 if i == dim else 0 for i in range(rank)],
+  )
+  pred = np.ones(self.type.shape, dtype=np.bool_)
+  pred[*[
+      slice(start, end, step) if i == dim else slice(None, None, None)
+      for i in range(rank)
+  ]] = False
+  pred = stablehlo.constant(
+      ir.DenseElementsAttr.get(
+          np.packbits(pred, bitorder="little"),
+          type=ir.IntegerType.get_signless(1),
+          shape=pred.shape,
+      )
+  )
+  out = stablehlo.select(pred, self, src)
+  return out

ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py CHANGED Viewed

@@ -203,7 +203,6 @@ lower_by_torch_xla2(torch.ops.aten.sin)
 lower_by_torch_xla2(torch.ops.aten.sinh)
 lower_by_torch_xla2(torch.ops.aten.slice)
 lower_by_torch_xla2(torch.ops.aten.slice_copy)
-lower_by_torch_xla2(torch.ops.aten.slice_scatter)
 lower_by_torch_xla2(torch.ops.aten.sort)
 lower_by_torch_xla2(torch.ops.aten.split)
 lower_by_torch_xla2(torch.ops.aten.split_copy)

ai_edge_torch/version.py CHANGED Viewed

@@ -13,4 +13,4 @@
 # limitations under the License.
 # ==============================================================================
-__version__ = "0.3.0.dev20240913"
+__version__ = "0.3.0.dev20240914"

{ai_edge_torch_nightly-0.3.0.dev20240913.dist-info → ai_edge_torch_nightly-0.3.0.dev20240914.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-torch-nightly
-Version: 0.3.0.dev20240913
+Version: 0.3.0.dev20240914
 Summary: Supporting PyTorch models with the Google AI Edge TFLite runtime.
 Home-page: https://github.com/google-ai-edge/ai-edge-torch
 Keywords: On-Device ML,AI,Google,TFLite,PyTorch,LLMs,GenAI

ai-edge-torch-nightly 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl