PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ai_edge_torch/_convert/conversion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import logging
 import os
 from typing import Any, Optional
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 from ai_edge_torch import model
 from ai_edge_torch._convert import fx_passes
@@ -34,7 +35,7 @@ def _run_convert_passes(
   exported_program = generative_fx_passes.run_generative_passes(
       exported_program
   )
-  return fx_passes.run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           fx_passes.BuildInterpolateCompositePass(),

ai_edge_torch/_convert/fx_passes/__init__.py CHANGED Viewed

@@ -15,44 +15,8 @@
 from typing import Sequence, Union
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassResult
-from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.canonicalize_pass import CanonicalizePass
-from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass  # NOQA
-from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass  # NOQA
-from torch.export import ExportedProgram
-from torch.fx.passes.infra.pass_manager import pass_result_wrapper
-import torch.utils._pytree as pytree
-# TODO(cnchan): make a PassManager class.
-def run_passes(
-    exported_program: ExportedProgram,
-    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
-) -> ExportedProgram:
-  passes, _ = pytree.tree_flatten(passes)
-  for pass_ in passes:
-    if not isinstance(pass_, ExportedProgramPassBase):
-      pass_ = pass_result_wrapper(pass_)
-    if isinstance(pass_, ExportedProgramPassBase):
-      exported_program = pass_(exported_program).exported_program
-    else:
-      gm = exported_program.graph_module
-      gm, modified = pass_(gm)
-      if modified and gm is not exported_program.graph_module:
-        exported_program = ExportedProgram(
-            root=gm,
-            graph=gm.graph,
-            graph_signature=exported_program.graph_signature,
-            state_dict=exported_program.state_dict,
-            range_constraints=exported_program.range_constraints,
-            module_call_graph=exported_program.module_call_graph,
-            example_inputs=exported_program.example_inputs,
-            verifier=exported_program.verifier,
-            constants=exported_program.constants,
-        )
-  return exported_program
+from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass
+from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass
+from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass
+from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass
+from ai_edge_torch.fx_pass_base import CanonicalizePass

ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # limitations under the License.
 # ==============================================================================
-from functools import reduce
 from typing import Any, Callable
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra import pass_base
 import torch.utils._pytree as pytree
 _composite_builders: dict[
@@ -277,7 +276,7 @@ def _aten_embedding(gm: torch.fx.GraphModule, node: torch.fx.Node):
   node.target = embedding
-class BuildAtenCompositePass(pass_base.PassBase):
+class BuildAtenCompositePass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -286,4 +285,4 @@ class BuildAtenCompositePass(pass_base.PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return pass_base.PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/build_interpolate_composite_pass.py CHANGED Viewed

@@ -16,8 +16,7 @@
 import functools
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch.hlfb import mark_pattern
 from ai_edge_torch.hlfb.mark_pattern import pattern as pattern_module
 import torch
@@ -103,7 +102,7 @@ def _get_interpolate_nearest2d_pattern():
   return pattern
-class BuildInterpolateCompositePass(ExportedProgramPassBase):
+class BuildInterpolateCompositePass(fx_pass_base.ExportedProgramPassBase):
   def __init__(self):
     super().__init__()
@@ -124,4 +123,4 @@ class BuildInterpolateCompositePass(ExportedProgramPassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/_convert/fx_passes/inject_mlir_debuginfo_pass.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra.pass_base import PassBase
-from torch.fx.passes.infra.pass_base import PassResult
 import torch.utils._pytree as pytree
@@ -62,7 +61,7 @@ def _wrap_call_function_node_with_debuginfo_writer(node: torch.fx.GraphModule):
   node.target = debuginfo_writer
-class InjectMlirDebuginfoPass(PassBase):
+class InjectMlirDebuginfoPass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -70,4 +69,4 @@ class InjectMlirDebuginfoPass(PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/optimize_layout_transposes_pass/pass_body.py CHANGED Viewed

@@ -18,8 +18,7 @@ import operator
 import os
 from typing import Union
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassResult
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_check  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_mark  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_partitioners  # NOQA
@@ -31,7 +30,7 @@ import torch.ao.quantization.quantize_pt2e
 TransposeFunc = Union[utils.tensor_to_nchw, utils.tensor_to_nhwc]
-class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
+class OptimizeLayoutTransposesPass(fx_pass_base.ExportedProgramPassBase):
   def get_source_meta(self, node: torch.fx.Node):
     keys = ["stack_trace", "nn_module_stack", "source_fn_stack", "from_node"]
@@ -94,7 +93,7 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     q_args = input_q.args[1:]
     q_kwargs = input_q.kwargs
-    q_op, dq_op = self.get_paired_q_dq_ops(input_q.target)
+    q_op, dq_op = utils.get_paired_q_dq_ops(input_q.target)
     with graph.inserting_before(target):
       # Q and DQ inserted here may required updating the `axis` arg when they
       # are per_channel ops. However, instead of updating here, the nodes would
@@ -301,4 +300,4 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     # Mark const node again for debugging
     self.mark_const_nodes(exported_program)
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/config.py CHANGED Viewed

@@ -21,4 +21,7 @@ import os
 @dataclasses.dataclass
 class Config:
-  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "True") == "True"
+  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "true").lower() in (
+      "1",
+      "true",
+  )

ai_edge_torch/fx_pass_base.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import abc
+import collections
+from typing import Sequence, Union
+import torch
+from torch.fx.passes.infra.pass_base import PassBase
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import pass_result_wrapper
+import torch.utils._pytree as pytree
+FxPassBase = PassBase
+FxPassResult = PassResult
+ExportedProgramPassResult = collections.namedtuple(
+    "ExportedProgramPassResult", ["exported_program", "modified"]
+)
+class ExportedProgramPassBase(abc.ABC):
+  def __call__(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    self.requires(exported_program)
+    res = self.call(exported_program)
+    self.ensures(exported_program)
+    return res
+  @abc.abstractmethod
+  def call(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    pass
+  def requires(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+  def ensures(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+# TODO(cnchan): make a PassManager class.
+def run_passes(
+    exported_program: torch.export.ExportedProgram,
+    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
+) -> torch.export.ExportedProgram:
+  passes, _ = pytree.tree_flatten(passes)
+  for pass_ in passes:
+    if not isinstance(pass_, ExportedProgramPassBase):
+      pass_ = pass_result_wrapper(pass_)
+    if isinstance(pass_, ExportedProgramPassBase):
+      exported_program = pass_(exported_program).exported_program
+    else:
+      gm = exported_program.graph_module
+      gm, modified = pass_(gm)
+      if modified and gm is not exported_program.graph_module:
+        exported_program = torch.export.ExportedProgram(
+            root=gm,
+            graph=gm.graph,
+            graph_signature=exported_program.graph_signature,
+            state_dict=exported_program.state_dict,
+            range_constraints=exported_program.range_constraints,
+            module_call_graph=exported_program.module_call_graph,
+            example_inputs=exported_program.example_inputs,
+            verifier=exported_program.verifier,
+            constants=exported_program.constants,
+        )
+  return exported_program
+class CanonicalizePass(ExportedProgramPassBase):
+  # A dummy decomp table for running ExportedProgram.run_decompositions without
+  # any op decompositions but just aot_export_module. Due to the check in
+  # run_decompositions, if None or an empty dict is passed as decomp_table,
+  # it will run the default aten-coreaten decompositions. Therefore a non-empty
+  # dummy decomp table is needed.
+  # Ref: https://github.com/pytorch/pytorch/blob/db895ace1d36726e64781774f53b3d3098206116/torch/export/exported_program.py#L543
+  _DUMMY_DECOMP_TABLE = {
+      torch._ops.OperatorBase(): lambda: None,
+  }
+  def call(self, exported_program: torch.export.ExportedProgram):
+    exported_program = exported_program.run_decompositions(
+        self._DUMMY_DECOMP_TABLE
+    )
+    return ExportedProgramPassResult(exported_program, True)

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py CHANGED Viewed

@@ -47,10 +47,10 @@ def convert_gemma2_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None

ai_edge_torch/generative/examples/gemma/convert_to_tflite.py CHANGED Viewed

@@ -47,10 +47,10 @@ def convert_gemma_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -203,9 +203,9 @@ def define_and_run_2b(checkpoint_path: str) -> None:
   kv_cache_max_len = 1024
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(model.config)
   output = model.forward(tokens, input_pos, kv)
   print("comparing with goldens..")

ai_edge_torch/generative/examples/gemma/gemma2.py CHANGED Viewed

@@ -280,9 +280,9 @@ def define_and_run_2b(checkpoint_path: str) -> None:
   toks = torch.from_numpy(
       np.array([2, 651, 9456, 576, 573, 3520, 3858, 603, 235248])
   )
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :9] = toks
-  input_pos = torch.arange(0, kv_cache_max_len)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
   kv = kv_utils.KVCache.from_model_config(model.config)
   out = model.forward(tokens, input_pos, kv)
   out_final = out["logits"][0, 8, :]

ai_edge_torch/generative/examples/openelm/convert_to_tflite.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of converting OpenELM model to multi-signature tflite model."""
+import os
+import pathlib
+import ai_edge_torch
+from ai_edge_torch.generative.examples.openelm import openelm
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+from ai_edge_torch.generative.quantize import quant_recipes
+import torch
+def convert_openelm_to_tflite(
+    checkpoint_path: str,
+    prefill_seq_len: int = 512,
+    kv_cache_max_len: int = 1024,
+    quantize: bool = True,
+):
+  """Converts OpenELM model to multi-signature tflite model.
+  Args:
+      checkpoint_path (str): The filepath to the model checkpoint, or directory
+        holding the checkpoint.
+      prefill_seq_len (int, optional): The maximum size of prefill input tensor.
+        Defaults to 512.
+      kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
+        including both prefill and decode. Defaults to 1024.
+      quantize (bool, optional): Whether the model should be quanized. Defaults
+        to True.
+  """
+  pytorch_model = openelm.build_model(
+      checkpoint_path, kv_cache_max_len=kv_cache_max_len
+  )
+  # Tensors used to trace the model graph during conversion.
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
+  quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
+  edge_model = (
+      ai_edge_torch.signature(
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .convert(quant_config=quant_config)
+  )
+  quant_suffix = 'q8' if quantize else 'f32'
+  edge_model.export(
+      f'/tmp/openelm_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
+  )
+if __name__ == '__main__':
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/openelm')
+  convert_openelm_to_tflite(path)

ai_edge_torch/generative/examples/openelm/openelm.py ADDED Viewed

@@ -0,0 +1,237 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of building an OpenELM model."""
+import os
+import pathlib
+from ai_edge_torch.generative.layers import attention
+from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
+import ai_edge_torch.generative.layers.attention_utils as attn_utils
+import ai_edge_torch.generative.layers.model_config as cfg
+import ai_edge_torch.generative.utilities.loader as loading_utils
+import numpy as np
+import torch
+from torch import nn
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="transformer.layers.{}.ffn.proj_1",
+    ff_down_proj="transformer.layers.{}.ffn.proj_2",
+    attn_fused_qkv_proj="transformer.layers.{}.attn.qkv_proj",
+    attn_query_norm="transformer.layers.{}.attn.q_norm",
+    attn_key_norm="transformer.layers.{}.attn.k_norm",
+    attn_output_proj="transformer.layers.{}.attn.out_proj",
+    pre_attn_norm="transformer.layers.{}.attn_norm",
+    pre_ff_norm="transformer.layers.{}.ffn_norm",
+    embedding="transformer.token_embeddings",
+    final_norm="transformer.norm",
+    lm_head=None,
+)
+class OpenELM(nn.Module):
+  """An OpenELM model built from the Edge Generative API layers."""
+  def __init__(self, config: cfg.ModelConfig):
+    super().__init__()
+    # Construct model layers.
+    self.tok_embedding = nn.Embedding(
+        config.vocab_size, config.embedding_dim, padding_idx=0
+    )
+    self.lm_head = nn.Linear(
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
+    )
+    # OpenELM re-uses the embedding as the head projection layer.
+    self.lm_head.weight.data = self.tok_embedding.weight.data
+    self.transformer_blocks = nn.ModuleList(
+        attention.TransformerBlock(config.block_config(idx), config)
+        for idx in range(config.num_layers)
+    )
+    self.final_norm = builder.build_norm(
+        config.embedding_dim,
+        config.final_norm_config,
+    )
+    # OpenELM has same hyper parameters for rotary_percentage and head_dim for
+    # each layer block. Use the first block.
+    attn_config = config.block_config(0).attn_config
+    self.rope_cache = attn_utils.build_rope_cache(
+        size=config.kv_cache_max,
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
+        base=10_000,
+        condense_ratio=1,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+    )
+    self.config = config
+  @torch.inference_mode
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
+    assert self.config.max_seq_len >= seq_len, (
+        f"Cannot forward sequence of length {seq_len}, max seq length is only"
+        f" {self.config.max_seq_len}"
+    )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
+    cos, sin = self.rope_cache
+    cos = cos.index_select(0, input_pos)
+    sin = sin.index_select(0, input_pos)
+    mask = self.mask_cache.index_select(2, input_pos)
+    mask = mask[:, :, :, : self.config.kv_cache_max]
+    # token embeddings of shape (b, t, n_embd)
+    x = self.tok_embedding(tokens)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
+    x = self.final_norm(x)
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for an OpenELM model.
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+  Returns:
+    The model config for an OpenELM model.
+  """
+  norm_config = cfg.NormalizationConfig(
+      type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6
+  )
+  num_heads = [12] * 4 + [16] * 14 + [20] * 12 + [24] * 6
+  num_query_groups = [3] * 4 + [4] * 14 + [5] * 12 + [6] * 6
+  def make_divisible(v, d):
+    """Ensures that all layers have a channel number that is divisible by d."""
+    new_v = int(v + d / 2) // d * d
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+      new_v += d
+    return new_v
+  # The way to get intermediate size is from
+  # https://huggingface.co/apple/OpenELM-3B/blob/main/modeling_openelm.py
+  def get_intermediate_size(idx: int) -> int:
+    return make_divisible((0.5 + 0.1 * idx) * 3072, 256)
+  def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
+    return cfg.TransformerBlockConfig(
+        attn_config=cfg.AttentionConfig(
+            num_heads=num_heads[idx],
+            head_dim=128,
+            num_query_groups=num_query_groups[idx],
+            rotary_percentage=1.0,
+            qkv_transpose_before_split=True,
+            query_norm_config=norm_config,
+            key_norm_config=norm_config,
+        ),
+        ff_config=cfg.FeedForwardConfig(
+            type=cfg.FeedForwardType.SEQUENTIAL,
+            activation=cfg.ActivationConfig(
+                cfg.ActivationType.SILU_GLU, gate_is_front=True
+            ),
+            intermediate_size=get_intermediate_size(idx),
+            pre_ff_norm_config=norm_config,
+        ),
+        pre_attention_norm_config=norm_config,
+    )
+  num_layers = 36
+  config = cfg.ModelConfig(
+      vocab_size=32000,
+      num_layers=num_layers,
+      max_seq_len=2048,
+      embedding_dim=3072,
+      kv_cache_max_len=kv_cache_max_len,
+      block_configs=[get_block_config(i) for i in range(num_layers)],
+      final_norm_config=norm_config,
+  )
+  return config
+def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
+  config = get_model_config(kv_cache_max_len)
+  config.vocab_size = 128
+  config.num_layers = 2
+  config.max_seq_len = 2 * kv_cache_max_len
+  config.embedding_dim = 128
+  config.block_configs = config.block_configs[: config.num_layers]
+  for block_config in config.block_configs:
+    block_config.attn_config.num_heads = 3
+    block_config.attn_config.head_dim = 64
+    block_config.ff_config.intermediate_size = 128
+  return config
+def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
+  config = get_model_config(**kwargs)
+  model = OpenELM(config)
+  loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
+  # Since embedding and lm-head use the same weight, we need to set strict
+  # to False.
+  loader.load(model, strict=False)
+  model.eval()
+  return model
+def define_and_run(checkpoint_path: str) -> None:
+  """Instantiates and runs an OpenELM model."""
+  current_dir = pathlib.Path(__file__).parent.resolve()
+  openelm_goldens = torch.load(current_dir / "openelm_lm_logits.pt")
+  kv_cache_max_len = 1024
+  model = build_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
+  idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
+  tokens[0, :4] = idx
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
+  assert torch.allclose(
+      openelm_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-05
+  )
+if __name__ == "__main__":
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/openelm"
+  )
+  define_and_run(input_checkpoint_path)

ai-edge-torch-nightly 0.3.0.dev20240913__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240913py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl