PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

ai_edge_torch/_convert/conversion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import logging
 import os
 from typing import Any, Optional
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 from ai_edge_torch import model
 from ai_edge_torch._convert import fx_passes
@@ -34,7 +35,7 @@ def _run_convert_passes(
   exported_program = generative_fx_passes.run_generative_passes(
       exported_program
   )
-  return fx_passes.run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           fx_passes.BuildInterpolateCompositePass(),

ai_edge_torch/_convert/fx_passes/__init__.py CHANGED Viewed

@@ -15,44 +15,8 @@
 from typing import Sequence, Union
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassResult
-from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.canonicalize_pass import CanonicalizePass
-from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass  # NOQA
-from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass  # NOQA
-from torch.export import ExportedProgram
-from torch.fx.passes.infra.pass_manager import pass_result_wrapper
-import torch.utils._pytree as pytree
-# TODO(cnchan): make a PassManager class.
-def run_passes(
-    exported_program: ExportedProgram,
-    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
-) -> ExportedProgram:
-  passes, _ = pytree.tree_flatten(passes)
-  for pass_ in passes:
-    if not isinstance(pass_, ExportedProgramPassBase):
-      pass_ = pass_result_wrapper(pass_)
-    if isinstance(pass_, ExportedProgramPassBase):
-      exported_program = pass_(exported_program).exported_program
-    else:
-      gm = exported_program.graph_module
-      gm, modified = pass_(gm)
-      if modified and gm is not exported_program.graph_module:
-        exported_program = ExportedProgram(
-            root=gm,
-            graph=gm.graph,
-            graph_signature=exported_program.graph_signature,
-            state_dict=exported_program.state_dict,
-            range_constraints=exported_program.range_constraints,
-            module_call_graph=exported_program.module_call_graph,
-            example_inputs=exported_program.example_inputs,
-            verifier=exported_program.verifier,
-            constants=exported_program.constants,
-        )
-  return exported_program
+from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass
+from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass
+from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass
+from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass
+from ai_edge_torch.fx_pass_base import CanonicalizePass

ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # limitations under the License.
 # ==============================================================================
-from functools import reduce
 from typing import Any, Callable
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra import pass_base
 import torch.utils._pytree as pytree
 _composite_builders: dict[
@@ -277,7 +276,7 @@ def _aten_embedding(gm: torch.fx.GraphModule, node: torch.fx.Node):
   node.target = embedding
-class BuildAtenCompositePass(pass_base.PassBase):
+class BuildAtenCompositePass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -286,4 +285,4 @@ class BuildAtenCompositePass(pass_base.PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return pass_base.PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/build_interpolate_composite_pass.py CHANGED Viewed

@@ -16,8 +16,7 @@
 import functools
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch.hlfb import mark_pattern
 from ai_edge_torch.hlfb.mark_pattern import pattern as pattern_module
 import torch
@@ -103,7 +102,7 @@ def _get_interpolate_nearest2d_pattern():
   return pattern
-class BuildInterpolateCompositePass(ExportedProgramPassBase):
+class BuildInterpolateCompositePass(fx_pass_base.ExportedProgramPassBase):
   def __init__(self):
     super().__init__()
@@ -124,4 +123,4 @@ class BuildInterpolateCompositePass(ExportedProgramPassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/_convert/fx_passes/inject_mlir_debuginfo_pass.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra.pass_base import PassBase
-from torch.fx.passes.infra.pass_base import PassResult
 import torch.utils._pytree as pytree
@@ -62,7 +61,7 @@ def _wrap_call_function_node_with_debuginfo_writer(node: torch.fx.GraphModule):
   node.target = debuginfo_writer
-class InjectMlirDebuginfoPass(PassBase):
+class InjectMlirDebuginfoPass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -70,4 +69,4 @@ class InjectMlirDebuginfoPass(PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/optimize_layout_transposes_pass/pass_body.py CHANGED Viewed

@@ -18,8 +18,7 @@ import operator
 import os
 from typing import Union
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassResult
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_check  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_mark  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_partitioners  # NOQA
@@ -31,7 +30,7 @@ import torch.ao.quantization.quantize_pt2e
 TransposeFunc = Union[utils.tensor_to_nchw, utils.tensor_to_nhwc]
-class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
+class OptimizeLayoutTransposesPass(fx_pass_base.ExportedProgramPassBase):
   def get_source_meta(self, node: torch.fx.Node):
     keys = ["stack_trace", "nn_module_stack", "source_fn_stack", "from_node"]
@@ -94,7 +93,7 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     q_args = input_q.args[1:]
     q_kwargs = input_q.kwargs
-    q_op, dq_op = self.get_paired_q_dq_ops(input_q.target)
+    q_op, dq_op = utils.get_paired_q_dq_ops(input_q.target)
     with graph.inserting_before(target):
       # Q and DQ inserted here may required updating the `axis` arg when they
       # are per_channel ops. However, instead of updating here, the nodes would
@@ -301,4 +300,4 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     # Mark const node again for debugging
     self.mark_const_nodes(exported_program)
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/config.py CHANGED Viewed

@@ -21,4 +21,7 @@ import os
 @dataclasses.dataclass
 class Config:
-  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "True") == "True"
+  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "true").lower() in (
+      "1",
+      "true",
+  )

ai_edge_torch/fx_pass_base.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import abc
+import collections
+from typing import Sequence, Union
+import torch
+from torch.fx.passes.infra.pass_base import PassBase
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import pass_result_wrapper
+import torch.utils._pytree as pytree
+FxPassBase = PassBase
+FxPassResult = PassResult
+ExportedProgramPassResult = collections.namedtuple(
+    "ExportedProgramPassResult", ["exported_program", "modified"]
+)
+class ExportedProgramPassBase(abc.ABC):
+  def __call__(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    self.requires(exported_program)
+    res = self.call(exported_program)
+    self.ensures(exported_program)
+    return res
+  @abc.abstractmethod
+  def call(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    pass
+  def requires(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+  def ensures(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+# TODO(cnchan): make a PassManager class.
+def run_passes(
+    exported_program: torch.export.ExportedProgram,
+    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
+) -> torch.export.ExportedProgram:
+  passes, _ = pytree.tree_flatten(passes)
+  for pass_ in passes:
+    if not isinstance(pass_, ExportedProgramPassBase):
+      pass_ = pass_result_wrapper(pass_)
+    if isinstance(pass_, ExportedProgramPassBase):
+      exported_program = pass_(exported_program).exported_program
+    else:
+      gm = exported_program.graph_module
+      gm, modified = pass_(gm)
+      if modified and gm is not exported_program.graph_module:
+        exported_program = torch.export.ExportedProgram(
+            root=gm,
+            graph=gm.graph,
+            graph_signature=exported_program.graph_signature,
+            state_dict=exported_program.state_dict,
+            range_constraints=exported_program.range_constraints,
+            module_call_graph=exported_program.module_call_graph,
+            example_inputs=exported_program.example_inputs,
+            verifier=exported_program.verifier,
+            constants=exported_program.constants,
+        )
+  return exported_program
+class CanonicalizePass(ExportedProgramPassBase):
+  # A dummy decomp table for running ExportedProgram.run_decompositions without
+  # any op decompositions but just aot_export_module. Due to the check in
+  # run_decompositions, if None or an empty dict is passed as decomp_table,
+  # it will run the default aten-coreaten decompositions. Therefore a non-empty
+  # dummy decomp table is needed.
+  # Ref: https://github.com/pytorch/pytorch/blob/db895ace1d36726e64781774f53b3d3098206116/torch/export/exported_program.py#L543
+  _DUMMY_DECOMP_TABLE = {
+      torch._ops.OperatorBase(): lambda: None,
+  }
+  def call(self, exported_program: torch.export.ExportedProgram):
+    exported_program = exported_program.run_decompositions(
+        self._DUMMY_DECOMP_TABLE
+    )
+    return ExportedProgramPassResult(exported_program, True)

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py CHANGED Viewed

@@ -13,55 +13,74 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma2 model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_gemma_to_tflite(
+def convert_gemma2_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """Converting a Gemma 2 2B model to multi-signature
-  tflite model.
+  """Converts a Gemma2 2B model to multi-signature tflite model.
   Args:
-      checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
+      checkpoint_path (str): The filepath to the model checkpoint, or directory
+        holding the checkpoint.
       prefill_seq_len (int, optional): The maximum size of prefill input tensor.
         Defaults to 512.
       kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
         including both prefill and decode. Defaults to 1024.
-      quantize (bool, optional): Whether the model should be quanized.
-        Defaults to True.
+      quantize (bool, optional): Whether the model should be quanized. Defaults
+        to True.
   """
   pytorch_model = gemma2.build_2b_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma2_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma2_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma2-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma2-2b')
+  convert_gemma2_to_tflite(path)

ai_edge_torch/generative/examples/gemma/convert_to_tflite.py CHANGED Viewed

@@ -13,11 +13,14 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
@@ -44,24 +47,40 @@ def convert_gemma_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma-2b')
+  convert_gemma_to_tflite(path)

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building a Gemma model.
+"""Example of building a Gemma model."""
 import os
-from pathlib import Path
+import pathlib
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
@@ -48,7 +50,6 @@ class Gemma(nn.Module):
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
@@ -60,18 +61,20 @@ class Gemma(nn.Module):
     )
     # Gemma re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
+    # Gemma has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        attention.TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -84,16 +87,22 @@ class Gemma(nn.Module):
     )
     self.config = config
-  # The model's forward function takes in additional k/v cache tensors
-  # and returns the updated k/v cache tensors to the caller.
-  # This can be eliminated if we handle k/v cache updates inside the model itself.
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    _, seq_len = idx.size()
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -102,15 +111,20 @@ class Gemma(nn.Module):
     mask = mask[:, :, :, : self.config.kv_cache_max]
     # token embeddings of shape (b, t, n_embd)
-    x = self.tok_embedding(idx)
+    x = self.tok_embedding(tokens)
     x = x * (self.config.embedding_dim**0.5)
-    for _, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -139,18 +153,20 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       epsilon=1e-6,
       zero_centered=True,
   )
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=256000,
       num_layers=18,
       max_seq_len=8192,
       embedding_dim=2048,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
-      parallel_residual=False,
       lm_head_use_bias=False,
       enable_hlfb=True,
   )
@@ -159,7 +175,8 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
 def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
   config = get_model_config_2b(kv_cache_max_len)
-  config.ff_config.intermediate_size = 128
+  # Gemma has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
   config.max_seq_len = 2 * kv_cache_max_len
@@ -170,32 +187,35 @@ def build_2b_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config_2b(**kwargs)
   model = Gemma(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
-  # since embedding and lm-head use the same weight, we need to set strict
+  # Since embedding and lm-head use the same weight, we need to set strict
   # to False.
   loader.load(model, strict=False)
   model.eval()
   return model
-def define_and_run_2b() -> None:
+def define_and_run_2b(checkpoint_path: str) -> None:
   """Instantiates and runs a Gemma 2B model."""
-  current_dir = Path(__file__).parent.resolve()
+  current_dir = pathlib.Path(__file__).parent.resolve()
   gemma_goldens = torch.load(current_dir / "gemma_lm_logits.pt")
   kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma-2b")
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
-  lm_logits = model.forward(tokens, input_pos)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
   print("comparing with goldens..")
   assert torch.allclose(
-      gemma_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05
+      gemma_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-02
   )
 if __name__ == "__main__":
-  define_and_run_2b()
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/gemma-2b"
+  )
+  define_and_run_2b(input_checkpoint_path)

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl