PyPI - ai-edge-torch-nightly - Versions diffs - 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl

Files changed (68) hide show

ai_edge_torch/_convert/conversion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import logging
 import os
 from typing import Any, Optional
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 from ai_edge_torch import model
 from ai_edge_torch._convert import fx_passes
@@ -34,7 +35,7 @@ def _run_convert_passes(
   exported_program = generative_fx_passes.run_generative_passes(
       exported_program
   )
-  return fx_passes.run_passes(
+  return fx_pass_base.run_passes(
       exported_program,
       [
           fx_passes.BuildInterpolateCompositePass(),

ai_edge_torch/_convert/fx_passes/__init__.py CHANGED Viewed

@@ -15,44 +15,8 @@
 from typing import Sequence, Union
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import FxPassResult
-from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass  # NOQA
-from ai_edge_torch._convert.fx_passes.canonicalize_pass import CanonicalizePass
-from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass  # NOQA
-from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass  # NOQA
-from torch.export import ExportedProgram
-from torch.fx.passes.infra.pass_manager import pass_result_wrapper
-import torch.utils._pytree as pytree
-# TODO(cnchan): make a PassManager class.
-def run_passes(
-    exported_program: ExportedProgram,
-    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
-) -> ExportedProgram:
-  passes, _ = pytree.tree_flatten(passes)
-  for pass_ in passes:
-    if not isinstance(pass_, ExportedProgramPassBase):
-      pass_ = pass_result_wrapper(pass_)
-    if isinstance(pass_, ExportedProgramPassBase):
-      exported_program = pass_(exported_program).exported_program
-    else:
-      gm = exported_program.graph_module
-      gm, modified = pass_(gm)
-      if modified and gm is not exported_program.graph_module:
-        exported_program = ExportedProgram(
-            root=gm,
-            graph=gm.graph,
-            graph_signature=exported_program.graph_signature,
-            state_dict=exported_program.state_dict,
-            range_constraints=exported_program.range_constraints,
-            module_call_graph=exported_program.module_call_graph,
-            example_inputs=exported_program.example_inputs,
-            verifier=exported_program.verifier,
-            constants=exported_program.constants,
-        )
-  return exported_program
+from ai_edge_torch._convert.fx_passes.build_aten_composite_pass import BuildAtenCompositePass
+from ai_edge_torch._convert.fx_passes.build_interpolate_composite_pass import BuildInterpolateCompositePass
+from ai_edge_torch._convert.fx_passes.inject_mlir_debuginfo_pass import InjectMlirDebuginfoPass
+from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import OptimizeLayoutTransposesPass
+from ai_edge_torch.fx_pass_base import CanonicalizePass

ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # limitations under the License.
 # ==============================================================================
-from functools import reduce
 from typing import Any, Callable
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra import pass_base
 import torch.utils._pytree as pytree
 _composite_builders: dict[
@@ -277,7 +276,7 @@ def _aten_embedding(gm: torch.fx.GraphModule, node: torch.fx.Node):
   node.target = embedding
-class BuildAtenCompositePass(pass_base.PassBase):
+class BuildAtenCompositePass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -286,4 +285,4 @@ class BuildAtenCompositePass(pass_base.PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return pass_base.PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/build_interpolate_composite_pass.py CHANGED Viewed

@@ -16,8 +16,7 @@
 import functools
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes._pass_base import ExportedProgramPassResult  # NOQA
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch.hlfb import mark_pattern
 from ai_edge_torch.hlfb.mark_pattern import pattern as pattern_module
 import torch
@@ -103,7 +102,7 @@ def _get_interpolate_nearest2d_pattern():
   return pattern
-class BuildInterpolateCompositePass(ExportedProgramPassBase):
+class BuildInterpolateCompositePass(fx_pass_base.ExportedProgramPassBase):
   def __init__(self):
     super().__init__()
@@ -124,4 +123,4 @@ class BuildInterpolateCompositePass(ExportedProgramPassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/_convert/fx_passes/inject_mlir_debuginfo_pass.py CHANGED Viewed

@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch import lowertools
 import torch
-from torch.fx.passes.infra.pass_base import PassBase
-from torch.fx.passes.infra.pass_base import PassResult
 import torch.utils._pytree as pytree
@@ -62,7 +61,7 @@ def _wrap_call_function_node_with_debuginfo_writer(node: torch.fx.GraphModule):
   node.target = debuginfo_writer
-class InjectMlirDebuginfoPass(PassBase):
+class InjectMlirDebuginfoPass(fx_pass_base.PassBase):
   def call(self, graph_module: torch.fx.GraphModule):
     for node in graph_module.graph.nodes:
@@ -70,4 +69,4 @@ class InjectMlirDebuginfoPass(PassBase):
     graph_module.graph.lint()
     graph_module.recompile()
-    return PassResult(graph_module, True)
+    return fx_pass_base.PassResult(graph_module, True)

ai_edge_torch/_convert/fx_passes/optimize_layout_transposes_pass/pass_body.py CHANGED Viewed

@@ -18,8 +18,7 @@ import operator
 import os
 from typing import Union
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassBase
-from ai_edge_torch._convert.fx_passes import ExportedProgramPassResult
+from ai_edge_torch import fx_pass_base
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_check  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_mark  # NOQA
 from ai_edge_torch._convert.fx_passes.optimize_layout_transposes_pass import layout_partitioners  # NOQA
@@ -31,7 +30,7 @@ import torch.ao.quantization.quantize_pt2e
 TransposeFunc = Union[utils.tensor_to_nchw, utils.tensor_to_nhwc]
-class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
+class OptimizeLayoutTransposesPass(fx_pass_base.ExportedProgramPassBase):
   def get_source_meta(self, node: torch.fx.Node):
     keys = ["stack_trace", "nn_module_stack", "source_fn_stack", "from_node"]
@@ -94,7 +93,7 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     q_args = input_q.args[1:]
     q_kwargs = input_q.kwargs
-    q_op, dq_op = self.get_paired_q_dq_ops(input_q.target)
+    q_op, dq_op = utils.get_paired_q_dq_ops(input_q.target)
     with graph.inserting_before(target):
       # Q and DQ inserted here may required updating the `axis` arg when they
       # are per_channel ops. However, instead of updating here, the nodes would
@@ -301,4 +300,4 @@ class OptimizeLayoutTransposesPass(ExportedProgramPassBase):
     # Mark const node again for debugging
     self.mark_const_nodes(exported_program)
-    return ExportedProgramPassResult(exported_program, True)
+    return fx_pass_base.ExportedProgramPassResult(exported_program, True)

ai_edge_torch/config.py CHANGED Viewed

@@ -21,4 +21,7 @@ import os
 @dataclasses.dataclass
 class Config:
-  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "True") == "True"
+  use_torch_xla: bool = os.environ.get("USE_TORCH_XLA", "true").lower() in (
+      "1",
+      "true",
+  )

ai_edge_torch/fx_pass_base.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import abc
+import collections
+from typing import Sequence, Union
+import torch
+from torch.fx.passes.infra.pass_base import PassBase
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import pass_result_wrapper
+import torch.utils._pytree as pytree
+FxPassBase = PassBase
+FxPassResult = PassResult
+ExportedProgramPassResult = collections.namedtuple(
+    "ExportedProgramPassResult", ["exported_program", "modified"]
+)
+class ExportedProgramPassBase(abc.ABC):
+  def __call__(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    self.requires(exported_program)
+    res = self.call(exported_program)
+    self.ensures(exported_program)
+    return res
+  @abc.abstractmethod
+  def call(
+      self, exported_program: torch.export.ExportedProgram
+  ) -> ExportedProgramPassResult:
+    pass
+  def requires(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+  def ensures(self, exported_program: torch.export.ExportedProgram) -> None:
+    pass
+# TODO(cnchan): make a PassManager class.
+def run_passes(
+    exported_program: torch.export.ExportedProgram,
+    passes: Sequence[Union[ExportedProgramPassBase, FxPassBase]],
+) -> torch.export.ExportedProgram:
+  passes, _ = pytree.tree_flatten(passes)
+  for pass_ in passes:
+    if not isinstance(pass_, ExportedProgramPassBase):
+      pass_ = pass_result_wrapper(pass_)
+    if isinstance(pass_, ExportedProgramPassBase):
+      exported_program = pass_(exported_program).exported_program
+    else:
+      gm = exported_program.graph_module
+      gm, modified = pass_(gm)
+      if modified and gm is not exported_program.graph_module:
+        exported_program = torch.export.ExportedProgram(
+            root=gm,
+            graph=gm.graph,
+            graph_signature=exported_program.graph_signature,
+            state_dict=exported_program.state_dict,
+            range_constraints=exported_program.range_constraints,
+            module_call_graph=exported_program.module_call_graph,
+            example_inputs=exported_program.example_inputs,
+            verifier=exported_program.verifier,
+            constants=exported_program.constants,
+        )
+  return exported_program
+class CanonicalizePass(ExportedProgramPassBase):
+  # A dummy decomp table for running ExportedProgram.run_decompositions without
+  # any op decompositions but just aot_export_module. Due to the check in
+  # run_decompositions, if None or an empty dict is passed as decomp_table,
+  # it will run the default aten-coreaten decompositions. Therefore a non-empty
+  # dummy decomp table is needed.
+  # Ref: https://github.com/pytorch/pytorch/blob/db895ace1d36726e64781774f53b3d3098206116/torch/export/exported_program.py#L543
+  _DUMMY_DECOMP_TABLE = {
+      torch._ops.OperatorBase(): lambda: None,
+  }
+  def call(self, exported_program: torch.export.ExportedProgram):
+    exported_program = exported_program.run_decompositions(
+        self._DUMMY_DECOMP_TABLE
+    )
+    return ExportedProgramPassResult(exported_program, True)

ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py CHANGED Viewed

@@ -13,55 +13,74 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma2 model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma2
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
-def convert_gemma_to_tflite(
+def convert_gemma2_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
-  """Converting a Gemma 2 2B model to multi-signature
-  tflite model.
+  """Converts a Gemma2 2B model to multi-signature tflite model.
   Args:
-      checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
+      checkpoint_path (str): The filepath to the model checkpoint, or directory
+        holding the checkpoint.
       prefill_seq_len (int, optional): The maximum size of prefill input tensor.
         Defaults to 512.
       kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
         including both prefill and decode. Defaults to 1024.
-      quantize (bool, optional): Whether the model should be quanized.
-        Defaults to True.
+      quantize (bool, optional): Whether the model should be quanized. Defaults
+        to True.
   """
   pytorch_model = gemma2.build_2b_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma2_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma2_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma2-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma2-2b')
+  convert_gemma2_to_tflite(path)

ai_edge_torch/generative/examples/gemma/convert_to_tflite.py CHANGED Viewed

@@ -13,11 +13,14 @@
 # limitations under the License.
 # ==============================================================================
+"""Example of converting a Gemma model to multi-signature tflite model."""
 import os
-from pathlib import Path
+import pathlib
 import ai_edge_torch
 from ai_edge_torch.generative.examples.gemma import gemma
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.quantize import quant_recipes
 import torch
@@ -44,24 +47,40 @@ def convert_gemma_to_tflite(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
-  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
-  prefill_input_pos = torch.arange(0, prefill_seq_len)
-  decode_token = torch.tensor([[0]], dtype=torch.long)
-  decode_input_pos = torch.tensor([0], dtype=torch.int64)
+  prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.int)
+  prefill_input_pos = torch.arange(0, prefill_seq_len, dtype=torch.int)
+  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_input_pos = torch.tensor([0], dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(pytorch_model.config)
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
-          'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
+          'prefill',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': prefill_tokens,
+              'input_pos': prefill_input_pos,
+              'kv_cache': kv,
+          },
+      )
+      .signature(
+          'decode',
+          pytorch_model,
+          sample_kwargs={
+              'tokens': decode_token,
+              'input_pos': decode_input_pos,
+              'kv_cache': kv,
+          },
       )
-      .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
+  quant_suffix = 'q8' if quantize else 'f32'
   edge_model.export(
-      f'/tmp/gemma_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite'
+      f'/tmp/gemma_{quant_suffix}_seq{prefill_seq_len}_ekv{kv_cache_max_len}.tflite'
   )
 if __name__ == '__main__':
-  checkpoint_path = os.path.join(Path.home(), 'Downloads/llm_data/gemma-2b')
-  convert_gemma_to_tflite(checkpoint_path)
+  path = os.path.join(pathlib.Path.home(), 'Downloads/llm_data/gemma-2b')
+  convert_gemma_to_tflite(path)

ai_edge_torch/generative/examples/gemma/gemma.py CHANGED Viewed

@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Example of building a Gemma model.
+"""Example of building a Gemma model."""
 import os
-from pathlib import Path
+import pathlib
 from ai_edge_torch.generative.layers import attention
 from ai_edge_torch.generative.layers import builder
+from ai_edge_torch.generative.layers import kv_cache as kv_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
 import ai_edge_torch.generative.utilities.loader as loading_utils
@@ -48,7 +50,6 @@ class Gemma(nn.Module):
   def __init__(self, config: cfg.ModelConfig):
     super().__init__()
-    self.config = config
     # Construct model layers.
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
@@ -60,18 +61,20 @@ class Gemma(nn.Module):
     )
     # Gemma re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
+    # Gemma has only one block config.
+    block_config = config.block_config(0)
     self.transformer_blocks = nn.ModuleList(
-        attention.TransformerBlock(config) for _ in range(config.num_layers)
+        attention.TransformerBlock(block_config, config)
+        for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
+    attn_config = block_config.attn_config
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
-        dim=int(
-            config.attn_config.rotary_percentage * config.attn_config.head_dim
-        ),
+        dim=int(attn_config.rotary_percentage * attn_config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
@@ -84,16 +87,22 @@ class Gemma(nn.Module):
     )
     self.config = config
-  # The model's forward function takes in additional k/v cache tensors
-  # and returns the updated k/v cache tensors to the caller.
-  # This can be eliminated if we handle k/v cache updates inside the model itself.
   @torch.inference_mode
-  def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
-    _, seq_len = idx.size()
+  def forward(
+      self,
+      tokens: torch.Tensor,
+      input_pos: torch.Tensor,
+      kv_cache: kv_utils.KVCache,
+  ) -> dict[torch.Tensor, kv_utils.KVCache]:
+    _, seq_len = tokens.size()
     assert self.config.max_seq_len >= seq_len, (
         f"Cannot forward sequence of length {seq_len}, max seq length is only"
         f" {self.config.max_seq_len}"
     )
+    assert len(self.transformer_blocks) == len(kv_cache.caches), (
+        "The number of transformer blocks and the number of KV cache entries"
+        " must be the same."
+    )
     cos, sin = self.rope_cache
     cos = cos.index_select(0, input_pos)
@@ -102,15 +111,20 @@ class Gemma(nn.Module):
     mask = mask[:, :, :, : self.config.kv_cache_max]
     # token embeddings of shape (b, t, n_embd)
-    x = self.tok_embedding(idx)
+    x = self.tok_embedding(tokens)
     x = x * (self.config.embedding_dim**0.5)
-    for _, block in enumerate(self.transformer_blocks):
-      x = block(x, (cos, sin), mask, input_pos)
+    updated_kv_entires = []
+    for i, block in enumerate(self.transformer_blocks):
+      kv_entry = kv_cache.caches[i] if kv_cache else None
+      x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
+      if kv_entry:
+        updated_kv_entires.append(kv_entry)
+    updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))
     x = self.final_norm(x)
-    res = self.lm_head(x)  # (b, t, vocab_size)
-    return res
+    logits = self.lm_head(x)  # (b, t, vocab_size)
+    return {"logits": logits, "kv_cache": updated_kv_cache}
 def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -139,18 +153,20 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       epsilon=1e-6,
       zero_centered=True,
   )
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
   config = cfg.ModelConfig(
       vocab_size=256000,
       num_layers=18,
       max_seq_len=8192,
       embedding_dim=2048,
       kv_cache_max_len=kv_cache_max_len,
-      attn_config=attn_config,
-      ff_config=ff_config,
-      pre_attention_norm_config=norm_config,
-      post_attention_norm_config=norm_config,
+      block_configs=block_config,
       final_norm_config=norm_config,
-      parallel_residual=False,
       lm_head_use_bias=False,
       enable_hlfb=True,
   )
@@ -159,7 +175,8 @@ def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
 def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
   config = get_model_config_2b(kv_cache_max_len)
-  config.ff_config.intermediate_size = 128
+  # Gemma has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
   config.max_seq_len = 2 * kv_cache_max_len
@@ -170,32 +187,35 @@ def build_2b_model(checkpoint_path: str, **kwargs) -> nn.Module:
   config = get_model_config_2b(**kwargs)
   model = Gemma(config)
   loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
-  # since embedding and lm-head use the same weight, we need to set strict
+  # Since embedding and lm-head use the same weight, we need to set strict
   # to False.
   loader.load(model, strict=False)
   model.eval()
   return model
-def define_and_run_2b() -> None:
+def define_and_run_2b(checkpoint_path: str) -> None:
   """Instantiates and runs a Gemma 2B model."""
-  current_dir = Path(__file__).parent.resolve()
+  current_dir = pathlib.Path(__file__).parent.resolve()
   gemma_goldens = torch.load(current_dir / "gemma_lm_logits.pt")
   kv_cache_max_len = 1024
-  checkpoint_path = os.path.join(Path.home(), "Downloads/llm_data/gemma-2b")
   model = build_2b_model(checkpoint_path, kv_cache_max_len=kv_cache_max_len)
   idx = torch.from_numpy(np.array([[1, 2, 3, 4]]))
-  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.long, device="cpu")
+  tokens = torch.full((1, kv_cache_max_len), 0, dtype=torch.int, device="cpu")
   tokens[0, :4] = idx
-  input_pos = torch.arange(0, kv_cache_max_len)
-  lm_logits = model.forward(tokens, input_pos)
+  input_pos = torch.arange(0, kv_cache_max_len, dtype=torch.int)
+  kv = kv_utils.KVCache.from_model_config(model.config)
+  output = model.forward(tokens, input_pos, kv)
   print("comparing with goldens..")
   assert torch.allclose(
-      gemma_goldens, lm_logits[0, idx.shape[1] - 1, :], atol=1e-05
+      gemma_goldens, output["logits"][0, idx.shape[1] - 1, :], atol=1e-02
   )
 if __name__ == "__main__":
-  define_and_run_2b()
+  input_checkpoint_path = os.path.join(
+      pathlib.Path.home(), "Downloads/llm_data/gemma-2b"
+  )
+  define_and_run_2b(input_checkpoint_path)

ai-edge-torch-nightly 0.3.0.dev20240910__py3-none-any.whl → 0.3.0.dev20240914__py3-none-any.whl

ai-edge-torch-nightly 0.3.0.dev20240910py3-none-any.whl → 0.3.0.dev20240914py3-none-any.whl