PyPI - mediapipe-nightly - Versions diffs - 0.10.21.post20241223__cp39-cp39-manylinux_2_28_x86_64.whl - Mend

mediapipe-nightly 0.10.21.post20241223__cp39-cp39-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (593) hide show

mediapipe/tasks/python/genai/converter/converter_base.py ADDED Viewed

@@ -0,0 +1,179 @@
+# Copyright 2024 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines a couple base classes for the conversion/quantization process."""
+from typing import Iterator
+import os
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+class QuantizationAction:
+  """Container of the tensor values and its corresponding quantization settings.
+  The container is responsible for hosting all of the information that is
+  required to execute the weight-only quantization.
+  Attributes:
+    tensor_name: A string that represents the input tensor name.
+    tensor_value: A numpy array that contains the unquantized tensor values.
+    target_name: A string that represents the updated tensor name.
+    quantize_axis: A list of integers representing the dimensions to be
+      quantized along. For example, if an input tensor has shape [128, 256] and
+      the quantize_axis==[0], it means the quantization happens along the 0-th
+      dimension, resulting in [256] scaling factors.
+    quantize_bits: An integer that specifies the target quantization bits. It
+      currently only supports either 8 or 4 bits.
+    pack_dim: An integer specifying which dimension to pack the quantized bits.
+      This is only applicable when the quantize_bits == 4.
+  """
+  def __init__(
+      self,
+      tensor_name: str,
+      tensor_value: Optional[np.ndarray] = None,
+      target_name: Optional[str] = None,
+      quantize_axis: Optional[List[int]] = None,
+      quantize_bits: Optional[int] = None,
+      pack_dim: Optional[int] = 0,
+  ):
+    """Initializes the model attributes."""
+    self.tensor_name = tensor_name
+    self.tensor_value = tensor_value
+    self.target_name = target_name
+    self.quantize_axis = quantize_axis
+    self.quantize_bits = quantize_bits
+    self.pack_dim = pack_dim
+  def __str__(self) -> str:
+    output_string = "QuantizationAction(\n"
+    output_string += f"  tensor_name: {self.tensor_name}\n"
+    output_string += f"  target_name: {self.target_name}\n"
+    output_string += f"  quantize_axis: {self.quantize_axis}\n"
+    output_string += f"  quantize_bits: {self.quantize_bits}\n"
+    output_string += f"  pack_dim: {self.pack_dim}\n"
+    if self.tensor_value is not None:
+      output_string += f"  tensor_value: {self.tensor_value.shape}\n"
+    output_string += ")\n"
+    return output_string
+class CkptLoaderBase:
+  """Base class for loading the checkpoint.
+  This class is responsible for loading the checkpoint files into the layer
+  weight tensors (as numpy arrays) + quantization setting information (8/4
+  bits). The returned data should be a list of QuantizationAction that describes
+  how to quantize each layer weights.
+  """
+  def __init__(
+      self,
+      ckpt_path: str,
+      is_symmetric: bool,
+      attention_quant_bits: int,
+      feedforward_quant_bits: int,
+      embedding_quant_bits: int,
+  ):
+    """Initializes the loader.
+    Args:
+      ckpt_path: The filepath to the checkpoint.
+      is_symmetric: Whether to apply symmetric or asymmetric quantization.
+      attention_quant_bits: An integer that specify the target quantization bits
+        (support 8 or 4) for the attention layers.
+      feedforward_quant_bits: An integer that specify the target quantization
+        bits (support 8 or 4) for the feedforward layers in each Transformer
+        blocks.
+      embedding_quant_bits: An integer that specify the target quantization bits
+        (support 8 or 4) for the embedding (and the final projection) layers.
+    """
+    self._ckpt_path = ckpt_path
+    self._is_symmetric = is_symmetric
+    self._attention_quant_bits = attention_quant_bits
+    self._feedforward_quant_bits = feedforward_quant_bits
+    self._embedding_quant_bits = embedding_quant_bits
+  def load_to_actions(
+      self,
+  ) -> Iterator[Optional[List[QuantizationAction]]]:
+    """Loads the checkpoint and returns the quantization actions."""
+    raise NotImplementedError("The load_to_actions method is not implemented.")
+class LayerActionMapperBase:
+  """Base class for mapping the layer weights to quantization actions.
+  This class is responsible for mapping from each layer to its corresponding
+  quantization information (e.g. target quantization bits / updated tensor
+  name...).
+  """
+  def __init__(
+      self,
+      is_symmetric: bool,
+      attention_quant_bits: int,
+      feedforward_quant_bits: int,
+      embedding_quant_bits: int,
+      backend: str,
+  ):
+    self._is_symmetric = is_symmetric
+    self._attention_quant_bits = attention_quant_bits
+    self._feedforward_quant_bits = feedforward_quant_bits
+    self._embedding_quant_bits = embedding_quant_bits
+    self._backend = backend
+  def map_to_actions(
+      self, layer_name: str
+  ) -> Optional[List[QuantizationAction]]:
+    """Maps the layer weights to quantization actions.
+    Args:
+      layer_name: A string representing the name of the layer weight. Note that
+        it is expected the layer information is contained in the name which is
+        enough to determine the target quantization information. Any child class
+        is expected to implement this function.
+    """
+    raise NotImplementedError("The map_to_actions method is not implemented.")
+class ModelWriterBase:
+  """Base class for writing the quantized model.
+  This class is responsible for taking a dictionary of the quantized
+  tensors/names and writing them into the format that can be loaded by the
+  on-device inference engine.
+  """
+  def __init__(self, output_dir: str, backend: str):
+    """Initializes the class.
+    Args:
+      output_dir: A string that represents the output directory to write the
+        resulting file(s).
+      backend: A string that represents the target backend to run the output
+        file(s).
+    """
+    self._output_dir = output_dir
+    if not os.path.exists(self._output_dir):
+      os.mkdir(self._output_dir)
+    self._backend = backend
+  def write_variables(
+      self,
+      variables: Dict[str, Tuple[np.ndarray, bool]],
+      use_fake_values: bool = False,
+  ):
+    raise NotImplementedError("The write_variables method is not implemented.")

mediapipe/tasks/python/genai/converter/converter_factory.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright 2024 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility library that helps create the converter instances."""
+from mediapipe.tasks.python.genai.converter import converter_base
+from mediapipe.tasks.python.genai.converter import pytorch_converter
+from mediapipe.tasks.python.genai.converter import safetensors_converter
+from mediapipe.tasks.python.genai.converter import weight_bins_writer
+def create_ckpt_loader(
+    ckpt_format: str, *args, **kwargs
+) -> converter_base.CkptLoaderBase:
+  """Creates the checkpoint loader.
+  Args:
+    ckpt_format: A string that indicates which input checkpoint format is.
+    *args: Additional arguments to be passed into the loader.
+    **kwargs: Additional arguments to be passed into the loader.
+  Returns:
+    A created CkptLoader instance.
+  """
+  del args
+  if ckpt_format == "pytorch":
+    return pytorch_converter.PytorchCkptLoader(
+        ckpt_path=kwargs["ckpt_path"],
+        is_symmetric=kwargs["is_symmetric"],
+        attention_quant_bits=kwargs["attention_quant_bits"],
+        feedforward_quant_bits=kwargs["feedforward_quant_bits"],
+        embedding_quant_bits=kwargs["embedding_quant_bits"],
+        special_model=kwargs["special_model"],
+        backend=kwargs["backend"],
+    )
+  elif ckpt_format == "safetensors":
+    return safetensors_converter.SafetensorsCkptLoader(
+        ckpt_path=kwargs["ckpt_path"],
+        is_symmetric=kwargs["is_symmetric"],
+        attention_quant_bits=kwargs["attention_quant_bits"],
+        feedforward_quant_bits=kwargs["feedforward_quant_bits"],
+        embedding_quant_bits=kwargs["embedding_quant_bits"],
+        special_model=kwargs["special_model"],
+        backend=kwargs["backend"],
+    )
+  else:
+    raise ValueError(f"Unknown checkpoint format: {ckpt_format}")
+def create_writer(
+    writer_type: str, *args, **kwargs
+) -> converter_base.ModelWriterBase:
+  """Creates the model writer.
+  Args:
+    writer_type: A string the indicates which model writer to create.
+    *args: Additional arguments to be passed into the loader.
+    **kwargs: Additional arguments to be passed into the loader.
+  Returns:
+    A created ModelWriter instance.
+  """
+  del args
+  if writer_type == "weight_bins":
+    return weight_bins_writer.WeightBinsWriter(
+        output_dir=kwargs["output_dir"], backend=kwargs["backend"]
+    )
+  else:
+    raise ValueError(f"Unknown writer type: {writer_type}")

mediapipe/tasks/python/genai/converter/llm_converter.py ADDED Viewed

@@ -0,0 +1,374 @@
+"""Functions to perform the checkpoint conversion."""
+import contextlib
+import os
+from typing import List, Optional
+from absl import logging
+import numpy as np
+from mediapipe.python._framework_bindings import model_ckpt_util
+from mediapipe.tasks.python.genai.converter import converter_base
+from mediapipe.tasks.python.genai.converter import converter_factory
+from mediapipe.tasks.python.genai.converter import quantization_util
+class ConversionConfig(object):
+  """Config for checkpoint conversion.
+  Attributes:
+    input_ckpt: Directory or path for the input checkpoint.
+    ckpt_format: Checkpoint format, e.g. 'safetensors', 'pytorch'.
+    model_type: Name of the model, e.g. GEMMA_2B.
+    backend: Target backend to run the model. Can be either "cpu" or "gpu".
+    output_dir: Where the output file(s) to be stored.
+    is_symmetric: Whether to quantize symmetrically.
+    attention_quant_bits: Target quantization bits for the attention layers.
+    feedforward_quant_bits: Target quantization bits for the feedforward layers.
+    embedding_quant_bits: Target quantization bits for the embedding layers.
+    combine_file_only: Whether to combine the weight files only (assuming the
+      weight files are already existed).
+    vocab_model_file: The file path to the 1) SentencePiece vocab model; 2)
+      Hugging Face BPE tokenizer files; 1) is applicable for the Gemma model and
+      2) is applicable for other models. When 2) is used, the provided path is
+      expected to point to a directory that contains both tokenizer.json and
+      tokenizer_config.json files.
+    obfuscate: Whether to obfuscate the model.
+    output_tflite_file: (optional) the output tflite filename. If not provided,
+      the output will be `model.tflite` stored in the output_dir.
+    fp16_scale: A scalar value between [0, 1]. Some models can run into
+      activation overflow issue when running in 16-bit floating point mode. To
+      solve this, we need to scale down the weights of certain layers. See
+      go/llm-on-device-fp16 for more detailed explanation.
+    lora_ckpt: The directory or path for the lora checkpoint. Required in order
+      to convert the lora weights.
+    lora_rank: An integer representing the rank of LoRA. Required in order to
+      convert the lora weights.If not provided, then the converter assumes there
+      is no LoRA weights. Note that only the GPU backend supports LoRA.
+    lora_output_tflite_file: A string indicating the name of the generated
+      tflite file for the LoRA weight. Only applicable when the lora_rank is not
+      zero.
+    image_encoder_file: A string with the name of the image encoder tflite file.
+    image_adapter_file: A string with the name of the image adapter tflite file.
+    submodel_type: Name of submodel, e.g. GEMMA_2B.
+    use_fake_weights: Whether to use fake weights. If set to True, the weights
+      will be filled with zeros.
+  """
+  def __init__(
+      self,
+      input_ckpt: str,
+      ckpt_format: str,
+      model_type: str,
+      backend: str,
+      output_dir: str,
+      is_symmetric: bool = True,
+      attention_quant_bits: int = 8,
+      feedforward_quant_bits: int = 8,
+      embedding_quant_bits: int = 8,
+      combine_file_only: bool = False,
+      vocab_model_file: str = '',
+      obfuscate: bool = False,
+      output_tflite_file: Optional[str] = None,
+      fp16_scale: Optional[float] = None,
+      lora_ckpt: Optional[str] = None,
+      lora_rank: Optional[int] = None,
+      lora_output_tflite_file: Optional[str] = None,
+      image_encoder_file: Optional[str] = None,
+      image_adapter_file: Optional[str] = None,
+      submodel_type: Optional[str] = None,
+      use_fake_weights: bool = False,
+  ):
+    self.input_ckpt = input_ckpt
+    self.ckpt_format = ckpt_format
+    self.model_type = model_type
+    self.backend = backend
+    if os.path.isfile(output_dir):
+      raise ValueError('Output directory mush not point to an existing file.')
+    if not os.path.isdir(output_dir):
+      logging.info('Creating output directory: %s', output_dir)
+      os.makedirs(output_dir, exist_ok=True)
+    self.output_dir = output_dir
+    self.is_symmetric = is_symmetric
+    self.attention_quant_bits = attention_quant_bits
+    self.feedforward_quant_bits = feedforward_quant_bits
+    self.embedding_quant_bits = embedding_quant_bits
+    self.combine_file_only = combine_file_only
+    self.vocab_model_file = vocab_model_file
+    self.obfuscate = obfuscate
+    self.image_encoder_file = image_encoder_file
+    self.image_adapter_file = image_adapter_file
+    self.submodel_type = submodel_type
+    self.use_fake_weights = use_fake_weights
+    if output_tflite_file:
+      parent_dir = os.path.dirname(output_tflite_file)
+      if not os.path.isdir(parent_dir):
+        logging.info('Creating tflite parent directory: %s', parent_dir)
+        os.makedirs(parent_dir, exist_ok=True)
+      self.output_tflite_file = output_tflite_file
+    else:
+      self.output_tflite_file = os.path.join(output_dir, 'model.tflite')
+    self.fp16_scale = None
+    self.lora_ckpt = lora_ckpt
+    self.lora_rank = lora_rank
+    self.lora_output_tflite_file = lora_output_tflite_file
+    if (self.lora_ckpt is None) ^ (self.lora_rank is None):
+      raise ValueError(
+          'lora_ckpt and lora_rank must be either both provided or both not'
+          ' provided.'
+      )
+    if self.lora_rank is not None:
+      if backend == 'cpu':
+        raise ValueError('LoRA is not supported for CPU backend.')
+      lora_applicable_models = ['GEMMA_2B', 'GEMMA2_2B', 'PHI_2']
+      if model_type not in lora_applicable_models:
+        raise ValueError(
+            'LoRA is only applicable for the model_type:'
+            f' {", ".join(lora_applicable_models)}, but get model_type:'
+            f' {model_type}.'
+        )
+def quantize_by_actions(
+    actions: List[converter_base.QuantizationAction],
+    backend: str,
+    is_symmetric: bool,
+):
+  """Quantizes the weights by actions.
+  Args:
+    actions: A list of QuantizationAction that contains the information and
+      tensor values to be quantized.
+    backend: Target backend to run the model. Can be either "cpu" or "gpu".
+    is_symmetric: Whether to quantize symmetrically.
+  Returns:
+    A dictionary that maps from the updated tensor names to the quantized
+    tensor values + a boolean that indicates whether the tensor values need to
+    be packed (only applicable for the 4-bit quantized weights).
+  """
+  output_tensors = {}
+  for action in actions:
+    if action.tensor_value is None:
+      continue
+    # The dtype needs to be compared in string as it is a custom numpy dtype.
+    # Explicitly cast the bfloat16 and float16 dtype to float32 to make sure its
+    # value is converted and serialized correctly.
+    if (
+        str(action.tensor_value.dtype) == 'bfloat16'
+        or action.tensor_value.dtype == np.float16
+    ):
+      action.tensor_value = action.tensor_value.astype(np.float32)
+    if (
+        action.tensor_value.dtype != np.float32
+        and action.tensor_value.dtype != np.int8
+    ):
+      raise ValueError(
+          'All tensors should be casted to either float32 or int8, but got: %s'
+          % action.tensor_value.dtype
+      )
+    if action.quantize_axis:
+      pack = action.quantize_bits == 4
+      if action.tensor_value.dtype == np.int8:
+        if backend == 'cpu' and pack:
+          raise ValueError(
+              'Converting pre-quantized checkpoint into 4-bit is not supported'
+              ' for CPU backend.'
+          )
+        output_tensors[action.target_name] = (action.tensor_value, pack)
+      else:
+        if is_symmetric:
+          target_var, scale = quantization_util.quantize_tensor(
+              var=action.tensor_value,
+              axis=action.quantize_axis,
+              sym=is_symmetric,
+              number_bits=action.quantize_bits,
+          )
+          output_tensors[action.target_name] = (target_var, pack)
+          output_tensors[action.target_name + '_quantized_scale'] = (
+              scale,
+              False,
+          )
+          zp = None
+        else:
+          target_var, scale, zp = quantization_util.quantize_tensor(
+              var=action.tensor_value,
+              axis=action.quantize_axis,
+              sym=is_symmetric,
+              number_bits=action.quantize_bits,
+          )
+        if backend == 'cpu' and pack:
+          target_var, scale, zp = quantization_util.update_to_uint4(
+              target_var, scale, zp
+          )
+        output_tensors[action.target_name] = (target_var, pack)
+        output_tensors[action.target_name + '_quantized_scale'] = (scale, False)
+        if zp is not None:
+          output_tensors[action.target_name + '_quantized_zp'] = (zp, False)
+    else:
+      output_tensors[action.target_name] = (action.tensor_value, False)
+  return output_tensors
+def combined_weight_bins_to_tflite(
+    model_type: str,
+    backend: str,
+    weight_path: str,
+    output_tflite_file: str,
+    obfuscate: bool,
+    vocab_model_file: str,
+    lora_rank: Optional[int] = None,
+    lora_weight_path: Optional[str] = None,
+    lora_output_tflite_file: Optional[str] = None,
+    image_encoder_file: Optional[str] = None,
+    image_adapter_file: Optional[str] = None,
+    submodel_type: Optional[str] = None,
+):
+  """Combines weight files to tflite file."""
+  if backend == 'cpu':
+    if lora_rank is not None:
+      logging.fatal('LoRA is not supported for CPU backend.')
+    model_ckpt_util.GenerateCpuTfLite(
+        model_type,
+        weight_path,
+        vocab_model_file,
+        True,
+        output_tflite_file,
+    )
+  elif backend == 'gpu':
+    model_ckpt_util.GenerateGpuTfLite(
+        model_type,
+        weight_path,
+        vocab_model_file,
+        True,
+        obfuscate,
+        output_tflite_file,
+        0 if lora_rank is None else lora_rank,
+        '' if lora_weight_path is None else lora_weight_path,
+        '' if lora_output_tflite_file is None else lora_output_tflite_file,
+        '' if image_encoder_file is None else image_encoder_file,
+        '' if image_adapter_file is None else image_adapter_file,
+        '' if submodel_type is None else submodel_type,
+    )
+  else:
+    raise ValueError('Unsupported backend: %s' % backend)
+def convert_bpe_vocab(vocab_model_file: str, output_dir: str) -> str:
+  if not os.path.isdir(vocab_model_file):
+    raise ValueError(
+        'The input BPE vocab model file path is expected to be a directory that'
+        ' contains both tokenizer.json and tokenizer_config.json files.'
+    )
+  output_vocab_file = os.path.join(output_dir, 'spm.model')
+  model_ckpt_util.ConvertHfTokenizer(vocab_model_file, output_vocab_file)
+  return output_vocab_file
+@contextlib.contextmanager
+def filemanager(filename: str, mode: str):
+  try:
+    with open(filename, mode) as f:
+      yield f
+  finally:
+    pass
+def sort_layer_info(layer_info_file: str) -> None:
+  """Loads and sorts the layer info file."""
+  layer_info = []
+  with filemanager(layer_info_file, 'r') as finfo:
+    for line in finfo:
+      line = line.strip()
+      if line:
+        layer_info.append(line)
+  layer_info = list(set(layer_info))
+  layer_info.sort()
+  with filemanager(layer_info_file, 'w') as finfo:
+    for line in layer_info:
+      finfo.write(line + '\n')
+      finfo.write('\n')
+def maybe_quantize_and_write_tensors_to_bins(
+    ckpt_loader: converter_base.CkptLoaderBase,
+    config: ConversionConfig,
+) -> None:
+  """Quantizes the weight tensors according to the loader and writes them to bins."""
+  actions = ckpt_loader.load_to_actions()
+  for action in actions:
+    # Quantize the weight
+    quantized_tensors = quantize_by_actions(
+        action, config.backend, config.is_symmetric
+    )
+    del action
+    # Write the tensors into file(s).
+    writer = converter_factory.create_writer(
+        writer_type='weight_bins',
+        output_dir=config.output_dir,
+        backend=config.backend,
+    )
+    writer.write_variables(quantized_tensors, config.use_fake_weights)
+    del quantized_tensors
+    del writer
+def convert_checkpoint(config: ConversionConfig) -> None:
+  """Converts the checkpoint to tflite file."""
+  logging.info('input folder: %s', config.input_ckpt)
+  if os.path.isdir(config.vocab_model_file):
+    vocab_model_path = convert_bpe_vocab(
+        config.vocab_model_file, config.output_dir
+    )
+  else:
+    vocab_model_path = config.vocab_model_file
+  if not config.combine_file_only:
+    # Load the layer weights and prepare the quantization configurations.
+    loader = converter_factory.create_ckpt_loader(
+        config.ckpt_format,
+        ckpt_path=config.input_ckpt,
+        is_symmetric=config.is_symmetric,
+        backend=config.backend,
+        attention_quant_bits=config.attention_quant_bits,
+        feedforward_quant_bits=config.feedforward_quant_bits,
+        embedding_quant_bits=config.embedding_quant_bits,
+        special_model=config.model_type,
+        fp16_scale=config.fp16_scale,
+    )
+    maybe_quantize_and_write_tensors_to_bins(loader, config)
+    if config.lora_ckpt is not None and config.lora_ckpt != config.input_ckpt:
+      # If lora ckpt and the input ckpt is the same. The lora conversion is
+      # handled in the previous loader.
+      lora_loader = converter_factory.create_ckpt_loader(
+          config.ckpt_format,
+          ckpt_path=config.lora_ckpt,
+          is_symmetric=config.is_symmetric,
+          backend=config.backend,
+          attention_quant_bits=config.attention_quant_bits,
+          feedforward_quant_bits=config.feedforward_quant_bits,
+          embedding_quant_bits=config.embedding_quant_bits,
+          special_model=config.model_type,
+      )
+      maybe_quantize_and_write_tensors_to_bins(lora_loader, config)
+    sort_layer_info(os.path.join(config.output_dir, 'layer_info.txt'))
+  combined_weight_bins_to_tflite(
+      config.model_type,
+      config.backend,
+      weight_path=config.output_dir,
+      output_tflite_file=config.output_tflite_file,
+      obfuscate=config.obfuscate,
+      vocab_model_file=vocab_model_path,
+      lora_rank=config.lora_rank,
+      lora_weight_path=config.output_dir,
+      lora_output_tflite_file=config.lora_output_tflite_file,
+      image_encoder_file=config.image_encoder_file,
+      image_adapter_file=config.image_adapter_file,
+      submodel_type=config.submodel_type,
+  )

mediapipe/tasks/python/genai/converter/llm_converter_test.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Tests for llm_converter."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+import unittest
+from mediapipe.tasks.python.genai.converter import converter_base
+from mediapipe.tasks.python.genai.converter import llm_converter
+class LlmConverterTest(googletest.TestCase, parameterized.TestCase):
+  def get_fake_action(self, input_dtype):
+    if input_dtype == 'bfloat16':
+      # Create a TensorFlow bfloat16 tensor
+      bfloat16_tensor = tf.constant([1.0, -1.0, 2.0, -2.0], dtype=tf.bfloat16)
+      # Convert the TensorFlow tensor to a NumPy array
+      tensor_value = bfloat16_tensor.numpy()
+    else:
+      tensor_value = np.array(
+          [1.0, -1.0, 2.0, -2.0], dtype=np.dtype(input_dtype)
+      )
+    return converter_base.QuantizationAction(
+        tensor_name='mdl_vars.params.lm.softmax.logits_ffn.w',
+        target_name='params.lm.softmax.logits_ffn.w',
+        quantize_axis=[0],
+        quantize_bits=8,
+        pack_dim=0,
+        tensor_value=tensor_value,
+    )
+  @parameterized.parameters(
+      {'input_dtype': 'float32'},
+      {'input_dtype': 'float16'},
+      {'input_dtype': 'bfloat16'},
+      {'input_dtype': 'int8'},
+  )
+  def test_quantize_by_actions(self, input_dtype):
+    out = llm_converter.quantize_by_actions(
+        [self.get_fake_action(input_dtype)], backend='gpu', is_symmetric=True
+    )
+    if input_dtype == 'int8':
+      # The values are pre-quantized and should be the same.
+      np.testing.assert_allclose(
+          out['params.lm.softmax.logits_ffn.w'][0],
+          np.array([1, -1, 2, -2], dtype=np.int8),
+      )
+    else:
+      np.testing.assert_allclose(
+          out['params.lm.softmax.logits_ffn.w'][0],
+          np.array([64, -64, 127, -127], dtype=np.int8),
+      )
+      np.testing.assert_allclose(
+          out['params.lm.softmax.logits_ffn.w_quantized_scale'][0],
+          np.array(0.015748, dtype=np.float32),
+          rtol=1e-03,
+      )
+if __name__ == '__main__':
+  googletest.main()