PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -16,8 +16,11 @@
 """Uniform quantize in tensor level."""
 import dataclasses
+from typing import Optional, Sequence
+import ml_dtypes
 import numpy as np
 from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
 @dataclasses.dataclass(frozen=True)
@@ -26,6 +29,11 @@ class IntType:
   signed: bool
+def is_blockwise(granularity: qtyping.QuantGranularity) -> bool:
+  """Checks if the quantization granularity is blockwise."""
+  return "BLOCKWISE" in str(granularity)
 def get_quantized_range(qtype: IntType) -> tuple[float, float]:
   """Calculates range of the quantized type."""
   if qtype.signed:
@@ -37,6 +45,22 @@ def get_quantized_range(qtype: IntType) -> tuple[float, float]:
   return float(qmin), float(qmax)
+def extract_block_size_from_granularity(
+    granularity: qtyping.QuantGranularity,
+) -> int:
+  """Get the block size for blockwise quantization."""
+  if granularity == qtyping.QuantGranularity.BLOCKWISE_32:
+    return 32
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_64:
+    return 64
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_128:
+    return 128
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_256:
+    return 256
+  else:
+    return 0
 def _round_and_clip(
     tensor: np.ndarray, qtype: IntType, narrow: bool
 ) -> np.ndarray:
@@ -116,22 +140,141 @@ def fix_quantization_params_rank(
       symmetric=quantization_params.symmetric,
       quantized_dimension=quantization_params.quantized_dimension,
       quantized_data=quantization_params.quantized_data,
+      block_size=quantization_params.block_size,
+  )
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      if val % block_size != 0:
+        raise ValueError(
+            f"Quantized dimension {val} in tensor shape {tensor_shape} is not"
+            f" divisible by block size {block_size}."
+        )
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def reshape_data_for_blockwise(
+    tensor_data: np.ndarray,
+    op_name: qtyping.TFLOperationName,
+    granularity: qtyping.QuantGranularity,
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    op_name: The name of the TFL op.
+    granularity: The quantization granularity for the tensor.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+      op_name
+  ]
+  block_size = extract_block_size_from_granularity(granularity)
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  return tensor_data.reshape(new_shape), quantized_dim + 1
+def _broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+      `quant_params.quantized_dimension` must be specified.
+      `quant_params.block_size` must be specified and positive.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  if quant_params.zero_point is None or quant_params.zero_point.size == 0:
+    expanded_zp = np.zeros_like(tensor_content, dtype=np.int32)
+  else:
+    expanded_zp = np.reshape(
+        np.broadcast_to(
+            np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+            expanded_tensor_shape,
+        ),
+        tensor_content.shape,
+    )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
   )
 def uniform_quantize(
     tensor_data: np.ndarray,
     quantization_params: qtyping.UniformQuantParams,
+    is_blockwise_quant: bool = False,
 ):
   """Uniform quantize a tensor.
   Args:
     tensor_data: The tensor to be quantized.
     quantization_params: The quantization parameters.
+    is_blockwise_quant: Whether the tensor is blockwise quantized.
   Returns:
     The quantized tensor.
   """
+  # The reshaping for blockwise quantization is unique hence we do this here
+  # to avoid unexpected broadcast behavior downstream.
+  if is_blockwise_quant:
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -145,8 +288,15 @@ def uniform_quantize(
   inverse_scales = 1.0 / scales
   # TODO: b/332574603 - support unsigned data type.
   qtype = IntType(quantization_params.num_bits, signed=True)
-  # Symmetric means narrow range (e.g., -127 to 127)
-  narrow_range = quantization_params.symmetric
+  # For quantization with more than 8 bits, symmetric narrow-range quantization
+  # is required due to assumptions made by legacy TFLite kernels. However, this
+  # method is not ideal for low-bit quantization (e.g., 2-bit quantization,
+  # which only has 4 bins), as it wastes a bin and there are no kernel
+  # requirements for a narrow range when < 8 bits because the data is unpacked
+  # to int8 before being used in the kernel.
+  narrow_range = (
+      quantization_params.symmetric and quantization_params.num_bits >= 8
+  )
   required_dtype = np.signedinteger if qtype.signed else np.unsignedinteger
   if not np.issubdtype(zero_points.dtype, required_dtype):
     raise ValueError(
@@ -172,6 +322,26 @@ def uniform_dequantize(
   Returns:
     The dequantized tensor.
   """
+  if quantization_params.block_size != 0:
+    # b/443830202: The quantized dimension is currently increased by 1 because
+    # AEQ expects 1 and XNNPack expects 0.
+    quantization_params = dataclasses.replace(
+        quantization_params,
+        quantized_dimension=quantization_params.quantized_dimension + 1,
+    )
+    scale_shape = list(tensor_data.shape)
+    scale_shape[quantization_params.quantized_dimension] = (
+        scale_shape[quantization_params.quantized_dimension]
+        // quantization_params.block_size
+    )
+    quantization_params = dataclasses.replace(
+        quantization_params,
+        scale=quantization_params.scale.reshape(scale_shape),
+    )
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -187,6 +357,7 @@ def symmetric_quantize_bias_tensor(
     bias_content: np.ndarray,
     input_tensor_quant_params: qtyping.UniformQuantParams,
     weight_tensor_quant_params: qtyping.UniformQuantParams,
+    check_error: bool = True,
 ) -> qtyping.UniformQuantParams:
   """Quantize bias tensor (symmetrically, i.e., zero_point = 0).
@@ -198,6 +369,12 @@ def symmetric_quantize_bias_tensor(
     bias_content: The bias content.
     input_tensor_quant_params: The quantization parameters of input tensor.
     weight_tensor_quant_params: The quantization parameters of weight tensor.
+    check_error: Whether to check if the quantization error (the difference
+      between the original and dequantized bias) is larger than the quantization
+      scale. This check is important because bias quantization parameters are
+      fixed (bias_scale = input_scale * weight_scale), which can lead to large
+      quantization errors. Raising an error when the quantization error is
+      larger than the scale helps to identify unexpected numerical issues.
   Returns:
     The quantized bias tensor.
@@ -212,7 +389,8 @@ def symmetric_quantize_bias_tensor(
   # symmetric
   bias_zp = np.zeros_like(effective_output_scale, dtype=np.int32)
-  bias_number_bits = 64 if input_tensor_quant_params.num_bits == 16 else 32
+  # Fixed to 32 bits since most of the accelerators use int32 accumulator.
+  bias_number_bits = 32
   symmetric = True
   quantized_dimension = None if len(effective_output_scale) == 1 else 0
   bias_quant_params = qtyping.UniformQuantParams(
@@ -224,6 +402,24 @@ def symmetric_quantize_bias_tensor(
   )
   quantized_vars = uniform_quantize(bias_content, bias_quant_params)
+  if check_error:
+    dequantized_bias = uniform_dequantize(quantized_vars, bias_quant_params)
+    max_quant_error = np.max(np.abs(dequantized_bias - bias_content))
+    error_tolerance = np.maximum(1e-6, np.max(effective_output_scale))
+    if max_quant_error > error_tolerance:
+      raise ValueError(
+          "Quantization error is too large for bias tensor quantization. Max"
+          f" quantization error is {max_quant_error}, which exceed"
+          f" the threshold {error_tolerance}"
+      )
+  # Save the int32 quantized bias as int64 if the input tensor is quantized to
+  # 16 bits. This is to assume the matmul is using int64 accumulator (safe from
+  # overflow). For accelerators with int32 accumulator, it is safe to cast int64
+  # back to int32.
+  if input_tensor_quant_params.num_bits == 16:
+    quantized_vars = quantized_vars.astype(np.int64)
+    bias_number_bits = 64
   # UniformQuantParams is frozen dataclass, need to recreate.
   return qtyping.UniformQuantParams(
@@ -237,30 +433,70 @@ def symmetric_quantize_bias_tensor(
 def tensor_zp_scale_from_min_max(
-    min_value, max_value, num_bits: int, symmetric: bool
+    min_value,
+    max_value,
+    num_bits: int,
+    symmetric: bool,
+    granularity: qtyping.QuantGranularity,
+    clipping_values: Optional[np.ndarray] = None,
 ):
   """Get zero point and scale from min and max value.
   Args:
-    min_value: The minimum value of the tensor (channel-wise supported).
-    max_value: The maximum value of the tensor (channel-wise supported).
+    min_value: The minimum value of the tensor (channelwise and blockwise
+      supported).
+    max_value: The maximum value of the tensor (channelwise and blockwise
+      supported).
     num_bits: The number of bits of the tensor.
     symmetric: Whether the tensor is symmetric.
+    granularity: The granularity of the tensor.
+    clipping_values: Absolute clipping values to apply to the tensor. This will
+      clip the tensors to the range [-clipping_values, clipping_values]. This
+      should be the same shape as min_value and max_value. If None, no clipping
+      will be applied.
   Returns:
     The zero point and scale of the tensor.
   """
   # TODO: b/332574603 - support unsigned data type.
   qtype = IntType(
       num_bits,
       signed=True,
   )
   qmin, qmax = get_quantized_range(qtype)
-  min_bound = 1e-4  # 1e-6 precision for int8 and 1e-8 for int16.
+  min_bound = 1e-9  # Avoid zero scale.
+  pos_clipping_values = None if clipping_values is None else clipping_values
+  neg_clipping_values = None if clipping_values is None else -clipping_values
+  if is_blockwise(granularity):
+    # Blockwise quantization uses float16 scale,
+    # with 7 bit mantissa, so the maximum scale value is 65280 and maximum
+    # representable range is [-65280 * (2 ** num_bits),
+    # 65280 * (2 ** num_bits - 1)].
+    # Note that we have one extra value on the negative side.
+    float16_max = np.broadcast_to(
+        np.array(65280) * (2**num_bits - 1), max_value.shape
+    )
+    float16_min = np.broadcast_to(
+        np.array(-65280) * (2**num_bits), min_value.shape
+    )
+    pos_clipping_values = (
+        float16_max
+        if pos_clipping_values is None
+        else np.minimum(pos_clipping_values, float16_max)
+    )
+    neg_clipping_values = (
+        float16_min
+        if neg_clipping_values is None
+        else np.maximum(neg_clipping_values, float16_min)
+    )
   if symmetric:
     bound = np.maximum(np.abs(min_value), np.abs(max_value))
     bound = np.maximum(bound, min_bound)
+    if clipping_values is not None:
+      bound = np.clip(bound, neg_clipping_values, pos_clipping_values)
     if not qtype.signed:
       half_q = (qmax - 1) / 2
       scale = bound / half_q
@@ -268,7 +504,6 @@ def tensor_zp_scale_from_min_max(
     else:
       scale = bound / qmax
       zp = np.zeros_like(scale, dtype=np.int32)
   else:
     # Include 0 to the range to support zero-padding.
     # See: https://arxiv.org/pdf/1712.05877.pdf
@@ -276,10 +511,18 @@ def tensor_zp_scale_from_min_max(
     bound_max = np.maximum(max_value, np.zeros_like(max_value))
     bound_min = np.minimum(min_value, np.zeros_like(min_value))
     bound = np.maximum(bound_max - bound_min, min_bound)
+    if clipping_values is not None:
+      bound = np.clip(bound, -clipping_values, clipping_values)
     scale = bound / (qmax - qmin)
     zp = qmin - bound_min / scale
     zp = np.rint(zp)
+  if is_blockwise(granularity):
+    # Round the scale values to 7 bit mantissa.
+    scale = (
+        scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
+    )
   # It's safe to cast zp to qtype without clipping because we can infer
   # qmin <= zp <= qmax from bound_min <= 0 <= bound_max.
   zp = assign_quantized_type(zp, qtype)
@@ -293,7 +536,8 @@ def _is_valid_quantization_params(
   """Checks if the quantization parameters are valid.
   A valid quantization params requires:
-    1. scale and zero point have the same shape (TFL Runtime requirement).
+    1. scale and zero point either have the same shape or the zero point is a
+    scalar.
     2. scale and zero point have the same rank as the tensor content (avoid
     ambiguous broadcasting).
@@ -304,17 +548,20 @@ def _is_valid_quantization_params(
   Returns:
     True if the quantization parameters are valid.
   """
-  if quantization_params.scale.shape != quantization_params.zero_point.shape:
+  if (
+      quantization_params.scale.shape != quantization_params.zero_point.shape
+      and quantization_params.zero_point.size != 1
+  ):
     raise ValueError(
-        "scale and zero_point must have the same shape. Got"
-        f" {quantization_params.scale.shape} and"
+        "scale and zero_point must have the same shape or zero_point must have"
+        f" only one element. Got {quantization_params.scale.shape} and"
         f" {quantization_params.zero_point.shape}"
     )
   tensor_rank = tensor_data.ndim
   scale_rank = quantization_params.scale.ndim
   zero_point_rank = quantization_params.zero_point.ndim
-  if (tensor_rank != scale_rank) or (tensor_rank != zero_point_rank):
+  if tensor_rank != scale_rank or (tensor_rank != zero_point_rank):
     raise ValueError(
         f"Ranks of scales ({scale_rank}) and zps"
         f" ({zero_point_rank}) must be the same as the tensor rank"

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py CHANGED Viewed

@@ -15,8 +15,11 @@
 """Tests for tensor_utils."""
+import dataclasses
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.platform import googletest
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
@@ -123,6 +126,14 @@ class TensorUtilsTest(parameterized.TestCase):
           False,
           [-24, 10, 19, 127],
       ),
+      (
+          [-16.0, 1.3, 2.4, 16.0],
+          [0.12598425],
+          [0],
+          8,
+          True,
+          [-127, 10, 19, 127],  # int8 symmetric is narrow range, -127 to 127
+      ),
       (
           [-3.0, 1.3, 2.4, 16.0],
           [1.2666667],
@@ -137,7 +148,7 @@ class TensorUtilsTest(parameterized.TestCase):
           [-6],
           4,
           True,
-          [-7, -5, -4, 7],
+          [-8, -5, -4, 7],  # int4 symmetric is not narrow range, -8 to 7
       ),
   )
   def test_uniform_quantize(
@@ -160,7 +171,9 @@ class TensorUtilsTest(parameterized.TestCase):
   def test_uniform_quantize_wrong_shape(self):
     tensor = [-3.0, 1.3, 2.4, 16.0]
-    error_message = "scale and zero_point must have the same shape."
+    error_message = (
+        "Ranks of scales (3) and zps (2) must be the same as the tensor rank"
+    )
     with self.assertRaisesWithPredicateMatch(
         ValueError, lambda err: error_message in str(err)
     ):
@@ -190,6 +203,28 @@ class TensorUtilsTest(parameterized.TestCase):
           ),
       )
+  def test_uniform_quantize_quant_dim_not_divisible_by_block_size_raise(self):
+    tensor = np.random.rand(34, 2)
+    error_message = (
+        "Quantized dimension 34 in tensor shape (34, 2) is not divisible by"
+        " block size 32."
+    )
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, lambda err: error_message in str(err)
+    ):
+      uniform_quantize_tensor.uniform_quantize(
+          np.array(tensor),
+          qtyping.UniformQuantParams(
+              quantized_dimension=0,
+              block_size=32,
+              num_bits=4,
+              scale=np.array([1.2666667]),
+              zero_point=np.array([-6]),
+              symmetric=True,
+          ),
+          is_blockwise_quant=True,
+      )
   @parameterized.parameters(
       (
           8,
@@ -233,7 +268,9 @@ class TensorUtilsTest(parameterized.TestCase):
   def test_uniform_dequantize_wrong_shape(self):
     tensor = [-3.0, 1.3, 2.4, 16.0]
-    error_message = "scale and zero_point must have the same shape."
+    error_message = (
+        "Ranks of scales (3) and zps (2) must be the same as the tensor rank"
+    )
     with self.assertRaisesWithPredicateMatch(
         ValueError, lambda err: error_message in str(err)
     ):
@@ -263,8 +300,35 @@ class TensorUtilsTest(parameterized.TestCase):
           ),
       )
+  def test_uniform_dequantize_blockwise(self):
+    quantized_tensor = np.array([[-8, -5, -4, 7], [-4, 7, -8, -5]])
+    expected_output_tensor = np.array([
+        [-10.1333336, -6.3333335, -5.0666668, 8.8666669],
+        [-5.0666668, 8.8666669, -10.1333336, -6.3333335],
+    ])
+    quant_params = qtyping.UniformQuantParams(
+        # b/443830202:
+        quantized_dimension=0,
+        num_bits=4,
+        scale=np.array([[[1.2666667, 1.2666667], [1.2666667, 1.2666667]]]),
+        zero_point=np.array([[0]]),
+        symmetric=True,
+        block_size=2,
+    )
+    dequantized_tensor = uniform_quantize_tensor.uniform_dequantize(
+        np.array(quantized_tensor), quant_params
+    )
+    self.assertSequenceAlmostEqual(
+        expected_output_tensor.flatten(), dequantized_tensor.flatten(), places=4
+    )
   @parameterized.parameters(
-      (8, 8, True, True), (8, 4, False, True), (16, 8, True, False)
+      (8, 8, True, True),
+      (8, 4, False, True),
+      (16, 8, True, False),
+      (16, 8, True, True),
   )
   def test_quantize_bias_tensor(
       self,
@@ -322,6 +386,26 @@ class TensorUtilsTest(parameterized.TestCase):
     self.assertSequenceAlmostEqual(
         list(dequantized_bias.flatten()), list(bias_tensor_data), places=5
     )
+    if activation_num_bits == 16:
+      # Check if it is safe to cast int64 bias to int32. We save the int32
+      # quantized bias as int64 if the input tensor is quantized to 16 bits.
+      # This is to assume the matmul is using int64 accumulator (safe from
+      # overflow). For accelerators with int32 accumulator, it is safe to cast
+      # int64 back to int32.
+      quantized_bias = bias_quant_config.quantized_data
+      self.assertIsNotNone(quantized_bias)
+      self.assertEqual(quantized_bias.dtype, np.int64)
+      self.assertSequenceEqual(
+          list(quantized_bias.flatten()),
+          list(quantized_bias.astype(np.int32).flatten()),
+      )
+      bias_quant_config = dataclasses.replace(
+          bias_quant_config,
+          num_bits=32,
+      )
     expected_quantized_data = uniform_quantize_tensor.uniform_quantize(
         bias_tensor_data, bias_quant_config
     )
@@ -330,13 +414,44 @@ class TensorUtilsTest(parameterized.TestCase):
         list(bias_quant_config.quantized_data.flatten()),  # pytype: disable=attribute-error
     )
+  def test_quantize_bias_tensor_raises_error_for_large_quantization_error(self):
+    input_quant_config = qtyping.UniformQuantParams(
+        scale=np.array([0.1]),
+        zero_point=np.array([10]),
+        num_bits=8,
+        symmetric=False,
+        quantized_dimension=None,
+    )
+    weight_quant_config = qtyping.UniformQuantParams(
+        scale=np.array([0.1]),
+        zero_point=np.array([-1]),
+        num_bits=8,
+        symmetric=True,
+        quantized_dimension=None,
+    )
+    # This will result in quantized bias of 3e9, which is larger than int32 max.
+    bias_tensor_data = np.array([3e7])
+    with self.assertRaisesRegex(
+        ValueError,
+        "Quantization error is too large for bias tensor quantization.",
+    ):
+      uniform_quantize_tensor.symmetric_quantize_bias_tensor(
+          bias_tensor_data,
+          input_quant_config,
+          weight_quant_config,
+      )
   @parameterized.parameters((8, True), (16, False))
   def test_tensor_zp_scale_from_min_max(self, num_bits, symmetric):
     min_val = np.min(self._test_data, keepdims=True)
     max_val = np.max(self._test_data, keepdims=True)
     zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
-        min_val, max_val, num_bits, symmetric
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
     )
     self.assertEqual(zp.shape, scale.shape)
     max_q = 2**num_bits / 2 - 1
@@ -352,6 +467,38 @@ class TensorUtilsTest(parameterized.TestCase):
       # Range has to be extended to include zero.
       self.assertEqual(calculated_min, 0)
+  @parameterized.parameters(
+      # number of bits, is_symmetric, max bound of the quantized range.
+      (4, True, 7),
+      (8, False, 255),
+    )
+  def test_tensor_zp_scale_from_min_max_with_clipping(
+      self, num_bits, symmetric, quantized_bound
+  ):
+    min_val = np.array([[1.0]])
+    max_val = np.array([[5.0]])
+    clipping_values = np.array([4.0])
+    zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
+        clipping_values,
+    )
+    expected_scale = clipping_values / quantized_bound
+    with self.subTest(name="CheckShapes"):
+      self.assertEqual(zp.shape, scale.shape)
+      self.assertEqual(zp.shape, (1, 1))
+    if symmetric:
+      with self.subTest(name="CheckSymmetricZpValue"):
+        self.assertEqual(zp[0], 0)
+    with self.subTest(name="CheckScaleValue"):
+      self.assertEqual(scale[0], expected_scale)
 if __name__ == "__main__":
   googletest.main()

ai-edge-quantizer-nightly 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl