PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.1.0.dev20250415__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.1.0.dev20250415py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py CHANGED Viewed

@@ -44,33 +44,17 @@ class OctavQuantizeTest(parameterized.TestCase):
     )
     self._tensor_name_to_qsv = {}
     subgraph0 = self._test_model.subgraphs[0]
-    subgraph_op_index = 3
-    fc_op = subgraph0.operators[subgraph_op_index]
+    self._subgraph_op_index = 3
+    self._fc_op = subgraph0.operators[self._subgraph_op_index]
     self._fc_op_info = qtyping.OpInfo(
-        op=fc_op,
+        op=self._fc_op,
         op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
-        subgraph_op_index=subgraph_op_index,
+        subgraph_op_index=self._subgraph_op_index,
         op_quant_config=qtyping.OpQuantizationConfig(
             weight_tensor_config=None,
         ),
     )
-  def test_get_tensor_quant_params_unsupported_granularity_assert(self):
-    err_msg = "Unsupported granularity"
-    test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, lambda err: err_msg in str(err)
-    ):
-      _ = octav.get_tensor_quant_params(
-          op_info=self._fc_op_info,
-          tensor_quant_config=qtyping.TensorQuantizationConfig(
-              num_bits=4,
-              symmetric=True,
-              granularity=qtyping.QuantGranularity.BLOCKWISE,
-          ),
-          tensor_content=test_data,
-      )
   def test_get_tensor_quant_params_unsupported_symmetry(self):
     err_msg = "Unsupported symmetry"
     test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
@@ -117,13 +101,22 @@ class OctavQuantizeTest(parameterized.TestCase):
         [25, -30, 50, -75, 1e5, -125],
         [50, -60, 70, -80, 90, -100],
     ])
-    quant_params = octav.get_tensor_quant_params(
-        op_info=self._fc_op_info,
-        tensor_quant_config=qtyping.TensorQuantizationConfig(
-            num_bits=4,
-            symmetric=True,
-            granularity=qtyping.QuantGranularity.TENSORWISE,
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.TENSORWISE,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
         ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
         tensor_content=test_data,
     )
     adjusted_test_data = quant_params.quantized_data * quant_params.scale
@@ -131,10 +124,10 @@ class OctavQuantizeTest(parameterized.TestCase):
     adjusted_max = np.max(np.abs(adjusted_test_data))
     # Check that some clipping occurred.
-    with self.subTest(name="SanityCheckClipping"):
+    with self.subTest(name="CheckClipping"):
       self.assertLess(adjusted_max, real_max)
-    with self.subTest(name="SanityCheckQuantParamsShapes"):
+    with self.subTest(name="CheckQuantParamsShapes"):
       self.assertEqual(quant_params.zero_point.shape, (1, 1))
       self.assertEqual(quant_params.scale.shape, (1, 1))
       self.assertIsNone(quant_params.quantized_dimension)
@@ -143,33 +136,47 @@ class OctavQuantizeTest(parameterized.TestCase):
           cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
       )
-    with self.subTest(name="SanityCheckQuantParamsValues"):
+    with self.subTest(name="CheckQuantParamsValues"):
       self.assertTrue(np.all(quant_params.zero_point == 0))
   def test_get_tensor_quant_params_sanity_channelwise(self):
+    # Test that the call generates quant params that are appropriately shaped,
+    # have some clipping, and correct config values without checking the
+    # actual values numerically.
     test_data = np.array([
         [-1e5, 25, -50, 75, -100, 125],
         [25, -30, 50, -75, 1e5, -125],
         [50, -60, 70, -80, 90, -100],
     ])
-    quant_params = octav.get_tensor_quant_params(
-        op_info=self._fc_op_info,
-        tensor_quant_config=qtyping.TensorQuantizationConfig(
-            num_bits=4,
-            symmetric=True,
-            granularity=qtyping.QuantGranularity.CHANNELWISE,
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.CHANNELWISE,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
         ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
         tensor_content=test_data,
     )
+    # Dequantize output to compare with the original test data.
     adjusted_test_data = quant_params.quantized_data * quant_params.scale
     for i, row in enumerate(test_data):
       real_max = np.max(np.abs(row))
       adjusted_max = np.max(np.abs(adjusted_test_data[i]))
       # Check that some clipping occurred.
-      with self.subTest(name="SanityCheckClipping"):
+      with self.subTest(name="CheckClipping"):
         self.assertLess(adjusted_max, real_max)
-    with self.subTest(name="SanityCheckQuantParamsShapes"):
+    with self.subTest(name="CheckQuantParamsShapes"):
       self.assertEqual(quant_params.zero_point.shape, (test_data.shape[0], 1))
       self.assertEqual(quant_params.scale.shape, (test_data.shape[0], 1))
       self.assertIsNotNone(quant_params.quantized_data)
@@ -177,10 +184,57 @@ class OctavQuantizeTest(parameterized.TestCase):
           cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
       )
-    with self.subTest(name="SanityCheckQuantParamsValues"):
+    with self.subTest(name="CheckQuantParamsValues"):
       self.assertTrue(np.all(quant_params.zero_point == 0))
       self.assertEqual(quant_params.quantized_dimension, 0)
+  def test_get_tensor_quant_params_sanity_blockwise(self):
+    # Test that the call generates quant params that are appropriately shaped,
+    # have some clipping, and correct config values without checking the
+    # actual values numerically.
+    test_data = np.random.randint(0, 1024, size=(32, 128))
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.BLOCKWISE_32,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
+        ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
+        tensor_content=test_data,
+    )
+    with self.subTest(name="CheckQuantParamsShapes"):
+      # Check that quant params have appropriate shapes.
+      self.assertEqual(quant_params.zero_point.shape, (32, 4))
+      self.assertEqual(quant_params.scale.shape, (32, 4))
+      self.assertIsNotNone(quant_params.quantized_data)
+      self.assertTupleEqual(
+          cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
+      )
+    scales = np.repeat(quant_params.scale, 32, axis=1)
+    adjusted_test_data = quant_params.quantized_data * scales
+    for i, row in enumerate(test_data):
+      real_max = np.max(np.abs(row))
+      adjusted_max = np.max(np.abs(adjusted_test_data[i]))
+      # Check that some clipping occurred.
+      with self.subTest(name="CheckClipping"):
+        self.assertLess(adjusted_max, real_max)
+    with self.subTest(name="CheckQuantParamsValues"):
+      self.assertTrue(np.all(quant_params.zero_point == 0))
+      # See TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM.
+      self.assertEqual(quant_params.quantized_dimension, 1)
 if __name__ == "__main__":
   googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -16,9 +16,11 @@
 """Uniform quantize in tensor level."""
 import dataclasses
-from typing import Optional
+from typing import Optional, Sequence
+import ml_dtypes
 import numpy as np
 from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
 @dataclasses.dataclass(frozen=True)
@@ -27,6 +29,11 @@ class IntType:
   signed: bool
+def is_blockwise(granularity: qtyping.QuantGranularity) -> bool:
+  """Checks if the quantization granularity is blockwise."""
+  return "BLOCKWISE" in str(granularity)
 def get_quantized_range(qtype: IntType) -> tuple[float, float]:
   """Calculates range of the quantized type."""
   if qtype.signed:
@@ -38,6 +45,22 @@ def get_quantized_range(qtype: IntType) -> tuple[float, float]:
   return float(qmin), float(qmax)
+def extract_block_size_from_granularity(
+    granularity: qtyping.QuantGranularity,
+) -> int:
+  """Get the block size for blockwise quantization."""
+  if granularity == qtyping.QuantGranularity.BLOCKWISE_32:
+    return 32
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_64:
+    return 64
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_128:
+    return 128
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE_256:
+    return 256
+  else:
+    return 0
 def _round_and_clip(
     tensor: np.ndarray, qtype: IntType, narrow: bool
 ) -> np.ndarray:
@@ -117,22 +140,141 @@ def fix_quantization_params_rank(
       symmetric=quantization_params.symmetric,
       quantized_dimension=quantization_params.quantized_dimension,
       quantized_data=quantization_params.quantized_data,
+      block_size=quantization_params.block_size,
+  )
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      if val % block_size != 0:
+        raise ValueError(
+            f"Quantized dimension {val} in tensor shape {tensor_shape} is not"
+            f" divisible by block size {block_size}."
+        )
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def reshape_data_for_blockwise(
+    tensor_data: np.ndarray,
+    op_name: qtyping.TFLOperationName,
+    granularity: qtyping.QuantGranularity,
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    op_name: The name of the TFL op.
+    granularity: The quantization granularity for the tensor.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+      op_name
+  ]
+  block_size = extract_block_size_from_granularity(granularity)
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  return tensor_data.reshape(new_shape), quantized_dim + 1
+def _broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+      `quant_params.quantized_dimension` must be specified.
+      `quant_params.block_size` must be specified and positive.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  if quant_params.zero_point is None or quant_params.zero_point.size == 0:
+    expanded_zp = np.zeros_like(tensor_content, dtype=np.int32)
+  else:
+    expanded_zp = np.reshape(
+        np.broadcast_to(
+            np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+            expanded_tensor_shape,
+        ),
+        tensor_content.shape,
+    )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
   )
 def uniform_quantize(
     tensor_data: np.ndarray,
     quantization_params: qtyping.UniformQuantParams,
+    is_blockwise_quant: bool = False,
 ):
   """Uniform quantize a tensor.
   Args:
     tensor_data: The tensor to be quantized.
     quantization_params: The quantization parameters.
+    is_blockwise_quant: Whether the tensor is blockwise quantized.
   Returns:
     The quantized tensor.
   """
+  # The reshaping for blockwise quantization is unique hence we do this here
+  # to avoid unexpected broadcast behavior downstream.
+  if is_blockwise_quant:
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -146,8 +288,15 @@ def uniform_quantize(
   inverse_scales = 1.0 / scales
   # TODO: b/332574603 - support unsigned data type.
   qtype = IntType(quantization_params.num_bits, signed=True)
-  # Symmetric means narrow range (e.g., -127 to 127)
-  narrow_range = quantization_params.symmetric
+  # For quantization with more than 8 bits, symmetric narrow-range quantization
+  # is required due to assumptions made by legacy TFLite kernels. However, this
+  # method is not ideal for low-bit quantization (e.g., 2-bit quantization,
+  # which only has 4 bins), as it wastes a bin and there are no kernel
+  # requirements for a narrow range when < 8 bits because the data is unpacked
+  # to int8 before being used in the kernel.
+  narrow_range = (
+      quantization_params.symmetric and quantization_params.num_bits >= 8
+  )
   required_dtype = np.signedinteger if qtype.signed else np.unsignedinteger
   if not np.issubdtype(zero_points.dtype, required_dtype):
     raise ValueError(
@@ -173,6 +322,26 @@ def uniform_dequantize(
   Returns:
     The dequantized tensor.
   """
+  if quantization_params.block_size != 0:
+    # b/443830202: The quantized dimension is currently increased by 1 because
+    # AEQ expects 1 and XNNPack expects 0.
+    quantization_params = dataclasses.replace(
+        quantization_params,
+        quantized_dimension=quantization_params.quantized_dimension + 1,
+    )
+    scale_shape = list(tensor_data.shape)
+    scale_shape[quantization_params.quantized_dimension] = (
+        scale_shape[quantization_params.quantized_dimension]
+        // quantization_params.block_size
+    )
+    quantization_params = dataclasses.replace(
+        quantization_params,
+        scale=quantization_params.scale.reshape(scale_shape),
+    )
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -188,6 +357,7 @@ def symmetric_quantize_bias_tensor(
     bias_content: np.ndarray,
     input_tensor_quant_params: qtyping.UniformQuantParams,
     weight_tensor_quant_params: qtyping.UniformQuantParams,
+    check_error: bool = True,
 ) -> qtyping.UniformQuantParams:
   """Quantize bias tensor (symmetrically, i.e., zero_point = 0).
@@ -199,6 +369,12 @@ def symmetric_quantize_bias_tensor(
     bias_content: The bias content.
     input_tensor_quant_params: The quantization parameters of input tensor.
     weight_tensor_quant_params: The quantization parameters of weight tensor.
+    check_error: Whether to check if the quantization error (the difference
+      between the original and dequantized bias) is larger than the quantization
+      scale. This check is important because bias quantization parameters are
+      fixed (bias_scale = input_scale * weight_scale), which can lead to large
+      quantization errors. Raising an error when the quantization error is
+      larger than the scale helps to identify unexpected numerical issues.
   Returns:
     The quantized bias tensor.
@@ -213,7 +389,8 @@ def symmetric_quantize_bias_tensor(
   # symmetric
   bias_zp = np.zeros_like(effective_output_scale, dtype=np.int32)
-  bias_number_bits = 64 if input_tensor_quant_params.num_bits == 16 else 32
+  # Fixed to 32 bits since most of the accelerators use int32 accumulator.
+  bias_number_bits = 32
   symmetric = True
   quantized_dimension = None if len(effective_output_scale) == 1 else 0
   bias_quant_params = qtyping.UniformQuantParams(
@@ -225,6 +402,24 @@ def symmetric_quantize_bias_tensor(
   )
   quantized_vars = uniform_quantize(bias_content, bias_quant_params)
+  if check_error:
+    dequantized_bias = uniform_dequantize(quantized_vars, bias_quant_params)
+    max_quant_error = np.max(np.abs(dequantized_bias - bias_content))
+    error_tolerance = np.maximum(1e-6, np.max(effective_output_scale))
+    if max_quant_error > error_tolerance:
+      raise ValueError(
+          "Quantization error is too large for bias tensor quantization. Max"
+          f" quantization error is {max_quant_error}, which exceed"
+          f" the threshold {error_tolerance}"
+      )
+  # Save the int32 quantized bias as int64 if the input tensor is quantized to
+  # 16 bits. This is to assume the matmul is using int64 accumulator (safe from
+  # overflow). For accelerators with int32 accumulator, it is safe to cast int64
+  # back to int32.
+  if input_tensor_quant_params.num_bits == 16:
+    quantized_vars = quantized_vars.astype(np.int64)
+    bias_number_bits = 64
   # UniformQuantParams is frozen dataclass, need to recreate.
   return qtyping.UniformQuantParams(
@@ -242,15 +437,19 @@ def tensor_zp_scale_from_min_max(
     max_value,
     num_bits: int,
     symmetric: bool,
+    granularity: qtyping.QuantGranularity,
     clipping_values: Optional[np.ndarray] = None,
 ):
   """Get zero point and scale from min and max value.
   Args:
-    min_value: The minimum value of the tensor (channel-wise supported).
-    max_value: The maximum value of the tensor (channel-wise supported).
+    min_value: The minimum value of the tensor (channelwise and blockwise
+      supported).
+    max_value: The maximum value of the tensor (channelwise and blockwise
+      supported).
     num_bits: The number of bits of the tensor.
     symmetric: Whether the tensor is symmetric.
+    granularity: The granularity of the tensor.
     clipping_values: Absolute clipping values to apply to the tensor. This will
       clip the tensors to the range [-clipping_values, clipping_values]. This
       should be the same shape as min_value and max_value. If None, no clipping
@@ -259,19 +458,45 @@ def tensor_zp_scale_from_min_max(
   Returns:
     The zero point and scale of the tensor.
   """
   # TODO: b/332574603 - support unsigned data type.
   qtype = IntType(
       num_bits,
       signed=True,
   )
   qmin, qmax = get_quantized_range(qtype)
-  min_bound = 1e-4  # 1e-6 precision for int8 and 1e-8 for int16.
+  min_bound = 1e-9  # Avoid zero scale.
+  pos_clipping_values = None if clipping_values is None else clipping_values
+  neg_clipping_values = None if clipping_values is None else -clipping_values
+  if is_blockwise(granularity):
+    # Blockwise quantization uses float16 scale,
+    # with 7 bit mantissa, so the maximum scale value is 65280 and maximum
+    # representable range is [-65280 * (2 ** num_bits),
+    # 65280 * (2 ** num_bits - 1)].
+    # Note that we have one extra value on the negative side.
+    float16_max = np.broadcast_to(
+        np.array(65280) * (2**num_bits - 1), max_value.shape
+    )
+    float16_min = np.broadcast_to(
+        np.array(-65280) * (2**num_bits), min_value.shape
+    )
+    pos_clipping_values = (
+        float16_max
+        if pos_clipping_values is None
+        else np.minimum(pos_clipping_values, float16_max)
+    )
+    neg_clipping_values = (
+        float16_min
+        if neg_clipping_values is None
+        else np.maximum(neg_clipping_values, float16_min)
+    )
   if symmetric:
     bound = np.maximum(np.abs(min_value), np.abs(max_value))
     bound = np.maximum(bound, min_bound)
     if clipping_values is not None:
-      bound = np.clip(bound, -clipping_values, clipping_values)
+      bound = np.clip(bound, neg_clipping_values, pos_clipping_values)
     if not qtype.signed:
       half_q = (qmax - 1) / 2
       scale = bound / half_q
@@ -292,6 +517,12 @@ def tensor_zp_scale_from_min_max(
     zp = qmin - bound_min / scale
     zp = np.rint(zp)
+  if is_blockwise(granularity):
+    # Round the scale values to 7 bit mantissa.
+    scale = (
+        scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
+    )
   # It's safe to cast zp to qtype without clipping because we can infer
   # qmin <= zp <= qmax from bound_min <= 0 <= bound_max.
   zp = assign_quantized_type(zp, qtype)
@@ -305,7 +536,8 @@ def _is_valid_quantization_params(
   """Checks if the quantization parameters are valid.
   A valid quantization params requires:
-    1. scale and zero point have the same shape (TFL Runtime requirement).
+    1. scale and zero point either have the same shape or the zero point is a
+    scalar.
     2. scale and zero point have the same rank as the tensor content (avoid
     ambiguous broadcasting).
@@ -316,17 +548,20 @@ def _is_valid_quantization_params(
   Returns:
     True if the quantization parameters are valid.
   """
-  if quantization_params.scale.shape != quantization_params.zero_point.shape:
+  if (
+      quantization_params.scale.shape != quantization_params.zero_point.shape
+      and quantization_params.zero_point.size != 1
+  ):
     raise ValueError(
-        "scale and zero_point must have the same shape. Got"
-        f" {quantization_params.scale.shape} and"
+        "scale and zero_point must have the same shape or zero_point must have"
+        f" only one element. Got {quantization_params.scale.shape} and"
         f" {quantization_params.zero_point.shape}"
     )
   tensor_rank = tensor_data.ndim
   scale_rank = quantization_params.scale.ndim
   zero_point_rank = quantization_params.zero_point.ndim
-  if (tensor_rank != scale_rank) or (tensor_rank != zero_point_rank):
+  if tensor_rank != scale_rank or (tensor_rank != zero_point_rank):
     raise ValueError(
         f"Ranks of scales ({scale_rank}) and zps"
         f" ({zero_point_rank}) must be the same as the tensor rank"

ai-edge-quantizer-nightly 0.1.0.dev20250415__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl

ai-edge-quantizer-nightly 0.1.0.dev20250415py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl