PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.1.0.dev20250513__py3-none-any.whl → 0.1.0.dev20250515__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.1.0.dev20250513py3-none-any.whl → 0.1.0.dev20250515py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py CHANGED Viewed

@@ -726,18 +726,29 @@ def _get_tensor_shape_for_blockwise(
 def _reshape_data_for_blockwise(
-    tensor_data: np.ndarray, quantized_dim: int, block_size: int
+    tensor_data: np.ndarray,
+    quantized_dim: int,
+    block_size: int,
 ) -> tuple[np.ndarray, int]:
   """Reshapes data for blockwise quantization.
   Args:
     tensor_data: The original tensor data.
     quantized_dim: The dimension to be quantized blockwise.
-    block_size: The size of the block.
+    block_size: The size of the block. `block_size must be a multiple of 32. `
+      `The tensor quantized dimension shape must be divisible by block_size.
   Returns:
     A tuple containing the reshaped tensor data and the new reduce dimension.
   """
+  # TODO: b/417508018 - create AEQ specific error class instead of
+  # using generic ValueError.
+  if tensor_data.shape[quantized_dim] % block_size != 0:
+    raise ValueError(
+        "Tensor quantization dimension must be divisible by block size for"
+        " blockwise quantization."
+    )
   new_shape = _get_tensor_shape_for_blockwise(
       tensor_data.shape, quantized_dim, block_size
   )
@@ -818,22 +829,19 @@ def init_tensor_min_max(
         weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
     ):
       quantized_dim = common_utils.get_weight_quantized_dim(
-          op_info, tensor_data
+          op_info, tensor_data, weight_tensor_config.granularity
       )
     if (
         weight_tensor_config is not None
         and weight_tensor_config.granularity
         == qtyping.QuantGranularity.BLOCKWISE
     ):
-      quantized_dim = (
-          tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
-              op_info.op_name
-          ]
-      )
-      reshaped_data, reduce_dims = _reshape_data_for_blockwise(
-          tensor_data,
-          quantized_dim,
-          weight_tensor_config.block_size,
+      reshaped_data, reduce_dims = (
+          uniform_quantize_tensor.reshape_data_for_blockwise(
+              tensor_data,
+              op_info.op_name,
+              weight_tensor_config.block_size,
+          )
       )
       return {
           "min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),

ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py CHANGED Viewed

@@ -31,8 +31,7 @@ _TensorQuantConfig = qtyping.TensorQuantizationConfig
 class CommonQuantizeTest(parameterized.TestCase):
-  """Tests for general quantize functions.
-  """
+  """Tests for general quantize functions."""
   def setUp(self):
     super().setUp()
@@ -69,6 +68,34 @@ class CommonQuantizeTest(parameterized.TestCase):
           default_policy.DEFAULT_CONFIG_CHECK_POLICY,
       )
+  def test_reshape_data_for_blockwise_raises_error_when_quantized_dim_not_divisible_by_block_size(
+      self,
+  ):
+    tensor_data = np.ones((24, 128), dtype=np.float32)
+    block_size = 256
+    quantized_dim = 1
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        lambda err: (
+            "Tensor quantization dimension must be divisible by block"
+            " size for blockwise quantization."
+        )
+        in str(err),
+    ):
+      common_quantize._reshape_data_for_blockwise(
+          tensor_data, quantized_dim, block_size
+      )
+  def test_reshape_data_for_blockwise_returns_correct_values(self):
+    tensor_data = np.ones((24, 128), dtype=np.float32)
+    block_size = 32
+    quantized_dim = 1
+    new_tensor_data, reduce_dim = common_quantize._reshape_data_for_blockwise(
+        tensor_data, quantized_dim, block_size
+    )
+    self.assertEqual(new_tensor_data.shape, (24, 4, 32))
+    self.assertEqual(reduce_dim, 2)
 if __name__ == "__main__":
   googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py CHANGED Viewed

@@ -168,11 +168,9 @@ def get_tensor_quant_params(
         "Only symmetric weights are supported for dequantized weight recovery."
     )
-  quantized_dim = None
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
-    quantized_dim = common_utils.get_weight_quantized_dim(
-        op_info, tensor_content
-    )
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
   zp, scale = get_zp_scale_from_dequantized_symmetric_weights(
       dequant_vals=tensor_content,

ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation.py CHANGED Viewed

@@ -78,13 +78,16 @@ def _rotate_with_diagonal_hadamard(
   # of 2 to calculate this factor.
   hadamard_size = np.gcd(tensor_content.shape[axis], 2 ** 30)
   diagonal_size = tensor_content.shape[axis] // hadamard_size
+  output_size = tensor_content.shape[1 - axis]
   random_vector = np.ones(hadamard_size, dtype=np.int8)
   # Use a canonical Hadamard matrix.
   hadamard = _make_hadamard_matrix(hadamard_size)
-  hadamard_diagonal = np.kron(np.eye(diagonal_size), hadamard)
-  w_rotated = np.einsum("ij,aj->ai", hadamard_diagonal, tensor_content)
-  return w_rotated, hadamard_size, random_vector
+  reshaped_tensor = tensor_content.reshape(
+      diagonal_size, output_size, hadamard_size
+  )
+  w_rotated = np.einsum("jk,ilk->ilj", hadamard, reshaped_tensor)
+  return w_rotated.reshape(tensor_content.shape), hadamard_size, random_vector
 def get_tensor_quant_params(
@@ -128,7 +131,9 @@ def get_tensor_quant_params(
         f" {tensor_quant_config.granularity} granularity."
     )
-  quantized_dim = common_utils.get_weight_quantized_dim(op_info, tensor_content)
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
   if quantized_dim != 0:
     raise ValueError(
         f"Unsupported quantized dimension: {quantized_dim}. Only 0 is"

ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation_test.py CHANGED Viewed

@@ -119,6 +119,55 @@ class HadamardRotationFullyConnectedTest(parameterized.TestCase):
     if qparams.hadamard is not None:
       self.assertEqual(qparams.hadamard.hadamard_size, 32)
+  def test_get_tensor_quant_params_golden_1(self):
+    test_data = np.ones((6, 6))
+    # expected:
+    #   [[127   0 127   0 127   0]
+    #    [127   0 127   0 127   0]
+    #    [127   0 127   0 127   0]
+    #    [127   0 127   0 127   0]
+    #    [127   0 127   0 127   0]
+    #    [127   0 127   0 127   0]]
+    expected = np.tile([127, 0], [6, 3])
+    qparams = hadamard_rotation.get_tensor_quant_params(
+        self._op_info,
+        self._op_info.op_quant_config.weight_tensor_config,
+        test_data,
+        self._tensor_name_to_qsv,
+    )
+    self.assertIsNotNone(qparams.quantized_data)
+    np.testing.assert_array_equal(
+        np.array(qparams.quantized_data), expected
+    )
+  def test_get_tensor_quant_params_golden_2(self):
+    # test_data:
+    #   [[1 2 1 2 1 2]
+    #    [3 4 3 4 3 4]
+    #    [1 2 1 2 1 2]
+    #    [3 4 3 4 3 4]
+    #    [1 2 1 2 1 2]
+    #    [3 4 3 4 3 4]]
+    test_data = np.tile([[1, 2], [3, 4]], [3, 3])
+    # expected:
+    #   [[127 -42 127 -42 127 -42]
+    #    [127 -18 127 -18 127 -18]
+    #    [127 -42 127 -42 127 -42]
+    #    [127 -18 127 -18 127 -18]
+    #    [127 -42 127 -42 127 -42]
+    #    [127 -18 127 -18 127 -18]]
+    expected = np.tile([[127, -42], [127, -18]], [3, 3])
+    qparams = hadamard_rotation.get_tensor_quant_params(
+        self._op_info,
+        self._op_info.op_quant_config.weight_tensor_config,
+        test_data,
+        self._tensor_name_to_qsv,
+    )
+    self.assertIsNotNone(qparams.quantized_data)
+    np.testing.assert_array_equal(
+        np.array(qparams.quantized_data), expected
+    )
   def test_raise_missing_tensor_content(self):
     with self.assertRaisesWithPredicateMatch(
         ValueError, lambda err: "weight tensor" in str(err)

ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py CHANGED Viewed

@@ -16,7 +16,6 @@
 """Performs naive min/max uniform quantization."""
 from typing import Any, Optional
-import ml_dtypes
 import numpy as np
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
@@ -75,35 +74,17 @@ def get_tensor_quant_params(
         " the ParamsGenerator."
     )
   clipping_values = None
-  # Blockwise quantization uses float16 scale, with 7 bit mantissa,
-  # so the maximum representable value is 65280.
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    clipping_values = np.broadcast_to(
-        np.array(65280), tensor_min_max["min"].shape
-    )
   zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
       tensor_min_max["min"],
       tensor_min_max["max"],
       tensor_quant_config.num_bits,
       tensor_quant_config.symmetric,
+      tensor_quant_config.granularity,
       clipping_values,
   )
-  # Round the scale values to 7 bit mantissa.
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    scale = (
-        scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
-    )
-  quantized_dim = None
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
-    quantized_dim = common_utils.get_weight_quantized_dim(
-        op_info, tensor_content
-    )
-  elif tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    quantized_dim = (
-        tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
-            op_info.op_name
-        ]
-    )
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
   quant_params = qtyping.UniformQuantParams(
       scale=scale,
       zero_point=zp,
@@ -115,15 +96,10 @@ def get_tensor_quant_params(
   if tensor_content is None:
     return quant_params
-  # The reshaping for blockwise quantization is unique hence we do this here
-  # to avoid unexpected broadcast behavior downstream.
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    quant_params = common_quantize.broadcast_scale_zp_for_blockwise(
-        tensor_content, quant_params
-    )
   quantized_vars = uniform_quantize_tensor.uniform_quantize(
-      tensor_content, quant_params
+      tensor_content,
+      quant_params,
+      tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE,
   )
   # Update with quantized values.
   return qtyping.UniformQuantParams(

ai_edge_quantizer/algorithms/uniform_quantize/octav.py CHANGED Viewed

@@ -102,21 +102,13 @@ def get_tensor_quant_params(
         op_info, tensor_quant_config, tensor_content, tensor_qsv
     )
-  if (
-      tensor_quant_config.granularity != qtyping.QuantGranularity.CHANNELWISE
-      and tensor_quant_config.granularity != qtyping.QuantGranularity.TENSORWISE
-  ):
-    raise ValueError(
-        f"Unsupported granularity: {tensor_quant_config.granularity}."
-    )
   if not tensor_quant_config.symmetric:
     raise ValueError(
         f"Unsupported symmetry: {tensor_quant_config.symmetric}. OCTAV"
         " supports symmetric quantization only for now."
     )
-  if tensor_qsv is None:
+  if not tensor_qsv:
     # We need min/max to calculate quantization parameters, which
     # should be collected during the calibration process. However,
     # weight-only and DRQ do not require calibration, thus it is
@@ -136,25 +128,41 @@ def get_tensor_quant_params(
         " the ParamsGenerator."
     )
-  quantized_dim = None
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
-    quantized_dim = common_utils.get_weight_quantized_dim(
-        op_info, tensor_content
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
+  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
+    reshaped_data, reduce_dims = (
+        uniform_quantize_tensor.reshape_data_for_blockwise(
+            tensor_content,
+            op_info.op_name,
+            tensor_quant_config.block_size,
+        )
+    )
+  else:
+    reshaped_data = tensor_content
+    reduce_dims = common_utils.get_reduce_dims(
+        quantized_dim, tensor_content.shape
     )
   clipping_constants = _guess_clipping_with_octav(
-      tensor_content,
+      reshaped_data,
       tensor_quant_config.num_bits,
-      common_utils.get_reduce_dims(quantized_dim, tensor_content.shape),
+      reduce_dims,
       max_iterations=10,
       exponent_divisor=3.0 if tensor_quant_config.symmetric else 12.0,
   )
+  # We created a new dimension in order to reduce properly for blockwise
+  # quantization, so we need to reshape the clipping constants back to the
+  # min/max shape for the next step.
+  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
+    clipping_constants = clipping_constants.reshape(tensor_min_max["min"].shape)
   zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
       tensor_min_max["min"],
       tensor_min_max["max"],
       tensor_quant_config.num_bits,
       tensor_quant_config.symmetric,
+      tensor_quant_config.granularity,
       clipping_constants,
   )
@@ -168,7 +176,9 @@ def get_tensor_quant_params(
   )
   quantized_vars = uniform_quantize_tensor.uniform_quantize(
-      tensor_content, quant_params
+      tensor_content,
+      quant_params,
+      tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE,
   )
   return dataclasses.replace(quant_params, quantized_data=quantized_vars)

ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py CHANGED Viewed

@@ -44,33 +44,17 @@ class OctavQuantizeTest(parameterized.TestCase):
     )
     self._tensor_name_to_qsv = {}
     subgraph0 = self._test_model.subgraphs[0]
-    subgraph_op_index = 3
-    fc_op = subgraph0.operators[subgraph_op_index]
+    self._subgraph_op_index = 3
+    self._fc_op = subgraph0.operators[self._subgraph_op_index]
     self._fc_op_info = qtyping.OpInfo(
-        op=fc_op,
+        op=self._fc_op,
         op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
-        subgraph_op_index=subgraph_op_index,
+        subgraph_op_index=self._subgraph_op_index,
         op_quant_config=qtyping.OpQuantizationConfig(
             weight_tensor_config=None,
         ),
     )
-  def test_get_tensor_quant_params_unsupported_granularity_assert(self):
-    err_msg = "Unsupported granularity"
-    test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, lambda err: err_msg in str(err)
-    ):
-      _ = octav.get_tensor_quant_params(
-          op_info=self._fc_op_info,
-          tensor_quant_config=qtyping.TensorQuantizationConfig(
-              num_bits=4,
-              symmetric=True,
-              granularity=qtyping.QuantGranularity.BLOCKWISE,
-          ),
-          tensor_content=test_data,
-      )
   def test_get_tensor_quant_params_unsupported_symmetry(self):
     err_msg = "Unsupported symmetry"
     test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
@@ -117,13 +101,22 @@ class OctavQuantizeTest(parameterized.TestCase):
         [25, -30, 50, -75, 1e5, -125],
         [50, -60, 70, -80, 90, -100],
     ])
-    quant_params = octav.get_tensor_quant_params(
-        op_info=self._fc_op_info,
-        tensor_quant_config=qtyping.TensorQuantizationConfig(
-            num_bits=4,
-            symmetric=True,
-            granularity=qtyping.QuantGranularity.TENSORWISE,
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.TENSORWISE,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
         ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
         tensor_content=test_data,
     )
     adjusted_test_data = quant_params.quantized_data * quant_params.scale
@@ -131,10 +124,10 @@ class OctavQuantizeTest(parameterized.TestCase):
     adjusted_max = np.max(np.abs(adjusted_test_data))
     # Check that some clipping occurred.
-    with self.subTest(name="SanityCheckClipping"):
+    with self.subTest(name="CheckClipping"):
       self.assertLess(adjusted_max, real_max)
-    with self.subTest(name="SanityCheckQuantParamsShapes"):
+    with self.subTest(name="CheckQuantParamsShapes"):
       self.assertEqual(quant_params.zero_point.shape, (1, 1))
       self.assertEqual(quant_params.scale.shape, (1, 1))
       self.assertIsNone(quant_params.quantized_dimension)
@@ -143,33 +136,47 @@ class OctavQuantizeTest(parameterized.TestCase):
           cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
       )
-    with self.subTest(name="SanityCheckQuantParamsValues"):
+    with self.subTest(name="CheckQuantParamsValues"):
       self.assertTrue(np.all(quant_params.zero_point == 0))
   def test_get_tensor_quant_params_sanity_channelwise(self):
+    # Test that the call generates quant params that are appropriately shaped,
+    # have some clipping, and correct config values without checking the
+    # actual values numerically.
     test_data = np.array([
         [-1e5, 25, -50, 75, -100, 125],
         [25, -30, 50, -75, 1e5, -125],
         [50, -60, 70, -80, 90, -100],
     ])
-    quant_params = octav.get_tensor_quant_params(
-        op_info=self._fc_op_info,
-        tensor_quant_config=qtyping.TensorQuantizationConfig(
-            num_bits=4,
-            symmetric=True,
-            granularity=qtyping.QuantGranularity.CHANNELWISE,
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.CHANNELWISE,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
         ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
         tensor_content=test_data,
     )
+    # Dequantize output to compare with the original test data.
     adjusted_test_data = quant_params.quantized_data * quant_params.scale
     for i, row in enumerate(test_data):
       real_max = np.max(np.abs(row))
       adjusted_max = np.max(np.abs(adjusted_test_data[i]))
       # Check that some clipping occurred.
-      with self.subTest(name="SanityCheckClipping"):
+      with self.subTest(name="CheckClipping"):
         self.assertLess(adjusted_max, real_max)
-    with self.subTest(name="SanityCheckQuantParamsShapes"):
+    with self.subTest(name="CheckQuantParamsShapes"):
       self.assertEqual(quant_params.zero_point.shape, (test_data.shape[0], 1))
       self.assertEqual(quant_params.scale.shape, (test_data.shape[0], 1))
       self.assertIsNotNone(quant_params.quantized_data)
@@ -177,10 +184,58 @@ class OctavQuantizeTest(parameterized.TestCase):
           cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
       )
-    with self.subTest(name="SanityCheckQuantParamsValues"):
+    with self.subTest(name="CheckQuantParamsValues"):
       self.assertTrue(np.all(quant_params.zero_point == 0))
       self.assertEqual(quant_params.quantized_dimension, 0)
+  def test_get_tensor_quant_params_sanity_blockwise(self):
+    # Test that the call generates quant params that are appropriately shaped,
+    # have some clipping, and correct config values without checking the
+    # actual values numerically.
+    test_data = np.random.randint(0, 1024, size=(32, 128))
+    tensor_config = qtyping.TensorQuantizationConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.BLOCKWISE,
+        block_size=32,
+    )
+    fc_op_info = qtyping.OpInfo(
+        op=self._fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=self._subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=tensor_config,
+        ),
+    )
+    quant_params = octav.get_tensor_quant_params(
+        op_info=fc_op_info,
+        tensor_quant_config=tensor_config,
+        tensor_content=test_data,
+    )
+    with self.subTest(name="CheckQuantParamsShapes"):
+      # Check that quant params have appropriate shapes.
+      self.assertEqual(quant_params.zero_point.shape, (32, 4))
+      self.assertEqual(quant_params.scale.shape, (32, 4))
+      self.assertIsNotNone(quant_params.quantized_data)
+      self.assertTupleEqual(
+          cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
+      )
+    scales = np.repeat(quant_params.scale, 32, axis=1)
+    adjusted_test_data = quant_params.quantized_data * scales
+    for i, row in enumerate(test_data):
+      real_max = np.max(np.abs(row))
+      adjusted_max = np.max(np.abs(adjusted_test_data[i]))
+      # Check that some clipping occurred.
+      with self.subTest(name="CheckClipping"):
+        self.assertLess(adjusted_max, real_max)
+    with self.subTest(name="CheckQuantParamsValues"):
+      self.assertTrue(np.all(quant_params.zero_point == 0))
+      # See TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM.
+      self.assertEqual(quant_params.quantized_dimension, 1)
 if __name__ == "__main__":
   googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -16,9 +16,11 @@
 """Uniform quantize in tensor level."""
 import dataclasses
-from typing import Optional
+from typing import Optional, Sequence
+import ml_dtypes
 import numpy as np
 from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
 @dataclasses.dataclass(frozen=True)
@@ -120,19 +122,127 @@ def fix_quantization_params_rank(
   )
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def reshape_data_for_blockwise(
+    tensor_data: np.ndarray, op_name: qtyping.TFLOperationName, block_size: int
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    op_name: The name of the TFL op.
+    block_size: The size of the block.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+      op_name
+  ]
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  reshaped_data = tensor_data.reshape(new_shape)
+  return reshaped_data, quantized_dim + 1
+def _broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+      `quant_params.quantized_dimension` must be specified.
+      `quant_params.block_size` must be specified and positive.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  expanded_zp = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
+  )
 def uniform_quantize(
     tensor_data: np.ndarray,
     quantization_params: qtyping.UniformQuantParams,
+    is_blockwise: bool = False,
 ):
   """Uniform quantize a tensor.
   Args:
     tensor_data: The tensor to be quantized.
     quantization_params: The quantization parameters.
+    is_blockwise: Whether the tensor is blockwise quantized.
   Returns:
     The quantized tensor.
   """
+  # The reshaping for blockwise quantization is unique hence we do this here
+  # to avoid unexpected broadcast behavior downstream.
+  if is_blockwise:
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -242,15 +352,19 @@ def tensor_zp_scale_from_min_max(
     max_value,
     num_bits: int,
     symmetric: bool,
+    granularity: qtyping.QuantGranularity,
     clipping_values: Optional[np.ndarray] = None,
 ):
   """Get zero point and scale from min and max value.
   Args:
-    min_value: The minimum value of the tensor (channel-wise supported).
-    max_value: The maximum value of the tensor (channel-wise supported).
+    min_value: The minimum value of the tensor (channelwise and blockwise
+      supported).
+    max_value: The maximum value of the tensor (channelwise and blockwise
+      supported).
     num_bits: The number of bits of the tensor.
     symmetric: Whether the tensor is symmetric.
+    granularity: The granularity of the tensor.
     clipping_values: Absolute clipping values to apply to the tensor. This will
       clip the tensors to the range [-clipping_values, clipping_values]. This
       should be the same shape as min_value and max_value. If None, no clipping
@@ -267,6 +381,16 @@ def tensor_zp_scale_from_min_max(
   qmin, qmax = get_quantized_range(qtype)
   min_bound = 1e-4  # 1e-6 precision for int8 and 1e-8 for int16.
+  if granularity == qtyping.QuantGranularity.BLOCKWISE:
+    # Blockwise quantization uses float16 scale, with 7 bit mantissa,
+    # so the maximum representable value is 65280.
+    float16_max = np.broadcast_to(np.array(65280), min_value.shape)
+    clipping_values = (
+        float16_max
+        if clipping_values is None
+        else np.minimum(clipping_values, float16_max)
+    )
   if symmetric:
     bound = np.maximum(np.abs(min_value), np.abs(max_value))
     bound = np.maximum(bound, min_bound)
@@ -292,6 +416,12 @@ def tensor_zp_scale_from_min_max(
     zp = qmin - bound_min / scale
     zp = np.rint(zp)
+  if granularity == qtyping.QuantGranularity.BLOCKWISE:
+    # Round the scale values to 7 bit mantissa.
+    scale = (
+        scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
+    )
   # It's safe to cast zp to qtype without clipping because we can infer
   # qmin <= zp <= qmax from bound_min <= 0 <= bound_max.
   zp = assign_quantized_type(zp, qtype)

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py CHANGED Viewed

@@ -336,7 +336,11 @@ class TensorUtilsTest(parameterized.TestCase):
     max_val = np.max(self._test_data, keepdims=True)
     zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
-        min_val, max_val, num_bits, symmetric
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
     )
     self.assertEqual(zp.shape, scale.shape)
     max_q = 2**num_bits / 2 - 1
@@ -364,7 +368,12 @@ class TensorUtilsTest(parameterized.TestCase):
     max_val = np.array([[5.0]])
     clipping_values = np.array([4.0])
     zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
-        min_val, max_val, num_bits, symmetric, clipping_values
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
+        clipping_values,
     )
     expected_scale = clipping_values / quantized_bound

ai_edge_quantizer/algorithms/utils/common_utils.py CHANGED Viewed

@@ -905,23 +905,36 @@ def get_tensor_transformation_params(
   )
-def get_weight_quantized_dim(op_info: qtyping.OpInfo, tensor_data: np.ndarray):
+def get_weight_quantized_dim(
+    op_info: qtyping.OpInfo,
+    tensor_data: np.ndarray,
+    granularity: qtyping.QuantGranularity,
+):
   """Get the quantized dimension for the weight tensor.
   Args:
     op_info: Aggregated information about the op (e.g., quantization config).
     tensor_data: The weight tensor data.
+    granularity: The granularity of the weight tensor.
   Returns:
     The quantized dimension for the weight tensor.
   """
-  if op_info.op_name == _TFLOpName.BATCH_MATMUL:
-    quantized_dim = get_bmm_weight_quantized_dim(
-        tensor_data, adj_y=op_info.op.builtinOptions.adjY
-    )
-  else:
-    quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
-        op_info.op_name, None
+  quantized_dim = None
+  if granularity == qtyping.QuantGranularity.CHANNELWISE:
+    if op_info.op_name == _TFLOpName.BATCH_MATMUL:
+      quantized_dim = get_bmm_weight_quantized_dim(
+          tensor_data, adj_y=op_info.op.builtinOptions.adjY
+      )
+    else:
+      quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
+          op_info.op_name, None
+      )
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE:
+    quantized_dim = (
+        tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+            op_info.op_name
+        ]
     )
   return quantized_dim

{ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-quantizer-nightly
-Version: 0.1.0.dev20250513
+Version: 0.1.0.dev20250515
 Summary: A quantizer for advanced developers to quantize converted AI Edge models.
 Home-page: https://github.com/google-ai-edge/ai-edge-quantizer
 Keywords: On-Device ML,AI,Google,TFLite,Quantization,LLMs,GenAI

{ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info}/RECORD RENAMED Viewed

@@ -28,20 +28,20 @@ ai_edge_quantizer/algorithms/nonlinear_quantize/__init__.py,sha256=lpq1g2ayg3lCP
 ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting.py,sha256=Bs9CK7wZAw6jNaZ8xEtbwO2vM34VYXNZSMVWvxJo9nw,9297
 ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting_test.py,sha256=EqIHGEZ1LgUrTN7zf880RuAzEv3Qy7kgh5ivObJGHSo,22646
 ai_edge_quantizer/algorithms/uniform_quantize/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
-ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=-ugXQ4cZoVMrgOVs4m73ozI-49CRyT0YuKrLS5begW8,28297
-ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py,sha256=qMmKbWqxrCoVKbLKHn9WuCrGKPfHkEyU0Nmhokh8Qeo,2597
-ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py,sha256=Fk3s9Qy2A_hjUepFOUmTwIZ_wKYVPbdDX4eoP-eoAQU,8726
+ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=NpZ-JvZt2OhpTqH7Z81YYVjzOX_pHoDCt8rr3VIXJUY,28665
+ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py,sha256=GGf_n3wIeg3GB_eGsmyNJ0fTcxgpeMMbugTMRONK6TQ,3553
+ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py,sha256=BDdn_uBZakfHyzdMJPKadsOqxqyC-s6W2ZzFH99L4fE,8652
 ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py,sha256=sT5eX5TLZEHTtPfnSkCPDlS0sQxlTFWbCsbvOuj--yY,8889
-ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation.py,sha256=f9HhFCAavbrdYkQQH37ivbKRuRXC1g1TO2FmILMApN8,12389
-ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation_test.py,sha256=kN9aCPt1yTleiDBiH4g2RZ1vMBm7WAf5pmVFjmYCH-0,7617
-ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=Divlsn3NjNGtH0vlvE91wxL-VHb4q1nUE0JTDGiEtYc,8572
+ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation.py,sha256=pN4hwggrdI4eBdqvsdwnFagFxpd4D8LkWK0o4HG_xxk,12536
+ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation_test.py,sha256=MajG6DqpP4HvVzcZwgiKojWL3RBxCpkU3u2mKyeB0hA,9191
+ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=8_tNLTbOWTKId4DfHBjkOR9RvELUyIpxlGxKu7tv5Ko,7556
 ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py,sha256=zoF_EHjYqsKkuev8wfuutIITEmp_maa70IpJI_Df3ck,7431
-ai_edge_quantizer/algorithms/uniform_quantize/octav.py,sha256=e5wYtki-vl739gSVAZHAKcs2hA87GvFUjVoSUPlnkyM,6433
-ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py,sha256=IcTOaJ1pxtqsitqxOEP9LROVEP_19VFutHalqNied4I,6940
-ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=WmZzKQlzfu9gFr9SbUDoPY3rFqTl363om8-0rTLwotw,11629
-ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=G2PFpHhF-6OOuAwQ1lei63QEIm7uzIZJ62qpgA02qTM,12288
+ai_edge_quantizer/algorithms/uniform_quantize/octav.py,sha256=Umxh4kJyeHddZf-Wd4aXE5MTI1XWFa5KRuM17uYU714,6922
+ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py,sha256=sha1d99Xk87bI87tgz0g5LeDC-EeE4WMfM5rRC98-m4,9140
+ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=W2QbXP96xeleAmA7qFwco1iq_bOtArGDK6Qj_g6kNl8,15986
+ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=MgG7Qh2_z4I6InBqEEDSVlaR0q48aMz4xqAlxeG2EMk,12436
 ai_edge_quantizer/algorithms/utils/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
-ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=4qSlVNx3-91kJufnnJV1RdVRXBPapylZkrAp2nywoao,34581
+ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=UoZxeAQmZk3b3hK51KFwq6XfdbeduXVjdYIxAxlAzB8,34982
 ai_edge_quantizer/algorithms/utils/common_utils_test.py,sha256=zqapGEfYhjQWe9cNGPLmdbwtEUUYQRhlO_kNe0cXX6E,18104
 ai_edge_quantizer/transformations/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
 ai_edge_quantizer/transformations/dequant_insert.py,sha256=sL1LHFVzBDSd9jgrzlHz38LWU0bwmVX7iBkaNcui0ts,3566
@@ -70,8 +70,8 @@ ai_edge_quantizer/utils/tfl_interpreter_utils.py,sha256=WoewyiZpaua80oP0tpgyrw5W
 ai_edge_quantizer/utils/tfl_interpreter_utils_test.py,sha256=6fjkM-rycZ95L4yfvlr0TN6RlrhfPzxNUYrZaYO_F0A,12013
 ai_edge_quantizer/utils/validation_utils.py,sha256=oYw33Sg547AqtGw-choPUJmp9SAKkV46J_ddqSsum2Q,3950
 ai_edge_quantizer/utils/validation_utils_test.py,sha256=V_qNDikPD4OPB-siOLQCWNVWTAu87h2IgNYt7teFd-o,2934
-ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info/METADATA,sha256=zL_JxmjzCHEwIUmLkDGzI6B7IACt6YnVQSpaxaNUujY,1528
-ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
-ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info/RECORD,,
+ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info/METADATA,sha256=Rwa9ls9ryiTwntWB8-SCfO_uYjWMj3bqPTjEhIiQMyo,1528
+ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
+ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info/RECORD,,

{ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info}/LICENSE RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.1.0.dev20250513.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250515.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai-edge-quantizer-nightly 0.1.0.dev20250513__py3-none-any.whl → 0.1.0.dev20250515__py3-none-any.whl

ai-edge-quantizer-nightly 0.1.0.dev20250513py3-none-any.whl → 0.1.0.dev20250515py3-none-any.whl