PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250220__py3-none-any.whl → 0.0.1.dev20250222__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250220py3-none-any.whl → 0.0.1.dev20250222py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py ADDED Viewed

@@ -0,0 +1,250 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recovers quantized weights from dequantized weights (often from QAT)."""
+import dataclasses
+from typing import Any, Optional
+import numpy as np
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
+from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
+from ai_edge_quantizer.algorithms.utils import common_utils
+ALGORITHM_KEY = "dequantized_weight_recovery"
+_TFLOpName = qtyping.TFLOperationName
+_QuantTransformation = qtyping.QuantTransformation
+_IntType = uniform_quantize_tensor.IntType
+def _validate_recovered_weights(
+    original_vals: np.ndarray,
+    quant_vals: np.ndarray,
+    scale: np.ndarray,
+    tol: float = 1e-4,
+):
+  """Validates if recovered weights (from the quantized values) are close enough to the original ones.
+  Args:
+    original_vals: Original values before quantization.
+    quant_vals: Quantized values.
+    scale: Scale used for quantization.
+    tol: Tolerance for the difference between original and recovered values.
+  Raises:
+    RuntimeError: If the maximum difference between original and recovered
+    values exceeds the tolerance.
+  """
+  recovered_vals = quant_vals * scale
+  diff = np.abs(recovered_vals - original_vals).flatten()
+  max_diff = diff.max()
+  if max_diff > tol:
+    raise RuntimeError(
+        "Failed to recover the original quantized values from dequantized"
+        f" values. Max diff between recovered and original values: {max_diff}"
+    )
+def _get_scale(arr: np.ndarray, min_scale: float) -> float:
+  """Helper function to calculate scale from a 1D array."""
+  # Make sure the array includes zero (symmetric quantization).
+  arr = np.append(arr, 0)
+  unique_vals = np.unique(arr)
+  if unique_vals.size > 1:
+    diffs = np.diff(unique_vals)
+    return float(
+        np.maximum(np.min(diffs), min_scale)
+    )  # Cast to float to ensure return type consistency
+  return min_scale
+def get_zp_scale_from_2d_dequantized_symmetric_weights(
+    dequant_vals: np.ndarray,
+    quantized_dimension: Optional[int] = None,
+    min_scale: float = 1e-9,
+) -> tuple[np.ndarray, np.ndarray]:
+  """Calculates scale and zero point from 2D dequantized, symmetric weights.
+  Handles both per-tensor and per-channel (axis) quantization.
+  Args:
+      dequant_vals: The 2D dequantized weight values (numpy array).
+      quantized_dimension:  The dimension along which quantization was performed
+        (0 or 1), or None for per-tensor quantization.
+      min_scale: The minimum allowed scale value.
+  Returns:
+      A tuple containing:
+          - zero_points: Zero points (all zeros for symmetric quantization).
+          - scales: Scales (scalar for per-tensor, array for per-channel).
+  Raises:
+      ValueError: If `dequant_vals` is not 2D, or if
+          `quantized_dimension` is not 0, 1, or None.
+  """
+  if dequant_vals.ndim != 2:
+    raise ValueError(
+        f"Only 2D weights are supported. Got {dequant_vals.ndim} dimensions."
+    )
+  if quantized_dimension not in (0, 1, None):
+    raise ValueError(
+        f"quantized_dimension must be 0, 1, or None. Got {quantized_dimension}"
+    )
+  # Use absolute values for symmetric quantization.
+  dequant_vals = np.abs(dequant_vals)
+  if quantized_dimension is None:
+    # Per-tensor quantization: One scale for the entire tensor.
+    scales = _get_scale(dequant_vals.flatten(), min_scale)
+    scales = np.array([[scales]])
+  else:
+    # Per-channel quantization: A scale for each slice along the dimension.
+    scales = []
+    for i in range(dequant_vals.shape[quantized_dimension]):
+      if quantized_dimension == 0:
+        vec = dequant_vals[i, :]
+      else:  # quantized_dimension == 1
+        vec = dequant_vals[:, i]
+      scales.append(_get_scale(vec, min_scale))
+    # Reshape for correct broadcasting.
+    scales = (
+        np.array(scales).reshape(-1, 1)
+        if quantized_dimension == 0
+        else np.array(scales).reshape(1, -1)
+    )
+  zero_points = np.zeros_like(scales, dtype=np.int32)
+  return zero_points, scales
+def get_tensor_quant_params(
+    op_info: qtyping.OpInfo,
+    tensor_quant_config: qtyping.TensorQuantizationConfig,
+    tensor_content: Optional[np.ndarray] = None,
+    tensor_qsv: Optional[dict[str, Any]] = None,
+) -> qtyping.UniformQuantParams:
+  """Get the quantization parameters for a tensor.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    tensor_quant_config: The quantization config for the tensor.
+    tensor_content: The content of the tensor.
+    tensor_qsv: A dictionary containing the min/max of the tensor.
+  Returns:
+    The quantization parameters for the tensor.
+  Raises:
+    ValueError: If the quantization granularity is blockwise, or if the tensor
+    is not a 2D symmetric weight tensor.
+  """
+  # Fallback to naive_min_max_quantize.py for non-weight tensors.
+  if tensor_content is None:
+    return naive_min_max_quantize.get_tensor_quant_params(
+        op_info, tensor_quant_config, tensor_content, tensor_qsv
+    )
+  if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
+    raise ValueError(
+        "Blockwise quantization is not supported for dequantized weight"
+        " recovery."
+    )
+  if tensor_content.ndim != 2 or not tensor_quant_config.symmetric:
+    raise ValueError(
+        "Only 2D symmetric weights are supported for dequantized weight"
+        " recovery."
+    )
+  quantized_dim = None
+  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
+    quantized_dim = common_utils.get_weight_quantized_dim(
+        op_info, tensor_content
+    )
+  zp, scale = get_zp_scale_from_2d_dequantized_symmetric_weights(
+      dequant_vals=tensor_content,
+      quantized_dimension=quantized_dim,
+  )
+  quant_params = qtyping.UniformQuantParams(
+      scale=scale,
+      zero_point=zp,
+      num_bits=tensor_quant_config.num_bits,
+      symmetric=tensor_quant_config.symmetric,
+      quantized_dimension=quantized_dim,
+  )
+  quantized_vars = uniform_quantize_tensor.uniform_quantize(
+      tensor_content, quant_params
+  )
+  _validate_recovered_weights(tensor_content, quantized_vars, scale)
+  return dataclasses.replace(quant_params, quantized_data=quantized_vars)
+def calibrate(
+    tfl_op: Any,
+    graph_info: qtyping.GraphInfo,
+    tensor_content_map: dict[str, np.ndarray],
+    inputs_to_ignore: Optional[list[int]] = None,
+    outputs_to_ignore: Optional[list[int]] = None,
+) -> dict[str, qtyping.QSV]:
+  """Collect quantization statistics variable (QSV, e.g., min/max) for the op.
+  Args:
+    tfl_op: The tfl operation.
+    graph_info: Graph information needed to perform quantization for the op.
+    tensor_content_map: A map of tensor name to tensor content.
+    inputs_to_ignore: Input tensor indices to ignore.
+    outputs_to_ignore: Output tensor indices to ignore.
+  Returns:
+    A dictionary with key as tensor name and value as the collected QSV.
+  """
+  # Reuse the min/max calibration algorithm from naive_min_max_quantize.py since
+  # only weights need to be handled differently.
+  return naive_min_max_quantize.min_max_calibrate(
+      tfl_op,
+      graph_info,
+      tensor_content_map,
+      inputs_to_ignore,
+      outputs_to_ignore,
+  )
+def init_qsvs(
+    op_info: qtyping.OpInfo,
+    graph_info: qtyping.GraphInfo,
+    inputs_to_ignore: Optional[list[int]] = None,
+    outputs_to_ignore: Optional[list[int]] = None,
+) -> qtyping.QSV:
+  """Initialize the QSVs.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    graph_info: Graph information needed to perform quantization for the op.
+    inputs_to_ignore: Input tensor indices to ignore.
+    outputs_to_ignore: Output tensor indices to ignore.
+  Returns:
+    QSVs.
+  """
+  # Reuse the min/max calibration algorithm from naive_min_max_quantize.py since
+  # only weights need to be handeled differently.
+  return naive_min_max_quantize.init_qsvs(
+      op_info, graph_info, inputs_to_ignore, outputs_to_ignore
+  )

ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py ADDED Viewed

@@ -0,0 +1,215 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.platform import googletest
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import dequantized_weight_recovery
+_TFLOpName = qtyping.TFLOperationName
+_TensorQuantConfig = qtyping.TensorQuantizationConfig
+class DequantizedWeightRecoveryTest(parameterized.TestCase):
+  def setUp(self):
+    super().setUp()
+    self._dummy_quantized_weights = np.array([
+        [1, -2, 3, 4],
+        [6, 7, -6, 5],
+        [2, -6, -7, -4],
+    ])
+    self._dummy_op_info = qtyping.OpInfo(
+        op=None,
+        op_name=_TFLOpName.FULLY_CONNECTED,
+        subgraph_op_index=0,
+        op_quant_config=qtyping.OpQuantizationConfig(),
+    )
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="per-tensor-recovery",
+          quantized_dimension=None,
+          scale=np.array([0.1875]).reshape(1, 1),
+      ),
+      dict(
+          testcase_name="channel0-recovery",
+          quantized_dimension=0,
+          scale=np.array([0.1875, 1e-4, 12.3]).reshape(3, 1),
+      ),
+      dict(
+          testcase_name="channel1-recovery",
+          quantized_dimension=1,
+          scale=np.array([0.003, 1.234, 12.65, 2.24e-4]).reshape(1, 4),
+      ),
+  )
+  def test_tensor_zp_scale_from_2d_dequantized_symmetric_weights_success(
+      self, quantized_dimension, scale
+  ):
+    dequant_vals = scale * self._dummy_quantized_weights
+    zp, recovered_scale = (
+        dequantized_weight_recovery.get_zp_scale_from_2d_dequantized_symmetric_weights(
+            dequant_vals, quantized_dimension
+        )
+    )
+    self.assertEqual(recovered_scale.shape, scale.shape)
+    self.assertSequenceAlmostEqual(recovered_scale.flatten(), scale.flatten())
+    # Zero point should be zero for symmetric quantization.
+    self.assertEqual(np.sum(zp), 0)
+    self.assertEqual(zp.shape, scale.shape)
+  def test_tensor_zp_scale_from_2d_dequantized_symmetric_weights_raises_error_for_non_2d_weights(
+      self,
+  ):
+    weights_3d = self._dummy_quantized_weights.reshape(1, 3, 4)
+    weights_3d = weights_3d * 1.02
+    with self.assertRaisesRegex(
+        ValueError, "Only 2D weights are supported. Got 3 dimensions."
+    ):
+      dequantized_weight_recovery.get_zp_scale_from_2d_dequantized_symmetric_weights(
+          weights_3d, quantized_dimension=None
+      )
+  @parameterized.named_parameters(
+      dict(testcase_name="negative_dimension", quantized_dimension=-1),
+      dict(testcase_name="too_large_dimension", quantized_dimension=2),
+  )
+  def test_tensor_zp_scale_from_2d_dequantized_symmetric_weights_raises_error_for_invalid_quantized_dimension(
+      self, quantized_dimension
+  ):
+    dequant_vals = self._dummy_quantized_weights * 1.02
+    with self.assertRaisesRegex(
+        ValueError, "quantized_dimension must be 0, 1, or None. Got"
+    ):
+      dequantized_weight_recovery.get_zp_scale_from_2d_dequantized_symmetric_weights(
+          dequant_vals, quantized_dimension
+      )
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="tensor-recovery-tensor-quant",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              granularity=qtyping.QuantGranularity.TENSORWISE,
+          ),
+          scale=np.array([0.1875]).reshape(1, 1),
+      ),
+      dict(
+          testcase_name="channel-recovery-channel-quant",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              granularity=qtyping.QuantGranularity.CHANNELWISE,
+          ),
+          scale=np.array([0.1875, 1e-4, 12.3]).reshape(3, 1),
+      ),
+      dict(
+          testcase_name="channel-recovery-excessive-bits",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=8,  # int4 is enough for the sample weights.
+              granularity=qtyping.QuantGranularity.CHANNELWISE,
+          ),
+          scale=np.array([0.1875, 1e-4, 12.3]).reshape(3, 1),
+      ),
+  )
+  def test_get_tensor_quant_params_success_with_dequantized_weights(
+      self, tensor_quant_config, scale
+  ):
+    dequant_vals = scale * self._dummy_quantized_weights
+    tensor_quant_params = dequantized_weight_recovery.get_tensor_quant_params(
+        self._dummy_op_info, tensor_quant_config, dequant_vals
+    )
+    if tensor_quant_config.granularity is qtyping.QuantGranularity.TENSORWISE:
+      self.assertIsNone(tensor_quant_params.quantized_dimension)
+    else:
+      self.assertEqual(tensor_quant_params.quantized_dimension, 0)
+    recovered_scale = tensor_quant_params.scale
+    self.assertEqual(recovered_scale.shape, scale.shape)
+    self.assertSequenceAlmostEqual(recovered_scale.flatten(), scale.flatten())
+    # Zero point should be zero for symmetric quantization.
+    recovered_zp = tensor_quant_params.zero_point
+    self.assertEqual(np.sum(recovered_zp), 0)
+    self.assertEqual(recovered_zp.shape, scale.shape)
+  def test_get_tensor_quant_params_success_with_qsv(self):
+    # Fall back to naive_min_max_quantize.py for non-weight tensors.
+    tensor_quant_params = dequantized_weight_recovery.get_tensor_quant_params(
+        self._dummy_op_info,
+        tensor_quant_config=qtyping.TensorQuantizationConfig(
+            num_bits=8,
+            granularity=qtyping.QuantGranularity.TENSORWISE,
+        ),
+        tensor_qsv={
+            "min": np.array([-1]),
+            "max": np.array([1]),
+        },
+    )
+    self.assertIsNone(tensor_quant_params.quantized_dimension)
+    recovered_scale = tensor_quant_params.scale
+    self.assertEqual(recovered_scale.shape, (1,))
+    self.assertSequenceAlmostEqual(recovered_scale.flatten(), [1 / 127])
+    # Zero point should be zero for symmetric quantization.
+    recovered_zp = tensor_quant_params.zero_point
+    self.assertEqual(np.sum(recovered_zp), 0)
+    self.assertEqual(recovered_zp.shape, (1,))
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="recovery_on_wrong_dimension",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              granularity=qtyping.QuantGranularity.CHANNELWISE,
+          ),
+          scale=np.array([0.003, 1.234, 12.65, 2.24e-4]).reshape(1, 4),
+      ),
+      dict(
+          testcase_name="tensor_recovery_for_channel_quantization",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              granularity=qtyping.QuantGranularity.TENSORWISE,
+          ),
+          scale=np.array([0.1875, 1e-2, 12.3]).reshape(3, 1),
+      ),
+      dict(
+          testcase_name="insufficient_bits",
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=2,
+              granularity=qtyping.QuantGranularity.CHANNELWISE,
+          ),
+          scale=np.array([0.1875, 1e-2, 12.3]).reshape(3, 1),
+      ),
+  )
+  def test_get_tensor_quant_params_raises_error_big_recovery_error(
+      self, tensor_quant_config, scale
+  ):
+    dequant_vals = scale * self._dummy_quantized_weights
+    with self.assertRaisesRegex(
+        RuntimeError,
+        "Failed to recover the original quantized values from dequantized"
+        " values. Max diff between recovered and original values: ",
+    ):
+      dequantized_weight_recovery.get_tensor_quant_params(
+          self._dummy_op_info, tensor_quant_config, dequant_vals
+      )
+if __name__ == "__main__":
+  googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py CHANGED Viewed

@@ -15,6 +15,7 @@
 """Performs naive min/max uniform quantization."""
+from collections.abc import Sequence
 from typing import Any, Optional
 import numpy as np
 from ai_edge_quantizer import qtyping
@@ -36,55 +37,133 @@ def _init_tensor_min_max(
   if tensor_data is None:
     return {}
   else:
+    weight_tensor_config = op_info.op_quant_config.weight_tensor_config
     quantized_dim = None
+    if weight_tensor_config is not None and (
+        weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
+        or weight_tensor_config.granularity
+        == qtyping.QuantGranularity.BLOCKWISE
+    ):
+      quantized_dim = common_utils.get_weight_quantized_dim(
+          op_info, tensor_data
+      )
     if (
-        op_info.op_quant_config.weight_tensor_config is not None
-        and op_info.op_quant_config.weight_tensor_config.granularity
+        weight_tensor_config is not None
+        and weight_tensor_config.granularity
         == qtyping.QuantGranularity.BLOCKWISE
     ):
-      # TODO(b/346612503): emulate subchannel only supports fully connected,
-      # will skip special handling. Once we have a spec, we can change this.
-      block_size = op_info.op_quant_config.weight_tensor_config.block_size
-      # assuming tensor is 2D, which is correct for FULLY_CONNECTED
-      transposed_tensor_data = np.transpose(tensor_data, (1, 0))
-      if transposed_tensor_data.shape[0] % block_size:
-        raise ValueError(
-            f"Block size {block_size} does not divide channel dimension"
-            f" {transposed_tensor_data.shape[0]}."
-        )
-      reshaped_tensor_data = np.reshape(
-          transposed_tensor_data,
-          (
-              1,
-              int(transposed_tensor_data.shape[0] / block_size),
-              block_size,
-              transposed_tensor_data.shape[1],
-          ),
+      reshaped_data, reduce_dims = _reshape_data_for_blockwise(
+          tensor_data,
+          quantized_dim,
+          weight_tensor_config.block_size,
       )
       return {
-          "min": np.min(reshaped_tensor_data, axis=(0, 1, 2), keepdims=True),
-          "max": np.max(reshaped_tensor_data, axis=(0, 1, 2), keepdims=True),
+          "min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
+          "max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
       }
-    if (
-        op_info.op_quant_config.weight_tensor_config is not None
-        and op_info.op_quant_config.weight_tensor_config.granularity
-        == qtyping.QuantGranularity.CHANNELWISE
-    ):
-      if op_info.op_name == _TFLOpName.BATCH_MATMUL:
-        quantized_dim = common_utils.get_bmm_weight_quantized_dim(
-            tensor_data, adj_y=op_info.op.builtinOptions.adjY
-        )
-      else:
-        quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
-            op_info.op_name, None
-        )
-    reduce_dims = common_utils.get_reduce_dims(
-        quantized_dim, list(tensor_data.shape)
-    )
-    return {
-        "min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
-        "max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
-    }
+    else:
+      reduce_dims = common_utils.get_reduce_dims(
+          quantized_dim, tensor_data.shape
+      )
+      return {
+          "min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
+          "max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
+      }
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def _reshape_data_for_blockwise(
+    tensor_data: np.ndarray, quantized_dim: int, block_size: int
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  reshaped_data = tensor_data.reshape(new_shape)
+  return reshaped_data, quantized_dim + 1
+def _broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  expanded_zp = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
+  )
 def get_tensor_quant_params(
@@ -138,34 +217,34 @@ def get_tensor_quant_params(
       tensor_quant_config.symmetric,
   )
   quantized_dim = None
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
-    if op_info.op_name == _TFLOpName.BATCH_MATMUL:
-      quantized_dim = common_utils.get_bmm_weight_quantized_dim(
-          tensor_content, adj_y=op_info.op.builtinOptions.adjY
-      )
-    else:
-      quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM[
-          op_info.op_name
-      ]
+  if (
+      tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE
+      or tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE
+  ):
+    quantized_dim = common_utils.get_weight_quantized_dim(
+        op_info, tensor_content
+    )
   quant_params = qtyping.UniformQuantParams(
       scale=scale,
       zero_point=zp,
       num_bits=tensor_quant_config.num_bits,
       symmetric=tensor_quant_config.symmetric,
       quantized_dimension=quantized_dim,
+      block_size=tensor_quant_config.block_size,
   )
   if tensor_content is None:
     return quant_params
+  # The reshaping for blockwise quantization is unique hence we do this here
+  # to avoid unexpected broadcast behavior downstream.
   if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    quantized_vars = (
-        uniform_quantize_tensor.uniform_quantize_for_emulated_subchannel(
-            tensor_content, quant_params, tensor_quant_config.block_size
-        )
-    )
-  else:
-    quantized_vars = uniform_quantize_tensor.uniform_quantize(
+    quant_params = _broadcast_scale_zp_for_blockwise(
         tensor_content, quant_params
     )
+  quantized_vars = uniform_quantize_tensor.uniform_quantize(
+      tensor_content, quant_params
+  )
   # Update with quantized values.
   return qtyping.UniformQuantParams(
       scale=scale,
@@ -174,6 +253,7 @@ def get_tensor_quant_params(
       symmetric=tensor_quant_config.symmetric,
       quantized_dimension=quantized_dim,
       quantized_data=quantized_vars,
+      block_size=tensor_quant_config.block_size,
   )

ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # ==============================================================================
 import os
+from typing import cast
 from absl.testing import parameterized
 import numpy as np
@@ -21,6 +22,7 @@ import numpy as np
 from tensorflow.python.platform import googletest
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
+from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
 from ai_edge_quantizer.utils import test_utils
 from ai_edge_quantizer.utils import tfl_flatbuffer_utils
@@ -157,6 +159,49 @@ class NaiveMinMaxQuantizeTest(parameterized.TestCase):
     self.assertNotIn("arith.constant1", op_qsvs)
     self.assertNotIn("arith.constant2", op_qsvs)
+  def test_get_tensor_quant_params_for_blockwise_weight(self):
+    subgraph0 = self._test_model.subgraphs[0]
+    subgraph_op_index = 3
+    fc_op = subgraph0.operators[subgraph_op_index]
+    weight_tensor_config = _TensorQuantConfig(
+        num_bits=4,
+        symmetric=True,
+        granularity=qtyping.QuantGranularity.BLOCKWISE,
+        block_size=2,
+    )
+    op_info = qtyping.OpInfo(
+        op=fc_op,
+        op_name=_TFLOpName.FULLY_CONNECTED,
+        subgraph_op_index=subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=weight_tensor_config,
+        ),
+    )
+    test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
+    quant_params = naive_min_max_quantize.get_tensor_quant_params(
+        op_info=op_info,
+        tensor_quant_config=weight_tensor_config,
+        tensor_content=test_data,
+    )
+    scale = quant_params.scale
+    zp = quant_params.zero_point
+    expected_zp, expected_scale = (
+        uniform_quantize_tensor.tensor_zp_scale_from_min_max(
+            min_value=np.array([[-7, 4], [-4, -4]]),
+            max_value=np.array([[4, 7], [7, 7]]),
+            num_bits=4,
+            symmetric=True,
+        )
+    )
+    self.assertTrue(np.array_equal(zp, expected_zp))
+    self.assertTrue(np.array_equal(scale, expected_scale))
+    self.assertIsNotNone(quant_params.quantized_data)
+    self.assertTupleEqual(
+        cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
+    )
+    self.assertEqual(quant_params.block_size, 2)
+    self.assertEqual(quant_params.quantized_dimension, 0)
 if __name__ == "__main__":
   googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -119,55 +119,6 @@ def fix_quantization_params_rank(
   )
-def uniform_quantize_for_emulated_subchannel(
-    tensor_data: np.ndarray,
-    quantization_params: qtyping.UniformQuantParams,
-    block_size: int,
-) -> np.ndarray:
-  """Uniform quantize a tensor for emulated subchannel.
-  emulation involves reshaping the tensor and quantizing value on a different
-  axes. Hence, we use a different quantization function.
-  Args:
-    tensor_data: The tensor to be quantized.
-    quantization_params: The quantization parameters.
-    block_size: The block size of the emulated subchannel.
-  Returns:
-    The quantized tensor.
-  """
-  scales, zero_points = (
-      quantization_params.scale,
-      quantization_params.zero_point,
-  )
-  transposed_and_reshaped_tensor = np.reshape(
-      np.transpose(tensor_data, (1, 0)),
-      (
-          1,
-          int(tensor_data.shape[1] / block_size),
-          block_size,
-          tensor_data.shape[0],
-      ),
-  )
-  inverse_scales = 1.0 / scales
-  qtype = IntType(quantization_params.num_bits, signed=True)
-  # Symmetric means narrow range (e.g., -127 to 127)
-  narrow_range = quantization_params.symmetric
-  required_dtype = np.signedinteger if qtype.signed else np.unsignedinteger
-  if not np.issubdtype(zero_points.dtype, required_dtype):
-    raise ValueError(
-        f"zero_points need to be {required_dtype}."
-        f" But the actual type is {zero_points.dtype}."
-    )
-  ret = (
-      np.multiply(transposed_and_reshaped_tensor, inverse_scales) + zero_points
-  )
-  ret = _round_and_clip(ret, qtype, narrow_range)
-  ret = assign_quantized_type(ret, qtype)
-  return ret
 def uniform_quantize(
     tensor_data: np.ndarray,
     quantization_params: qtyping.UniformQuantParams,
@@ -369,3 +320,14 @@ def _is_valid_quantization_params(
         f" ({zero_point_rank}) must be the same as the tensor rank"
         f" ({tensor_rank})."
     )
+  if (
+      quantization_params.block_size != 0
+      and tensor_data.shape[quantization_params.quantized_dimension]
+      % quantization_params.block_size
+      != 0
+  ):
+    raise ValueError(
+        "Tensor dimension must be divisible by block size. Got dimension:"
+        f" {tensor_data.shape[quantization_params.quantized_dimension]} and"
+        f" block size: {quantization_params.block_size}"
+    )

ai_edge_quantizer/algorithms/utils/common_utils.py CHANGED Viewed

@@ -906,9 +906,30 @@ def get_tensor_transformation_params(
   )
+def get_weight_quantized_dim(op_info: qtyping.OpInfo, tensor_data: np.ndarray):
+  """Get the quantized dimension for the weight tensor.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    tensor_data: The weight tensor data.
+  Returns:
+    The quantized dimension for the weight tensor.
+  """
+  if op_info.op_name == _TFLOpName.BATCH_MATMUL:
+    quantized_dim = get_bmm_weight_quantized_dim(
+        tensor_data, adj_y=op_info.op.builtinOptions.adjY
+    )
+  else:
+    quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
+        op_info.op_name, None
+    )
+  return quantized_dim
 def get_reduce_dims(
     quantized_dim: Optional[int],
-    tensor_shape: list[int],
+    tensor_shape: Sequence[int],
 ) -> Optional[tuple[int, ...]]:
   """Get the reduce dims of a tensor for the given quantized dimension."""
   if quantized_dim is None:

{ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info → ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-quantizer-nightly
-Version: 0.0.1.dev20250220
+Version: 0.0.1.dev20250222
 Summary: A quantizer for advanced developers to quantize converted AI Edge models.
 Home-page: https://github.com/google-ai-edge/ai-edge-quantizer
 Keywords: On-Device ML,AI,Google,TFLite,Quantization,LLMs,GenAI

{ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info → ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info}/RECORD RENAMED Viewed

@@ -30,12 +30,14 @@ ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting_test.py,sha256=s64
 ai_edge_quantizer/algorithms/uniform_quantize/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
 ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=wPZevOuowJczG9t4Gynzv7tIeH6zhOnaKPsfr2K_fsk,21259
 ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py,sha256=qMmKbWqxrCoVKbLKHn9WuCrGKPfHkEyU0Nmhokh8Qeo,2597
-ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=WaN6w-DqQkSwNl8xsbsSPPY97oKohHpo-5Ng_5yAerw,9958
-ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py,sha256=3UV1I-to_u6NE_yKoXOVUOQgil-tMY6VQ_L273lMfqQ,5949
-ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=B-s1KMfb9tqvaDhHJV-M2zRR078z5Mwv-P9h77S3Mis,12229
+ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py,sha256=OTXjEZ3Ctq3ffYzisX-6HwgK_DuA7uos_aap5PiIUPE,8686
+ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py,sha256=y7BK11fkF63Ex_Jzg3fbIdy0D_Ca6HuvChVZR7Uwggc,8073
+ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=fBqSidFVKZmdO-xIFfwZPdIN1eLJjOik8mUZxZj2ljk,12149
+ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py,sha256=Hok09dloSyBfD0oDM5VABdSZjM9JWSQhm_hDHNbFujA,7640
+ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=Q_vx7YN7KMpjubsngxRdJ4bfdSIV-gmXjtVuxIkZuX4,11078
 ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=WZ4_bvbG999nOtCIqn7mrMnpRdoJOdiyzxhsL_QiPHA,11395
 ai_edge_quantizer/algorithms/utils/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
-ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=Z2ziBeADwov8rRN4pRX6Qr2L_agu8RRAbOKw0_yLG7E,33936
+ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=nlLpUY1LTO9ZC3X0FjQ0EArCZekGUnv2-IF0AUu5zNM,34582
 ai_edge_quantizer/algorithms/utils/common_utils_test.py,sha256=zqapGEfYhjQWe9cNGPLmdbwtEUUYQRhlO_kNe0cXX6E,18104
 ai_edge_quantizer/transformations/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
 ai_edge_quantizer/transformations/dequant_insert.py,sha256=sL1LHFVzBDSd9jgrzlHz38LWU0bwmVX7iBkaNcui0ts,3566
@@ -58,8 +60,8 @@ ai_edge_quantizer/utils/tfl_interpreter_utils.py,sha256=SM8H4i7Jq_nfdsJpImopHndN
 ai_edge_quantizer/utils/tfl_interpreter_utils_test.py,sha256=Op3JxtOqlrjzmYF18jnnstL1k9xiY9kKJ8S2vklKGkc,11327
 ai_edge_quantizer/utils/validation_utils.py,sha256=oYw33Sg547AqtGw-choPUJmp9SAKkV46J_ddqSsum2Q,3950
 ai_edge_quantizer/utils/validation_utils_test.py,sha256=V_qNDikPD4OPB-siOLQCWNVWTAu87h2IgNYt7teFd-o,2934
-ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info/METADATA,sha256=wc1t3VKLcToSVZ6MOwmrcNhWcy967d9mAaQlFF6w50s,1484
-ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
-ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info/RECORD,,
+ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info/METADATA,sha256=e9r1p0vAQtBGj4RIEtBbjmiyDyUVUmdNYNU8LqfDVGk,1484
+ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
+ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info/RECORD,,

{ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info → ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info}/LICENSE RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info → ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.0.1.dev20250220.dist-info → ai_edge_quantizer_nightly-0.0.1.dev20250222.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai-edge-quantizer-nightly 0.0.1.dev20250220__py3-none-any.whl → 0.0.1.dev20250222__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250220py3-none-any.whl → 0.0.1.dev20250222py3-none-any.whl