PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.1.0.dev20250512__py3-none-any.whl → 0.1.0.dev20250514__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.1.0.dev20250512py3-none-any.whl → 0.1.0.dev20250514py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ai_edge_quantizer/algorithm_manager.py CHANGED Viewed

@@ -24,6 +24,7 @@ from ai_edge_quantizer import qtyping
 from ai_edge_quantizer.algorithms.nonlinear_quantize import float_casting
 from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
 from ai_edge_quantizer.algorithms.uniform_quantize import dequantized_weight_recovery
+from ai_edge_quantizer.algorithms.uniform_quantize import hadamard_rotation
 from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
 from ai_edge_quantizer.algorithms.uniform_quantize import octav
@@ -58,6 +59,8 @@ class AlgorithmName(str, enum.Enum):
   FLOAT_CASTING = float_casting.ALGORITHM_KEY
   DEQUANTIZED_WEIGHT_RECOVERY = dequantized_weight_recovery.ALGORITHM_KEY
   OCTAV = octav.ALGORITHM_KEY
+  HADAMARD_ROTATION = hadamard_rotation.ALGORITHM_KEY
 ### MIN/MAX_UNIFORM_QUANT ###
@@ -104,6 +107,7 @@ MIN_MAX_OP_NAME_MATERIALIZE_FUNC_DICT = {
         common_quantize.materialize_dynamic_update_slice
     ),
     _TFLOpName.STABLEHLO_COMPOSITE: common_quantize.materialize_composite,
+    _TFLOpName.PAD: common_quantize.materialize_pad,
 }
 for op_name, materialize_func in MIN_MAX_OP_NAME_MATERIALIZE_FUNC_DICT.items():
   register_quantized_op(
@@ -237,6 +241,7 @@ _OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT = immutabledict({
         common_quantize.materialize_dynamic_update_slice
     ),
     _TFLOpName.STABLEHLO_COMPOSITE: common_quantize.materialize_composite,
+    _TFLOpName.PAD: common_quantize.materialize_pad,
 })
 for op_name, materialize_func in _OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT.items():
@@ -250,3 +255,32 @@ for op_name, materialize_func in _OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT.items():
           octav.get_tensor_quant_params,
       ),
   )
+# Register the Hadamard Rotation algorithm.
+register_op_quant_config_validation_func(
+    AlgorithmName.HADAMARD_ROTATION,
+    common_quantize.check_op_quantization_config,
+)
+# Register a config check policy for the Hadamard Rotation algorithm.
+register_config_check_policy_func(
+    AlgorithmName.HADAMARD_ROTATION,
+    default_policy.DEFAULT_CONFIG_CHECK_POLICY,
+)
+# Register specialized hadamard rotation materialize functions.
+_HADAMARD_ROTATION_OP_NAME_MATERIALIZE_FUNC_DICT = immutabledict({
+    _TFLOpName.FULLY_CONNECTED: hadamard_rotation.materialize_fully_connected,
+    _TFLOpName.EMBEDDING_LOOKUP: hadamard_rotation.materialize_embedding_lookup,
+})
+for (
+    op_name,
+    materialize_func,
+) in _HADAMARD_ROTATION_OP_NAME_MATERIALIZE_FUNC_DICT.items():
+  register_quantized_op(
+      AlgorithmName.HADAMARD_ROTATION,
+      op_name,
+      naive_min_max_quantize.init_qsvs,
+      calibration_func=naive_min_max_quantize.min_max_calibrate,
+      materialize_func=materialize_func,
+  )

ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py CHANGED Viewed

@@ -680,6 +680,23 @@ def materialize_split(
   )
+def materialize_pad(
+    get_tensor_quant_params_fn: qtyping.GetTensorQuantParamsFuncSignature,
+    op_info: qtyping.OpInfo,
+    graph_info: qtyping.GraphInfo,
+    tensor_name_to_qsv: dict[str, Any],
+) -> list[qtyping.TensorTransformationParams]:
+  """Materialize tensors in tfl.pad."""
+  return common_utils.materialize_standard_op(
+      op_info,
+      graph_info,
+      tensor_name_to_qsv,
+      get_tensor_quant_params_fn,
+      constraint=_OpQuantConstraint.SAME_AS_INPUT_SCALE,
+      inputs_to_ignore=[1],  # Padding value does not need to be quantized.
+  )
 def _get_tensor_shape_for_blockwise(
     tensor_shape: Sequence[int], quantized_dim: int, block_size: int
 ) -> list[int]:
@@ -709,18 +726,29 @@ def _get_tensor_shape_for_blockwise(
 def _reshape_data_for_blockwise(
-    tensor_data: np.ndarray, quantized_dim: int, block_size: int
+    tensor_data: np.ndarray,
+    quantized_dim: int,
+    block_size: int,
 ) -> tuple[np.ndarray, int]:
   """Reshapes data for blockwise quantization.
   Args:
     tensor_data: The original tensor data.
     quantized_dim: The dimension to be quantized blockwise.
-    block_size: The size of the block.
+    block_size: The size of the block. `block_size must be a multiple of 32. `
+      `The tensor quantized dimension shape must be divisible by block_size.
   Returns:
     A tuple containing the reshaped tensor data and the new reduce dimension.
   """
+  # TODO: b/417508018 - create AEQ specific error class instead of
+  # using generic ValueError.
+  if tensor_data.shape[quantized_dim] % block_size != 0:
+    raise ValueError(
+        "Tensor quantization dimension must be divisible by block size for"
+        " blockwise quantization."
+    )
   new_shape = _get_tensor_shape_for_blockwise(
       tensor_data.shape, quantized_dim, block_size
   )
@@ -801,22 +829,19 @@ def init_tensor_min_max(
         weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
     ):
       quantized_dim = common_utils.get_weight_quantized_dim(
-          op_info, tensor_data
+          op_info, tensor_data, weight_tensor_config.granularity
       )
     if (
         weight_tensor_config is not None
         and weight_tensor_config.granularity
         == qtyping.QuantGranularity.BLOCKWISE
     ):
-      quantized_dim = (
-          tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
-              op_info.op_name
-          ]
-      )
-      reshaped_data, reduce_dims = _reshape_data_for_blockwise(
-          tensor_data,
-          quantized_dim,
-          weight_tensor_config.block_size,
+      reshaped_data, reduce_dims = (
+          uniform_quantize_tensor.reshape_data_for_blockwise(
+              tensor_data,
+              op_info.op_name,
+              weight_tensor_config.block_size,
+          )
       )
       return {
           "min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),

ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py CHANGED Viewed

@@ -31,8 +31,7 @@ _TensorQuantConfig = qtyping.TensorQuantizationConfig
 class CommonQuantizeTest(parameterized.TestCase):
-  """Tests for general quantize functions.
-  """
+  """Tests for general quantize functions."""
   def setUp(self):
     super().setUp()
@@ -69,6 +68,34 @@ class CommonQuantizeTest(parameterized.TestCase):
           default_policy.DEFAULT_CONFIG_CHECK_POLICY,
       )
+  def test_reshape_data_for_blockwise_raises_error_when_quantized_dim_not_divisible_by_block_size(
+      self,
+  ):
+    tensor_data = np.ones((24, 128), dtype=np.float32)
+    block_size = 256
+    quantized_dim = 1
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        lambda err: (
+            "Tensor quantization dimension must be divisible by block"
+            " size for blockwise quantization."
+        )
+        in str(err),
+    ):
+      common_quantize._reshape_data_for_blockwise(
+          tensor_data, quantized_dim, block_size
+      )
+  def test_reshape_data_for_blockwise_returns_correct_values(self):
+    tensor_data = np.ones((24, 128), dtype=np.float32)
+    block_size = 32
+    quantized_dim = 1
+    new_tensor_data, reduce_dim = common_quantize._reshape_data_for_blockwise(
+        tensor_data, quantized_dim, block_size
+    )
+    self.assertEqual(new_tensor_data.shape, (24, 4, 32))
+    self.assertEqual(reduce_dim, 2)
 if __name__ == "__main__":
   googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py CHANGED Viewed

@@ -168,11 +168,9 @@ def get_tensor_quant_params(
         "Only symmetric weights are supported for dequantized weight recovery."
     )
-  quantized_dim = None
-  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
-    quantized_dim = common_utils.get_weight_quantized_dim(
-        op_info, tensor_content
-    )
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
   zp, scale = get_zp_scale_from_dequantized_symmetric_weights(
       dequant_vals=tensor_content,

ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation.py ADDED Viewed

@@ -0,0 +1,357 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements the Hadamard Rotation quantization."""
+from typing import Any, Optional
+import numpy as np
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import octav
+from ai_edge_quantizer.algorithms.utils import common_utils
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
+ALGORITHM_KEY = "HADAMARD_ROTATION"
+def _make_hadamard_matrix(size: int) -> np.ndarray:
+  """Generates a Hadamard matrix of the given size.
+  Args:
+    size: The size of the Hadamard matrix. Must be a power of 2. This
+      represents a single dimension. E.g. if size is 4, then the Hadamard matrix
+      is a 4x4 matrix.
+  Returns:
+    The Hadamard matrix.
+  Raises:
+    ValueError: If the size is not a power of 2.
+  """
+  if size <= 0 or (size & (size - 1)) != 0:
+    raise ValueError("Hadamard matrix size must be a power of 2. ")
+  h = h2 = np.array([[1, 1], [1, -1]])
+  current_size = 2
+  while current_size < size:
+    h = np.kron(h, h2)
+    current_size *= 2
+  return h / np.sqrt(size)
+def _rotate_with_diagonal_hadamard(
+    tensor_content: np.ndarray,
+    axis: int,
+):
+  """Quantizes the given float array using the diagonal Hadamard algorithm.
+  Args:
+    tensor_content: The float array to quantize.
+    axis: The axis of the tensor to quantize.
+  Returns:
+    A tuple containing the quantized array and the recovered array.
+  Raises:
+    ValueError: If the axis is not 1. To support other axes, please add
+      support to the matrix multiplication.
+  """
+  if axis != 1:
+    raise ValueError(
+        "Hadamard rotation is only supported for 2D tensors with quantized"
+        " dimension 0."
+    )
+  # Use the largest power of 2 that is a factor of the dimension and then
+  # tile this Hadamard matrix along the diagonal. 2**30 is just a large power
+  # of 2 to calculate this factor.
+  hadamard_size = np.gcd(tensor_content.shape[axis], 2 ** 30)
+  diagonal_size = tensor_content.shape[axis] // hadamard_size
+  output_size = tensor_content.shape[1 - axis]
+  random_vector = np.ones(hadamard_size, dtype=np.int8)
+  # Use a canonical Hadamard matrix.
+  hadamard = _make_hadamard_matrix(hadamard_size)
+  reshaped_tensor = tensor_content.reshape(
+      diagonal_size, output_size, hadamard_size
+  )
+  w_rotated = np.einsum("jk,ilk->ilj", hadamard, reshaped_tensor)
+  return w_rotated.reshape(tensor_content.shape), hadamard_size, random_vector
+def get_tensor_quant_params(
+    op_info: qtyping.OpInfo,
+    tensor_quant_config: qtyping.TensorQuantizationConfig,
+    tensor_content: Optional[np.ndarray] = None,
+    tensor_qsv: Optional[dict[str, Any]] = None,
+) -> qtyping.UniformQuantParams:
+  """Returns the quantization parameters for a tensor.
+  This function will rotate the tensor with a Hadamard matrix and then
+  quantize it with OCTAV.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    tensor_quant_config: The quantization config for the tensor.
+    tensor_content: The content of the tensor. When None, it means the tensor is
+      not a weight tensor (e.g. static quantization).
+    tensor_qsv: A dictionary containing the min/max of the tensor.
+  Raises:
+    ValueError: If the blockwise quantization is requested.
+    ValueError: If the asymmetric quantization is requested.
+    ValueError: `tensor_qsv` must contain min/max values, or `tensor_content`
+      must be provided so that they can be inferred.
+  """
+  if tensor_content is None:
+    raise ValueError("Hadamard rotation is only supported for weight tensors.")
+  if tensor_qsv is not None:
+    raise ValueError(
+        "Hadamard rotation is not supported for static quantization."
+    )
+  if tensor_content.ndim != 2:
+    raise ValueError("Hadamard rotation is only supported for 2D tensors.")
+  if tensor_quant_config.granularity != qtyping.QuantGranularity.CHANNELWISE:
+    raise ValueError(
+        "Hadamard rotation is not supported for"
+        f" {tensor_quant_config.granularity} granularity."
+    )
+  quantized_dim = common_utils.get_weight_quantized_dim(
+      op_info, tensor_content, tensor_quant_config.granularity
+  )
+  if quantized_dim != 0:
+    raise ValueError(
+        f"Unsupported quantized dimension: {quantized_dim}. Only 0 is"
+        " supported."
+    )
+  # Reduction axis is the non-quantized dimension. Since we only support 2D
+  # tensors and quantized_dim of 0, the reduction axis is 1.
+  reduce_axis = 1
+  # Rotate the tensor with a Hadamard matrix.
+  w_rotated, hadamard_size, random_vector = _rotate_with_diagonal_hadamard(
+      tensor_content, axis=reduce_axis
+  )
+  # Get the quantized values of the rotated tensor.
+  qparams = octav.get_tensor_quant_params(
+      op_info, tensor_quant_config, w_rotated, tensor_qsv
+  )
+  return qtyping.UniformQuantParams(
+      quantized_dimension=qparams.quantized_dimension,
+      num_bits=qparams.num_bits,
+      scale=qparams.scale,
+      zero_point=qparams.zero_point,
+      symmetric=qparams.symmetric,
+      quantized_data=qparams.quantized_data,
+      block_size=qparams.block_size,
+      hadamard=qtyping.UniformQuantParams.HadamardRotationParams(
+          random_binary_vector=random_vector,
+          hadamard_size=hadamard_size,
+      ),
+  )
+def materialize_fully_connected(
+    op_info: qtyping.OpInfo,
+    graph_info: qtyping.GraphInfo,
+    tensor_name_to_qsv: Optional[dict[str, Any]] = None,  # pylint: disable=unused-argument
+) -> list[qtyping.TensorTransformationParams]:
+  """Materialize the fully_connected op.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    graph_info: Graph information needed to perform quantization for the op.
+    tensor_name_to_qsv: A map of tensor name to quantization parameters.
+  Returns:
+    Quantization configuration for the tensors associated with the op (e.g.,
+    weights, bias).
+  """
+  op_tensor_params = []
+  # Materialize weight.
+  weight_tensor_index = 1
+  weight_tensor = graph_info.subgraph_tensors[
+      op_info.op.inputs[weight_tensor_index]
+  ]
+  tensor_data = tfl_flatbuffer_utils.get_tensor_data(
+      weight_tensor, graph_info.buffers
+  )
+  # quant_params contains the rotated and quantized weights done by
+  # get_tensor_quant_params().
+  quant_params = get_tensor_quant_params(
+      op_info,
+      op_info.op_quant_config.weight_tensor_config,
+      tensor_data,
+      None,
+  )
+  transformations = [qtyping.QuantTransformation.QUANTIZE_TENSOR]
+  op2tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      parameters=quant_params,
+      transformations=transformations,
+  )
+  weight_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(weight_tensor),
+      consumers=[op2tensor_params],
+  )
+  # Materialize input. A hadamard rotation op should be inserted on the input
+  # tensor to do the inverse of the weight's transformation.
+  input_tensor_index = 0
+  input_tensor = graph_info.subgraph_tensors[
+      op_info.op.inputs[input_tensor_index]
+  ]
+  transformations = [
+      qtyping.QuantTransformation.INSERT_HADAMARD_ROTATION,
+  ]
+  op2tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      parameters=quant_params,
+      transformations=transformations,
+  )
+  input_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(input_tensor),
+      consumers=[op2tensor_params],
+  )
+  op_tensor_params.append(input_transformation_params)
+  op_tensor_params.append(weight_transformation_params)
+  # Materialize bias. Since static quantization is not supported, we do not
+  # quantize the bias tensor.
+  bias_tensor_index = 2
+  bias_tensor = graph_info.subgraph_tensors[
+      op_info.op.inputs[bias_tensor_index]
+  ]
+  no_quant_tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      transformations=[qtyping.QuantTransformation.NO_QUANTIZE],
+  )
+  bias_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(bias_tensor),
+      consumers=[no_quant_tensor_params],
+  )
+  op_tensor_params.append(bias_transformation_params)
+  # Materialize output. Since static quantization is not supported, we do not
+  # quantize the output tensor.
+  output_tensor_index = 0
+  output_tensor = graph_info.subgraph_tensors[
+      op_info.op.outputs[output_tensor_index]
+  ]
+  no_quant_tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      transformations=[qtyping.QuantTransformation.NO_QUANTIZE],
+  )
+  output_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(output_tensor),
+      producer=no_quant_tensor_params,
+  )
+  op_tensor_params.append(output_transformation_params)
+  return op_tensor_params
+def materialize_embedding_lookup(
+    op_info: qtyping.OpInfo,
+    graph_info: qtyping.GraphInfo,
+    tensor_name_to_qsv: Optional[dict[str, Any]] = None,  # pylint: disable=unused-argument
+) -> list[qtyping.TensorTransformationParams]:
+  """Materialize the embedding_lookup op.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    graph_info: Graph information needed to perform quantization for the op.
+    tensor_name_to_qsv: A map of tensor name to quantization parameters.
+  Returns:
+    Quantization configuration for the tensors associated with the op (e.g.,
+    weights, bias).
+  """
+  op_tensor_params = []
+  # Materialize lookup.
+  lookup_tensor_index = 0
+  lookup_tensor = graph_info.subgraph_tensors[
+      op_info.op.inputs[lookup_tensor_index]
+  ]
+  transformations = [
+      qtyping.QuantTransformation.NO_QUANTIZE,
+  ]
+  op2tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      parameters=None,
+      transformations=transformations,
+  )
+  lookup_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(lookup_tensor),
+      consumers=[op2tensor_params],
+  )
+  op_tensor_params.append(lookup_transformation_params)
+  # Materialize embedding. The embedding table should be rotated and then
+  # quantized.
+  embedding_tensor_index = 1
+  embedding_tensor = graph_info.subgraph_tensors[
+      op_info.op.inputs[embedding_tensor_index]
+  ]
+  tensor_data = tfl_flatbuffer_utils.get_tensor_data(
+      embedding_tensor, graph_info.buffers
+  )
+  quant_params = get_tensor_quant_params(
+      op_info,
+      op_info.op_quant_config.weight_tensor_config,
+      tensor_data,
+      None,
+  )
+  transformations = [qtyping.QuantTransformation.QUANTIZE_TENSOR]
+  op2tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      parameters=quant_params,
+      transformations=transformations,
+  )
+  weight_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(embedding_tensor),
+      consumers=[op2tensor_params],
+  )
+  op_tensor_params.append(weight_transformation_params)
+  # Materialize output. A hadamard rotation op should be inserted on the output
+  # tensor to do the inverse of the embedding's transformation.
+  output_tensor_index = 0
+  output_tensor = graph_info.subgraph_tensors[
+      op_info.op.outputs[output_tensor_index]
+  ]
+  transformations = [
+      qtyping.QuantTransformation.INSERT_HADAMARD_ROTATION,
+  ]
+  op2tensor_params = qtyping.OpToTensorParams(
+      subgraph_op_id=op_info.subgraph_op_index,
+      parameters=quant_params,
+      transformations=transformations,
+  )
+  output_transformation_params = qtyping.TensorTransformationParams(
+      tensor_name=tfl_flatbuffer_utils.get_tensor_name(output_tensor),
+      producer=op2tensor_params,
+  )
+  op_tensor_params.append(output_transformation_params)
+  return op_tensor_params

ai-edge-quantizer-nightly 0.1.0.dev20250512__py3-none-any.whl → 0.1.0.dev20250514__py3-none-any.whl

ai-edge-quantizer-nightly 0.1.0.dev20250512py3-none-any.whl → 0.1.0.dev20250514py3-none-any.whl