PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250317__py3-none-any.whl → 0.1.0.dev20250319__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250317py3-none-any.whl → 0.1.0.dev20250319py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

ai_edge_quantizer/algorithm_manager.py CHANGED Viewed

@@ -17,6 +17,7 @@
 import enum
 import functools
+from immutabledict import immutabledict
 from ai_edge_quantizer import algorithm_manager_api
 from ai_edge_quantizer import default_policy
 from ai_edge_quantizer import qtyping
@@ -24,6 +25,7 @@ from ai_edge_quantizer.algorithms.nonlinear_quantize import float_casting
 from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
 from ai_edge_quantizer.algorithms.uniform_quantize import dequantized_weight_recovery
 from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
+from ai_edge_quantizer.algorithms.uniform_quantize import octav
 # TODO: b/399775701 - Clean up this file.
@@ -55,6 +57,7 @@ class AlgorithmName(str, enum.Enum):
   MIN_MAX_UNIFORM_QUANT = naive_min_max_quantize.ALGORITHM_KEY
   FLOAT_CASTING = float_casting.ALGORITHM_KEY
   DEQUANTIZED_WEIGHT_RECOVERY = dequantized_weight_recovery.ALGORITHM_KEY
+  OCTAV = octav.ALGORITHM_KEY
 ### MIN/MAX_UNIFORM_QUANT ###
@@ -188,3 +191,61 @@ for (
           dequantized_weight_recovery.get_tensor_quant_params,
       ),
   )
+# Register OCTAV algorithm.
+register_op_quant_config_validation_func(
+    AlgorithmName.OCTAV,
+    common_quantize.check_op_quantization_config,
+)
+# Register a config check policy for OCTAV algorithm.
+register_config_check_policy_func(
+    AlgorithmName.OCTAV,
+    default_policy.DEFAULT_CONFIG_CHECK_POLICY,
+)
+_OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT = immutabledict({
+    _TFLOpName.INPUT: common_quantize.materialize_input,
+    _TFLOpName.OUTPUT: common_quantize.materialize_output,
+    _TFLOpName.FULLY_CONNECTED: common_quantize.materialize_fc_conv,
+    _TFLOpName.BATCH_MATMUL: common_quantize.materialize_batch_matmul,
+    _TFLOpName.CONV_2D: common_quantize.materialize_fc_conv,
+    _TFLOpName.DEPTHWISE_CONV_2D: common_quantize.materialize_fc_conv,
+    _TFLOpName.CONV_2D_TRANSPOSE: common_quantize.materialize_conv2d_transpose,
+    _TFLOpName.RESHAPE: common_quantize.materialize_reshape,
+    _TFLOpName.AVERAGE_POOL_2D: common_quantize.materialize_average_pool_2d,
+    _TFLOpName.EMBEDDING_LOOKUP: common_quantize.materialize_embedding_lookup,
+    _TFLOpName.SOFTMAX: common_quantize.materialize_softmax_and_logistic,
+    _TFLOpName.TANH: common_quantize.materialize_tanh,
+    _TFLOpName.TRANSPOSE: common_quantize.materialize_transpose,
+    _TFLOpName.GELU: common_quantize.materialize_gelu,
+    _TFLOpName.ADD: common_quantize.materialize_add,
+    _TFLOpName.SUB: common_quantize.materialize_sub,
+    _TFLOpName.MUL: common_quantize.materialize_mul,
+    _TFLOpName.MEAN: common_quantize.materialize_mean,
+    _TFLOpName.RSQRT: common_quantize.materialize_rsqrt,
+    _TFLOpName.CONCATENATION: common_quantize.materialize_concatenation,
+    _TFLOpName.STRIDED_SLICE: common_quantize.materialize_strided_slice,
+    _TFLOpName.SPLIT: common_quantize.materialize_split,
+    _TFLOpName.LOGISTIC: common_quantize.materialize_softmax_and_logistic,
+    _TFLOpName.SLICE: common_quantize.materialize_slice,
+    _TFLOpName.SUM: common_quantize.materialize_sum,
+    _TFLOpName.SELECT_V2: common_quantize.materialize_select_v2,
+    _TFLOpName.DYNAMIC_UPDATE_SLICE: (
+        common_quantize.materialize_dynamic_update_slice
+    ),
+    _TFLOpName.STABLEHLO_COMPOSITE: common_quantize.materialize_composite,
+})
+for op_name, materialize_func in _OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT.items():
+  register_quantized_op(
+      AlgorithmName.OCTAV,
+      op_name,
+      naive_min_max_quantize.init_qsvs,
+      calibration_func=naive_min_max_quantize.min_max_calibrate,
+      materialize_func=functools.partial(
+          materialize_func,
+          octav.get_tensor_quant_params,
+      ),
+  )

ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py CHANGED Viewed

@@ -23,7 +23,7 @@ to implement the get_tensor_quant_params_fn with the
 qtyping.GetTensorQuantParamsFuncSignature signature.
 """
-from typing import Any
+from typing import Any, Optional, Sequence
 import numpy as np
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
@@ -669,3 +669,153 @@ def materialize_split(
       constraint=_OpQuantConstraint.SAME_AS_INPUT_SCALE,
       inputs_to_ignore=[0],  # Split dimension does not need to be quantized.
   )
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def _reshape_data_for_blockwise(
+    tensor_data: np.ndarray, quantized_dim: int, block_size: int
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  reshaped_data = tensor_data.reshape(new_shape)
+  return reshaped_data, quantized_dim + 1
+def broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+      `quant_params.quantized_dimension` must be specified.
+      `quant_params.block_size` must be specified and positive.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  expanded_zp = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
+  )
+def init_tensor_min_max(
+    tensor_data: Optional[np.ndarray],
+    op_info: qtyping.OpInfo,
+) -> qtyping.QSV:
+  """Initialize the min/max for a tensor.
+  This function initializes the min/max values for a tensor.
+  Args:
+    tensor_data: The tensor data.
+    op_info: Aggregated information about the op (e.g., quantization config).
+  Returns:
+    A dictionary containing the min/max values for the tensor, or an empty
+    dictionary if the tensor data is None.
+  """
+  if tensor_data is None:
+    return {}
+  else:
+    weight_tensor_config = op_info.op_quant_config.weight_tensor_config
+    quantized_dim = None
+    if weight_tensor_config is not None and (
+        weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
+        or weight_tensor_config.granularity
+        == qtyping.QuantGranularity.BLOCKWISE
+    ):
+      quantized_dim = common_utils.get_weight_quantized_dim(
+          op_info, tensor_data
+      )
+    if (
+        weight_tensor_config is not None
+        and weight_tensor_config.granularity
+        == qtyping.QuantGranularity.BLOCKWISE
+    ):
+      reshaped_data, reduce_dims = _reshape_data_for_blockwise(
+          tensor_data,
+          quantized_dim,
+          weight_tensor_config.block_size,
+      )
+      return {
+          "min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
+          "max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
+      }
+    else:
+      reduce_dims = common_utils.get_reduce_dims(
+          quantized_dim, tensor_data.shape
+      )
+      return {
+          "min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
+          "max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
+      }

ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py CHANGED Viewed

@@ -15,10 +15,10 @@
 """Performs naive min/max uniform quantization."""
-from collections.abc import Sequence
 from typing import Any, Optional
 import numpy as np
 from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
 from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
 from ai_edge_quantizer.algorithms.utils import common_utils
 from ai_edge_quantizer.utils import tfl_flatbuffer_utils
@@ -29,143 +29,6 @@ _QuantTransformation = qtyping.QuantTransformation
 _IntType = uniform_quantize_tensor.IntType
-def _init_tensor_min_max(
-    tensor_data: Optional[np.ndarray],
-    op_info: qtyping.OpInfo,
-) -> qtyping.QSV:
-  """Initialize the min/max for a tensor."""
-  if tensor_data is None:
-    return {}
-  else:
-    weight_tensor_config = op_info.op_quant_config.weight_tensor_config
-    quantized_dim = None
-    if weight_tensor_config is not None and (
-        weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
-        or weight_tensor_config.granularity
-        == qtyping.QuantGranularity.BLOCKWISE
-    ):
-      quantized_dim = common_utils.get_weight_quantized_dim(
-          op_info, tensor_data
-      )
-    if (
-        weight_tensor_config is not None
-        and weight_tensor_config.granularity
-        == qtyping.QuantGranularity.BLOCKWISE
-    ):
-      reshaped_data, reduce_dims = _reshape_data_for_blockwise(
-          tensor_data,
-          quantized_dim,
-          weight_tensor_config.block_size,
-      )
-      return {
-          "min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
-          "max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
-      }
-    else:
-      reduce_dims = common_utils.get_reduce_dims(
-          quantized_dim, tensor_data.shape
-      )
-      return {
-          "min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
-          "max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
-      }
-def _get_tensor_shape_for_blockwise(
-    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
-) -> list[int]:
-  """Get the tensor shape for blockwise quantization.
-  This function splits the quantize dimension of the tensor into blocks and the
-  dim/blocks. Hence, min/max of the tensor can be calculated for each block
-  using existing functions.
-  Args:
-    tensor_shape: The original shape of the tensor.
-    quantized_dim: The dimension to be quantized blockwise.
-    block_size: The size of the block.
-  Returns:
-    The new tensor shape for calculating scale and zp for blockwise
-    quantization.
-  """
-  new_shape = []
-  for index, val in enumerate(tensor_shape):
-    if index == quantized_dim:
-      new_shape.append(int(val / block_size))
-      new_shape.append(block_size)
-    else:
-      new_shape.append(val)
-  return new_shape
-def _reshape_data_for_blockwise(
-    tensor_data: np.ndarray, quantized_dim: int, block_size: int
-) -> tuple[np.ndarray, int]:
-  """Reshapes data for blockwise quantization.
-  Args:
-    tensor_data: The original tensor data.
-    quantized_dim: The dimension to be quantized blockwise.
-    block_size: The size of the block.
-  Returns:
-    A tuple containing the reshaped tensor data and the new reduce dimension.
-  """
-  new_shape = _get_tensor_shape_for_blockwise(
-      tensor_data.shape, quantized_dim, block_size
-  )
-  reshaped_data = tensor_data.reshape(new_shape)
-  return reshaped_data, quantized_dim + 1
-def _broadcast_scale_zp_for_blockwise(
-    tensor_content: np.ndarray,
-    quant_params: qtyping.UniformQuantParams,
-) -> qtyping.UniformQuantParams:
-  """Broadcasts scale and zp for blockwise quantization.
-  Args:
-    tensor_content: The original tensor data.
-    quant_params: The quantization parameters.
-  Returns:
-    The updated quantization parameters with broadcasted scale and zp for
-    correct constant quantization.
-  """
-  if quant_params.quantized_dimension is None:
-    raise ValueError("Quantized dimension must be specified.")
-  if quant_params.block_size is None or quant_params.block_size <= 0:
-    raise ValueError("Block size must be specified and positive.")
-  quantized_dim = quant_params.quantized_dimension
-  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
-      tensor_content.shape, quantized_dim, quant_params.block_size
-  )
-  expanded_scale = np.reshape(
-      np.broadcast_to(
-          np.expand_dims(quant_params.scale, quantized_dim + 1),
-          expanded_tensor_shape,
-      ),
-      tensor_content.shape,
-  )
-  expanded_zp = np.reshape(
-      np.broadcast_to(
-          np.expand_dims(quant_params.zero_point, quantized_dim + 1),
-          expanded_tensor_shape,
-      ),
-      tensor_content.shape,
-  )
-  return qtyping.UniformQuantParams(
-      scale=expanded_scale,
-      zero_point=expanded_zp,
-      num_bits=quant_params.num_bits,
-      symmetric=quant_params.symmetric,
-      quantized_dimension=quantized_dim,
-      block_size=quant_params.block_size,
-  )
 def get_tensor_quant_params(
     op_info: qtyping.OpInfo,
     tensor_quant_config: qtyping.TensorQuantizationConfig,
@@ -191,7 +54,7 @@ def get_tensor_quant_params(
       # weight-only and DRQ do not require calibration, thus it is
       # possible that this information is missing here. In that case we
       # collect min/max on the spot.
-      tensor_min_max = _init_tensor_min_max(
+      tensor_min_max = common_quantize.init_tensor_min_max(
           tensor_content,
           op_info,
       )
@@ -238,7 +101,7 @@ def get_tensor_quant_params(
   # The reshaping for blockwise quantization is unique hence we do this here
   # to avoid unexpected broadcast behavior downstream.
   if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
-    quant_params = _broadcast_scale_zp_for_blockwise(
+    quant_params = common_quantize.broadcast_scale_zp_for_blockwise(
         tensor_content, quant_params
     )
@@ -286,7 +149,7 @@ def init_qsvs(
       tensor_data = tfl_flatbuffer_utils.get_tensor_data(
           tensor, graph_info.buffers
       )
-      op_qsvs[tensor_name] = _init_tensor_min_max(
+      op_qsvs[tensor_name] = common_quantize.init_tensor_min_max(
           tensor_data,
           op_info,
       )
@@ -297,7 +160,7 @@ def init_qsvs(
       tensor_data = tfl_flatbuffer_utils.get_tensor_data(
           tensor, graph_info.buffers
       )
-      op_qsvs[tensor_name] = _init_tensor_min_max(
+      op_qsvs[tensor_name] = common_quantize.init_tensor_min_max(
           tensor_data,
           op_info,
       )

ai_edge_quantizer/algorithms/uniform_quantize/octav.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements the OCTAV quantization."""
+import dataclasses
+from typing import Any, Optional, Sequence, Union
+import numpy as np
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
+from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
+from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
+from ai_edge_quantizer.algorithms.utils import common_utils
+ALGORITHM_KEY = "OCTAV"
+def _guess_clipping_with_octav(
+    x: np.ndarray,
+    bits: int,
+    axis: Union[int, Sequence[int]],
+    max_iterations: int,
+    exponent_divisor: float,
+    early_stop: bool = True,
+) -> np.ndarray:
+  """Returns a tensor of absolute clipping constants for a tensor using OCTAV.
+  This method implements equation (6) from the OCTAV paper:
+  https://arxiv.org/abs/2206.06501
+  Args:
+    x: Tensor data to return guesses for.
+    bits: Number of bits used during quantization.
+    axis: Axis to reduce the tensor along to get the guesses.
+    max_iterations: Number of Newton-Raphson iterations to use.
+    exponent_divisor: What factor to divide the 4^-bits term by. In the paper,
+      3.0 is optimal for signed ints and 12.0 for unsigned ints.
+    early_stop: If True, stop the iteration if the guess doesn't change.
+  Returns:
+    A tensor of shape [num_channels] with clipping constant guesses.
+  """
+  magnitude = np.abs(x)
+  x_reduced = np.mean(x, axis=axis, keepdims=True)
+  old_guess = np.zeros(x_reduced.shape)
+  guess = np.ones(x_reduced.shape)
+  for _ in range(max_iterations):
+    if early_stop and np.allclose(guess, old_guess):
+      break
+    guess_broadcasted = np.broadcast_to(guess, magnitude.shape)
+    guess_mask = np.asarray(magnitude < guess_broadcasted, dtype=x.dtype)
+    numerator = np.sum(
+        magnitude * np.asarray(1.0 - guess_mask), axis=axis, keepdims=True
+    )
+    denominator1 = (4.0 ** (-bits) / exponent_divisor) * np.sum(
+        guess_mask, axis=axis, keepdims=True
+    )
+    denominator2 = np.sum(1.0 - guess_mask, axis=axis, keepdims=True)
+    old_guess = guess
+    guess = numerator / (denominator1 + denominator2)
+  return guess
+def get_tensor_quant_params(
+    op_info: qtyping.OpInfo,
+    tensor_quant_config: qtyping.TensorQuantizationConfig,
+    tensor_content: Optional[np.ndarray] = None,
+    tensor_qsv: Optional[dict[str, Any]] = None,
+) -> qtyping.UniformQuantParams:
+  """Returns the quantization parameters for a tensor.
+  Args:
+    op_info: Aggregated information about the op (e.g., quantization config).
+    tensor_quant_config: The quantization config for the tensor.
+    tensor_content: The content of the tensor. When None, it means the tensor is
+      not a weight tensor (e.g. static quantization) so we fallback to using
+      naive_min_max_quantize.
+    tensor_qsv: A dictionary containing the min/max of the tensor.
+  Raises:
+    ValueError: If the blockwise quantization is requested.
+    ValueError: If the asymmetric quantization is requested.
+    ValueError: `tensor_qsv` must contain min/max values, or `tensor_content`
+      must be provided so that they can be inferred.
+  """
+  # Fallback to naive_min_max_quantize.py for non-weight tensors.
+  if tensor_content is None:
+    return naive_min_max_quantize.get_tensor_quant_params(
+        op_info, tensor_quant_config, tensor_content, tensor_qsv
+    )
+  if (
+      tensor_quant_config.granularity != qtyping.QuantGranularity.CHANNELWISE
+      and tensor_quant_config.granularity != qtyping.QuantGranularity.TENSORWISE
+  ):
+    raise ValueError(
+        f"Unsupported granularity: {tensor_quant_config.granularity}."
+    )
+  if not tensor_quant_config.symmetric:
+    raise ValueError(
+        f"Unsupported symmetry: {tensor_quant_config.symmetric}. OCTAV"
+        " supports symmetric quantization only for now."
+    )
+  if tensor_qsv is None:
+    # We need min/max to calculate quantization parameters, which
+    # should be collected during the calibration process. However,
+    # weight-only and DRQ do not require calibration, thus it is
+    # possible that this information is missing here. In that case we
+    # collect min/max on the spot.
+    tensor_min_max = common_quantize.init_tensor_min_max(
+        tensor_content,
+        op_info,
+    )
+  else:
+    tensor_min_max = tensor_qsv
+  if "min" not in tensor_min_max or "max" not in tensor_min_max:
+    raise ValueError(
+        "min and max must be provided to produce tensor quantization"
+        " parameters. Check if the correct calibration results are passed into"
+        " the ParamsGenerator."
+    )
+  quantized_dim = None
+  if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
+    quantized_dim = common_utils.get_weight_quantized_dim(
+        op_info, tensor_content
+    )
+  clipping_constants = _guess_clipping_with_octav(
+      tensor_content,
+      tensor_quant_config.num_bits,
+      common_utils.get_reduce_dims(quantized_dim, tensor_content.shape),
+      max_iterations=10,
+      exponent_divisor=3.0 if tensor_quant_config.symmetric else 12.0,
+  )
+  zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
+      tensor_min_max["min"],
+      tensor_min_max["max"],
+      tensor_quant_config.num_bits,
+      tensor_quant_config.symmetric,
+      clipping_constants,
+  )
+  quant_params = qtyping.UniformQuantParams(
+      scale=scale,
+      zero_point=zp,
+      num_bits=tensor_quant_config.num_bits,
+      symmetric=tensor_quant_config.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=tensor_quant_config.block_size,
+  )
+  quantized_vars = uniform_quantize_tensor.uniform_quantize(
+      tensor_content, quant_params
+  )
+  return dataclasses.replace(quant_params, quantized_data=quantized_vars)

ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py ADDED Viewed

@@ -0,0 +1,186 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+from typing import cast
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.platform import googletest
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.algorithms.uniform_quantize import octav
+from ai_edge_quantizer.utils import test_utils
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
+class OctavQuantizeTest(parameterized.TestCase):
+  """Tests for general functions for OCTAV."""
+  def setUp(self):
+    super().setUp()
+    np.random.seed(666)
+    self._test_model_path = os.path.join(
+        test_utils.get_path_to_datafile("../../tests/models"),
+        "conv_fc_mnist.tflite",
+    )
+    self._test_model = tfl_flatbuffer_utils.read_model(self._test_model_path)
+    # The test model has one subgraph for now.
+    self._graph_info = qtyping.GraphInfo(
+        subgraph_tensors=self._test_model.subgraphs[0].tensors,
+        buffers=self._test_model.buffers,
+    )
+    self._tensor_name_to_qsv = {}
+    subgraph0 = self._test_model.subgraphs[0]
+    subgraph_op_index = 3
+    fc_op = subgraph0.operators[subgraph_op_index]
+    self._fc_op_info = qtyping.OpInfo(
+        op=fc_op,
+        op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        subgraph_op_index=subgraph_op_index,
+        op_quant_config=qtyping.OpQuantizationConfig(
+            weight_tensor_config=None,
+        ),
+    )
+  def test_get_tensor_quant_params_unsupported_granularity_assert(self):
+    err_msg = "Unsupported granularity"
+    test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, lambda err: err_msg in str(err)
+    ):
+      _ = octav.get_tensor_quant_params(
+          op_info=self._fc_op_info,
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              symmetric=True,
+              granularity=qtyping.QuantGranularity.BLOCKWISE,
+          ),
+          tensor_content=test_data,
+      )
+  def test_get_tensor_quant_params_unsupported_symmetry(self):
+    err_msg = "Unsupported symmetry"
+    test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, lambda err: err_msg in str(err)
+    ):
+      _ = octav.get_tensor_quant_params(
+          op_info=self._fc_op_info,
+          tensor_quant_config=qtyping.TensorQuantizationConfig(
+              num_bits=4,
+              symmetric=False,
+              granularity=qtyping.QuantGranularity.CHANNELWISE,
+          ),
+          tensor_content=test_data,
+      )
+  def test_get_tensor_quant_params_success_with_qsv(self):
+    # Fall back to naive_min_max_quantize.py for non-weight tensors.
+    tensor_quant_params = octav.get_tensor_quant_params(
+        op_info=self._fc_op_info,
+        tensor_quant_config=qtyping.TensorQuantizationConfig(
+            num_bits=8,
+            granularity=qtyping.QuantGranularity.TENSORWISE,
+        ),
+        tensor_qsv={
+            "min": np.array([-1]),
+            "max": np.array([1]),
+        },
+    )
+    self.assertIsNone(tensor_quant_params.quantized_dimension)
+    scale = tensor_quant_params.scale
+    self.assertEqual(scale.shape, (1,))
+    self.assertSequenceAlmostEqual(scale.flatten(), [1 / 127])
+    # Zero point should be zero for symmetric quantization.
+    zp = tensor_quant_params.zero_point
+    self.assertEqual(np.sum(zp), 0)
+    self.assertEqual(zp.shape, (1,))
+  def test_get_tensor_quant_params_sanity_tensorwise(self):
+    test_data = np.array([
+        [-1e5, 25, -50, 75, -100, 125],
+        [25, -30, 50, -75, 1e5, -125],
+        [50, -60, 70, -80, 90, -100],
+    ])
+    quant_params = octav.get_tensor_quant_params(
+        op_info=self._fc_op_info,
+        tensor_quant_config=qtyping.TensorQuantizationConfig(
+            num_bits=4,
+            symmetric=True,
+            granularity=qtyping.QuantGranularity.TENSORWISE,
+        ),
+        tensor_content=test_data,
+    )
+    adjusted_test_data = quant_params.quantized_data * quant_params.scale
+    real_max = np.max(np.abs(test_data))
+    adjusted_max = np.max(np.abs(adjusted_test_data))
+    # Check that some clipping occurred.
+    with self.subTest(name="SanityCheckClipping"):
+      self.assertLess(adjusted_max, real_max)
+    with self.subTest(name="SanityCheckQuantParamsShapes"):
+      self.assertEqual(quant_params.zero_point.shape, (1, 1))
+      self.assertEqual(quant_params.scale.shape, (1, 1))
+      self.assertIsNone(quant_params.quantized_dimension)
+      self.assertIsNotNone(quant_params.quantized_data)
+      self.assertTupleEqual(
+          cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
+      )
+    with self.subTest(name="SanityCheckQuantParamsValues"):
+      self.assertTrue(np.all(quant_params.zero_point == 0))
+  def test_get_tensor_quant_params_sanity_channelwise(self):
+    test_data = np.array([
+        [-1e5, 25, -50, 75, -100, 125],
+        [25, -30, 50, -75, 1e5, -125],
+        [50, -60, 70, -80, 90, -100],
+    ])
+    quant_params = octav.get_tensor_quant_params(
+        op_info=self._fc_op_info,
+        tensor_quant_config=qtyping.TensorQuantizationConfig(
+            num_bits=4,
+            symmetric=True,
+            granularity=qtyping.QuantGranularity.CHANNELWISE,
+        ),
+        tensor_content=test_data,
+    )
+    adjusted_test_data = quant_params.quantized_data * quant_params.scale
+    for i, row in enumerate(test_data):
+      real_max = np.max(np.abs(row))
+      adjusted_max = np.max(np.abs(adjusted_test_data[i]))
+      # Check that some clipping occurred.
+      with self.subTest(name="SanityCheckClipping"):
+        self.assertLess(adjusted_max, real_max)
+    with self.subTest(name="SanityCheckQuantParamsShapes"):
+      self.assertEqual(quant_params.zero_point.shape, (test_data.shape[0], 1))
+      self.assertEqual(quant_params.scale.shape, (test_data.shape[0], 1))
+      self.assertIsNotNone(quant_params.quantized_data)
+      self.assertTupleEqual(
+          cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
+      )
+    with self.subTest(name="SanityCheckQuantParamsValues"):
+      self.assertTrue(np.all(quant_params.zero_point == 0))
+      self.assertEqual(quant_params.quantized_dimension, 0)
+if __name__ == "__main__":
+  googletest.main()

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -16,6 +16,7 @@
 """Uniform quantize in tensor level."""
 import dataclasses
+from typing import Optional
 import numpy as np
 from ai_edge_quantizer import qtyping
@@ -237,7 +238,11 @@ def symmetric_quantize_bias_tensor(
 def tensor_zp_scale_from_min_max(
-    min_value, max_value, num_bits: int, symmetric: bool
+    min_value,
+    max_value,
+    num_bits: int,
+    symmetric: bool,
+    clipping_values: Optional[np.ndarray] = None,
 ):
   """Get zero point and scale from min and max value.
@@ -246,6 +251,10 @@ def tensor_zp_scale_from_min_max(
     max_value: The maximum value of the tensor (channel-wise supported).
     num_bits: The number of bits of the tensor.
     symmetric: Whether the tensor is symmetric.
+    clipping_values: Absolute clipping values to apply to the tensor. This will
+      clip the tensors to the range [-clipping_values, clipping_values]. This
+      should be the same shape as min_value and max_value. If None, no clipping
+      will be applied.
   Returns:
     The zero point and scale of the tensor.
@@ -261,6 +270,8 @@ def tensor_zp_scale_from_min_max(
   if symmetric:
     bound = np.maximum(np.abs(min_value), np.abs(max_value))
     bound = np.maximum(bound, min_bound)
+    if clipping_values is not None:
+      bound = np.clip(bound, -clipping_values, clipping_values)
     if not qtype.signed:
       half_q = (qmax - 1) / 2
       scale = bound / half_q
@@ -268,7 +279,6 @@ def tensor_zp_scale_from_min_max(
     else:
       scale = bound / qmax
       zp = np.zeros_like(scale, dtype=np.int32)
   else:
     # Include 0 to the range to support zero-padding.
     # See: https://arxiv.org/pdf/1712.05877.pdf
@@ -276,6 +286,8 @@ def tensor_zp_scale_from_min_max(
     bound_max = np.maximum(max_value, np.zeros_like(max_value))
     bound_min = np.minimum(min_value, np.zeros_like(min_value))
     bound = np.maximum(bound_max - bound_min, min_bound)
+    if clipping_values is not None:
+      bound = np.clip(bound, -clipping_values, clipping_values)
     scale = bound / (qmax - qmin)
     zp = qmin - bound_min / scale
     zp = np.rint(zp)

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py CHANGED Viewed

@@ -352,6 +352,33 @@ class TensorUtilsTest(parameterized.TestCase):
       # Range has to be extended to include zero.
       self.assertEqual(calculated_min, 0)
+  @parameterized.parameters(
+      # number of bits, is_symmetric, max bound of the quantized range.
+      (4, True, 7),
+      (8, False, 255),
+    )
+  def test_tensor_zp_scale_from_min_max_with_clipping(
+      self, num_bits, symmetric, quantized_bound
+  ):
+    min_val = np.array([[1.0]])
+    max_val = np.array([[5.0]])
+    clipping_values = np.array([4.0])
+    zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
+        min_val, max_val, num_bits, symmetric, clipping_values
+    )
+    expected_scale = clipping_values / quantized_bound
+    with self.subTest(name="CheckShapes"):
+      self.assertEqual(zp.shape, scale.shape)
+      self.assertEqual(zp.shape, (1, 1))
+    if symmetric:
+      with self.subTest(name="CheckSymmetricZpValue"):
+        self.assertEqual(zp[0], 0)
+    with self.subTest(name="CheckScaleValue"):
+      self.assertEqual(scale[0], expected_scale)
 if __name__ == "__main__":
   googletest.main()

{ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-edge-quantizer-nightly
-Version: 0.0.1.dev20250317
+Version: 0.1.0.dev20250319
 Summary: A quantizer for advanced developers to quantize converted AI Edge models.
 Home-page: https://github.com/google-ai-edge/ai-edge-quantizer
 Keywords: On-Device ML,AI,Google,TFLite,Quantization,LLMs,GenAI
@@ -27,7 +27,7 @@ License-File: LICENSE
 Requires-Dist: immutabledict
 Requires-Dist: numpy
 Requires-Dist: tf-nightly>=2.17.0.dev20240509
-Requires-Dist: ai-edge-litert-nightly
+Requires-Dist: ai-edge-litert>=1.2.0
 It aims to facilitate advanced users to strive for optimal performance on
 resource demanding models (e.g., GenAI models).

{ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 ai_edge_quantizer/__init__.py,sha256=4pFSkukSwahYyzwqia0yPRyz8TnFQfGRthVJhYpMWas,793
-ai_edge_quantizer/algorithm_manager.py,sha256=VZx4HvGEgt6XAS-b0breFPioLfKkAFNG71VLSG4aKS8,7712
+ai_edge_quantizer/algorithm_manager.py,sha256=sOZ1T8n0YYi_ijDDuzryNJi2HUPggeo9uWNJri3elv0,10431
 ai_edge_quantizer/algorithm_manager_api.py,sha256=u903TG0s1uIDhJqfeJne3CFl8A93phZrwgV2-hwdcXU,9247
 ai_edge_quantizer/algorithm_manager_api_test.py,sha256=tL_ozYFTsOPX8qGcti0KTz37nVsCxf0SSG5C45SyT-g,7319
 ai_edge_quantizer/calibrator.py,sha256=n7AD9j7UScR-CieoI6DQRMeiG_fhLBfSLRiM4460xaM,11895
@@ -28,14 +28,16 @@ ai_edge_quantizer/algorithms/nonlinear_quantize/__init__.py,sha256=lpq1g2ayg3lCP
 ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting.py,sha256=Bs9CK7wZAw6jNaZ8xEtbwO2vM34VYXNZSMVWvxJo9nw,9297
 ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting_test.py,sha256=s64eDDH9bmRWy6Bl1peHnhGewLnFJjvnhYOdjo1zYOA,22625
 ai_edge_quantizer/algorithms/uniform_quantize/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
-ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=osvXIwVVEi5DRiT_MpJpAXGZCVMEoR0tcc6EwuAtcp0,22330
+ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=LnItMEsR47qe8T5pg9UI5NGfhi4cOxt0vAU35IkWnaY,27163
 ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py,sha256=qMmKbWqxrCoVKbLKHn9WuCrGKPfHkEyU0Nmhokh8Qeo,2597
 ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py,sha256=OTXjEZ3Ctq3ffYzisX-6HwgK_DuA7uos_aap5PiIUPE,8686
 ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py,sha256=y7BK11fkF63Ex_Jzg3fbIdy0D_Ca6HuvChVZR7Uwggc,8073
-ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=fBqSidFVKZmdO-xIFfwZPdIN1eLJjOik8mUZxZj2ljk,12149
+ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=aWHU4rneBv7ErufEWKQGAWTK-pgfn-rG9mAkC0d9V6Q,7871
 ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py,sha256=Hok09dloSyBfD0oDM5VABdSZjM9JWSQhm_hDHNbFujA,7640
-ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=Q_vx7YN7KMpjubsngxRdJ4bfdSIV-gmXjtVuxIkZuX4,11078
-ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=WZ4_bvbG999nOtCIqn7mrMnpRdoJOdiyzxhsL_QiPHA,11395
+ai_edge_quantizer/algorithms/uniform_quantize/octav.py,sha256=e5wYtki-vl739gSVAZHAKcs2hA87GvFUjVoSUPlnkyM,6433
+ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py,sha256=IcTOaJ1pxtqsitqxOEP9LROVEP_19VFutHalqNied4I,6940
+ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=WmZzKQlzfu9gFr9SbUDoPY3rFqTl363om8-0rTLwotw,11629
+ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=G2PFpHhF-6OOuAwQ1lei63QEIm7uzIZJ62qpgA02qTM,12288
 ai_edge_quantizer/algorithms/utils/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
 ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=4qSlVNx3-91kJufnnJV1RdVRXBPapylZkrAp2nywoao,34581
 ai_edge_quantizer/algorithms/utils/common_utils_test.py,sha256=zqapGEfYhjQWe9cNGPLmdbwtEUUYQRhlO_kNe0cXX6E,18104
@@ -62,8 +64,8 @@ ai_edge_quantizer/utils/tfl_interpreter_utils.py,sha256=x2xA2CFPpe_2trcV8v5xGaBE
 ai_edge_quantizer/utils/tfl_interpreter_utils_test.py,sha256=Op3JxtOqlrjzmYF18jnnstL1k9xiY9kKJ8S2vklKGkc,11327
 ai_edge_quantizer/utils/validation_utils.py,sha256=oYw33Sg547AqtGw-choPUJmp9SAKkV46J_ddqSsum2Q,3950
 ai_edge_quantizer/utils/validation_utils_test.py,sha256=V_qNDikPD4OPB-siOLQCWNVWTAu87h2IgNYt7teFd-o,2934
-ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info/METADATA,sha256=Y7H57IqB-YuZ_cSLtY16TQmBsudtOQPoAmFU0MfbyvU,1528
-ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
-ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info/RECORD,,
+ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/METADATA,sha256=WTz-_FHdUgNLhVPcpu4VW9rw2drBw92tUqa35_OsDWg,1527
+ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
+ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/RECORD,,

{ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/LICENSE RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/top_level.txt RENAMED Viewed

File without changes

ai-edge-quantizer-nightly 0.0.1.dev20250317__py3-none-any.whl → 0.1.0.dev20250319__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250317py3-none-any.whl → 0.1.0.dev20250319py3-none-any.whl