PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.1.0.dev20250512__py3-none-any.whl → 0.1.0.dev20250514__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.1.0.dev20250512py3-none-any.whl → 0.1.0.dev20250514py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py CHANGED Viewed

@@ -16,9 +16,11 @@
 """Uniform quantize in tensor level."""
 import dataclasses
-from typing import Optional
+from typing import Optional, Sequence
+import ml_dtypes
 import numpy as np
 from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
 @dataclasses.dataclass(frozen=True)
@@ -120,19 +122,127 @@ def fix_quantization_params_rank(
   )
+def _get_tensor_shape_for_blockwise(
+    tensor_shape: Sequence[int], quantized_dim: int, block_size: int
+) -> list[int]:
+  """Get the tensor shape for blockwise quantization.
+  This function splits the quantize dimension of the tensor into blocks and the
+  dim/blocks. Hence, min/max of the tensor can be calculated for each block
+  using existing functions.
+  Args:
+    tensor_shape: The original shape of the tensor.
+    quantized_dim: The dimension to be quantized blockwise.
+    block_size: The size of the block.
+  Returns:
+    The new tensor shape for calculating scale and zp for blockwise
+    quantization.
+  """
+  new_shape = []
+  for index, val in enumerate(tensor_shape):
+    if index == quantized_dim:
+      new_shape.append(int(val / block_size))
+      new_shape.append(block_size)
+    else:
+      new_shape.append(val)
+  return new_shape
+def reshape_data_for_blockwise(
+    tensor_data: np.ndarray, op_name: qtyping.TFLOperationName, block_size: int
+) -> tuple[np.ndarray, int]:
+  """Reshapes data for blockwise quantization.
+  Args:
+    tensor_data: The original tensor data.
+    op_name: The name of the TFL op.
+    block_size: The size of the block.
+  Returns:
+    A tuple containing the reshaped tensor data and the new reduce dimension.
+  """
+  quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+      op_name
+  ]
+  new_shape = _get_tensor_shape_for_blockwise(
+      tensor_data.shape, quantized_dim, block_size
+  )
+  reshaped_data = tensor_data.reshape(new_shape)
+  return reshaped_data, quantized_dim + 1
+def _broadcast_scale_zp_for_blockwise(
+    tensor_content: np.ndarray,
+    quant_params: qtyping.UniformQuantParams,
+) -> qtyping.UniformQuantParams:
+  """Broadcasts scale and zp for blockwise quantization.
+  Args:
+    tensor_content: The original tensor data.
+    quant_params: The quantization parameters.
+      `quant_params.quantized_dimension` must be specified.
+      `quant_params.block_size` must be specified and positive.
+  Returns:
+    The updated quantization parameters with broadcasted scale and zp for
+    correct constant quantization.
+  """
+  if quant_params.quantized_dimension is None:
+    raise ValueError("Quantized dimension must be specified.")
+  if quant_params.block_size is None or quant_params.block_size <= 0:
+    raise ValueError("Block size must be specified and positive.")
+  quantized_dim = quant_params.quantized_dimension
+  expanded_tensor_shape = _get_tensor_shape_for_blockwise(
+      tensor_content.shape, quantized_dim, quant_params.block_size
+  )
+  expanded_scale = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.scale, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  expanded_zp = np.reshape(
+      np.broadcast_to(
+          np.expand_dims(quant_params.zero_point, quantized_dim + 1),
+          expanded_tensor_shape,
+      ),
+      tensor_content.shape,
+  )
+  return qtyping.UniformQuantParams(
+      scale=expanded_scale,
+      zero_point=expanded_zp,
+      num_bits=quant_params.num_bits,
+      symmetric=quant_params.symmetric,
+      quantized_dimension=quantized_dim,
+      block_size=quant_params.block_size,
+  )
 def uniform_quantize(
     tensor_data: np.ndarray,
     quantization_params: qtyping.UniformQuantParams,
+    is_blockwise: bool = False,
 ):
   """Uniform quantize a tensor.
   Args:
     tensor_data: The tensor to be quantized.
     quantization_params: The quantization parameters.
+    is_blockwise: Whether the tensor is blockwise quantized.
   Returns:
     The quantized tensor.
   """
+  # The reshaping for blockwise quantization is unique hence we do this here
+  # to avoid unexpected broadcast behavior downstream.
+  if is_blockwise:
+    quantization_params = _broadcast_scale_zp_for_blockwise(
+        tensor_data, quantization_params
+    )
   # quant params in flatbuffer is flattened, expand the rank to be the same
   # as the tensor rank to avoid ambiguous broadcasting.
   quantization_params = fix_quantization_params_rank(
@@ -242,15 +352,19 @@ def tensor_zp_scale_from_min_max(
     max_value,
     num_bits: int,
     symmetric: bool,
+    granularity: qtyping.QuantGranularity,
     clipping_values: Optional[np.ndarray] = None,
 ):
   """Get zero point and scale from min and max value.
   Args:
-    min_value: The minimum value of the tensor (channel-wise supported).
-    max_value: The maximum value of the tensor (channel-wise supported).
+    min_value: The minimum value of the tensor (channelwise and blockwise
+      supported).
+    max_value: The maximum value of the tensor (channelwise and blockwise
+      supported).
     num_bits: The number of bits of the tensor.
     symmetric: Whether the tensor is symmetric.
+    granularity: The granularity of the tensor.
     clipping_values: Absolute clipping values to apply to the tensor. This will
       clip the tensors to the range [-clipping_values, clipping_values]. This
       should be the same shape as min_value and max_value. If None, no clipping
@@ -267,6 +381,16 @@ def tensor_zp_scale_from_min_max(
   qmin, qmax = get_quantized_range(qtype)
   min_bound = 1e-4  # 1e-6 precision for int8 and 1e-8 for int16.
+  if granularity == qtyping.QuantGranularity.BLOCKWISE:
+    # Blockwise quantization uses float16 scale, with 7 bit mantissa,
+    # so the maximum representable value is 65280.
+    float16_max = np.broadcast_to(np.array(65280), min_value.shape)
+    clipping_values = (
+        float16_max
+        if clipping_values is None
+        else np.minimum(clipping_values, float16_max)
+    )
   if symmetric:
     bound = np.maximum(np.abs(min_value), np.abs(max_value))
     bound = np.maximum(bound, min_bound)
@@ -292,6 +416,12 @@ def tensor_zp_scale_from_min_max(
     zp = qmin - bound_min / scale
     zp = np.rint(zp)
+  if granularity == qtyping.QuantGranularity.BLOCKWISE:
+    # Round the scale values to 7 bit mantissa.
+    scale = (
+        scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
+    )
   # It's safe to cast zp to qtype without clipping because we can infer
   # qmin <= zp <= qmax from bound_min <= 0 <= bound_max.
   zp = assign_quantized_type(zp, qtype)

ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py CHANGED Viewed

@@ -336,7 +336,11 @@ class TensorUtilsTest(parameterized.TestCase):
     max_val = np.max(self._test_data, keepdims=True)
     zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
-        min_val, max_val, num_bits, symmetric
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
     )
     self.assertEqual(zp.shape, scale.shape)
     max_q = 2**num_bits / 2 - 1
@@ -364,7 +368,12 @@ class TensorUtilsTest(parameterized.TestCase):
     max_val = np.array([[5.0]])
     clipping_values = np.array([4.0])
     zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
-        min_val, max_val, num_bits, symmetric, clipping_values
+        min_val,
+        max_val,
+        num_bits,
+        symmetric,
+        qtyping.QuantGranularity.TENSORWISE,
+        clipping_values,
     )
     expected_scale = clipping_values / quantized_bound

ai_edge_quantizer/algorithms/utils/common_utils.py CHANGED Viewed

@@ -905,23 +905,36 @@ def get_tensor_transformation_params(
   )
-def get_weight_quantized_dim(op_info: qtyping.OpInfo, tensor_data: np.ndarray):
+def get_weight_quantized_dim(
+    op_info: qtyping.OpInfo,
+    tensor_data: np.ndarray,
+    granularity: qtyping.QuantGranularity,
+):
   """Get the quantized dimension for the weight tensor.
   Args:
     op_info: Aggregated information about the op (e.g., quantization config).
     tensor_data: The weight tensor data.
+    granularity: The granularity of the weight tensor.
   Returns:
     The quantized dimension for the weight tensor.
   """
-  if op_info.op_name == _TFLOpName.BATCH_MATMUL:
-    quantized_dim = get_bmm_weight_quantized_dim(
-        tensor_data, adj_y=op_info.op.builtinOptions.adjY
-    )
-  else:
-    quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
-        op_info.op_name, None
+  quantized_dim = None
+  if granularity == qtyping.QuantGranularity.CHANNELWISE:
+    if op_info.op_name == _TFLOpName.BATCH_MATMUL:
+      quantized_dim = get_bmm_weight_quantized_dim(
+          tensor_data, adj_y=op_info.op.builtinOptions.adjY
+      )
+    else:
+      quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_WEIGHT_QUANTIZED_DIM.get(
+          op_info.op_name, None
+      )
+  elif granularity == qtyping.QuantGranularity.BLOCKWISE:
+    quantized_dim = (
+        tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
+            op_info.op_name
+        ]
     )
   return quantized_dim

ai_edge_quantizer/default_policy.py CHANGED Viewed

@@ -183,7 +183,8 @@ DEFAULT_JSON_POLICY = """
       "SELECT_V2",
       "DYNAMIC_UPDATE_SLICE",
       "SELECT_V2",
-      "STABLEHLO_COMPOSITE"
+      "STABLEHLO_COMPOSITE",
+      "PAD"
     ],
     "static_wi8_ai8": [
       "ADD",
@@ -214,7 +215,8 @@ DEFAULT_JSON_POLICY = """
       "SELECT_V2",
       "DYNAMIC_UPDATE_SLICE",
       "SELECT_V2",
-      "STABLEHLO_COMPOSITE"
+      "STABLEHLO_COMPOSITE",
+      "PAD"
     ],
     "static_wi4_ai8": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT", "EMBEDDING_LOOKUP"],
     "static_wi4_ai16": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT", "EMBEDDING_LOOKUP"],

ai_edge_quantizer/params_generator.py CHANGED Viewed

@@ -508,6 +508,7 @@ def _compatible_tensor_params(
   float_source_transformations = [
       _QuantTrans.ADD_QUANTIZE,
       _QuantTrans.NO_QUANTIZE,
+      _QuantTrans.INSERT_HADAMARD_ROTATION,
   ]
   quantized_source_transformations = [
       _QuantTrans.QUANTIZE_TENSOR,

ai_edge_quantizer/qtyping.py CHANGED Viewed

@@ -20,7 +20,7 @@ from collections.abc import MutableMapping
 import copy
 import dataclasses
 import enum
-from typing import Any, Optional, Union, Callable
+from typing import Any, Callable, Optional, Union
 import numpy as np
 from typing_extensions import TypeAlias
@@ -62,6 +62,7 @@ class TFLOperationName(str, enum.Enum):
   SELECT_V2 = 'SELECT_V2'
   DYNAMIC_UPDATE_SLICE = 'DYNAMIC_UPDATE_SLICE'
   STABLEHLO_COMPOSITE = 'STABLEHLO_COMPOSITE'
+  PAD = 'PAD'
 class QuantizeMode(enum.Enum):
@@ -113,6 +114,8 @@ class QuantTransformation(enum.Enum):
   DUPLICATE_BUFFER = 5
   # Duplicate the tensor.
   DUPLICATE_TENSOR = 6
+  # Insert the aeq.hadamard_rotation op.
+  INSERT_HADAMARD_ROTATION = 7
 @dataclasses.dataclass(frozen=True)
@@ -128,8 +131,35 @@ class UniformQuantParams:
     quantized_data: The quantized data.
     block_size: The block size for blockwise quantization, block_size=0 meaning
       no blockwise quantization.
+    hadamard: The Hadamard rotation parameters, if set.
   """
+  class HadamardRotationParams:
+    """Parameters for the Hadamard rotation.
+    Attributes:
+      random_binary_vector: The random binary vector for the Hadamard rotation.
+        TODO(b/415392354): Randomization is an experimental feature that's
+        currently not implemented yet hence this is always 1. We will add
+        support or remove in the future.
+      hadamard_size: The size of the Hadamard matrix.
+    """
+    random_binary_vector: np.ndarray
+    hadamard_size: int
+    def __init__(self, random_binary_vector: np.ndarray, hadamard_size: int):
+      self.random_binary_vector = random_binary_vector
+      self.hadamard_size = hadamard_size
+    def __eq__(self, other):
+      if other.__class__ is not self.__class__:
+        return NotImplemented
+      return (
+          np.array_equal(self.random_binary_vector, other.random_binary_vector)
+          and self.hadamard_size == other.hadamard_size
+      )
   num_bits: int
   quantized_dimension: Optional[int]
   scale: np.ndarray
@@ -137,6 +167,7 @@ class UniformQuantParams:
   symmetric: bool = True
   quantized_data: Optional[np.ndarray] = None
   block_size: int = 0
+  hadamard: Optional[HadamardRotationParams] = None
   @classmethod
   def from_tfl_tensor_details(cls, tensor_detail) -> 'UniformQuantParams':
@@ -180,6 +211,7 @@ class UniformQuantParams:
         and self.symmetric == other.symmetric
         and _compare_array_or_none(self.quantized_data, other.quantized_data)
         and self.block_size == other.block_size
+        and self.hadamard == other.hadamard
     )
@@ -492,6 +524,7 @@ class IOOperator:
   outputs: list[int]
   op_key: TFLOperationName
 # The function signature for `get_tensor_quant_params_fn`.
 GetTensorQuantParamsFuncSignature = Callable[
     [

ai_edge_quantizer/transformation_performer.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ai_edge_quantizer.transformations import dequant_insert
 from ai_edge_quantizer.transformations import duplicate_buffer
 from ai_edge_quantizer.transformations import duplicate_tensor
 from ai_edge_quantizer.transformations import emulated_subchannel
+from ai_edge_quantizer.transformations import insert_hadamard_rotation
 from ai_edge_quantizer.transformations import quant_insert
 from ai_edge_quantizer.transformations import quantize_tensor
 from ai_edge_quantizer.transformations import transformation_utils
@@ -80,6 +81,9 @@ class TransformationPerformer:
         qtyping.QuantTransformation.DUPLICATE_TENSOR: (
             duplicate_tensor.duplicate_tensor
         ),
+        qtyping.QuantTransformation.INSERT_HADAMARD_ROTATION: (
+            insert_hadamard_rotation.insert_hadamard_rotation
+        ),
     }
     # transformations are seprated in two categories:
     # op_insertion_transformations are transformations that only insert ops
@@ -91,6 +95,7 @@ class TransformationPerformer:
         qtyping.QuantTransformation.ADD_QUANTIZE,
         qtyping.QuantTransformation.DUPLICATE_BUFFER,
         qtyping.QuantTransformation.DUPLICATE_TENSOR,
+        qtyping.QuantTransformation.INSERT_HADAMARD_ROTATION,
     ])
     self._op_replacement_transformations = set(
         [qtyping.QuantTransformation.EMULATED_SUBCHANNEL]

ai_edge_quantizer/transformations/insert_hadamard_rotation.py ADDED Viewed

@@ -0,0 +1,209 @@
+# Copyright 2024 The AI Edge Quantizer Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Hadamard rotation pattern transformation."""
+from flatbuffers import flexbuffers
+import numpy as np
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer.transformations import transformation_utils
+from ai_edge_litert import schema_py_generated  # pylint: disable=g-direct-tensorflow-import
+def _to_flexbuffer(
+    hadamard_size: int,
+    random_binary_vector: list[np.int8],
+) -> bytes:
+  """Converts hadamard_size to flexbuffer."""
+  fbb = flexbuffers.Builder()
+  with fbb.Map():
+    fbb.Int('hadamard_size', hadamard_size)
+    fbb.VectorFromElements('random_binary_vector', random_binary_vector)
+  return fbb.Finish()
+def _is_producer_embedding_lookup(
+    transformation: transformation_utils.TransformationInput,
+) -> bool:
+  """Checks if the tensor's producer is an embedding lookup op."""
+  if transformation.producer == -1:
+    return False
+  else:
+    return (
+        transformation.op_codes[
+            transformation.subgraph.operators[
+                transformation.producer
+            ].opcodeIndex
+        ].builtinCode
+        == schema_py_generated.BuiltinOperator.EMBEDDING_LOOKUP
+    )
+def _is_fully_connected(
+    transformation: transformation_utils.TransformationInput, op_id: int
+) -> bool:
+  """Checks if the any of the tensor's consumers is a fully connected op."""
+  return (
+      transformation.op_codes[
+          transformation.subgraph.operators[op_id].opcodeIndex
+      ].builtinCode
+      == schema_py_generated.BuiltinOperator.FULLY_CONNECTED
+  )
+def _update_embedding_lookup_consumers(
+    transformation: transformation_utils.TransformationInput,
+    new_tensor_id: int,
+) -> bool:
+  """Updates the consumers of the embedding lookup op to use the new tensor.
+  Args:
+    transformation: The transformation input to update the consumers of.
+    new_tensor_id: The new tensor id to use as the input to the embedding lookup
+      consumers.
+  """
+  for consumer in transformation.consumers:
+    # If the consumer is a graph output and not an op, we can ignore it here
+    # since the graph output will be updated later.
+    if consumer == -1:
+      continue
+    consumer_op = transformation.subgraph.operators[consumer]
+    # Find the input that was attached to the insertion point, and replace it
+    # with the new tensor.
+    for i in range(len(consumer_op.inputs)):
+      if consumer_op.inputs[i] == transformation.tensor_id:
+        consumer_op.inputs[i] = new_tensor_id
+def _update_fully_connected_consumers(
+    transformation: transformation_utils.TransformationInput,
+    new_tensor_id: int,
+) -> bool:
+  """Updates the fully connected op(s) to use the new tensor.
+  Since the new tensor is inserted to the fully_connected's input, we need to
+  scan each consumer (in case of multiple fully_connected ops), and update
+  the input tensor to the new tensor.
+  Args:
+    transformation: The transformation input to update the consumers of.
+    new_tensor_id: The new tensor id to use as the input to the fully connected
+      consumers.
+  Returns:
+    True if the fully connected op(s) were updated to use the new tensor.
+  """
+  updated = False
+  for consumer in transformation.consumers:
+    if _is_fully_connected(transformation, consumer):
+      transformation.subgraph.operators[consumer].inputs[0] = new_tensor_id
+      updated = True
+  return updated
+def insert_hadamard_rotation(
+    transformation_input: transformation_utils.TransformationInput,
+) -> qtyping.TransformationInfo:
+  """Inserts a custom aeq.hadamard_rotation op on this tensor.
+  This function works for float32 tensors only.
+  Args:
+    transformation_input: The transformation input to insert the custom op on.
+  Returns:
+    The transformation info of the inserted custom op.
+  Raises:
+    ValueError: If the transformation input is not a uniform quantization
+    transformation.
+    ValueError: If the Hadamard quantization params are not set.
+    ValueError: If the tensor is not a float32 tensor.
+    ValueError: If no supported ops were found as the tensor's producer or
+    consumers.
+  """
+  if not isinstance(
+      transformation_input.quant_params, qtyping.UniformQuantParams
+  ):
+    raise ValueError('Hadamard rotation supports uniform quantization only')
+  if transformation_input.quant_params.hadamard is None:
+    raise ValueError(
+        'Hadamard rotation quantization params are not set but op insertion is'
+        ' requested.'
+    )
+  tensor = transformation_input.subgraph.tensors[transformation_input.tensor_id]
+  if tensor.type != schema_py_generated.TensorType.FLOAT32:
+    raise ValueError(
+        'The Hadamard rotation op supports float32 tensors only. Got'
+        f' {tensor.type} tensor.'
+    )
+  # Create new custom op with the current tensor as input and a new activation
+  # tensor as output.
+  custom_op_code_idx = transformation_utils.add_op_code(
+      schema_py_generated.BuiltinOperator.CUSTOM,
+      transformation_input.op_codes,
+      'aeq.hadamard_rotation',
+  )
+  custom_op = schema_py_generated.OperatorT()
+  custom_op.opcodeIndex = custom_op_code_idx
+  custom_op.inputs = [transformation_input.tensor_id]
+  custom_op.customOptions = _to_flexbuffer(
+      transformation_input.quant_params.hadamard.hadamard_size,
+      transformation_input.quant_params.hadamard.random_binary_vector.tolist(),
+  )
+  new_tensor_id = transformation_utils.add_new_activation_tensor(
+      tensor.name + b'_rotated',
+      tensor.shapeSignature
+      if tensor.shapeSignature is not None
+      else tensor.shape,
+      schema_py_generated.TensorType.FLOAT32,
+      transformation_input.subgraph,
+  )
+  custom_op.outputs = [new_tensor_id]
+  # Update the users of this tensor to use the new tensor.
+  if _is_producer_embedding_lookup(transformation_input):
+    _update_embedding_lookup_consumers(transformation_input, new_tensor_id)
+  elif not _update_fully_connected_consumers(
+      transformation_input, new_tensor_id
+  ):
+    raise ValueError(
+        'The Hadamard rotation op supports embedding lookup and fully connected'
+        ' ops only, but no such ops were found.'
+    )
+  # If the tensor is a graph output, we need to replace the tensor with the
+  # new tensor.
+  for i, output in enumerate(transformation_input.subgraph.outputs):
+    if output == transformation_input.tensor_id:
+      transformation_input.subgraph.outputs[i] = new_tensor_id
+  # Find the actual insertion point. The insertion point should be after the
+  # producer op and before the first consumer op. The max() operation ensures
+  # that we're not using -1 as the insertion point.
+  first_consumer_id = min(transformation_input.consumers)
+  op_id = max(transformation_input.producer + 1, first_consumer_id)
+  # Insert the custom op.
+  transformation_input.subgraph.operators.insert(op_id, custom_op)
+  return qtyping.TransformationInfo(
+      op_id=op_id,
+      num_ops_added=1,
+      output_tensor_id=new_tensor_id,
+  )

ai-edge-quantizer-nightly 0.1.0.dev20250512__py3-none-any.whl → 0.1.0.dev20250514__py3-none-any.whl

ai-edge-quantizer-nightly 0.1.0.dev20250512py3-none-any.whl → 0.1.0.dev20250514py3-none-any.whl