PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ai_edge_quantizer/qtyping.py CHANGED Viewed

@@ -20,7 +20,7 @@ from collections.abc import MutableMapping
 import copy
 import dataclasses
 import enum
-from typing import Any, Optional, Union, Callable
+from typing import Any, Callable, Optional, Union
 import numpy as np
 from typing_extensions import TypeAlias
@@ -59,7 +59,31 @@ class TFLOperationName(str, enum.Enum):
   LOGISTIC = 'LOGISTIC'
   SLICE = 'SLICE'
   SUM = 'SUM'
+  SELECT = 'SELECT'
   SELECT_V2 = 'SELECT_V2'
+  DYNAMIC_UPDATE_SLICE = 'DYNAMIC_UPDATE_SLICE'
+  STABLEHLO_COMPOSITE = 'STABLEHLO_COMPOSITE'
+  PAD = 'PAD'
+  SQUARED_DIFFERENCE = 'SQUARED_DIFFERENCE'
+  MAX_POOL_2D = 'MAX_POOL_2D'
+  RESIZE_BILINEAR = 'RESIZE_BILINEAR'
+  RESIZE_NEAREST_NEIGHBOR = 'RESIZE_NEAREST_NEIGHBOR'
+  GATHER_ND = 'GATHER_ND'
+  PACK = 'PACK'
+  UNPACK = 'UNPACK'
+  DIV = 'DIV'
+  BROADCAST_TO = 'BROADCAST_TO'
+  SQRT = 'SQRT'
+  GATHER = 'GATHER'
+  HARD_SWISH = 'HARD_SWISH'
+  MAXIMUM = 'MAXIMUM'
+  PADV2 = 'PADV2'
+  REDUCE_MIN = 'REDUCE_MIN'
+  EQUAL = 'EQUAL'
+  NOT_EQUAL = 'NOT_EQUAL'
+  MIRROR_PAD = 'MIRROR_PAD'
+  SPACE_TO_DEPTH = 'SPACE_TO_DEPTH'
+  RELU = 'RELU'
 class QuantizeMode(enum.Enum):
@@ -90,7 +114,11 @@ class TensorDataType(str, enum.Enum):
 class QuantGranularity(str, enum.Enum):
   TENSORWISE = 'TENSORWISE'
   CHANNELWISE = 'CHANNELWISE'
-  BLOCKWISE = 'BLOCKWISE'
+  # Blockwise quantization with various block sizes.
+  BLOCKWISE_32 = 'BLOCKWISE_32'
+  BLOCKWISE_64 = 'BLOCKWISE_64'
+  BLOCKWISE_128 = 'BLOCKWISE_128'
+  BLOCKWISE_256 = 'BLOCKWISE_256'
 class QuantTransformation(enum.Enum):
@@ -104,9 +132,18 @@ class QuantTransformation(enum.Enum):
   ADD_DEQUANTIZE = 2
   # Quantize the float tensor: float_tensor -> quantized_tensor.
   QUANTIZE_TENSOR = 3
-  # Create pattern for emulated subchannel quantization, only support fully
-  # connected op.
+  # (Deprecated) Create pattern for emulated subchannel quantization,
+  # only support fully connected op.
   EMULATED_SUBCHANNEL = 4
+  # Duplicate the buffer.
+  DUPLICATE_BUFFER = 5
+  # Duplicate the tensor.
+  DUPLICATE_TENSOR = 6
+  # Insert the aeq.hadamard_rotation op.
+  INSERT_HADAMARD_ROTATION = 7
+  # Insert decomposed Hadamard rotation ops. This expresses the Hadamard
+  # rotation as matrix multiplication with Hadamard matrices.
+  INSERT_DECOMPOSED_HADAMARD_ROTATION = 8
 @dataclasses.dataclass(frozen=True)
@@ -122,8 +159,35 @@ class UniformQuantParams:
     quantized_data: The quantized data.
     block_size: The block size for blockwise quantization, block_size=0 meaning
       no blockwise quantization.
+    hadamard: The Hadamard rotation parameters, if set.
   """
+  class HadamardRotationParams:
+    """Parameters for the Hadamard rotation.
+    Attributes:
+      random_binary_vector: The random binary vector for the Hadamard rotation.
+        TODO(b/415392354): Randomization is an experimental feature that's
+        currently not implemented yet hence this is always 1. We will add
+        support or remove in the future.
+      hadamard_size: The size of the Hadamard matrix.
+    """
+    random_binary_vector: np.ndarray
+    hadamard_size: int
+    def __init__(self, random_binary_vector: np.ndarray, hadamard_size: int):
+      self.random_binary_vector = random_binary_vector
+      self.hadamard_size = hadamard_size
+    def __eq__(self, other):
+      if other.__class__ is not self.__class__:
+        return NotImplemented
+      return (
+          np.array_equal(self.random_binary_vector, other.random_binary_vector)
+          and self.hadamard_size == other.hadamard_size
+      )
   num_bits: int
   quantized_dimension: Optional[int]
   scale: np.ndarray
@@ -131,6 +195,7 @@ class UniformQuantParams:
   symmetric: bool = True
   quantized_data: Optional[np.ndarray] = None
   block_size: int = 0
+  hadamard: Optional[HadamardRotationParams] = None
   @classmethod
   def from_tfl_tensor_details(cls, tensor_detail) -> 'UniformQuantParams':
@@ -161,6 +226,7 @@ class UniformQuantParams:
         scale=quant_params['scales'],
         zero_point=quant_params['zero_points'],
         symmetric=symmetric,
+        block_size=quant_params['block_size'],
     )
   def __eq__(self, other):
@@ -174,6 +240,7 @@ class UniformQuantParams:
         and self.symmetric == other.symmetric
         and _compare_array_or_none(self.quantized_data, other.quantized_data)
         and self.block_size == other.block_size
+        and self.hadamard == other.hadamard
     )
@@ -249,14 +316,13 @@ class TensorQuantizationConfig:
     granularity: Whether to perform per-tensor, per-channel or per-block
       quantization.
     dtype: The data type of the tensor.
-    block_size: The block size for blockwise quantization, ignored otherwise.
+    algorithm_key: The algorithm key to use for quantization.
   """
   num_bits: int
   symmetric: bool = True
   granularity: QuantGranularity = QuantGranularity.TENSORWISE
   dtype: TensorDataType = TensorDataType.INT
-  block_size: int = 0
   def to_dict(self) -> dict[str, Any]:
     """Converts ActivationQuantizationConfig to dict."""
@@ -274,9 +340,28 @@ class TensorQuantizationConfig:
   def from_dict(cls, params: dict[str, Any]) -> 'TensorQuantizationConfig':
     """Converts a given dict to TensorQuantizationConfig."""
     params_copy = copy.deepcopy(params)
+    # Process block_size config from legacy recipe.
+    params_copy = _process_block_size(params_copy)
     return cls(**params_copy)
+def _process_block_size(params: dict[str, Any]) -> dict[str, Any]:
+  """Processes block size in the params."""
+  block_size = params.pop('block_size', 0)
+  if block_size > 0:
+    if block_size == 32:
+      params['granularity'] = QuantGranularity.BLOCKWISE_32
+    elif block_size == 64:
+      params['granularity'] = QuantGranularity.BLOCKWISE_64
+    elif block_size == 128:
+      params['granularity'] = QuantGranularity.BLOCKWISE_128
+    elif block_size == 256:
+      params['granularity'] = QuantGranularity.BLOCKWISE_256
+    else:
+      raise ValueError(f'Unsupported block size: {block_size}')
+  return params
 @dataclasses.dataclass(frozen=True)
 class OpQuantizationConfig:
   """Configuration class to control the quantization process behavior.
@@ -486,6 +571,7 @@ class IOOperator:
   outputs: list[int]
   op_key: TFLOperationName
 # The function signature for `get_tensor_quant_params_fn`.
 GetTensorQuantParamsFuncSignature = Callable[
     [

ai_edge_quantizer/quantizer.py CHANGED Viewed

@@ -18,8 +18,10 @@
 from collections.abc import Iterable
 import dataclasses
 import json
+import logging
 import os
 from typing import Any, Optional, Union
 from ai_edge_quantizer import algorithm_manager
 from ai_edge_quantizer import calibrator
 from ai_edge_quantizer import default_policy
@@ -28,11 +30,11 @@ from ai_edge_quantizer import model_validator
 from ai_edge_quantizer import params_generator
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer import recipe_manager
-from ai_edge_quantizer.utils import test_utils
 from ai_edge_quantizer.utils import tfl_flatbuffer_utils
 from ai_edge_quantizer.utils import tfl_interpreter_utils
 from ai_edge_quantizer.utils import validation_utils
-from tensorflow.python.platform import gfile  # pylint: disable=g-direct-tensorflow-import
+import os # tensorflow.python.platform.gfile  # pylint: disable=g-direct-tensorflow-import
 # Expose algorithm names to users.
 AlgorithmName = algorithm_manager.AlgorithmName
@@ -58,50 +60,62 @@ class QuantizationResult:
   recipe: _QuantRecipe
   quantized_model: Optional[bytearray]
-  def save(self, save_folder: str, model_name: str) -> None:
+  def save(
+      self, save_folder: str, model_name: str, overwrite: bool = False
+  ) -> None:
     """Saves the quantized model and the quantization recipe.
     Args:
       save_folder: Path to the folder to save the quantized model and the
         quantization recipe.
       model_name: Name of the model.
+      overwrite: Whether to overwrite the model if it already exists.
     Raises:
       RuntimeError: If no quantized model is available.
-      FileExistsError: If the model already exists in the folder.
     """
-    if self.quantized_model is None:
-      raise RuntimeError(
-          'No quantized model to save. Make sure .quantize() is called.'
-      )
+    if not os.path.exists(save_folder):
+      os.makedirs(save_folder)
     model_save_path = os.path.join(save_folder, f'{model_name}.tflite')
-    if gfile.Exists(model_save_path):
-      raise FileExistsError(
-          f'The model {model_save_path} already exists in the folder.'
-      )
-    with gfile.GFile(model_save_path, 'wb') as output_file_handle:
-      output_file_handle.write(self.quantized_model)
+    self.export_model(model_save_path, overwrite)
-    recipe = json.dumps(self.recipe)
     recipe_save_path = os.path.join(save_folder, model_name + '_recipe.json')
-    with gfile.GFile(recipe_save_path, 'w') as output_file_handle:
+    recipe = json.dumps(self.recipe)
+    with open(recipe_save_path, 'w') as output_file_handle:
       output_file_handle.write(recipe)
-  def export_model(self, filepath: str) -> None:
+  def export_model(self, filepath: str, overwrite: bool = False) -> None:
     """Exports the quantized model to a .tflite flatbuffer.
     Args:
       filepath: Path (including file name) that the exported model should be
         serialized to.
+      overwrite: Whether to overwrite the model if it already exists.
     Raises:
       RuntimeError: If no quantized model is available.
+      ValueError: If the model already exists in the folder and overwrite is
+        False.
     """
     if self.quantized_model is None:
       raise RuntimeError(
           'No quantized model to save. Make sure .quantize() is called.'
       )
-    with gfile.GFile(filepath, 'wb') as output_file_handle:
+    if os.path.exists(filepath):
+      if overwrite:
+        logging.warning(
+            'The model %s already exists in the folder. Overwriting the model'
+            ' since overwrite=True.',
+            filepath,
+        )
+      else:
+        raise ValueError(
+            f'The model {filepath} already exists in the folder. Please'
+            ' consider change the model name or specify overwrite=True to'
+            ' overwrite the model if needed.'
+        )
+    with open(filepath, 'wb') as output_file_handle:
       output_file_handle.write(self.quantized_model)
@@ -112,12 +126,16 @@ class Quantizer:
     float_model: TFLite model file path or bytearray.
     quantization_recipe: Quantization recipe .json filepath or in loaded json
       format.
+    previous_quantized_model: Optional previously quantized TFLite model file
+      path or bytearray. This is useful for validating a quantized model
+      without quantizing it again.
   """
   def __init__(
       self,
       float_model: Union[str, bytearray],
       quantization_recipe: Optional[Union[str, _QuantRecipe]] = None,
+      previous_quantized_model: Optional[Union[str, bytearray]] = None,
   ):
     """Initializes the quantizer.
@@ -125,6 +143,9 @@ class Quantizer:
       float_model: Path to the float tflite model.
       quantization_recipe: Quantization recipe in .json filepath or loaded json
         format.
+      previous_quantized_model: Path to an optional previously quantized tflite
+        model. This is useful for validating a quantized model without
+        quantizing it again.
     """
     # Use `float model` as bytes for memory efficiency.
     self.float_model: bytes = (
@@ -132,6 +153,14 @@ class Quantizer:
         if isinstance(float_model, str)
         else float_model
     )
+    if previous_quantized_model is not None:
+      self.previous_quantized_model: bytes = (
+          tfl_flatbuffer_utils.get_model_content(previous_quantized_model)
+          if isinstance(previous_quantized_model, str)
+          else previous_quantized_model
+      )
+    else:
+      self.previous_quantized_model = None
     self._recipe_manager: recipe_manager.RecipeManager = (
         recipe_manager.RecipeManager()
@@ -139,6 +168,7 @@ class Quantizer:
     if quantization_recipe is not None:
       self.load_quantization_recipe(quantization_recipe)
     self._result: QuantizationResult = QuantizationResult([{}], None)
+    self._quantize_called = False
   def load_quantization_recipe(self, recipe: Union[str, _QuantRecipe]) -> None:
     """Loads a quantization recipe.
@@ -149,7 +179,7 @@ class Quantizer:
       recipe: Quantization recipe in json format.
     """
     if isinstance(recipe, str):
-      with gfile.Open(recipe) as json_file:
+      with open(recipe) as json_file:
         recipe = json.load(json_file)
     self._recipe_manager.load_quantization_recipe(recipe)
@@ -161,7 +191,7 @@ class Quantizer:
     Args:
       filename: Config policy filename.
     """
-    with gfile.Open(filename, 'r') as f:
+    with open(filename, 'r') as f:
       policy = default_policy.update_default_config_policy(f.read())
     # Register the policy for MIN_MAX_UNIFORM_QUANT algorithm.
@@ -207,6 +237,109 @@ class Quantizer:
         regex, operation_name, op_config, algorithm_key
     )
+  def add_dynamic_config(
+      self,
+      regex: str,
+      operation_name: _TFLOpName,
+      num_bits: int,
+      granularity: qtyping.QuantGranularity = qtyping.QuantGranularity.CHANNELWISE,
+      algorithm_key: str = algorithm_manager.AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+  ):
+    """Adds a dynamic quantization configuration to the recipe.
+    During dynamic quantization, activations are not processed by AEQ and
+    remain in float format. The runtime kernel is expected to quantize these
+    activations on-the-fly, as indicated by compute_precision=Integer and
+    explicit_dequantize=False.
+    The model quality may suffer due to the on-the-fly quantization. If quality
+    is a concern, consider using weight-only
+    quantization.
+    Args:
+      regex: Regular expression for layer name (op's output tensor name)
+        matching.
+      operation_name: Target TFLite operation.
+      num_bits: Number of bits for quantization.
+      granularity: Granularity of quantization.
+      algorithm_key: Algorithm key to be applied.
+    """
+    self._recipe_manager.add_dynamic_config(
+        regex, operation_name, num_bits, granularity, algorithm_key
+    )
+  def add_weight_only_config(
+      self,
+      regex: str,
+      operation_name: _TFLOpName,
+      num_bits: int,
+      granularity: qtyping.QuantGranularity = qtyping.QuantGranularity.CHANNELWISE,
+      algorithm_key: str = algorithm_manager.AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+  ):
+    """Adds a weight only quantization configuration to the recipe.
+    In weight-only quantization, weights are quantized, but the actual operation
+    (op) computation remains in float. The quantized weight is explicitly
+    dequantized before being fed into the op. This is achieved by inserting a
+    dequantize op between the quantized weight and the consuming op. To enable
+    this, both compute_precision will be set to Float and explicit_dequantize to
+    True.
+    Weight-only quantization is useful for reducing model size but may
+    not decrease latency due to float computation. However, quantized model
+    generally has better quality than other quantization options (e.g., dynamic
+    range quantization) due to no loss of precision on activations. If latency
+    is a concern, consider using dynamic quantization.
+    Args:
+      regex: Regular expression for layer name matching.
+      operation_name: Target TFLite operation.
+      num_bits: Number of bits for quantization.
+      granularity: Granularity of quantization.
+      algorithm_key: Algorithm key to be applied.
+    """
+    self._recipe_manager.add_weight_only_config(
+        regex, operation_name, num_bits, granularity, algorithm_key
+    )
+  def add_static_config(
+      self,
+      regex: str,
+      operation_name: _TFLOpName,
+      activation_num_bits: int,
+      weight_num_bits: int,
+      weight_granularity: qtyping.QuantGranularity = qtyping.QuantGranularity.CHANNELWISE,
+      algorithm_key: str = algorithm_manager.AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+  ):
+    """Adds a static quantization configuration to the recipe.
+    In static quantization, both weights and activations are quantized. This
+    requires a calibration step to determine the quantization parameters (e.g.,
+    min/max ranges) for activations. The quantized model uses integer arithmetic
+    for computations, which can lead to significant latency reductions.
+    However, calibration is needed to determine the quantization parameters for
+    activations, which requires sample data and may lead to quality loss. If
+    there is no hardware requirement for full integer quantization, consider
+    using dynamic quantization for simplicity.
+    Args:
+      regex: Regular expression for layer name matching.
+      operation_name: Target TFLite operation.
+      activation_num_bits: Number of bits for activation quantization.
+      weight_num_bits: Number of bits for weight quantization.
+      weight_granularity: Granularity of weight quantization.
+      algorithm_key: Algorithm key to be applied.
+    """
+    self._recipe_manager.add_static_config(
+        regex,
+        operation_name,
+        activation_num_bits,
+        weight_num_bits,
+        weight_granularity,
+        algorithm_key,
+    )
   @property
   def need_calibration(self) -> bool:
     """Checks if the current recipe needs calibration."""
@@ -282,7 +415,7 @@ class Quantizer:
     Raises:
       RuntimeError: If quantization recipe is empty.
     """
+    self._quantize_called = True
     if calibration_result is not None:
       self._ensure_model_qsv_sufficient(calibration_result)
@@ -301,6 +434,7 @@ class Quantizer:
       error_metrics: str = 'mse',
       use_xnnpack: bool = True,
       num_threads: int = 16,
+      validate_output_tensors_only: bool = False,
   ) -> model_validator.ComparisonResult:
     """Numerical validation of the quantized model for a model signature.
@@ -319,23 +453,33 @@ class Quantizer:
       error_metrics: Error metrics to be used for comparison.
       use_xnnpack: Whether to use the xnnpack library for validation.
       num_threads: Number of threads to use for validation.
+      validate_output_tensors_only: If True, only compare output tensors.
+        Otherwise, compare all tensors.
     Returns:
       The comparison result.
     """
     if test_data is None:
       # Create test data for all signatures in the model.
-      test_data = test_utils.create_random_normal_input_data(
+      test_data = tfl_interpreter_utils.create_random_normal_input_data(
           self.float_model, num_samples=1
       )
+    if self._quantize_called:
+      quantized_model = self._result.quantized_model
+    else:
+      quantized_model = self.previous_quantized_model
+    if quantized_model is None:
+      raise ValueError('No quantized model available to validate.')
     return model_validator.compare_model(
         self.float_model,
-        self._result.quantized_model,
+        quantized_model,
         test_data,
         error_metrics,
         validation_utils.get_validation_func(error_metrics),
         use_xnnpack=use_xnnpack,
         num_threads=num_threads,
+        validate_output_tensors_only=validate_output_tensors_only,
     )
   def _get_quantization_params(

ai-edge-quantizer-nightly 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl