PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ai_edge_quantizer/default_policy.py CHANGED Viewed

@@ -18,8 +18,10 @@
 import collections
 import copy
 import json
-from typing import Any
+from typing import Any, Union
+from ai_edge_litert.tools import flatbuffer_utils
 from ai_edge_quantizer import qtyping
+from ai_edge_litert import schema_py_generated as schema  # pylint:disable=g-direct-tensorflow-import
 _TFLOpName = qtyping.TFLOperationName
 _OpQuantizationConfig = qtyping.OpQuantizationConfig
@@ -55,6 +57,16 @@ DEFAULT_JSON_POLICY = """
       "explicit_dequantize": false,
       "compute_precision": "INTEGER"
     },
+    "dynamic_wi4_afp32_blockwise": {
+      "weight_tensor_config": {
+        "num_bits": 4,
+        "symmetric": [true],
+        "granularity": ["BLOCKWISE_32", "BLOCKWISE_64", "BLOCKWISE_128", "BLOCKWISE_256"],
+        "dtype": "INT"
+      },
+      "explicit_dequantize": false,
+      "compute_precision": "INTEGER"
+    },
     "static_wi8_ai16": {
       "activation_tensor_config": {
         "num_bits": 16,
@@ -165,9 +177,30 @@ DEFAULT_JSON_POLICY = """
       "INPUT",
       "OUTPUT",
       "SLICE",
-      "EMBEDDING_LOOKUP",
       "SUM",
-      "SELECT_V2"
+      "SELECT",
+      "SELECT_V2",
+      "DYNAMIC_UPDATE_SLICE",
+      "SELECT_V2",
+      "STABLEHLO_COMPOSITE",
+      "PAD",
+      "MAX_POOL_2D",
+      "RESIZE_BILINEAR",
+      "RESIZE_NEAREST_NEIGHBOR",
+      "GATHER_ND",
+      "PACK",
+      "UNPACK",
+      "DIV",
+      "BROADCAST_TO",
+      "SQRT",
+      "GATHER",
+      "MAXIMUM",
+      "PADV2",
+      "REDUCE_MIN",
+      "EQUAL",
+      "NOT_EQUAL",
+      "MIRROR_PAD",
+      "RELU"
     ],
     "static_wi8_ai8": [
       "ADD",
@@ -193,12 +226,36 @@ DEFAULT_JSON_POLICY = """
       "INPUT",
       "OUTPUT",
       "SLICE",
-      "EMBEDDING_LOOKUP",
       "SUM",
-      "SELECT_V2"
+      "SELECT",
+      "SELECT_V2",
+      "DYNAMIC_UPDATE_SLICE",
+      "SELECT_V2",
+      "STABLEHLO_COMPOSITE",
+      "PAD",
+      "SQUARED_DIFFERENCE",
+      "MAX_POOL_2D",
+      "RESIZE_BILINEAR",
+      "RESIZE_NEAREST_NEIGHBOR",
+      "GATHER_ND",
+      "PACK",
+      "UNPACK",
+      "DIV",
+      "BROADCAST_TO",
+      "SQRT",
+      "GATHER",
+      "HARD_SWISH",
+      "MAXIMUM",
+      "PADV2",
+      "REDUCE_MIN",
+      "EQUAL",
+      "NOT_EQUAL",
+      "MIRROR_PAD",
+      "SPACE_TO_DEPTH",
+      "RELU"
     ],
-    "static_wi4_ai8": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT", "EMBEDDING_LOOKUP"],
-    "static_wi4_ai16": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT", "EMBEDDING_LOOKUP"],
+    "static_wi4_ai8": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT"],
+    "static_wi4_ai16": ["FULLY_CONNECTED", "CONV_2D", "INPUT", "OUTPUT"],
     "dynamic_wi8_afp32": [
       "BATCH_MATMUL",
       "CONV_2D",
@@ -208,6 +265,7 @@ DEFAULT_JSON_POLICY = """
       "FULLY_CONNECTED"
     ],
     "dynamic_wi4_afp32": ["FULLY_CONNECTED", "EMBEDDING_LOOKUP", "CONV_2D"],
+    "dynamic_wi4_afp32_blockwise": ["EMBEDDING_LOOKUP", "FULLY_CONNECTED"],
     "weightonly_wi8_afp32": [
       "BATCH_MATMUL",
       "CONV_2D",
@@ -220,6 +278,11 @@ DEFAULT_JSON_POLICY = """
   }
 }
 """
+QUANTIZABLE_COMPOSITES = [
+    "od" + "ml.npu_call",
+    "od" + "ml.rms_norm",
+    "od" + "ml.l2_norm",
+]
 def _unroll_json_config(
@@ -251,6 +314,7 @@ def _unroll_json_config(
   # Then unroll weight configs and turn them into quantization configs.
   quant_configs = []
+  weight_configs = []
   for symmetric in json_config["weight_tensor_config"]["symmetric"]:
     for granularity in json_config["weight_tensor_config"]["granularity"]:
       tensor_config = {
@@ -259,6 +323,9 @@ def _unroll_json_config(
           "granularity": granularity,
           "dtype": json_config["weight_tensor_config"]["dtype"],
       }
+      weight_configs.append(
+          qtyping.TensorQuantizationConfig.from_dict(tensor_config)
+      )
       if activation_configs:
         for activation_config in activation_configs:
@@ -273,19 +340,44 @@ def _unroll_json_config(
               )
           )
       else:
-        quant_configs.append(
-            qtyping.OpQuantizationConfig(
-                weight_tensor_config=qtyping.TensorQuantizationConfig.from_dict(
-                    tensor_config
-                ),
-                compute_precision=json_config["compute_precision"],
-                explicit_dequantize=json_config["explicit_dequantize"],
-            )
-        )
+        for weight_config in weight_configs:
+          quant_configs.append(
+              qtyping.OpQuantizationConfig(
+                  weight_tensor_config=weight_config,
+                  compute_precision=json_config["compute_precision"],
+                  explicit_dequantize=json_config["explicit_dequantize"],
+              )
+          )
   return quant_configs
+# TODO: b/401024954 - Have a better way to specify recipes based on op options.
+def is_non_quantizable_composite_op(
+    op: Union[schema.Operator, schema.OperatorT],
+) -> bool:
+  """Checks if the operator is a non-quantizable composite op.
+  We may want to quantize an op only when its has certain options.
+  Policies/recipes
+  are not aware of op options, so it is checked here.
+  Args:
+    op: The operator to check.
+  Returns:
+    True if the operator is conditionally unquantized, False otherwise.
+  """
+  if opts := flatbuffer_utils.get_options_as(
+      op, schema.StableHLOCompositeOptionsT
+  ):
+    name = opts.name.decode("utf-8")
+    if name not in QUANTIZABLE_COMPOSITES:
+      return True
+  return False
 def update_default_config_policy(raw_json_policy: str):
   """Updates the default config check policy."""
   json_policy_content = json.loads(raw_json_policy)

ai_edge_quantizer/model_modifier.py CHANGED Viewed

@@ -15,15 +15,23 @@
 """Model Modifier class that produce the final quantized TFlite model."""
+from collections.abc import Sequence
 import copy
+import logging
 import numpy as np
+from ai_edge_litert.tools import flatbuffer_utils
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer import transformation_instruction_generator
 from ai_edge_quantizer import transformation_performer
+from ai_edge_quantizer.utils import tfl_flatbuffer_utils
+from ai_edge_quantizer.utils import tfl_interpreter_utils
+from ai_edge_litert import interpreter as tfl  # pylint: disable=g-direct-tensorflow-import
 from ai_edge_litert import schema_py_generated  # pylint: disable=g-direct-tensorflow-import
-from tensorflow.lite.tools import flatbuffer_utils  # pylint: disable=g-direct-tensorflow-import
+_DEQUANT_SUFFIX = "_dequant"
 class ModelModifier:
@@ -46,6 +54,35 @@ class ModelModifier:
         transformation_performer.TransformationPerformer()
     )
+  def _get_tensor_processing_order(
+      self,
+      tensor_names: Sequence[str],
+      flatbuffer_model: schema_py_generated.ModelT,
+  ) -> list[str]:
+    """Get the tensor processing order obtained from `buffer_to_tensors`.
+    The processing order is used to ensure that last tensor in a buffer is
+    processed the last. This is required for the correctness of buffer
+    duplication, as the last tensor in a buffer won't be duplicated.
+    Args:
+      tensor_names: Names of the tensors that need to be processed.
+      flatbuffer_model: TFlite model.
+    Returns:
+      A list of tensor names in the processing order.
+    """
+    buffer_to_tensors = tfl_flatbuffer_utils.buffer_to_tensors(flatbuffer_model)
+    processing_order = []
+    for buffer_tensors in buffer_to_tensors.values():
+      for tensor in buffer_tensors:
+        tensor_name = tfl_flatbuffer_utils.get_tensor_name(tensor)
+        if tensor_name in tensor_names:
+          processing_order.append(tensor_name)
+    return processing_order
   def modify_model(
       self, params: dict[str, qtyping.TensorTransformationParams]
   ) -> bytearray:
@@ -66,15 +103,102 @@ class ModelModifier:
         params, quantized_model
     )
+    tensor_processing_order = self._get_tensor_processing_order(
+        list(instructions.keys()), quantized_model
+    )
     self._transformation_performer.transform_graph(
-        instructions, quantized_model
+        instructions, quantized_model, tensor_processing_order
     )
     constant_buffer_size = self._process_constant_map(quantized_model)
-    # we leave 64MB for the model architecture.
-    if constant_buffer_size > 2**31 - 2**26:
-      return self._serialize_large_model(quantized_model)
-    else:
-      return self._serialize_small_model(quantized_model)
+    # we leave 256MB for the model architecture.
+    serialize_fun = (
+        self._serialize_large_model
+        if constant_buffer_size > 2**31 - 2**28
+        else self._serialize_small_model
+    )
+    serialized_quantized_model = serialize_fun(quantized_model)
+    # Update signature defs if dequant is inserted before output.
+    if self._has_dequant_before_output(instructions):
+      quantized_model = self._update_signature_defs_for_dequant_output(
+          quantized_model, serialized_quantized_model
+      )
+      serialized_quantized_model = serialize_fun(quantized_model)
+    return serialized_quantized_model
+  def _update_signature_defs_for_dequant_output(
+      self, model: schema_py_generated.ModelT, serialized_model: bytearray
+  ):
+    """Updates the signature definitions in the model.
+    This function is called when a dequantize operation is inserted before
+    an output tensor. It updates the tensor index in the signature
+    definitions to point to the newly inserted dequantize output tensor.
+    Args:
+      model: The TFlite ModelT object.
+      serialized_model: The serialized bytearray of the TFlite model.
+    Returns:
+      The updated TFlite ModelT object.
+    """
+    interpreter = tfl.Interpreter(model_content=bytes(serialized_model))
+    for signature_def in model.signatureDefs:
+      signature_key = signature_def.signatureKey.decode("utf-8")
+      logging.info("Signature = %s", signature_key)
+      subgraph_idx = tfl_interpreter_utils.get_signature_main_subgraph_index(
+          interpreter, signature_key
+      )
+      output_details = interpreter.get_signature_runner(
+          signature_key
+      ).get_output_details()
+      subgraph = model.subgraphs[subgraph_idx]
+      graph_info = qtyping.GraphInfo(subgraph.tensors, model.buffers)
+      for output in subgraph.outputs:
+        tensor_name = tfl_flatbuffer_utils.get_tensor_name(
+            graph_info.subgraph_tensors[output]
+        )
+        logging.info("\tOutput tensor = `%s`", tensor_name)
+        for signature_name, tensor_details in output_details.items():
+          if tensor_details["name"] + _DEQUANT_SUFFIX == tensor_name:
+            logging.info(
+                "\t\tfound tensor mapping: `%s`->`%s` for signature name: `%s`",
+                tensor_details["name"],
+                tensor_name,
+                signature_name,
+            )
+            for signature_item in signature_def.outputs:
+              if signature_item.name.decode("utf-8") == signature_name:
+                signature_item.tensorIndex = output
+                logging.info(
+                    "\t\t\tswapped tensor index: %s->%s",
+                    tensor_details["index"],
+                    output,
+                )
+                break
+            break
+    return model
+  def _has_dequant_before_output(
+      self, instructions: dict[str, qtyping.TensorTransformationInsts]
+  ) -> bool:
+    """Check if the model has dequant insert to output."""
+    for tensor_name, tensor_trans_insts in instructions.items():
+      for instr in tensor_trans_insts.instructions:
+        if (
+            qtyping.QuantTransformation.ADD_DEQUANTIZE == instr.transformation
+            and instr.consumers == [-1]
+        ):
+          logging.info(
+              "Found dequant insert to output for tensor: %s", tensor_name
+          )
+          return True
+    return False
   def _process_constant_map(
       self, quantized_model: schema_py_generated.ModelT
@@ -108,7 +232,7 @@ class ModelModifier:
     remainder = len(bytearr) % 16
     if remainder != 0:
       padding_size = 16 - remainder
-      bytearr.extend(b'\0' * padding_size)
+      bytearr.extend(b"\0" * padding_size)
   # TODO: b/333797307 - support > 2GB output model
   def _serialize_large_model(

ai_edge_quantizer/model_modifier_test.py CHANGED Viewed

@@ -19,13 +19,13 @@ import os
 import tracemalloc
 from tensorflow.python.platform import googletest
 from absl.testing import parameterized
+from ai_edge_litert.tools import flatbuffer_utils
 from ai_edge_quantizer import model_modifier
 from ai_edge_quantizer import params_generator
 from ai_edge_quantizer import qtyping
 from ai_edge_quantizer import recipe_manager
 from ai_edge_quantizer.utils import test_utils
 from ai_edge_quantizer.utils import tfl_flatbuffer_utils
-from tensorflow.lite.tools import flatbuffer_utils  # pylint: disable=g-direct-tensorflow-import
 TEST_DATA_PREFIX_PATH = test_utils.get_path_to_datafile('.')
@@ -125,6 +125,86 @@ class ModelModifierTest(parameterized.TestCase):
     loosen_mem_use_factor = 4.5
     self.assertLess(mem_peak / len(self._model_content), loosen_mem_use_factor)
+  def test_has_dequant_before_output_true(self):
+    instructions = {
+        'tensor1': qtyping.TensorTransformationInsts(
+            'tensor1',
+            0,
+            instructions=[
+                qtyping.TransformationInst(
+                    transformation=qtyping.QuantTransformation.ADD_DEQUANTIZE,
+                    tensor_id=0,
+                    producer=0,
+                    consumers=[-1],
+                )
+            ],
+        )
+    }
+    self.assertTrue(
+        self._model_modifier._has_dequant_before_output(instructions)
+    )
+  def test_has_dequant_before_output_false(self):
+    instructions = {
+        'tensor1': qtyping.TensorTransformationInsts(
+            'tensor1',
+            0,
+            instructions=[
+                qtyping.TransformationInst(
+                    transformation=qtyping.QuantTransformation.ADD_DEQUANTIZE,
+                    tensor_id=0,
+                    producer=0,
+                    consumers=[1],
+                )
+            ],
+        )
+    }
+    self.assertFalse(
+        self._model_modifier._has_dequant_before_output(instructions)
+    )
+  def test_pad_bytearray(self):
+    arr = bytearray(b'\x01\x02\x03')
+    self._model_modifier._pad_bytearray(arr)
+    self.assertLen(arr, 16)
+    self.assertEqual(arr, b'\x01\x02\x03' + b'\0' * 13)
+    arr = bytearray(b'\x01' * 16)
+    self._model_modifier._pad_bytearray(arr)
+    self.assertLen(arr, 16)
+    arr = bytearray(b'\x01' * 17)
+    self._model_modifier._pad_bytearray(arr)
+    self.assertLen(arr, 32)
+class ModelModifierTestWithSignature(parameterized.TestCase):
+  def setUp(self):
+    super().setUp()
+    self._model_path = os.path.join(
+        TEST_DATA_PREFIX_PATH,
+        'tests/models/single_fc.tflite',
+    )
+    self._model_content: bytes = tfl_flatbuffer_utils.get_model_content(
+        self._model_path
+    )
+    self._model_modifier = model_modifier.ModelModifier(self._model_content)
+  def test_update_signature_defs_for_dequant_output_succeeds(self):
+    # This is a simplified test that only checks if the function runs without
+    # crashing and returns a model. A more thorough test with a model
+    # with a known signature was added in `quantizer_test`.
+    model_bytearray = flatbuffer_utils.read_model_from_bytearray(
+        self._model_content
+    )
+    updated_model = (
+        self._model_modifier._update_signature_defs_for_dequant_output(
+            model_bytearray, bytearray(self._model_content)
+        )
+    )
+    self.assertIsNotNone(updated_model)
 if __name__ == '__main__':
   googletest.main()

ai_edge_quantizer/model_validator.py CHANGED Viewed

@@ -25,7 +25,7 @@ from typing import Any, Optional, Union
 import numpy as np
 from ai_edge_quantizer.utils import tfl_interpreter_utils as utils
-from tensorflow.python.platform import gfile  # pylint: disable=g-direct-tensorflow-import
+import os # tensorflow.python.platform.gfile  # pylint: disable=g-direct-tensorflow-import
 _DEFAULT_SIGNATURE_KEY = utils.DEFAULT_SIGNATURE_KEY
@@ -118,7 +118,8 @@ class ComparisonResult:
     for name in utils.get_input_tensor_names(
         self._reference_model, signature_key
     ):
-      input_tensor_results[name] = result.pop(name)
+      if name in result:
+        input_tensor_results[name] = result.pop(name)
     output_tensor_results = {}
     for name in utils.get_output_tensor_names(
@@ -136,7 +137,8 @@ class ComparisonResult:
         self._reference_model,
         subgraph_index,
     ):
-      constant_tensor_results[name] = result.pop(name)
+      if name in result:
+        constant_tensor_results[name] = result.pop(name)
     self._comparison_results[signature_key] = SingleSignatureComparisonResult(
         error_metric=error_metric,
@@ -160,6 +162,12 @@ class ComparisonResult:
       result.update(signature_comparison_result.intermediate_tensors)
     return result
+  def get_model_size_reduction(self) -> tuple[int, float]:
+    """Get the model size reduction in bytes and percentage."""
+    reduced_model_size = len(self._reference_model) - len(self._target_model)
+    reduction_perc = reduced_model_size / len(self._reference_model) * 100
+    return reduced_model_size, reduction_perc
   def save(self, save_folder: str, model_name: str) -> None:
     """Saves the model comparison result.
@@ -170,8 +178,7 @@ class ComparisonResult:
     Raises:
       RuntimeError: If no quantized model is available.
     """
-    reduced_model_size = len(self._reference_model) - len(self._target_model)
-    reduction_ratio = reduced_model_size / len(self._reference_model) * 100
+    reduced_model_size, reduction_ratio = self.get_model_size_reduction()
     result = {
         'reduced_size_bytes': reduced_model_size,
         'reduced_size_percentage': reduction_ratio,
@@ -187,7 +194,7 @@ class ComparisonResult:
     result_save_path = os.path.join(
         save_folder, model_name + '_comparison_result.json'
     )
-    with gfile.GFile(result_save_path, 'w') as output_file_handle:
+    with open(result_save_path, 'w') as output_file_handle:
       output_file_handle.write(json.dumps(result))
     # TODO: b/365578554 - Remove after ME is updated to use the new json format.
@@ -199,7 +206,7 @@ class ComparisonResult:
     json_save_path = os.path.join(
         save_folder, model_name + '_comparison_result_me_input.json'
     )
-    with gfile.GFile(json_save_path, 'w') as output_file_handle:
+    with open(json_save_path, 'w') as output_file_handle:
       output_file_handle.write(json_object)
@@ -209,6 +216,7 @@ def _setup_validation_interpreter(
     signature_key: Optional[str],
     use_xnnpack: bool,
     num_threads: int,
+    preserve_all_tensors: bool = True,
 ) -> tuple[Any, int, dict[str, Any]]:
   """Setup the interpreter for validation given a signature key.
@@ -219,13 +227,17 @@ def _setup_validation_interpreter(
       model only has one signature, this can be set to None.
     use_xnnpack: Whether to use xnnpack for the interpreter.
     num_threads: The number of threads to use for the interpreter.
+    preserve_all_tensors: Whether to preserve all tensors.
   Returns:
     A tuple of interpreter, subgraph_index and tensor_name_to_details.
   """
   interpreter = utils.create_tfl_interpreter(
-      tflite_model=model, use_xnnpack=use_xnnpack, num_threads=num_threads
+      tflite_model=model,
+      use_xnnpack=use_xnnpack,
+      num_threads=num_threads,
+      preserve_all_tensors=preserve_all_tensors,
   )
   utils.invoke_interpreter_signature(
       interpreter, signature_input, signature_key
@@ -250,6 +262,7 @@ def compare_model(
     compare_fn: Callable[[Any, Any], float],
     use_xnnpack: bool = True,
     num_threads: int = 16,
+    validate_output_tensors_only: bool = False,
 ) -> ComparisonResult:
   """Compares model tensors over a model signature using the compare_fn.
@@ -270,10 +283,13 @@ def compare_model(
       single float value.
     use_xnnpack: Whether to use xnnpack for the interpreter.
     num_threads: The number of threads to use for the interpreter.
+    validate_output_tensors_only: If True, only compare output tensors.
+      Otherwise, compare all tensors.
   Returns:
     A ComparisonResult object.
   """
+  preserve_all_tensors = not validate_output_tensors_only
   model_comparion_result = ComparisonResult(reference_model, target_model)
   for signature_key, signature_inputs in test_data.items():
     comparison_results = {}
@@ -286,6 +302,7 @@ def compare_model(
               signature_key,
               use_xnnpack,
               num_threads,
+              preserve_all_tensors=preserve_all_tensors,
           )
       )
       targ_interpreter, targ_subgraph_index, targ_tensor_name_to_details = (
@@ -295,12 +312,23 @@ def compare_model(
               signature_key,
               use_xnnpack,
               num_threads,
+              preserve_all_tensors=preserve_all_tensors,
           )
       )
-      # Compare the cached tensor values.
-      for tensor_name, detail in ref_tensor_name_to_details.items():
+      # Compare the cached tensor value
+      tensor_names_to_compare = (
+          utils.get_output_tensor_names(reference_model, signature_key)
+          if validate_output_tensors_only
+          else list(ref_tensor_name_to_details.keys())
+      )
+      for tensor_name in tensor_names_to_compare:
+        detail = ref_tensor_name_to_details[tensor_name]
         if detail['dtype'] == np.object_:
           continue
+        # Ignore tensors where any dimension of the shape is 0.
+        if not np.all(detail['shape']):
+          continue
         if tensor_name in targ_tensor_name_to_details:
           if tensor_name not in comparison_results:
             comparison_results[tensor_name] = []

ai_edge_quantizer/model_validator_test.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tensorflow.python.platform import googletest
 from ai_edge_quantizer import model_validator
 from ai_edge_quantizer.utils import test_utils
 from ai_edge_quantizer.utils import tfl_flatbuffer_utils
+from ai_edge_quantizer.utils import tfl_interpreter_utils
 from ai_edge_quantizer.utils import validation_utils
 TEST_DATA_PREFIX_PATH = test_utils.get_path_to_datafile('.')
@@ -194,7 +195,7 @@ class ModelValidatorCompareTest(googletest.TestCase):
         self.target_model_path
     )
     self.signature_key = 'serving_default'  # single signature.
-    self.test_data = test_utils.create_random_normal_input_data(
+    self.test_data = tfl_interpreter_utils.create_random_normal_input_data(
         self.reference_model_path
     )
     self.test_dir = self.create_tempdir()

ai-edge-quantizer-nightly 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl