PyPI - ai-edge-quantizer-nightly - Versions diffs - 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl - Mend

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ai_edge_quantizer/quantizer_test.py CHANGED Viewed

@@ -51,6 +51,30 @@ def _get_calibration_data(num_samples: int = 16):
   return calibration_data
+def _is_all_signature_defs_inputs_float(model_content: bytes):
+  tfl_interpreter = tfl_interpreter_utils.create_tfl_interpreter(model_content)
+  for signature_key in tfl_interpreter.get_signature_list():
+    input_details = tfl_interpreter.get_signature_runner(
+        signature_key
+    ).get_input_details()
+    for tensor_details in input_details.values():
+      if tensor_details['dtype'] != np.float32:
+        return False
+  return True
+def _is_all_signature_defs_outputs_float(model_content: bytes):
+  tfl_interpreter = tfl_interpreter_utils.create_tfl_interpreter(model_content)
+  for signature_key in tfl_interpreter.get_signature_list():
+    output_details = tfl_interpreter.get_signature_runner(
+        signature_key
+    ).get_output_details()
+    for tensor_details in output_details.values():
+      if tensor_details['dtype'] != np.float32:
+        return False
+  return True
 class QuantizerTest(parameterized.TestCase):
   def setUp(self):
@@ -92,6 +116,76 @@ class QuantizerTest(parameterized.TestCase):
         new_op_config.compute_precision,
     )
+  def test_add_dynamic_config_succeeds(self):
+    self._quantizer.load_quantization_recipe(self._test_recipe_path)
+    scope_regex = '.*/Dense/.*'
+    self._quantizer.add_dynamic_config(
+        regex=scope_regex,
+        operation_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        num_bits=8,
+    )
+    updated_recipe = self._quantizer.get_quantization_recipe()
+    self.assertLen(updated_recipe, 2)
+    added_config = updated_recipe[-1]
+    self.assertEqual(added_config['regex'], scope_regex)
+    self.assertEqual(
+        added_config['op_config']['compute_precision'],
+        qtyping.ComputePrecision.INTEGER,
+    )
+    self.assertFalse(added_config['op_config']['explicit_dequantize'])
+    self.assertEqual(
+        added_config['op_config']['weight_tensor_config']['num_bits'], 8
+    )
+  def test_add_weight_only_config_succeeds(self):
+    self._quantizer.load_quantization_recipe(self._test_recipe_path)
+    scope_regex = '.*/Dense/.*'
+    self._quantizer.add_weight_only_config(
+        regex=scope_regex,
+        operation_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        num_bits=4,
+    )
+    updated_recipe = self._quantizer.get_quantization_recipe()
+    self.assertLen(updated_recipe, 2)
+    added_config = updated_recipe[-1]
+    self.assertEqual(added_config['regex'], scope_regex)
+    self.assertEqual(
+        added_config['op_config']['compute_precision'],
+        qtyping.ComputePrecision.FLOAT,
+    )
+    self.assertTrue(added_config['op_config']['explicit_dequantize'])
+    self.assertEqual(
+        added_config['op_config']['weight_tensor_config']['num_bits'], 4
+    )
+  def test_add_static_config_succeeds(self):
+    self._quantizer.load_quantization_recipe(self._test_recipe_path)
+    scope_regex = '.*/Dense/.*'
+    self._quantizer.add_static_config(
+        regex=scope_regex,
+        operation_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        activation_num_bits=8,
+        weight_num_bits=4,
+    )
+    updated_recipe = self._quantizer.get_quantization_recipe()
+    self.assertLen(updated_recipe, 2)
+    added_config = updated_recipe[-1]
+    self.assertEqual(added_config['regex'], scope_regex)
+    self.assertEqual(
+        added_config['op_config']['compute_precision'],
+        qtyping.ComputePrecision.INTEGER,
+    )
+    self.assertFalse(added_config['op_config']['explicit_dequantize'])
+    self.assertEqual(
+        added_config['op_config']['activation_tensor_config']['num_bits'], 8
+    )
+    self.assertEqual(
+        added_config['op_config']['weight_tensor_config']['num_bits'], 4
+    )
   def test_load_quantization_recipe_succeeds(self):
     qt = quantizer.Quantizer(self._test_model_path, None)
     qt.load_quantization_recipe(self._test_recipe_path)
@@ -118,7 +212,7 @@ class QuantizerTest(parameterized.TestCase):
     # Calibrate with empty state.
     calib_data = _get_calibration_data()
     calibration_result = self._quantizer.calibrate(calib_data)
-    self.assertLen(calibration_result, 13)
+    self.assertLen(calibration_result, 7)
   @parameterized.parameters(
       'recipes/default_a8w8_recipe.json',
@@ -133,7 +227,7 @@ class QuantizerTest(parameterized.TestCase):
     updated_calibration_result = self._quantizer.calibrate(
         calib_data, previous_calibration_result=calibration_result
     )
-    self.assertLen(updated_calibration_result, 13)
+    self.assertLen(updated_calibration_result, 7)
     self.assertNotEqual(
         calibration_result['StatefulPartitionedCall:0'],
         updated_calibration_result['StatefulPartitionedCall:0'],
@@ -215,6 +309,44 @@ class QuantizerTest(parameterized.TestCase):
       saved_recipe = json.load(json_file)
     self.assertEqual(saved_recipe, self._test_recipe)
+  def test_saved_legacy_recipe_lacks_block_size(self):
+    model_name = 'test_model'
+    legacy_recipe_path = os.path.join(
+        TEST_DATA_PREFIX_PATH,
+        'recipes/dynamic_legacy_wi8_afp32_recipe.json',
+    )
+    self._quantizer.load_quantization_recipe(legacy_recipe_path)
+    result = self._quantizer.quantize()
+    result.save(self._tmp_save_path, model_name)
+    saved_recipe_path = os.path.join(
+        self._tmp_save_path, model_name + '_recipe.json'
+    )
+    with open(saved_recipe_path) as json_file:
+      saved_recipe = json.load(json_file)
+    with open(legacy_recipe_path) as json_file:
+      legacy_recipe = json.load(json_file)
+    self.assertNotEqual(saved_recipe, legacy_recipe)
+    # Verify that the default test recipe contains 'block_size'.
+    has_block_size = False
+    for config in legacy_recipe:
+      op_config = config.get('op_config')
+      if op_config:
+        weight_config = op_config.get('weight_tensor_config')
+        if weight_config and 'block_size' in weight_config:
+          has_block_size = True
+          break
+    self.assertTrue(has_block_size)
+    # Verify that the saved recipe does not have 'block_size'.
+    for config in saved_recipe:
+      op_config = config.get('op_config')
+      if op_config:
+        weight_config = op_config.get('weight_tensor_config')
+        if weight_config:
+          self.assertNotIn('block_size', weight_config)
   def test_save_no_quantize_raise_error(self):
     error_message = 'No quantized model to save.'
     with self.assertRaisesWithPredicateMatch(
@@ -243,6 +375,34 @@ class QuantizerTest(parameterized.TestCase):
         'sequential/dense_1/MatMul', validation_result.intermediate_tensors
     )
+  def test_validate_output_tensors_only_succeeds(self):
+    self._quantizer.quantize()
+    validation_result = self._quantizer.validate(
+        validate_output_tensors_only=True
+    )
+    validation_result = validation_result.get_signature_comparison_result()
+    self.assertIsNotNone(validation_result)
+    self.assertEmpty(validation_result.input_tensors)
+    self.assertEmpty(validation_result.constant_tensors)
+    self.assertEmpty(validation_result.intermediate_tensors)
+    self.assertNotEmpty(validation_result.output_tensors)
+    self.assertIn('StatefulPartitionedCall:0', validation_result.output_tensors)
+  def test_validate_with_quantized_model_arg_succeeds(self):
+    self._quantizer.quantize()
+    quantized_model = self._quantizer._result.quantized_model
+    self.assertIsNotNone(quantized_model)
+    new_quantizer = quantizer.Quantizer(
+        self._test_model_path, previous_quantized_model=quantized_model
+    )
+    validation_result = new_quantizer.validate()
+    validation_result = validation_result.get_signature_comparison_result()
+    self.assertIsNotNone(validation_result)
+    self.assertIn(
+        'sequential/dense_1/MatMul', validation_result.intermediate_tensors
+    )
   def test_load_custom_policies_succeeds(self):
     test_op_config = qtyping.OpQuantizationConfig(
@@ -284,6 +444,33 @@ class QuantizerTest(parameterized.TestCase):
         op_config=test_op_config,
     )
+  def test_two_pass_quantization_with_conv_and_fc_succeeds(self):
+    float_model_path = self._test_model_path
+    drq_recipe_path = os.path.join(
+        TEST_DATA_PREFIX_PATH, 'recipes/dynamic_wi8_afp32_hadamard_recipe.json'
+    )
+    drq_quantizer = quantizer.Quantizer(float_model_path)
+    drq_quantizer.load_quantization_recipe(drq_recipe_path)
+    drq_result = drq_quantizer.quantize()
+    drq_model_path = os.path.join(self._tmp_save_path, 'drq_model.tflite')
+    drq_result.export_model(drq_model_path)
+    srq_recipe_path = os.path.join(
+        TEST_DATA_PREFIX_PATH, 'recipes/default_a8w8_recipe.json'
+    )
+    srq_quantizer = quantizer.Quantizer(drq_model_path)
+    srq_quantizer.load_quantization_recipe(srq_recipe_path)
+    representative_dataset = (
+        tfl_interpreter_utils.create_random_normal_input_data(
+            drq_model_path, num_samples=1
+        )
+    )
+    calibration_result = srq_quantizer.calibrate(representative_dataset)
+    srq_result = srq_quantizer.quantize(calibration_result)
+    srq_model_path = os.path.join(self._tmp_save_path, 'srq_model.tflite')
+    srq_result.export_model(srq_model_path)
 class QuantizerBytearrayInputs(googletest.TestCase):
@@ -412,7 +599,9 @@ class QuantizerMultiSignatureModelTest(parameterized.TestCase):
     available_signatures = validation_result.available_signature_keys()
     self.assertLen(available_signatures, 2)
-  def test_recipe_conflict_raises_error(self):
+  def test_constant_buffer_shared_by_tensors_with_different_quantization_params_succeeds(
+      self,
+  ):
     recipe = [
         dict({
             'regex': '.*',
@@ -424,14 +613,12 @@ class QuantizerMultiSignatureModelTest(parameterized.TestCase):
                     'symmetric': False,
                     'granularity': 'TENSORWISE',
                     'dtype': 'INT',
-                    'block_size': 0,
                 },
                 'weight_tensor_config': {
                     'num_bits': 8,
                     'symmetric': True,
                     'granularity': 'CHANNELWISE',
                     'dtype': 'INT',
-                    'block_size': 0,
                 },
                 'compute_precision': 'INTEGER',
                 'explicit_dequantize': False,
@@ -439,17 +626,9 @@ class QuantizerMultiSignatureModelTest(parameterized.TestCase):
             },
         })
     ]
     qt = quantizer.Quantizer(self._test_model_path, recipe)
     calib_result = qt.calibrate(_MULTI_SIGNATURE_CALIBRATION_DATASET)
-    error_message = (
-        "The tensors b'Add/y' and b'Mul/y' do not have the same quantization"
-    )
-    with self.assertRaisesWithPredicateMatch(
-        RuntimeError, lambda err: error_message in str(err)
-    ):
-      qt.quantize(calib_result)
+    self.assertIsNotNone(qt.quantize(calib_result).quantized_model)
   def test_quantization_with_insufficient_calibration(self):
     # Run calibration for one signature only.
@@ -460,8 +639,7 @@ class QuantizerMultiSignatureModelTest(parameterized.TestCase):
     # Quantize and expect an error about missing signature in calibration data.
     error_message = (
-        'Missing QSVs (min/max) for tensor multiply_x:0 in Signature'
-        " 'multiply'."
+        'MUL(index: 0) not found in tensor_name_to_qsv'
     )
     with self.assertRaisesWithPredicateMatch(
         ValueError, lambda err: error_message in str(err)
@@ -483,21 +661,21 @@ class QuantizerToyGemma2Test(parameterized.TestCase):
         'signature_1': [{
             'cache_0': _RNG.random(size=(1, 100, 4, 4), dtype=np.float32),
             'cache_1': _RNG.random(size=(1, 100, 4, 4), dtype=np.float32),
-            'positions': _RNG.integers(low=0, high=10, size=(1, 100)).astype(
-                np.int32
+            'positions': (
+                _RNG.integers(low=0, high=10, size=(1, 100)).astype(np.int32)
             ),
-            'tokens': _RNG.integers(low=0, high=10, size=(1, 100)).astype(
-                np.int32
+            'tokens': (
+                _RNG.integers(low=0, high=10, size=(1, 100)).astype(np.int32)
             ),
         }],
         'signature_2': [{
             'cache_0': _RNG.random(size=(1, 100, 4, 4), dtype=np.float32),
             'cache_1': _RNG.random(size=(1, 100, 4, 4), dtype=np.float32),
-            'positions': _RNG.integers(low=0, high=10, size=(1, 100)).astype(
-                np.int32
+            'positions': (
+                _RNG.integers(low=0, high=10, size=(1, 100)).astype(np.int32)
             ),
-            'tokens': _RNG.integers(low=0, high=10, size=(1, 100)).astype(
-                np.int32
+            'tokens': (
+                _RNG.integers(low=0, high=10, size=(1, 100)).astype(np.int32)
             ),
         }],
     }
@@ -514,8 +692,8 @@ class QuantizerToyGemma2Test(parameterized.TestCase):
     )
     self._quantizer.update_quantization_recipe(
-        regex='StatefulPartitionedCall',
-        operation_name=qtyping.TFLOperationName.FULLY_CONNECTED,
+        regex='.*',
+        operation_name=qtyping.TFLOperationName.OUTPUT,
         algorithm_key=_AlgorithmName.NO_QUANTIZE,
     )
@@ -527,6 +705,90 @@ class QuantizerToyGemma2Test(parameterized.TestCase):
     self._quantizer.quantize(calib_result)
     self.assertIsNotNone(self._quantizer._result.quantized_model)
+  def test_toy_gemma2_update_signature_defs_succeeds(self):
+    self.assertTrue(
+        _is_all_signature_defs_outputs_float(
+            open(self._test_model_path, 'rb').read()
+        )
+    )
+    calib_result = self._quantizer.calibrate(
+        self._toy_gemma2_calibration_dataset
+    )
+    self.assertIsNotNone(calib_result)
+    self._quantizer.quantize(calib_result)
+    self.assertIsNotNone(self._quantizer._result.quantized_model)
+    self.assertTrue(
+        _is_all_signature_defs_outputs_float(
+            self._quantizer._result.quantized_model
+        )
+    )
+class QuantizerFullyConnectedTest(parameterized.TestCase):
+  def setUp(self):
+    super().setUp()
+    self._tmp_save_path = self.create_tempdir().full_path
+    self._test_model_path = os.path.join(
+        TEST_DATA_PREFIX_PATH,
+        'tests/models/single_fc.tflite',
+    )
+    self._test_recipe_path = os.path.join(
+        TEST_DATA_PREFIX_PATH,
+        'recipes/default_a8w8_recipe.json',
+    )
+    with open(self._test_recipe_path) as json_file:
+      self._test_recipe = json.load(json_file)
+    self._quantizer = quantizer.Quantizer(
+        self._test_model_path, self._test_recipe_path
+    )
+    self._quantizer.update_quantization_recipe(
+        regex='.*',
+        operation_name=qtyping.TFLOperationName.INPUT,
+        algorithm_key=_AlgorithmName.NO_QUANTIZE,
+    )
+    self._quantizer.update_quantization_recipe(
+        regex='.*',
+        operation_name=qtyping.TFLOperationName.OUTPUT,
+        algorithm_key=_AlgorithmName.NO_QUANTIZE,
+    )
+  def test_fully_connected_quantization_succeeds(self):
+    calib_result = self._quantizer.calibrate(
+        tfl_interpreter_utils.create_random_normal_input_data(
+            self._test_model_path, num_samples=4
+        )
+    )
+    self.assertIsNotNone(calib_result)
+    self._quantizer.quantize(calib_result)
+    self.assertIsNotNone(self._quantizer._result.quantized_model)
+  def test_fully_connected_quantization_update_signature_defs_succeeds(self):
+    model_content = open(self._test_model_path, 'rb').read()
+    self.assertTrue(_is_all_signature_defs_inputs_float(model_content))
+    self.assertTrue(_is_all_signature_defs_outputs_float(model_content))
+    calib_result = self._quantizer.calibrate(
+        tfl_interpreter_utils.create_random_normal_input_data(
+            self._test_model_path, num_samples=4
+        )
+    )
+    self.assertIsNotNone(calib_result)
+    quant_result = self._quantizer.quantize(calib_result)
+    self.assertIsNotNone(quant_result.quantized_model)
+    self.assertTrue(
+        _is_all_signature_defs_inputs_float(quant_result.quantized_model)
+    )
+    self.assertTrue(
+        _is_all_signature_defs_outputs_float(quant_result.quantized_model)
+    )
 if __name__ == '__main__':
   googletest.main()

ai_edge_quantizer/recipe.py CHANGED Viewed

@@ -15,28 +15,163 @@
 """Quantization recipe module."""
+from ai_edge_quantizer import algorithm_manager
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer import recipe_manager
-def dynamic_wi8_afp32():
-  """Returns a dynamic quantization recipe with int8 weights and float32 activation."""
-  return [
-      dict({
-          'regex': '.*',
-          'operation': '*',
-          'algorithm_key': 'min_max_uniform_quantize',
-          'op_config': {
-              'weight_tensor_config': {
-                  'num_bits': 8,
-                  'symmetric': True,
-                  'granularity': 'CHANNELWISE',
-                  'dtype': 'INT',
-                  'block_size': 0,
-              },
-              'compute_precision': 'INTEGER',
-              'explicit_dequantize': False,
-              'skip_checks': False,
-          },
-      })
-  ]
+AlgorithmName = algorithm_manager.AlgorithmName
+def dynamic_wi8_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a dynamic quantization recipe with int8 weights and float32 activation.
+  All supported ops will be quantized with int8 weights and float32 activations,
+  which will be dynamically quantized to int8 during inference to enable int8
+  compute. The model quality may suffer due to the on-the-fly quantization. If
+  quality is a concern, consider using weight-only quantization.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A dynamic quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_dynamic_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+def dynamic_wi4_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a dynamic quantization recipe with int4 weights and float32 activation.
+  All supported ops will be quantized with int4 weights and float32 activations,
+  which will be dynamically quantized to int4 during inference to enable int4
+  compute.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A dynamic quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_dynamic_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=4,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+def weight_only_wi8_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a weight-only quantization recipe with int8 weights and float32 activation.
+  All supported ops will be quantized with int8 weights and float32 activations.
+  The weights will be explicitly dequantized before being fed into the op to
+  enable float compute thus retain model quality. If latency is a concern,
+  consider using dynamic range quantization.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A weight-only quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_weight_only_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+def weight_only_wi4_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a weight-only quantization recipe with int4 weights and float32 activation.
+  All supported ops will be quantized with int4 weights and float32 activations.
+  The weights will be explicitly dequantized before being fed into the op to
+  enable float compute thus retain model quality.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A weight-only quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_weight_only_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=4,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+def static_wi8_ai8(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a static quantization recipe with int8 weights and int8 activations.
+  All supported ops will be quantized with int8 weights and int8 activations.
+  Calibration is needed to use this recipe.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A static quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_static_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      activation_num_bits=8,
+      weight_num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+def static_wi8_ai16(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a static quantization recipe with int8 weights and int16 activations.
+  All supported ops will be quantized with int8 weights and int16 activations.
+  Calibration is needed to use this recipe.
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+  Returns:
+    A static quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_static_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      activation_num_bits=16,
+      weight_num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
 def dynamic_legacy_wi8_afp32():

ai-edge-quantizer-nightly 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl

ai-edge-quantizer-nightly 0.0.1.dev20250302py3-none-any.whl → 0.5.0.dev20260103py3-none-any.whl