ai-edge-quantizer-nightly 0.0.1.dev20250302__py3-none-any.whl → 0.5.0.dev20260103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_edge_quantizer/algorithm_manager.py +224 -0
- ai_edge_quantizer/algorithm_manager_api_test.py +7 -0
- ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting_test.py +2 -2
- ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py +643 -20
- ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py +29 -2
- ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py +29 -35
- ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py +35 -12
- ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation.py +414 -0
- ai_edge_quantizer/algorithms/uniform_quantize/hadamard_rotation_test.py +440 -0
- ai_edge_quantizer/algorithms/uniform_quantize/mse.py +127 -0
- ai_edge_quantizer/algorithms/uniform_quantize/mse_test.py +195 -0
- ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py +54 -168
- ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py +54 -17
- ai_edge_quantizer/algorithms/uniform_quantize/octav.py +188 -0
- ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py +240 -0
- ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py +260 -13
- ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py +152 -5
- ai_edge_quantizer/algorithms/utils/common_utils.py +142 -54
- ai_edge_quantizer/calibrator.py +58 -94
- ai_edge_quantizer/calibrator_test.py +5 -74
- ai_edge_quantizer/default_policy.py +108 -16
- ai_edge_quantizer/model_modifier.py +132 -8
- ai_edge_quantizer/model_modifier_test.py +81 -1
- ai_edge_quantizer/model_validator.py +38 -10
- ai_edge_quantizer/model_validator_test.py +2 -1
- ai_edge_quantizer/params_generator.py +230 -47
- ai_edge_quantizer/params_generator_test.py +366 -261
- ai_edge_quantizer/qtyping.py +92 -6
- ai_edge_quantizer/quantizer.py +167 -23
- ai_edge_quantizer/quantizer_test.py +288 -26
- ai_edge_quantizer/recipe.py +156 -21
- ai_edge_quantizer/recipe_manager.py +158 -1
- ai_edge_quantizer/recipe_manager_test.py +146 -32
- ai_edge_quantizer/recipe_test.py +93 -17
- ai_edge_quantizer/transformation_instruction_generator.py +313 -46
- ai_edge_quantizer/transformation_instruction_generator_test.py +449 -27
- ai_edge_quantizer/transformation_performer.py +112 -58
- ai_edge_quantizer/transformation_performer_test.py +176 -4
- ai_edge_quantizer/transformations/duplicate_buffer.py +46 -0
- ai_edge_quantizer/transformations/duplicate_buffer_test.py +106 -0
- ai_edge_quantizer/transformations/duplicate_tensor.py +62 -0
- ai_edge_quantizer/transformations/duplicate_tensor_test.py +131 -0
- ai_edge_quantizer/transformations/insert_decomposed_hadamard_rotation.py +299 -0
- ai_edge_quantizer/transformations/insert_decomposed_hadamard_rotation_test.py +244 -0
- ai_edge_quantizer/transformations/insert_hadamard_rotation.py +186 -0
- ai_edge_quantizer/transformations/insert_hadamard_rotation_test.py +200 -0
- ai_edge_quantizer/transformations/quantize_tensor.py +24 -44
- ai_edge_quantizer/transformations/quantize_tensor_test.py +3 -2
- ai_edge_quantizer/transformations/transformation_utils.py +157 -11
- ai_edge_quantizer/transformations/transformation_utils_test.py +96 -2
- ai_edge_quantizer/utils/calibration_utils.py +263 -1
- ai_edge_quantizer/utils/calibration_utils_test.py +173 -3
- ai_edge_quantizer/utils/constrained_ops_utils.py +111 -0
- ai_edge_quantizer/utils/constrained_ops_utils_test.py +50 -0
- ai_edge_quantizer/utils/test_utils.py +191 -58
- ai_edge_quantizer/utils/tfl_flatbuffer_utils.py +96 -50
- ai_edge_quantizer/utils/tfl_flatbuffer_utils_test.py +20 -0
- ai_edge_quantizer/utils/tfl_interpreter_utils.py +138 -5
- ai_edge_quantizer/utils/tfl_interpreter_utils_test.py +29 -2
- ai_edge_quantizer/utils/validation_utils.py +114 -4
- ai_edge_quantizer/utils/validation_utils_test.py +80 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250302.dist-info → ai_edge_quantizer_nightly-0.5.0.dev20260103.dist-info}/METADATA +13 -3
- ai_edge_quantizer_nightly-0.5.0.dev20260103.dist-info/RECORD +81 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250302.dist-info → ai_edge_quantizer_nightly-0.5.0.dev20260103.dist-info}/WHEEL +1 -1
- ai_edge_quantizer/transformations/emulated_subchannel.py +0 -363
- ai_edge_quantizer/transformations/emulated_subchannel_test.py +0 -212
- ai_edge_quantizer_nightly-0.0.1.dev20250302.dist-info/RECORD +0 -67
- {ai_edge_quantizer_nightly-0.0.1.dev20250302.dist-info → ai_edge_quantizer_nightly-0.5.0.dev20260103.dist-info/licenses}/LICENSE +0 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250302.dist-info → ai_edge_quantizer_nightly-0.5.0.dev20260103.dist-info}/top_level.txt +0 -0
|
@@ -16,8 +16,11 @@
|
|
|
16
16
|
"""Uniform quantize in tensor level."""
|
|
17
17
|
|
|
18
18
|
import dataclasses
|
|
19
|
+
from typing import Optional, Sequence
|
|
20
|
+
import ml_dtypes
|
|
19
21
|
import numpy as np
|
|
20
22
|
from ai_edge_quantizer import qtyping
|
|
23
|
+
from ai_edge_quantizer.utils import tfl_flatbuffer_utils
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
@dataclasses.dataclass(frozen=True)
|
|
@@ -26,6 +29,11 @@ class IntType:
|
|
|
26
29
|
signed: bool
|
|
27
30
|
|
|
28
31
|
|
|
32
|
+
def is_blockwise(granularity: qtyping.QuantGranularity) -> bool:
|
|
33
|
+
"""Checks if the quantization granularity is blockwise."""
|
|
34
|
+
return "BLOCKWISE" in str(granularity)
|
|
35
|
+
|
|
36
|
+
|
|
29
37
|
def get_quantized_range(qtype: IntType) -> tuple[float, float]:
|
|
30
38
|
"""Calculates range of the quantized type."""
|
|
31
39
|
if qtype.signed:
|
|
@@ -37,6 +45,22 @@ def get_quantized_range(qtype: IntType) -> tuple[float, float]:
|
|
|
37
45
|
return float(qmin), float(qmax)
|
|
38
46
|
|
|
39
47
|
|
|
48
|
+
def extract_block_size_from_granularity(
|
|
49
|
+
granularity: qtyping.QuantGranularity,
|
|
50
|
+
) -> int:
|
|
51
|
+
"""Get the block size for blockwise quantization."""
|
|
52
|
+
if granularity == qtyping.QuantGranularity.BLOCKWISE_32:
|
|
53
|
+
return 32
|
|
54
|
+
elif granularity == qtyping.QuantGranularity.BLOCKWISE_64:
|
|
55
|
+
return 64
|
|
56
|
+
elif granularity == qtyping.QuantGranularity.BLOCKWISE_128:
|
|
57
|
+
return 128
|
|
58
|
+
elif granularity == qtyping.QuantGranularity.BLOCKWISE_256:
|
|
59
|
+
return 256
|
|
60
|
+
else:
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
|
|
40
64
|
def _round_and_clip(
|
|
41
65
|
tensor: np.ndarray, qtype: IntType, narrow: bool
|
|
42
66
|
) -> np.ndarray:
|
|
@@ -116,22 +140,141 @@ def fix_quantization_params_rank(
|
|
|
116
140
|
symmetric=quantization_params.symmetric,
|
|
117
141
|
quantized_dimension=quantization_params.quantized_dimension,
|
|
118
142
|
quantized_data=quantization_params.quantized_data,
|
|
143
|
+
block_size=quantization_params.block_size,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_tensor_shape_for_blockwise(
|
|
148
|
+
tensor_shape: Sequence[int], quantized_dim: int, block_size: int
|
|
149
|
+
) -> list[int]:
|
|
150
|
+
"""Get the tensor shape for blockwise quantization.
|
|
151
|
+
|
|
152
|
+
This function splits the quantize dimension of the tensor into blocks and the
|
|
153
|
+
dim/blocks. Hence, min/max of the tensor can be calculated for each block
|
|
154
|
+
using existing functions.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
tensor_shape: The original shape of the tensor.
|
|
158
|
+
quantized_dim: The dimension to be quantized blockwise.
|
|
159
|
+
block_size: The size of the block.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
The new tensor shape for calculating scale and zp for blockwise
|
|
163
|
+
quantization.
|
|
164
|
+
"""
|
|
165
|
+
new_shape = []
|
|
166
|
+
for index, val in enumerate(tensor_shape):
|
|
167
|
+
if index == quantized_dim:
|
|
168
|
+
if val % block_size != 0:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Quantized dimension {val} in tensor shape {tensor_shape} is not"
|
|
171
|
+
f" divisible by block size {block_size}."
|
|
172
|
+
)
|
|
173
|
+
new_shape.append(int(val / block_size))
|
|
174
|
+
new_shape.append(block_size)
|
|
175
|
+
else:
|
|
176
|
+
new_shape.append(val)
|
|
177
|
+
return new_shape
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def reshape_data_for_blockwise(
|
|
181
|
+
tensor_data: np.ndarray,
|
|
182
|
+
op_name: qtyping.TFLOperationName,
|
|
183
|
+
granularity: qtyping.QuantGranularity,
|
|
184
|
+
) -> tuple[np.ndarray, int]:
|
|
185
|
+
"""Reshapes data for blockwise quantization.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
tensor_data: The original tensor data.
|
|
189
|
+
op_name: The name of the TFL op.
|
|
190
|
+
granularity: The quantization granularity for the tensor.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
A tuple containing the reshaped tensor data and the new reduce dimension.
|
|
194
|
+
"""
|
|
195
|
+
quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
|
|
196
|
+
op_name
|
|
197
|
+
]
|
|
198
|
+
block_size = extract_block_size_from_granularity(granularity)
|
|
199
|
+
new_shape = _get_tensor_shape_for_blockwise(
|
|
200
|
+
tensor_data.shape, quantized_dim, block_size
|
|
201
|
+
)
|
|
202
|
+
return tensor_data.reshape(new_shape), quantized_dim + 1
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _broadcast_scale_zp_for_blockwise(
|
|
206
|
+
tensor_content: np.ndarray,
|
|
207
|
+
quant_params: qtyping.UniformQuantParams,
|
|
208
|
+
) -> qtyping.UniformQuantParams:
|
|
209
|
+
"""Broadcasts scale and zp for blockwise quantization.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
tensor_content: The original tensor data.
|
|
213
|
+
quant_params: The quantization parameters.
|
|
214
|
+
`quant_params.quantized_dimension` must be specified.
|
|
215
|
+
`quant_params.block_size` must be specified and positive.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
The updated quantization parameters with broadcasted scale and zp for
|
|
219
|
+
correct constant quantization.
|
|
220
|
+
"""
|
|
221
|
+
if quant_params.quantized_dimension is None:
|
|
222
|
+
raise ValueError("Quantized dimension must be specified.")
|
|
223
|
+
if quant_params.block_size is None or quant_params.block_size <= 0:
|
|
224
|
+
raise ValueError("Block size must be specified and positive.")
|
|
225
|
+
quantized_dim = quant_params.quantized_dimension
|
|
226
|
+
expanded_tensor_shape = _get_tensor_shape_for_blockwise(
|
|
227
|
+
tensor_content.shape, quantized_dim, quant_params.block_size
|
|
228
|
+
)
|
|
229
|
+
expanded_scale = np.reshape(
|
|
230
|
+
np.broadcast_to(
|
|
231
|
+
np.expand_dims(quant_params.scale, quantized_dim + 1),
|
|
232
|
+
expanded_tensor_shape,
|
|
233
|
+
),
|
|
234
|
+
tensor_content.shape,
|
|
235
|
+
)
|
|
236
|
+
if quant_params.zero_point is None or quant_params.zero_point.size == 0:
|
|
237
|
+
expanded_zp = np.zeros_like(tensor_content, dtype=np.int32)
|
|
238
|
+
else:
|
|
239
|
+
expanded_zp = np.reshape(
|
|
240
|
+
np.broadcast_to(
|
|
241
|
+
np.expand_dims(quant_params.zero_point, quantized_dim + 1),
|
|
242
|
+
expanded_tensor_shape,
|
|
243
|
+
),
|
|
244
|
+
tensor_content.shape,
|
|
245
|
+
)
|
|
246
|
+
return qtyping.UniformQuantParams(
|
|
247
|
+
scale=expanded_scale,
|
|
248
|
+
zero_point=expanded_zp,
|
|
249
|
+
num_bits=quant_params.num_bits,
|
|
250
|
+
symmetric=quant_params.symmetric,
|
|
251
|
+
quantized_dimension=quantized_dim,
|
|
252
|
+
block_size=quant_params.block_size,
|
|
119
253
|
)
|
|
120
254
|
|
|
121
255
|
|
|
122
256
|
def uniform_quantize(
|
|
123
257
|
tensor_data: np.ndarray,
|
|
124
258
|
quantization_params: qtyping.UniformQuantParams,
|
|
259
|
+
is_blockwise_quant: bool = False,
|
|
125
260
|
):
|
|
126
261
|
"""Uniform quantize a tensor.
|
|
127
262
|
|
|
128
263
|
Args:
|
|
129
264
|
tensor_data: The tensor to be quantized.
|
|
130
265
|
quantization_params: The quantization parameters.
|
|
266
|
+
is_blockwise_quant: Whether the tensor is blockwise quantized.
|
|
131
267
|
|
|
132
268
|
Returns:
|
|
133
269
|
The quantized tensor.
|
|
134
270
|
"""
|
|
271
|
+
# The reshaping for blockwise quantization is unique hence we do this here
|
|
272
|
+
# to avoid unexpected broadcast behavior downstream.
|
|
273
|
+
if is_blockwise_quant:
|
|
274
|
+
quantization_params = _broadcast_scale_zp_for_blockwise(
|
|
275
|
+
tensor_data, quantization_params
|
|
276
|
+
)
|
|
277
|
+
|
|
135
278
|
# quant params in flatbuffer is flattened, expand the rank to be the same
|
|
136
279
|
# as the tensor rank to avoid ambiguous broadcasting.
|
|
137
280
|
quantization_params = fix_quantization_params_rank(
|
|
@@ -145,8 +288,15 @@ def uniform_quantize(
|
|
|
145
288
|
inverse_scales = 1.0 / scales
|
|
146
289
|
# TODO: b/332574603 - support unsigned data type.
|
|
147
290
|
qtype = IntType(quantization_params.num_bits, signed=True)
|
|
148
|
-
#
|
|
149
|
-
|
|
291
|
+
# For quantization with more than 8 bits, symmetric narrow-range quantization
|
|
292
|
+
# is required due to assumptions made by legacy TFLite kernels. However, this
|
|
293
|
+
# method is not ideal for low-bit quantization (e.g., 2-bit quantization,
|
|
294
|
+
# which only has 4 bins), as it wastes a bin and there are no kernel
|
|
295
|
+
# requirements for a narrow range when < 8 bits because the data is unpacked
|
|
296
|
+
# to int8 before being used in the kernel.
|
|
297
|
+
narrow_range = (
|
|
298
|
+
quantization_params.symmetric and quantization_params.num_bits >= 8
|
|
299
|
+
)
|
|
150
300
|
required_dtype = np.signedinteger if qtype.signed else np.unsignedinteger
|
|
151
301
|
if not np.issubdtype(zero_points.dtype, required_dtype):
|
|
152
302
|
raise ValueError(
|
|
@@ -172,6 +322,26 @@ def uniform_dequantize(
|
|
|
172
322
|
Returns:
|
|
173
323
|
The dequantized tensor.
|
|
174
324
|
"""
|
|
325
|
+
if quantization_params.block_size != 0:
|
|
326
|
+
# b/443830202: The quantized dimension is currently increased by 1 because
|
|
327
|
+
# AEQ expects 1 and XNNPack expects 0.
|
|
328
|
+
quantization_params = dataclasses.replace(
|
|
329
|
+
quantization_params,
|
|
330
|
+
quantized_dimension=quantization_params.quantized_dimension + 1,
|
|
331
|
+
)
|
|
332
|
+
scale_shape = list(tensor_data.shape)
|
|
333
|
+
scale_shape[quantization_params.quantized_dimension] = (
|
|
334
|
+
scale_shape[quantization_params.quantized_dimension]
|
|
335
|
+
// quantization_params.block_size
|
|
336
|
+
)
|
|
337
|
+
quantization_params = dataclasses.replace(
|
|
338
|
+
quantization_params,
|
|
339
|
+
scale=quantization_params.scale.reshape(scale_shape),
|
|
340
|
+
)
|
|
341
|
+
quantization_params = _broadcast_scale_zp_for_blockwise(
|
|
342
|
+
tensor_data, quantization_params
|
|
343
|
+
)
|
|
344
|
+
|
|
175
345
|
# quant params in flatbuffer is flattened, expand the rank to be the same
|
|
176
346
|
# as the tensor rank to avoid ambiguous broadcasting.
|
|
177
347
|
quantization_params = fix_quantization_params_rank(
|
|
@@ -187,6 +357,7 @@ def symmetric_quantize_bias_tensor(
|
|
|
187
357
|
bias_content: np.ndarray,
|
|
188
358
|
input_tensor_quant_params: qtyping.UniformQuantParams,
|
|
189
359
|
weight_tensor_quant_params: qtyping.UniformQuantParams,
|
|
360
|
+
check_error: bool = True,
|
|
190
361
|
) -> qtyping.UniformQuantParams:
|
|
191
362
|
"""Quantize bias tensor (symmetrically, i.e., zero_point = 0).
|
|
192
363
|
|
|
@@ -198,6 +369,12 @@ def symmetric_quantize_bias_tensor(
|
|
|
198
369
|
bias_content: The bias content.
|
|
199
370
|
input_tensor_quant_params: The quantization parameters of input tensor.
|
|
200
371
|
weight_tensor_quant_params: The quantization parameters of weight tensor.
|
|
372
|
+
check_error: Whether to check if the quantization error (the difference
|
|
373
|
+
between the original and dequantized bias) is larger than the quantization
|
|
374
|
+
scale. This check is important because bias quantization parameters are
|
|
375
|
+
fixed (bias_scale = input_scale * weight_scale), which can lead to large
|
|
376
|
+
quantization errors. Raising an error when the quantization error is
|
|
377
|
+
larger than the scale helps to identify unexpected numerical issues.
|
|
201
378
|
|
|
202
379
|
Returns:
|
|
203
380
|
The quantized bias tensor.
|
|
@@ -212,7 +389,8 @@ def symmetric_quantize_bias_tensor(
|
|
|
212
389
|
|
|
213
390
|
# symmetric
|
|
214
391
|
bias_zp = np.zeros_like(effective_output_scale, dtype=np.int32)
|
|
215
|
-
|
|
392
|
+
# Fixed to 32 bits since most of the accelerators use int32 accumulator.
|
|
393
|
+
bias_number_bits = 32
|
|
216
394
|
symmetric = True
|
|
217
395
|
quantized_dimension = None if len(effective_output_scale) == 1 else 0
|
|
218
396
|
bias_quant_params = qtyping.UniformQuantParams(
|
|
@@ -224,6 +402,24 @@ def symmetric_quantize_bias_tensor(
|
|
|
224
402
|
)
|
|
225
403
|
|
|
226
404
|
quantized_vars = uniform_quantize(bias_content, bias_quant_params)
|
|
405
|
+
if check_error:
|
|
406
|
+
dequantized_bias = uniform_dequantize(quantized_vars, bias_quant_params)
|
|
407
|
+
max_quant_error = np.max(np.abs(dequantized_bias - bias_content))
|
|
408
|
+
error_tolerance = np.maximum(1e-6, np.max(effective_output_scale))
|
|
409
|
+
if max_quant_error > error_tolerance:
|
|
410
|
+
raise ValueError(
|
|
411
|
+
"Quantization error is too large for bias tensor quantization. Max"
|
|
412
|
+
f" quantization error is {max_quant_error}, which exceed"
|
|
413
|
+
f" the threshold {error_tolerance}"
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Save the int32 quantized bias as int64 if the input tensor is quantized to
|
|
417
|
+
# 16 bits. This is to assume the matmul is using int64 accumulator (safe from
|
|
418
|
+
# overflow). For accelerators with int32 accumulator, it is safe to cast int64
|
|
419
|
+
# back to int32.
|
|
420
|
+
if input_tensor_quant_params.num_bits == 16:
|
|
421
|
+
quantized_vars = quantized_vars.astype(np.int64)
|
|
422
|
+
bias_number_bits = 64
|
|
227
423
|
|
|
228
424
|
# UniformQuantParams is frozen dataclass, need to recreate.
|
|
229
425
|
return qtyping.UniformQuantParams(
|
|
@@ -237,30 +433,70 @@ def symmetric_quantize_bias_tensor(
|
|
|
237
433
|
|
|
238
434
|
|
|
239
435
|
def tensor_zp_scale_from_min_max(
|
|
240
|
-
min_value,
|
|
436
|
+
min_value,
|
|
437
|
+
max_value,
|
|
438
|
+
num_bits: int,
|
|
439
|
+
symmetric: bool,
|
|
440
|
+
granularity: qtyping.QuantGranularity,
|
|
441
|
+
clipping_values: Optional[np.ndarray] = None,
|
|
241
442
|
):
|
|
242
443
|
"""Get zero point and scale from min and max value.
|
|
243
444
|
|
|
244
445
|
Args:
|
|
245
|
-
min_value: The minimum value of the tensor (
|
|
246
|
-
|
|
446
|
+
min_value: The minimum value of the tensor (channelwise and blockwise
|
|
447
|
+
supported).
|
|
448
|
+
max_value: The maximum value of the tensor (channelwise and blockwise
|
|
449
|
+
supported).
|
|
247
450
|
num_bits: The number of bits of the tensor.
|
|
248
451
|
symmetric: Whether the tensor is symmetric.
|
|
452
|
+
granularity: The granularity of the tensor.
|
|
453
|
+
clipping_values: Absolute clipping values to apply to the tensor. This will
|
|
454
|
+
clip the tensors to the range [-clipping_values, clipping_values]. This
|
|
455
|
+
should be the same shape as min_value and max_value. If None, no clipping
|
|
456
|
+
will be applied.
|
|
249
457
|
|
|
250
458
|
Returns:
|
|
251
459
|
The zero point and scale of the tensor.
|
|
252
460
|
"""
|
|
461
|
+
|
|
253
462
|
# TODO: b/332574603 - support unsigned data type.
|
|
254
463
|
qtype = IntType(
|
|
255
464
|
num_bits,
|
|
256
465
|
signed=True,
|
|
257
466
|
)
|
|
258
467
|
qmin, qmax = get_quantized_range(qtype)
|
|
259
|
-
min_bound = 1e-
|
|
468
|
+
min_bound = 1e-9 # Avoid zero scale.
|
|
469
|
+
pos_clipping_values = None if clipping_values is None else clipping_values
|
|
470
|
+
neg_clipping_values = None if clipping_values is None else -clipping_values
|
|
471
|
+
|
|
472
|
+
if is_blockwise(granularity):
|
|
473
|
+
# Blockwise quantization uses float16 scale,
|
|
474
|
+
# with 7 bit mantissa, so the maximum scale value is 65280 and maximum
|
|
475
|
+
# representable range is [-65280 * (2 ** num_bits),
|
|
476
|
+
# 65280 * (2 ** num_bits - 1)].
|
|
477
|
+
# Note that we have one extra value on the negative side.
|
|
478
|
+
float16_max = np.broadcast_to(
|
|
479
|
+
np.array(65280) * (2**num_bits - 1), max_value.shape
|
|
480
|
+
)
|
|
481
|
+
float16_min = np.broadcast_to(
|
|
482
|
+
np.array(-65280) * (2**num_bits), min_value.shape
|
|
483
|
+
)
|
|
484
|
+
pos_clipping_values = (
|
|
485
|
+
float16_max
|
|
486
|
+
if pos_clipping_values is None
|
|
487
|
+
else np.minimum(pos_clipping_values, float16_max)
|
|
488
|
+
)
|
|
489
|
+
neg_clipping_values = (
|
|
490
|
+
float16_min
|
|
491
|
+
if neg_clipping_values is None
|
|
492
|
+
else np.maximum(neg_clipping_values, float16_min)
|
|
493
|
+
)
|
|
260
494
|
|
|
261
495
|
if symmetric:
|
|
262
496
|
bound = np.maximum(np.abs(min_value), np.abs(max_value))
|
|
263
497
|
bound = np.maximum(bound, min_bound)
|
|
498
|
+
if clipping_values is not None:
|
|
499
|
+
bound = np.clip(bound, neg_clipping_values, pos_clipping_values)
|
|
264
500
|
if not qtype.signed:
|
|
265
501
|
half_q = (qmax - 1) / 2
|
|
266
502
|
scale = bound / half_q
|
|
@@ -268,7 +504,6 @@ def tensor_zp_scale_from_min_max(
|
|
|
268
504
|
else:
|
|
269
505
|
scale = bound / qmax
|
|
270
506
|
zp = np.zeros_like(scale, dtype=np.int32)
|
|
271
|
-
|
|
272
507
|
else:
|
|
273
508
|
# Include 0 to the range to support zero-padding.
|
|
274
509
|
# See: https://arxiv.org/pdf/1712.05877.pdf
|
|
@@ -276,10 +511,18 @@ def tensor_zp_scale_from_min_max(
|
|
|
276
511
|
bound_max = np.maximum(max_value, np.zeros_like(max_value))
|
|
277
512
|
bound_min = np.minimum(min_value, np.zeros_like(min_value))
|
|
278
513
|
bound = np.maximum(bound_max - bound_min, min_bound)
|
|
514
|
+
if clipping_values is not None:
|
|
515
|
+
bound = np.clip(bound, -clipping_values, clipping_values)
|
|
279
516
|
scale = bound / (qmax - qmin)
|
|
280
517
|
zp = qmin - bound_min / scale
|
|
281
518
|
zp = np.rint(zp)
|
|
282
519
|
|
|
520
|
+
if is_blockwise(granularity):
|
|
521
|
+
# Round the scale values to 7 bit mantissa.
|
|
522
|
+
scale = (
|
|
523
|
+
scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
|
|
524
|
+
)
|
|
525
|
+
|
|
283
526
|
# It's safe to cast zp to qtype without clipping because we can infer
|
|
284
527
|
# qmin <= zp <= qmax from bound_min <= 0 <= bound_max.
|
|
285
528
|
zp = assign_quantized_type(zp, qtype)
|
|
@@ -293,7 +536,8 @@ def _is_valid_quantization_params(
|
|
|
293
536
|
"""Checks if the quantization parameters are valid.
|
|
294
537
|
|
|
295
538
|
A valid quantization params requires:
|
|
296
|
-
1. scale and zero point have the same shape
|
|
539
|
+
1. scale and zero point either have the same shape or the zero point is a
|
|
540
|
+
scalar.
|
|
297
541
|
2. scale and zero point have the same rank as the tensor content (avoid
|
|
298
542
|
ambiguous broadcasting).
|
|
299
543
|
|
|
@@ -304,17 +548,20 @@ def _is_valid_quantization_params(
|
|
|
304
548
|
Returns:
|
|
305
549
|
True if the quantization parameters are valid.
|
|
306
550
|
"""
|
|
307
|
-
if
|
|
551
|
+
if (
|
|
552
|
+
quantization_params.scale.shape != quantization_params.zero_point.shape
|
|
553
|
+
and quantization_params.zero_point.size != 1
|
|
554
|
+
):
|
|
308
555
|
raise ValueError(
|
|
309
|
-
"scale and zero_point must have the same shape
|
|
310
|
-
f" {quantization_params.scale.shape} and"
|
|
556
|
+
"scale and zero_point must have the same shape or zero_point must have"
|
|
557
|
+
f" only one element. Got {quantization_params.scale.shape} and"
|
|
311
558
|
f" {quantization_params.zero_point.shape}"
|
|
312
559
|
)
|
|
313
560
|
|
|
314
561
|
tensor_rank = tensor_data.ndim
|
|
315
562
|
scale_rank = quantization_params.scale.ndim
|
|
316
563
|
zero_point_rank = quantization_params.zero_point.ndim
|
|
317
|
-
if
|
|
564
|
+
if tensor_rank != scale_rank or (tensor_rank != zero_point_rank):
|
|
318
565
|
raise ValueError(
|
|
319
566
|
f"Ranks of scales ({scale_rank}) and zps"
|
|
320
567
|
f" ({zero_point_rank}) must be the same as the tensor rank"
|
|
@@ -15,8 +15,11 @@
|
|
|
15
15
|
|
|
16
16
|
"""Tests for tensor_utils."""
|
|
17
17
|
|
|
18
|
+
import dataclasses
|
|
19
|
+
|
|
18
20
|
from absl.testing import parameterized
|
|
19
21
|
import numpy as np
|
|
22
|
+
|
|
20
23
|
from tensorflow.python.platform import googletest
|
|
21
24
|
from ai_edge_quantizer import qtyping
|
|
22
25
|
from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
|
|
@@ -123,6 +126,14 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
123
126
|
False,
|
|
124
127
|
[-24, 10, 19, 127],
|
|
125
128
|
),
|
|
129
|
+
(
|
|
130
|
+
[-16.0, 1.3, 2.4, 16.0],
|
|
131
|
+
[0.12598425],
|
|
132
|
+
[0],
|
|
133
|
+
8,
|
|
134
|
+
True,
|
|
135
|
+
[-127, 10, 19, 127], # int8 symmetric is narrow range, -127 to 127
|
|
136
|
+
),
|
|
126
137
|
(
|
|
127
138
|
[-3.0, 1.3, 2.4, 16.0],
|
|
128
139
|
[1.2666667],
|
|
@@ -137,7 +148,7 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
137
148
|
[-6],
|
|
138
149
|
4,
|
|
139
150
|
True,
|
|
140
|
-
[-
|
|
151
|
+
[-8, -5, -4, 7], # int4 symmetric is not narrow range, -8 to 7
|
|
141
152
|
),
|
|
142
153
|
)
|
|
143
154
|
def test_uniform_quantize(
|
|
@@ -160,7 +171,9 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
160
171
|
def test_uniform_quantize_wrong_shape(self):
|
|
161
172
|
tensor = [-3.0, 1.3, 2.4, 16.0]
|
|
162
173
|
|
|
163
|
-
error_message =
|
|
174
|
+
error_message = (
|
|
175
|
+
"Ranks of scales (3) and zps (2) must be the same as the tensor rank"
|
|
176
|
+
)
|
|
164
177
|
with self.assertRaisesWithPredicateMatch(
|
|
165
178
|
ValueError, lambda err: error_message in str(err)
|
|
166
179
|
):
|
|
@@ -190,6 +203,28 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
190
203
|
),
|
|
191
204
|
)
|
|
192
205
|
|
|
206
|
+
def test_uniform_quantize_quant_dim_not_divisible_by_block_size_raise(self):
|
|
207
|
+
tensor = np.random.rand(34, 2)
|
|
208
|
+
error_message = (
|
|
209
|
+
"Quantized dimension 34 in tensor shape (34, 2) is not divisible by"
|
|
210
|
+
" block size 32."
|
|
211
|
+
)
|
|
212
|
+
with self.assertRaisesWithPredicateMatch(
|
|
213
|
+
ValueError, lambda err: error_message in str(err)
|
|
214
|
+
):
|
|
215
|
+
uniform_quantize_tensor.uniform_quantize(
|
|
216
|
+
np.array(tensor),
|
|
217
|
+
qtyping.UniformQuantParams(
|
|
218
|
+
quantized_dimension=0,
|
|
219
|
+
block_size=32,
|
|
220
|
+
num_bits=4,
|
|
221
|
+
scale=np.array([1.2666667]),
|
|
222
|
+
zero_point=np.array([-6]),
|
|
223
|
+
symmetric=True,
|
|
224
|
+
),
|
|
225
|
+
is_blockwise_quant=True,
|
|
226
|
+
)
|
|
227
|
+
|
|
193
228
|
@parameterized.parameters(
|
|
194
229
|
(
|
|
195
230
|
8,
|
|
@@ -233,7 +268,9 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
233
268
|
def test_uniform_dequantize_wrong_shape(self):
|
|
234
269
|
tensor = [-3.0, 1.3, 2.4, 16.0]
|
|
235
270
|
|
|
236
|
-
error_message =
|
|
271
|
+
error_message = (
|
|
272
|
+
"Ranks of scales (3) and zps (2) must be the same as the tensor rank"
|
|
273
|
+
)
|
|
237
274
|
with self.assertRaisesWithPredicateMatch(
|
|
238
275
|
ValueError, lambda err: error_message in str(err)
|
|
239
276
|
):
|
|
@@ -263,8 +300,35 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
263
300
|
),
|
|
264
301
|
)
|
|
265
302
|
|
|
303
|
+
def test_uniform_dequantize_blockwise(self):
|
|
304
|
+
quantized_tensor = np.array([[-8, -5, -4, 7], [-4, 7, -8, -5]])
|
|
305
|
+
expected_output_tensor = np.array([
|
|
306
|
+
[-10.1333336, -6.3333335, -5.0666668, 8.8666669],
|
|
307
|
+
[-5.0666668, 8.8666669, -10.1333336, -6.3333335],
|
|
308
|
+
])
|
|
309
|
+
quant_params = qtyping.UniformQuantParams(
|
|
310
|
+
# b/443830202:
|
|
311
|
+
quantized_dimension=0,
|
|
312
|
+
num_bits=4,
|
|
313
|
+
scale=np.array([[[1.2666667, 1.2666667], [1.2666667, 1.2666667]]]),
|
|
314
|
+
zero_point=np.array([[0]]),
|
|
315
|
+
symmetric=True,
|
|
316
|
+
block_size=2,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
dequantized_tensor = uniform_quantize_tensor.uniform_dequantize(
|
|
320
|
+
np.array(quantized_tensor), quant_params
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
self.assertSequenceAlmostEqual(
|
|
324
|
+
expected_output_tensor.flatten(), dequantized_tensor.flatten(), places=4
|
|
325
|
+
)
|
|
326
|
+
|
|
266
327
|
@parameterized.parameters(
|
|
267
|
-
(8, 8, True, True),
|
|
328
|
+
(8, 8, True, True),
|
|
329
|
+
(8, 4, False, True),
|
|
330
|
+
(16, 8, True, False),
|
|
331
|
+
(16, 8, True, True),
|
|
268
332
|
)
|
|
269
333
|
def test_quantize_bias_tensor(
|
|
270
334
|
self,
|
|
@@ -322,6 +386,26 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
322
386
|
self.assertSequenceAlmostEqual(
|
|
323
387
|
list(dequantized_bias.flatten()), list(bias_tensor_data), places=5
|
|
324
388
|
)
|
|
389
|
+
|
|
390
|
+
if activation_num_bits == 16:
|
|
391
|
+
# Check if it is safe to cast int64 bias to int32. We save the int32
|
|
392
|
+
# quantized bias as int64 if the input tensor is quantized to 16 bits.
|
|
393
|
+
# This is to assume the matmul is using int64 accumulator (safe from
|
|
394
|
+
# overflow). For accelerators with int32 accumulator, it is safe to cast
|
|
395
|
+
# int64 back to int32.
|
|
396
|
+
quantized_bias = bias_quant_config.quantized_data
|
|
397
|
+
self.assertIsNotNone(quantized_bias)
|
|
398
|
+
self.assertEqual(quantized_bias.dtype, np.int64)
|
|
399
|
+
self.assertSequenceEqual(
|
|
400
|
+
list(quantized_bias.flatten()),
|
|
401
|
+
list(quantized_bias.astype(np.int32).flatten()),
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
bias_quant_config = dataclasses.replace(
|
|
405
|
+
bias_quant_config,
|
|
406
|
+
num_bits=32,
|
|
407
|
+
)
|
|
408
|
+
|
|
325
409
|
expected_quantized_data = uniform_quantize_tensor.uniform_quantize(
|
|
326
410
|
bias_tensor_data, bias_quant_config
|
|
327
411
|
)
|
|
@@ -330,13 +414,44 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
330
414
|
list(bias_quant_config.quantized_data.flatten()), # pytype: disable=attribute-error
|
|
331
415
|
)
|
|
332
416
|
|
|
417
|
+
def test_quantize_bias_tensor_raises_error_for_large_quantization_error(self):
|
|
418
|
+
input_quant_config = qtyping.UniformQuantParams(
|
|
419
|
+
scale=np.array([0.1]),
|
|
420
|
+
zero_point=np.array([10]),
|
|
421
|
+
num_bits=8,
|
|
422
|
+
symmetric=False,
|
|
423
|
+
quantized_dimension=None,
|
|
424
|
+
)
|
|
425
|
+
weight_quant_config = qtyping.UniformQuantParams(
|
|
426
|
+
scale=np.array([0.1]),
|
|
427
|
+
zero_point=np.array([-1]),
|
|
428
|
+
num_bits=8,
|
|
429
|
+
symmetric=True,
|
|
430
|
+
quantized_dimension=None,
|
|
431
|
+
)
|
|
432
|
+
# This will result in quantized bias of 3e9, which is larger than int32 max.
|
|
433
|
+
bias_tensor_data = np.array([3e7])
|
|
434
|
+
with self.assertRaisesRegex(
|
|
435
|
+
ValueError,
|
|
436
|
+
"Quantization error is too large for bias tensor quantization.",
|
|
437
|
+
):
|
|
438
|
+
uniform_quantize_tensor.symmetric_quantize_bias_tensor(
|
|
439
|
+
bias_tensor_data,
|
|
440
|
+
input_quant_config,
|
|
441
|
+
weight_quant_config,
|
|
442
|
+
)
|
|
443
|
+
|
|
333
444
|
@parameterized.parameters((8, True), (16, False))
|
|
334
445
|
def test_tensor_zp_scale_from_min_max(self, num_bits, symmetric):
|
|
335
446
|
min_val = np.min(self._test_data, keepdims=True)
|
|
336
447
|
max_val = np.max(self._test_data, keepdims=True)
|
|
337
448
|
|
|
338
449
|
zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
|
|
339
|
-
min_val,
|
|
450
|
+
min_val,
|
|
451
|
+
max_val,
|
|
452
|
+
num_bits,
|
|
453
|
+
symmetric,
|
|
454
|
+
qtyping.QuantGranularity.TENSORWISE,
|
|
340
455
|
)
|
|
341
456
|
self.assertEqual(zp.shape, scale.shape)
|
|
342
457
|
max_q = 2**num_bits / 2 - 1
|
|
@@ -352,6 +467,38 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
|
352
467
|
# Range has to be extended to include zero.
|
|
353
468
|
self.assertEqual(calculated_min, 0)
|
|
354
469
|
|
|
470
|
+
@parameterized.parameters(
|
|
471
|
+
# number of bits, is_symmetric, max bound of the quantized range.
|
|
472
|
+
(4, True, 7),
|
|
473
|
+
(8, False, 255),
|
|
474
|
+
)
|
|
475
|
+
def test_tensor_zp_scale_from_min_max_with_clipping(
|
|
476
|
+
self, num_bits, symmetric, quantized_bound
|
|
477
|
+
):
|
|
478
|
+
min_val = np.array([[1.0]])
|
|
479
|
+
max_val = np.array([[5.0]])
|
|
480
|
+
clipping_values = np.array([4.0])
|
|
481
|
+
zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
|
|
482
|
+
min_val,
|
|
483
|
+
max_val,
|
|
484
|
+
num_bits,
|
|
485
|
+
symmetric,
|
|
486
|
+
qtyping.QuantGranularity.TENSORWISE,
|
|
487
|
+
clipping_values,
|
|
488
|
+
)
|
|
489
|
+
expected_scale = clipping_values / quantized_bound
|
|
490
|
+
|
|
491
|
+
with self.subTest(name="CheckShapes"):
|
|
492
|
+
self.assertEqual(zp.shape, scale.shape)
|
|
493
|
+
self.assertEqual(zp.shape, (1, 1))
|
|
494
|
+
|
|
495
|
+
if symmetric:
|
|
496
|
+
with self.subTest(name="CheckSymmetricZpValue"):
|
|
497
|
+
self.assertEqual(zp[0], 0)
|
|
498
|
+
|
|
499
|
+
with self.subTest(name="CheckScaleValue"):
|
|
500
|
+
self.assertEqual(scale[0], expected_scale)
|
|
501
|
+
|
|
355
502
|
|
|
356
503
|
if __name__ == "__main__":
|
|
357
504
|
googletest.main()
|