ai-edge-quantizer-nightly 0.0.1.dev20250317__py3-none-any.whl → 0.1.0.dev20250319__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_edge_quantizer/algorithm_manager.py +61 -0
- ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py +151 -1
- ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py +5 -142
- ai_edge_quantizer/algorithms/uniform_quantize/octav.py +174 -0
- ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py +186 -0
- ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py +14 -2
- ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py +27 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/METADATA +2 -2
- {ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/RECORD +12 -10
- {ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/LICENSE +0 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/WHEEL +0 -0
- {ai_edge_quantizer_nightly-0.0.1.dev20250317.dist-info → ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@
|
|
17
17
|
|
18
18
|
import enum
|
19
19
|
import functools
|
20
|
+
from immutabledict import immutabledict
|
20
21
|
from ai_edge_quantizer import algorithm_manager_api
|
21
22
|
from ai_edge_quantizer import default_policy
|
22
23
|
from ai_edge_quantizer import qtyping
|
@@ -24,6 +25,7 @@ from ai_edge_quantizer.algorithms.nonlinear_quantize import float_casting
|
|
24
25
|
from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
|
25
26
|
from ai_edge_quantizer.algorithms.uniform_quantize import dequantized_weight_recovery
|
26
27
|
from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
|
28
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import octav
|
27
29
|
|
28
30
|
# TODO: b/399775701 - Clean up this file.
|
29
31
|
|
@@ -55,6 +57,7 @@ class AlgorithmName(str, enum.Enum):
|
|
55
57
|
MIN_MAX_UNIFORM_QUANT = naive_min_max_quantize.ALGORITHM_KEY
|
56
58
|
FLOAT_CASTING = float_casting.ALGORITHM_KEY
|
57
59
|
DEQUANTIZED_WEIGHT_RECOVERY = dequantized_weight_recovery.ALGORITHM_KEY
|
60
|
+
OCTAV = octav.ALGORITHM_KEY
|
58
61
|
|
59
62
|
### MIN/MAX_UNIFORM_QUANT ###
|
60
63
|
|
@@ -188,3 +191,61 @@ for (
|
|
188
191
|
dequantized_weight_recovery.get_tensor_quant_params,
|
189
192
|
),
|
190
193
|
)
|
194
|
+
|
195
|
+
|
196
|
+
# Register OCTAV algorithm.
|
197
|
+
register_op_quant_config_validation_func(
|
198
|
+
AlgorithmName.OCTAV,
|
199
|
+
common_quantize.check_op_quantization_config,
|
200
|
+
)
|
201
|
+
|
202
|
+
# Register a config check policy for OCTAV algorithm.
|
203
|
+
register_config_check_policy_func(
|
204
|
+
AlgorithmName.OCTAV,
|
205
|
+
default_policy.DEFAULT_CONFIG_CHECK_POLICY,
|
206
|
+
)
|
207
|
+
|
208
|
+
_OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT = immutabledict({
|
209
|
+
_TFLOpName.INPUT: common_quantize.materialize_input,
|
210
|
+
_TFLOpName.OUTPUT: common_quantize.materialize_output,
|
211
|
+
_TFLOpName.FULLY_CONNECTED: common_quantize.materialize_fc_conv,
|
212
|
+
_TFLOpName.BATCH_MATMUL: common_quantize.materialize_batch_matmul,
|
213
|
+
_TFLOpName.CONV_2D: common_quantize.materialize_fc_conv,
|
214
|
+
_TFLOpName.DEPTHWISE_CONV_2D: common_quantize.materialize_fc_conv,
|
215
|
+
_TFLOpName.CONV_2D_TRANSPOSE: common_quantize.materialize_conv2d_transpose,
|
216
|
+
_TFLOpName.RESHAPE: common_quantize.materialize_reshape,
|
217
|
+
_TFLOpName.AVERAGE_POOL_2D: common_quantize.materialize_average_pool_2d,
|
218
|
+
_TFLOpName.EMBEDDING_LOOKUP: common_quantize.materialize_embedding_lookup,
|
219
|
+
_TFLOpName.SOFTMAX: common_quantize.materialize_softmax_and_logistic,
|
220
|
+
_TFLOpName.TANH: common_quantize.materialize_tanh,
|
221
|
+
_TFLOpName.TRANSPOSE: common_quantize.materialize_transpose,
|
222
|
+
_TFLOpName.GELU: common_quantize.materialize_gelu,
|
223
|
+
_TFLOpName.ADD: common_quantize.materialize_add,
|
224
|
+
_TFLOpName.SUB: common_quantize.materialize_sub,
|
225
|
+
_TFLOpName.MUL: common_quantize.materialize_mul,
|
226
|
+
_TFLOpName.MEAN: common_quantize.materialize_mean,
|
227
|
+
_TFLOpName.RSQRT: common_quantize.materialize_rsqrt,
|
228
|
+
_TFLOpName.CONCATENATION: common_quantize.materialize_concatenation,
|
229
|
+
_TFLOpName.STRIDED_SLICE: common_quantize.materialize_strided_slice,
|
230
|
+
_TFLOpName.SPLIT: common_quantize.materialize_split,
|
231
|
+
_TFLOpName.LOGISTIC: common_quantize.materialize_softmax_and_logistic,
|
232
|
+
_TFLOpName.SLICE: common_quantize.materialize_slice,
|
233
|
+
_TFLOpName.SUM: common_quantize.materialize_sum,
|
234
|
+
_TFLOpName.SELECT_V2: common_quantize.materialize_select_v2,
|
235
|
+
_TFLOpName.DYNAMIC_UPDATE_SLICE: (
|
236
|
+
common_quantize.materialize_dynamic_update_slice
|
237
|
+
),
|
238
|
+
_TFLOpName.STABLEHLO_COMPOSITE: common_quantize.materialize_composite,
|
239
|
+
})
|
240
|
+
|
241
|
+
for op_name, materialize_func in _OCTAV_OP_NAME_MATERIALIZE_FUNC_DICT.items():
|
242
|
+
register_quantized_op(
|
243
|
+
AlgorithmName.OCTAV,
|
244
|
+
op_name,
|
245
|
+
naive_min_max_quantize.init_qsvs,
|
246
|
+
calibration_func=naive_min_max_quantize.min_max_calibrate,
|
247
|
+
materialize_func=functools.partial(
|
248
|
+
materialize_func,
|
249
|
+
octav.get_tensor_quant_params,
|
250
|
+
),
|
251
|
+
)
|
@@ -23,7 +23,7 @@ to implement the get_tensor_quant_params_fn with the
|
|
23
23
|
qtyping.GetTensorQuantParamsFuncSignature signature.
|
24
24
|
"""
|
25
25
|
|
26
|
-
from typing import Any
|
26
|
+
from typing import Any, Optional, Sequence
|
27
27
|
import numpy as np
|
28
28
|
from ai_edge_quantizer import qtyping
|
29
29
|
from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
|
@@ -669,3 +669,153 @@ def materialize_split(
|
|
669
669
|
constraint=_OpQuantConstraint.SAME_AS_INPUT_SCALE,
|
670
670
|
inputs_to_ignore=[0], # Split dimension does not need to be quantized.
|
671
671
|
)
|
672
|
+
|
673
|
+
|
674
|
+
def _get_tensor_shape_for_blockwise(
|
675
|
+
tensor_shape: Sequence[int], quantized_dim: int, block_size: int
|
676
|
+
) -> list[int]:
|
677
|
+
"""Get the tensor shape for blockwise quantization.
|
678
|
+
|
679
|
+
This function splits the quantize dimension of the tensor into blocks and the
|
680
|
+
dim/blocks. Hence, min/max of the tensor can be calculated for each block
|
681
|
+
using existing functions.
|
682
|
+
|
683
|
+
Args:
|
684
|
+
tensor_shape: The original shape of the tensor.
|
685
|
+
quantized_dim: The dimension to be quantized blockwise.
|
686
|
+
block_size: The size of the block.
|
687
|
+
|
688
|
+
Returns:
|
689
|
+
The new tensor shape for calculating scale and zp for blockwise
|
690
|
+
quantization.
|
691
|
+
"""
|
692
|
+
new_shape = []
|
693
|
+
for index, val in enumerate(tensor_shape):
|
694
|
+
if index == quantized_dim:
|
695
|
+
new_shape.append(int(val / block_size))
|
696
|
+
new_shape.append(block_size)
|
697
|
+
else:
|
698
|
+
new_shape.append(val)
|
699
|
+
return new_shape
|
700
|
+
|
701
|
+
|
702
|
+
def _reshape_data_for_blockwise(
|
703
|
+
tensor_data: np.ndarray, quantized_dim: int, block_size: int
|
704
|
+
) -> tuple[np.ndarray, int]:
|
705
|
+
"""Reshapes data for blockwise quantization.
|
706
|
+
|
707
|
+
Args:
|
708
|
+
tensor_data: The original tensor data.
|
709
|
+
quantized_dim: The dimension to be quantized blockwise.
|
710
|
+
block_size: The size of the block.
|
711
|
+
|
712
|
+
Returns:
|
713
|
+
A tuple containing the reshaped tensor data and the new reduce dimension.
|
714
|
+
"""
|
715
|
+
new_shape = _get_tensor_shape_for_blockwise(
|
716
|
+
tensor_data.shape, quantized_dim, block_size
|
717
|
+
)
|
718
|
+
reshaped_data = tensor_data.reshape(new_shape)
|
719
|
+
return reshaped_data, quantized_dim + 1
|
720
|
+
|
721
|
+
|
722
|
+
def broadcast_scale_zp_for_blockwise(
|
723
|
+
tensor_content: np.ndarray,
|
724
|
+
quant_params: qtyping.UniformQuantParams,
|
725
|
+
) -> qtyping.UniformQuantParams:
|
726
|
+
"""Broadcasts scale and zp for blockwise quantization.
|
727
|
+
|
728
|
+
Args:
|
729
|
+
tensor_content: The original tensor data.
|
730
|
+
quant_params: The quantization parameters.
|
731
|
+
`quant_params.quantized_dimension` must be specified.
|
732
|
+
`quant_params.block_size` must be specified and positive.
|
733
|
+
|
734
|
+
Returns:
|
735
|
+
The updated quantization parameters with broadcasted scale and zp for
|
736
|
+
correct constant quantization.
|
737
|
+
"""
|
738
|
+
if quant_params.quantized_dimension is None:
|
739
|
+
raise ValueError("Quantized dimension must be specified.")
|
740
|
+
if quant_params.block_size is None or quant_params.block_size <= 0:
|
741
|
+
raise ValueError("Block size must be specified and positive.")
|
742
|
+
quantized_dim = quant_params.quantized_dimension
|
743
|
+
expanded_tensor_shape = _get_tensor_shape_for_blockwise(
|
744
|
+
tensor_content.shape, quantized_dim, quant_params.block_size
|
745
|
+
)
|
746
|
+
expanded_scale = np.reshape(
|
747
|
+
np.broadcast_to(
|
748
|
+
np.expand_dims(quant_params.scale, quantized_dim + 1),
|
749
|
+
expanded_tensor_shape,
|
750
|
+
),
|
751
|
+
tensor_content.shape,
|
752
|
+
)
|
753
|
+
expanded_zp = np.reshape(
|
754
|
+
np.broadcast_to(
|
755
|
+
np.expand_dims(quant_params.zero_point, quantized_dim + 1),
|
756
|
+
expanded_tensor_shape,
|
757
|
+
),
|
758
|
+
tensor_content.shape,
|
759
|
+
)
|
760
|
+
return qtyping.UniformQuantParams(
|
761
|
+
scale=expanded_scale,
|
762
|
+
zero_point=expanded_zp,
|
763
|
+
num_bits=quant_params.num_bits,
|
764
|
+
symmetric=quant_params.symmetric,
|
765
|
+
quantized_dimension=quantized_dim,
|
766
|
+
block_size=quant_params.block_size,
|
767
|
+
)
|
768
|
+
|
769
|
+
|
770
|
+
def init_tensor_min_max(
|
771
|
+
tensor_data: Optional[np.ndarray],
|
772
|
+
op_info: qtyping.OpInfo,
|
773
|
+
) -> qtyping.QSV:
|
774
|
+
"""Initialize the min/max for a tensor.
|
775
|
+
|
776
|
+
This function initializes the min/max values for a tensor.
|
777
|
+
|
778
|
+
Args:
|
779
|
+
tensor_data: The tensor data.
|
780
|
+
op_info: Aggregated information about the op (e.g., quantization config).
|
781
|
+
|
782
|
+
Returns:
|
783
|
+
A dictionary containing the min/max values for the tensor, or an empty
|
784
|
+
dictionary if the tensor data is None.
|
785
|
+
"""
|
786
|
+
if tensor_data is None:
|
787
|
+
return {}
|
788
|
+
else:
|
789
|
+
weight_tensor_config = op_info.op_quant_config.weight_tensor_config
|
790
|
+
quantized_dim = None
|
791
|
+
if weight_tensor_config is not None and (
|
792
|
+
weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
|
793
|
+
or weight_tensor_config.granularity
|
794
|
+
== qtyping.QuantGranularity.BLOCKWISE
|
795
|
+
):
|
796
|
+
quantized_dim = common_utils.get_weight_quantized_dim(
|
797
|
+
op_info, tensor_data
|
798
|
+
)
|
799
|
+
if (
|
800
|
+
weight_tensor_config is not None
|
801
|
+
and weight_tensor_config.granularity
|
802
|
+
== qtyping.QuantGranularity.BLOCKWISE
|
803
|
+
):
|
804
|
+
reshaped_data, reduce_dims = _reshape_data_for_blockwise(
|
805
|
+
tensor_data,
|
806
|
+
quantized_dim,
|
807
|
+
weight_tensor_config.block_size,
|
808
|
+
)
|
809
|
+
return {
|
810
|
+
"min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
|
811
|
+
"max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
|
812
|
+
}
|
813
|
+
|
814
|
+
else:
|
815
|
+
reduce_dims = common_utils.get_reduce_dims(
|
816
|
+
quantized_dim, tensor_data.shape
|
817
|
+
)
|
818
|
+
return {
|
819
|
+
"min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
|
820
|
+
"max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
|
821
|
+
}
|
@@ -15,10 +15,10 @@
|
|
15
15
|
|
16
16
|
"""Performs naive min/max uniform quantization."""
|
17
17
|
|
18
|
-
from collections.abc import Sequence
|
19
18
|
from typing import Any, Optional
|
20
19
|
import numpy as np
|
21
20
|
from ai_edge_quantizer import qtyping
|
21
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
|
22
22
|
from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
|
23
23
|
from ai_edge_quantizer.algorithms.utils import common_utils
|
24
24
|
from ai_edge_quantizer.utils import tfl_flatbuffer_utils
|
@@ -29,143 +29,6 @@ _QuantTransformation = qtyping.QuantTransformation
|
|
29
29
|
_IntType = uniform_quantize_tensor.IntType
|
30
30
|
|
31
31
|
|
32
|
-
def _init_tensor_min_max(
|
33
|
-
tensor_data: Optional[np.ndarray],
|
34
|
-
op_info: qtyping.OpInfo,
|
35
|
-
) -> qtyping.QSV:
|
36
|
-
"""Initialize the min/max for a tensor."""
|
37
|
-
if tensor_data is None:
|
38
|
-
return {}
|
39
|
-
else:
|
40
|
-
weight_tensor_config = op_info.op_quant_config.weight_tensor_config
|
41
|
-
quantized_dim = None
|
42
|
-
if weight_tensor_config is not None and (
|
43
|
-
weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
|
44
|
-
or weight_tensor_config.granularity
|
45
|
-
== qtyping.QuantGranularity.BLOCKWISE
|
46
|
-
):
|
47
|
-
quantized_dim = common_utils.get_weight_quantized_dim(
|
48
|
-
op_info, tensor_data
|
49
|
-
)
|
50
|
-
if (
|
51
|
-
weight_tensor_config is not None
|
52
|
-
and weight_tensor_config.granularity
|
53
|
-
== qtyping.QuantGranularity.BLOCKWISE
|
54
|
-
):
|
55
|
-
reshaped_data, reduce_dims = _reshape_data_for_blockwise(
|
56
|
-
tensor_data,
|
57
|
-
quantized_dim,
|
58
|
-
weight_tensor_config.block_size,
|
59
|
-
)
|
60
|
-
return {
|
61
|
-
"min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
|
62
|
-
"max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
|
63
|
-
}
|
64
|
-
|
65
|
-
else:
|
66
|
-
reduce_dims = common_utils.get_reduce_dims(
|
67
|
-
quantized_dim, tensor_data.shape
|
68
|
-
)
|
69
|
-
return {
|
70
|
-
"min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
|
71
|
-
"max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
|
72
|
-
}
|
73
|
-
|
74
|
-
|
75
|
-
def _get_tensor_shape_for_blockwise(
|
76
|
-
tensor_shape: Sequence[int], quantized_dim: int, block_size: int
|
77
|
-
) -> list[int]:
|
78
|
-
"""Get the tensor shape for blockwise quantization.
|
79
|
-
|
80
|
-
This function splits the quantize dimension of the tensor into blocks and the
|
81
|
-
dim/blocks. Hence, min/max of the tensor can be calculated for each block
|
82
|
-
using existing functions.
|
83
|
-
|
84
|
-
Args:
|
85
|
-
tensor_shape: The original shape of the tensor.
|
86
|
-
quantized_dim: The dimension to be quantized blockwise.
|
87
|
-
block_size: The size of the block.
|
88
|
-
|
89
|
-
Returns:
|
90
|
-
The new tensor shape for calculating scale and zp for blockwise
|
91
|
-
quantization.
|
92
|
-
"""
|
93
|
-
new_shape = []
|
94
|
-
for index, val in enumerate(tensor_shape):
|
95
|
-
if index == quantized_dim:
|
96
|
-
new_shape.append(int(val / block_size))
|
97
|
-
new_shape.append(block_size)
|
98
|
-
else:
|
99
|
-
new_shape.append(val)
|
100
|
-
return new_shape
|
101
|
-
|
102
|
-
|
103
|
-
def _reshape_data_for_blockwise(
|
104
|
-
tensor_data: np.ndarray, quantized_dim: int, block_size: int
|
105
|
-
) -> tuple[np.ndarray, int]:
|
106
|
-
"""Reshapes data for blockwise quantization.
|
107
|
-
|
108
|
-
Args:
|
109
|
-
tensor_data: The original tensor data.
|
110
|
-
quantized_dim: The dimension to be quantized blockwise.
|
111
|
-
block_size: The size of the block.
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
A tuple containing the reshaped tensor data and the new reduce dimension.
|
115
|
-
"""
|
116
|
-
new_shape = _get_tensor_shape_for_blockwise(
|
117
|
-
tensor_data.shape, quantized_dim, block_size
|
118
|
-
)
|
119
|
-
reshaped_data = tensor_data.reshape(new_shape)
|
120
|
-
return reshaped_data, quantized_dim + 1
|
121
|
-
|
122
|
-
|
123
|
-
def _broadcast_scale_zp_for_blockwise(
|
124
|
-
tensor_content: np.ndarray,
|
125
|
-
quant_params: qtyping.UniformQuantParams,
|
126
|
-
) -> qtyping.UniformQuantParams:
|
127
|
-
"""Broadcasts scale and zp for blockwise quantization.
|
128
|
-
|
129
|
-
Args:
|
130
|
-
tensor_content: The original tensor data.
|
131
|
-
quant_params: The quantization parameters.
|
132
|
-
|
133
|
-
Returns:
|
134
|
-
The updated quantization parameters with broadcasted scale and zp for
|
135
|
-
correct constant quantization.
|
136
|
-
"""
|
137
|
-
if quant_params.quantized_dimension is None:
|
138
|
-
raise ValueError("Quantized dimension must be specified.")
|
139
|
-
if quant_params.block_size is None or quant_params.block_size <= 0:
|
140
|
-
raise ValueError("Block size must be specified and positive.")
|
141
|
-
quantized_dim = quant_params.quantized_dimension
|
142
|
-
expanded_tensor_shape = _get_tensor_shape_for_blockwise(
|
143
|
-
tensor_content.shape, quantized_dim, quant_params.block_size
|
144
|
-
)
|
145
|
-
expanded_scale = np.reshape(
|
146
|
-
np.broadcast_to(
|
147
|
-
np.expand_dims(quant_params.scale, quantized_dim + 1),
|
148
|
-
expanded_tensor_shape,
|
149
|
-
),
|
150
|
-
tensor_content.shape,
|
151
|
-
)
|
152
|
-
expanded_zp = np.reshape(
|
153
|
-
np.broadcast_to(
|
154
|
-
np.expand_dims(quant_params.zero_point, quantized_dim + 1),
|
155
|
-
expanded_tensor_shape,
|
156
|
-
),
|
157
|
-
tensor_content.shape,
|
158
|
-
)
|
159
|
-
return qtyping.UniformQuantParams(
|
160
|
-
scale=expanded_scale,
|
161
|
-
zero_point=expanded_zp,
|
162
|
-
num_bits=quant_params.num_bits,
|
163
|
-
symmetric=quant_params.symmetric,
|
164
|
-
quantized_dimension=quantized_dim,
|
165
|
-
block_size=quant_params.block_size,
|
166
|
-
)
|
167
|
-
|
168
|
-
|
169
32
|
def get_tensor_quant_params(
|
170
33
|
op_info: qtyping.OpInfo,
|
171
34
|
tensor_quant_config: qtyping.TensorQuantizationConfig,
|
@@ -191,7 +54,7 @@ def get_tensor_quant_params(
|
|
191
54
|
# weight-only and DRQ do not require calibration, thus it is
|
192
55
|
# possible that this information is missing here. In that case we
|
193
56
|
# collect min/max on the spot.
|
194
|
-
tensor_min_max =
|
57
|
+
tensor_min_max = common_quantize.init_tensor_min_max(
|
195
58
|
tensor_content,
|
196
59
|
op_info,
|
197
60
|
)
|
@@ -238,7 +101,7 @@ def get_tensor_quant_params(
|
|
238
101
|
# The reshaping for blockwise quantization is unique hence we do this here
|
239
102
|
# to avoid unexpected broadcast behavior downstream.
|
240
103
|
if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
|
241
|
-
quant_params =
|
104
|
+
quant_params = common_quantize.broadcast_scale_zp_for_blockwise(
|
242
105
|
tensor_content, quant_params
|
243
106
|
)
|
244
107
|
|
@@ -286,7 +149,7 @@ def init_qsvs(
|
|
286
149
|
tensor_data = tfl_flatbuffer_utils.get_tensor_data(
|
287
150
|
tensor, graph_info.buffers
|
288
151
|
)
|
289
|
-
op_qsvs[tensor_name] =
|
152
|
+
op_qsvs[tensor_name] = common_quantize.init_tensor_min_max(
|
290
153
|
tensor_data,
|
291
154
|
op_info,
|
292
155
|
)
|
@@ -297,7 +160,7 @@ def init_qsvs(
|
|
297
160
|
tensor_data = tfl_flatbuffer_utils.get_tensor_data(
|
298
161
|
tensor, graph_info.buffers
|
299
162
|
)
|
300
|
-
op_qsvs[tensor_name] =
|
163
|
+
op_qsvs[tensor_name] = common_quantize.init_tensor_min_max(
|
301
164
|
tensor_data,
|
302
165
|
op_info,
|
303
166
|
)
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# Copyright 2024 The AI Edge Quantizer Authors.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
# ==============================================================================
|
15
|
+
|
16
|
+
"""Implements the OCTAV quantization."""
|
17
|
+
|
18
|
+
import dataclasses
|
19
|
+
from typing import Any, Optional, Sequence, Union
|
20
|
+
import numpy as np
|
21
|
+
from ai_edge_quantizer import qtyping
|
22
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import common_quantize
|
23
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import naive_min_max_quantize
|
24
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import uniform_quantize_tensor
|
25
|
+
from ai_edge_quantizer.algorithms.utils import common_utils
|
26
|
+
|
27
|
+
ALGORITHM_KEY = "OCTAV"
|
28
|
+
|
29
|
+
|
30
|
+
def _guess_clipping_with_octav(
|
31
|
+
x: np.ndarray,
|
32
|
+
bits: int,
|
33
|
+
axis: Union[int, Sequence[int]],
|
34
|
+
max_iterations: int,
|
35
|
+
exponent_divisor: float,
|
36
|
+
early_stop: bool = True,
|
37
|
+
) -> np.ndarray:
|
38
|
+
"""Returns a tensor of absolute clipping constants for a tensor using OCTAV.
|
39
|
+
|
40
|
+
This method implements equation (6) from the OCTAV paper:
|
41
|
+
https://arxiv.org/abs/2206.06501
|
42
|
+
|
43
|
+
Args:
|
44
|
+
x: Tensor data to return guesses for.
|
45
|
+
bits: Number of bits used during quantization.
|
46
|
+
axis: Axis to reduce the tensor along to get the guesses.
|
47
|
+
max_iterations: Number of Newton-Raphson iterations to use.
|
48
|
+
exponent_divisor: What factor to divide the 4^-bits term by. In the paper,
|
49
|
+
3.0 is optimal for signed ints and 12.0 for unsigned ints.
|
50
|
+
early_stop: If True, stop the iteration if the guess doesn't change.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
A tensor of shape [num_channels] with clipping constant guesses.
|
54
|
+
"""
|
55
|
+
magnitude = np.abs(x)
|
56
|
+
x_reduced = np.mean(x, axis=axis, keepdims=True)
|
57
|
+
old_guess = np.zeros(x_reduced.shape)
|
58
|
+
guess = np.ones(x_reduced.shape)
|
59
|
+
for _ in range(max_iterations):
|
60
|
+
if early_stop and np.allclose(guess, old_guess):
|
61
|
+
break
|
62
|
+
guess_broadcasted = np.broadcast_to(guess, magnitude.shape)
|
63
|
+
guess_mask = np.asarray(magnitude < guess_broadcasted, dtype=x.dtype)
|
64
|
+
numerator = np.sum(
|
65
|
+
magnitude * np.asarray(1.0 - guess_mask), axis=axis, keepdims=True
|
66
|
+
)
|
67
|
+
denominator1 = (4.0 ** (-bits) / exponent_divisor) * np.sum(
|
68
|
+
guess_mask, axis=axis, keepdims=True
|
69
|
+
)
|
70
|
+
denominator2 = np.sum(1.0 - guess_mask, axis=axis, keepdims=True)
|
71
|
+
old_guess = guess
|
72
|
+
guess = numerator / (denominator1 + denominator2)
|
73
|
+
|
74
|
+
return guess
|
75
|
+
|
76
|
+
|
77
|
+
def get_tensor_quant_params(
|
78
|
+
op_info: qtyping.OpInfo,
|
79
|
+
tensor_quant_config: qtyping.TensorQuantizationConfig,
|
80
|
+
tensor_content: Optional[np.ndarray] = None,
|
81
|
+
tensor_qsv: Optional[dict[str, Any]] = None,
|
82
|
+
) -> qtyping.UniformQuantParams:
|
83
|
+
"""Returns the quantization parameters for a tensor.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
op_info: Aggregated information about the op (e.g., quantization config).
|
87
|
+
tensor_quant_config: The quantization config for the tensor.
|
88
|
+
tensor_content: The content of the tensor. When None, it means the tensor is
|
89
|
+
not a weight tensor (e.g. static quantization) so we fallback to using
|
90
|
+
naive_min_max_quantize.
|
91
|
+
tensor_qsv: A dictionary containing the min/max of the tensor.
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
ValueError: If the blockwise quantization is requested.
|
95
|
+
ValueError: If the asymmetric quantization is requested.
|
96
|
+
ValueError: `tensor_qsv` must contain min/max values, or `tensor_content`
|
97
|
+
must be provided so that they can be inferred.
|
98
|
+
"""
|
99
|
+
# Fallback to naive_min_max_quantize.py for non-weight tensors.
|
100
|
+
if tensor_content is None:
|
101
|
+
return naive_min_max_quantize.get_tensor_quant_params(
|
102
|
+
op_info, tensor_quant_config, tensor_content, tensor_qsv
|
103
|
+
)
|
104
|
+
|
105
|
+
if (
|
106
|
+
tensor_quant_config.granularity != qtyping.QuantGranularity.CHANNELWISE
|
107
|
+
and tensor_quant_config.granularity != qtyping.QuantGranularity.TENSORWISE
|
108
|
+
):
|
109
|
+
raise ValueError(
|
110
|
+
f"Unsupported granularity: {tensor_quant_config.granularity}."
|
111
|
+
)
|
112
|
+
|
113
|
+
if not tensor_quant_config.symmetric:
|
114
|
+
raise ValueError(
|
115
|
+
f"Unsupported symmetry: {tensor_quant_config.symmetric}. OCTAV"
|
116
|
+
" supports symmetric quantization only for now."
|
117
|
+
)
|
118
|
+
|
119
|
+
if tensor_qsv is None:
|
120
|
+
# We need min/max to calculate quantization parameters, which
|
121
|
+
# should be collected during the calibration process. However,
|
122
|
+
# weight-only and DRQ do not require calibration, thus it is
|
123
|
+
# possible that this information is missing here. In that case we
|
124
|
+
# collect min/max on the spot.
|
125
|
+
tensor_min_max = common_quantize.init_tensor_min_max(
|
126
|
+
tensor_content,
|
127
|
+
op_info,
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
tensor_min_max = tensor_qsv
|
131
|
+
|
132
|
+
if "min" not in tensor_min_max or "max" not in tensor_min_max:
|
133
|
+
raise ValueError(
|
134
|
+
"min and max must be provided to produce tensor quantization"
|
135
|
+
" parameters. Check if the correct calibration results are passed into"
|
136
|
+
" the ParamsGenerator."
|
137
|
+
)
|
138
|
+
|
139
|
+
quantized_dim = None
|
140
|
+
if tensor_quant_config.granularity == qtyping.QuantGranularity.CHANNELWISE:
|
141
|
+
quantized_dim = common_utils.get_weight_quantized_dim(
|
142
|
+
op_info, tensor_content
|
143
|
+
)
|
144
|
+
|
145
|
+
clipping_constants = _guess_clipping_with_octav(
|
146
|
+
tensor_content,
|
147
|
+
tensor_quant_config.num_bits,
|
148
|
+
common_utils.get_reduce_dims(quantized_dim, tensor_content.shape),
|
149
|
+
max_iterations=10,
|
150
|
+
exponent_divisor=3.0 if tensor_quant_config.symmetric else 12.0,
|
151
|
+
)
|
152
|
+
|
153
|
+
zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
|
154
|
+
tensor_min_max["min"],
|
155
|
+
tensor_min_max["max"],
|
156
|
+
tensor_quant_config.num_bits,
|
157
|
+
tensor_quant_config.symmetric,
|
158
|
+
clipping_constants,
|
159
|
+
)
|
160
|
+
|
161
|
+
quant_params = qtyping.UniformQuantParams(
|
162
|
+
scale=scale,
|
163
|
+
zero_point=zp,
|
164
|
+
num_bits=tensor_quant_config.num_bits,
|
165
|
+
symmetric=tensor_quant_config.symmetric,
|
166
|
+
quantized_dimension=quantized_dim,
|
167
|
+
block_size=tensor_quant_config.block_size,
|
168
|
+
)
|
169
|
+
|
170
|
+
quantized_vars = uniform_quantize_tensor.uniform_quantize(
|
171
|
+
tensor_content, quant_params
|
172
|
+
)
|
173
|
+
|
174
|
+
return dataclasses.replace(quant_params, quantized_data=quantized_vars)
|
@@ -0,0 +1,186 @@
|
|
1
|
+
# Copyright 2024 The AI Edge Quantizer Authors.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
# ==============================================================================
|
15
|
+
|
16
|
+
import os
|
17
|
+
from typing import cast
|
18
|
+
|
19
|
+
from absl.testing import parameterized
|
20
|
+
import numpy as np
|
21
|
+
|
22
|
+
from tensorflow.python.platform import googletest
|
23
|
+
from ai_edge_quantizer import qtyping
|
24
|
+
from ai_edge_quantizer.algorithms.uniform_quantize import octav
|
25
|
+
from ai_edge_quantizer.utils import test_utils
|
26
|
+
from ai_edge_quantizer.utils import tfl_flatbuffer_utils
|
27
|
+
|
28
|
+
|
29
|
+
class OctavQuantizeTest(parameterized.TestCase):
|
30
|
+
"""Tests for general functions for OCTAV."""
|
31
|
+
|
32
|
+
def setUp(self):
|
33
|
+
super().setUp()
|
34
|
+
np.random.seed(666)
|
35
|
+
self._test_model_path = os.path.join(
|
36
|
+
test_utils.get_path_to_datafile("../../tests/models"),
|
37
|
+
"conv_fc_mnist.tflite",
|
38
|
+
)
|
39
|
+
self._test_model = tfl_flatbuffer_utils.read_model(self._test_model_path)
|
40
|
+
# The test model has one subgraph for now.
|
41
|
+
self._graph_info = qtyping.GraphInfo(
|
42
|
+
subgraph_tensors=self._test_model.subgraphs[0].tensors,
|
43
|
+
buffers=self._test_model.buffers,
|
44
|
+
)
|
45
|
+
self._tensor_name_to_qsv = {}
|
46
|
+
subgraph0 = self._test_model.subgraphs[0]
|
47
|
+
subgraph_op_index = 3
|
48
|
+
fc_op = subgraph0.operators[subgraph_op_index]
|
49
|
+
self._fc_op_info = qtyping.OpInfo(
|
50
|
+
op=fc_op,
|
51
|
+
op_name=qtyping.TFLOperationName.FULLY_CONNECTED,
|
52
|
+
subgraph_op_index=subgraph_op_index,
|
53
|
+
op_quant_config=qtyping.OpQuantizationConfig(
|
54
|
+
weight_tensor_config=None,
|
55
|
+
),
|
56
|
+
)
|
57
|
+
|
58
|
+
def test_get_tensor_quant_params_unsupported_granularity_assert(self):
|
59
|
+
err_msg = "Unsupported granularity"
|
60
|
+
test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
|
61
|
+
with self.assertRaisesWithPredicateMatch(
|
62
|
+
ValueError, lambda err: err_msg in str(err)
|
63
|
+
):
|
64
|
+
_ = octav.get_tensor_quant_params(
|
65
|
+
op_info=self._fc_op_info,
|
66
|
+
tensor_quant_config=qtyping.TensorQuantizationConfig(
|
67
|
+
num_bits=4,
|
68
|
+
symmetric=True,
|
69
|
+
granularity=qtyping.QuantGranularity.BLOCKWISE,
|
70
|
+
),
|
71
|
+
tensor_content=test_data,
|
72
|
+
)
|
73
|
+
|
74
|
+
def test_get_tensor_quant_params_unsupported_symmetry(self):
|
75
|
+
err_msg = "Unsupported symmetry"
|
76
|
+
test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
|
77
|
+
with self.assertRaisesWithPredicateMatch(
|
78
|
+
ValueError, lambda err: err_msg in str(err)
|
79
|
+
):
|
80
|
+
_ = octav.get_tensor_quant_params(
|
81
|
+
op_info=self._fc_op_info,
|
82
|
+
tensor_quant_config=qtyping.TensorQuantizationConfig(
|
83
|
+
num_bits=4,
|
84
|
+
symmetric=False,
|
85
|
+
granularity=qtyping.QuantGranularity.CHANNELWISE,
|
86
|
+
),
|
87
|
+
tensor_content=test_data,
|
88
|
+
)
|
89
|
+
|
90
|
+
def test_get_tensor_quant_params_success_with_qsv(self):
|
91
|
+
# Fall back to naive_min_max_quantize.py for non-weight tensors.
|
92
|
+
tensor_quant_params = octav.get_tensor_quant_params(
|
93
|
+
op_info=self._fc_op_info,
|
94
|
+
tensor_quant_config=qtyping.TensorQuantizationConfig(
|
95
|
+
num_bits=8,
|
96
|
+
granularity=qtyping.QuantGranularity.TENSORWISE,
|
97
|
+
),
|
98
|
+
tensor_qsv={
|
99
|
+
"min": np.array([-1]),
|
100
|
+
"max": np.array([1]),
|
101
|
+
},
|
102
|
+
)
|
103
|
+
|
104
|
+
self.assertIsNone(tensor_quant_params.quantized_dimension)
|
105
|
+
scale = tensor_quant_params.scale
|
106
|
+
self.assertEqual(scale.shape, (1,))
|
107
|
+
self.assertSequenceAlmostEqual(scale.flatten(), [1 / 127])
|
108
|
+
|
109
|
+
# Zero point should be zero for symmetric quantization.
|
110
|
+
zp = tensor_quant_params.zero_point
|
111
|
+
self.assertEqual(np.sum(zp), 0)
|
112
|
+
self.assertEqual(zp.shape, (1,))
|
113
|
+
|
114
|
+
def test_get_tensor_quant_params_sanity_tensorwise(self):
|
115
|
+
test_data = np.array([
|
116
|
+
[-1e5, 25, -50, 75, -100, 125],
|
117
|
+
[25, -30, 50, -75, 1e5, -125],
|
118
|
+
[50, -60, 70, -80, 90, -100],
|
119
|
+
])
|
120
|
+
quant_params = octav.get_tensor_quant_params(
|
121
|
+
op_info=self._fc_op_info,
|
122
|
+
tensor_quant_config=qtyping.TensorQuantizationConfig(
|
123
|
+
num_bits=4,
|
124
|
+
symmetric=True,
|
125
|
+
granularity=qtyping.QuantGranularity.TENSORWISE,
|
126
|
+
),
|
127
|
+
tensor_content=test_data,
|
128
|
+
)
|
129
|
+
adjusted_test_data = quant_params.quantized_data * quant_params.scale
|
130
|
+
real_max = np.max(np.abs(test_data))
|
131
|
+
adjusted_max = np.max(np.abs(adjusted_test_data))
|
132
|
+
|
133
|
+
# Check that some clipping occurred.
|
134
|
+
with self.subTest(name="SanityCheckClipping"):
|
135
|
+
self.assertLess(adjusted_max, real_max)
|
136
|
+
|
137
|
+
with self.subTest(name="SanityCheckQuantParamsShapes"):
|
138
|
+
self.assertEqual(quant_params.zero_point.shape, (1, 1))
|
139
|
+
self.assertEqual(quant_params.scale.shape, (1, 1))
|
140
|
+
self.assertIsNone(quant_params.quantized_dimension)
|
141
|
+
self.assertIsNotNone(quant_params.quantized_data)
|
142
|
+
self.assertTupleEqual(
|
143
|
+
cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
|
144
|
+
)
|
145
|
+
|
146
|
+
with self.subTest(name="SanityCheckQuantParamsValues"):
|
147
|
+
self.assertTrue(np.all(quant_params.zero_point == 0))
|
148
|
+
|
149
|
+
def test_get_tensor_quant_params_sanity_channelwise(self):
|
150
|
+
test_data = np.array([
|
151
|
+
[-1e5, 25, -50, 75, -100, 125],
|
152
|
+
[25, -30, 50, -75, 1e5, -125],
|
153
|
+
[50, -60, 70, -80, 90, -100],
|
154
|
+
])
|
155
|
+
quant_params = octav.get_tensor_quant_params(
|
156
|
+
op_info=self._fc_op_info,
|
157
|
+
tensor_quant_config=qtyping.TensorQuantizationConfig(
|
158
|
+
num_bits=4,
|
159
|
+
symmetric=True,
|
160
|
+
granularity=qtyping.QuantGranularity.CHANNELWISE,
|
161
|
+
),
|
162
|
+
tensor_content=test_data,
|
163
|
+
)
|
164
|
+
adjusted_test_data = quant_params.quantized_data * quant_params.scale
|
165
|
+
for i, row in enumerate(test_data):
|
166
|
+
real_max = np.max(np.abs(row))
|
167
|
+
adjusted_max = np.max(np.abs(adjusted_test_data[i]))
|
168
|
+
# Check that some clipping occurred.
|
169
|
+
with self.subTest(name="SanityCheckClipping"):
|
170
|
+
self.assertLess(adjusted_max, real_max)
|
171
|
+
|
172
|
+
with self.subTest(name="SanityCheckQuantParamsShapes"):
|
173
|
+
self.assertEqual(quant_params.zero_point.shape, (test_data.shape[0], 1))
|
174
|
+
self.assertEqual(quant_params.scale.shape, (test_data.shape[0], 1))
|
175
|
+
self.assertIsNotNone(quant_params.quantized_data)
|
176
|
+
self.assertTupleEqual(
|
177
|
+
cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
|
178
|
+
)
|
179
|
+
|
180
|
+
with self.subTest(name="SanityCheckQuantParamsValues"):
|
181
|
+
self.assertTrue(np.all(quant_params.zero_point == 0))
|
182
|
+
self.assertEqual(quant_params.quantized_dimension, 0)
|
183
|
+
|
184
|
+
|
185
|
+
if __name__ == "__main__":
|
186
|
+
googletest.main()
|
@@ -16,6 +16,7 @@
|
|
16
16
|
"""Uniform quantize in tensor level."""
|
17
17
|
|
18
18
|
import dataclasses
|
19
|
+
from typing import Optional
|
19
20
|
import numpy as np
|
20
21
|
from ai_edge_quantizer import qtyping
|
21
22
|
|
@@ -237,7 +238,11 @@ def symmetric_quantize_bias_tensor(
|
|
237
238
|
|
238
239
|
|
239
240
|
def tensor_zp_scale_from_min_max(
|
240
|
-
min_value,
|
241
|
+
min_value,
|
242
|
+
max_value,
|
243
|
+
num_bits: int,
|
244
|
+
symmetric: bool,
|
245
|
+
clipping_values: Optional[np.ndarray] = None,
|
241
246
|
):
|
242
247
|
"""Get zero point and scale from min and max value.
|
243
248
|
|
@@ -246,6 +251,10 @@ def tensor_zp_scale_from_min_max(
|
|
246
251
|
max_value: The maximum value of the tensor (channel-wise supported).
|
247
252
|
num_bits: The number of bits of the tensor.
|
248
253
|
symmetric: Whether the tensor is symmetric.
|
254
|
+
clipping_values: Absolute clipping values to apply to the tensor. This will
|
255
|
+
clip the tensors to the range [-clipping_values, clipping_values]. This
|
256
|
+
should be the same shape as min_value and max_value. If None, no clipping
|
257
|
+
will be applied.
|
249
258
|
|
250
259
|
Returns:
|
251
260
|
The zero point and scale of the tensor.
|
@@ -261,6 +270,8 @@ def tensor_zp_scale_from_min_max(
|
|
261
270
|
if symmetric:
|
262
271
|
bound = np.maximum(np.abs(min_value), np.abs(max_value))
|
263
272
|
bound = np.maximum(bound, min_bound)
|
273
|
+
if clipping_values is not None:
|
274
|
+
bound = np.clip(bound, -clipping_values, clipping_values)
|
264
275
|
if not qtype.signed:
|
265
276
|
half_q = (qmax - 1) / 2
|
266
277
|
scale = bound / half_q
|
@@ -268,7 +279,6 @@ def tensor_zp_scale_from_min_max(
|
|
268
279
|
else:
|
269
280
|
scale = bound / qmax
|
270
281
|
zp = np.zeros_like(scale, dtype=np.int32)
|
271
|
-
|
272
282
|
else:
|
273
283
|
# Include 0 to the range to support zero-padding.
|
274
284
|
# See: https://arxiv.org/pdf/1712.05877.pdf
|
@@ -276,6 +286,8 @@ def tensor_zp_scale_from_min_max(
|
|
276
286
|
bound_max = np.maximum(max_value, np.zeros_like(max_value))
|
277
287
|
bound_min = np.minimum(min_value, np.zeros_like(min_value))
|
278
288
|
bound = np.maximum(bound_max - bound_min, min_bound)
|
289
|
+
if clipping_values is not None:
|
290
|
+
bound = np.clip(bound, -clipping_values, clipping_values)
|
279
291
|
scale = bound / (qmax - qmin)
|
280
292
|
zp = qmin - bound_min / scale
|
281
293
|
zp = np.rint(zp)
|
@@ -352,6 +352,33 @@ class TensorUtilsTest(parameterized.TestCase):
|
|
352
352
|
# Range has to be extended to include zero.
|
353
353
|
self.assertEqual(calculated_min, 0)
|
354
354
|
|
355
|
+
@parameterized.parameters(
|
356
|
+
# number of bits, is_symmetric, max bound of the quantized range.
|
357
|
+
(4, True, 7),
|
358
|
+
(8, False, 255),
|
359
|
+
)
|
360
|
+
def test_tensor_zp_scale_from_min_max_with_clipping(
|
361
|
+
self, num_bits, symmetric, quantized_bound
|
362
|
+
):
|
363
|
+
min_val = np.array([[1.0]])
|
364
|
+
max_val = np.array([[5.0]])
|
365
|
+
clipping_values = np.array([4.0])
|
366
|
+
zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
|
367
|
+
min_val, max_val, num_bits, symmetric, clipping_values
|
368
|
+
)
|
369
|
+
expected_scale = clipping_values / quantized_bound
|
370
|
+
|
371
|
+
with self.subTest(name="CheckShapes"):
|
372
|
+
self.assertEqual(zp.shape, scale.shape)
|
373
|
+
self.assertEqual(zp.shape, (1, 1))
|
374
|
+
|
375
|
+
if symmetric:
|
376
|
+
with self.subTest(name="CheckSymmetricZpValue"):
|
377
|
+
self.assertEqual(zp[0], 0)
|
378
|
+
|
379
|
+
with self.subTest(name="CheckScaleValue"):
|
380
|
+
self.assertEqual(scale[0], expected_scale)
|
381
|
+
|
355
382
|
|
356
383
|
if __name__ == "__main__":
|
357
384
|
googletest.main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ai-edge-quantizer-nightly
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.1.0.dev20250319
|
4
4
|
Summary: A quantizer for advanced developers to quantize converted AI Edge models.
|
5
5
|
Home-page: https://github.com/google-ai-edge/ai-edge-quantizer
|
6
6
|
Keywords: On-Device ML,AI,Google,TFLite,Quantization,LLMs,GenAI
|
@@ -27,7 +27,7 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: immutabledict
|
28
28
|
Requires-Dist: numpy
|
29
29
|
Requires-Dist: tf-nightly>=2.17.0.dev20240509
|
30
|
-
Requires-Dist: ai-edge-litert
|
30
|
+
Requires-Dist: ai-edge-litert>=1.2.0
|
31
31
|
|
32
32
|
It aims to facilitate advanced users to strive for optimal performance on
|
33
33
|
resource demanding models (e.g., GenAI models).
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ai_edge_quantizer/__init__.py,sha256=4pFSkukSwahYyzwqia0yPRyz8TnFQfGRthVJhYpMWas,793
|
2
|
-
ai_edge_quantizer/algorithm_manager.py,sha256=
|
2
|
+
ai_edge_quantizer/algorithm_manager.py,sha256=sOZ1T8n0YYi_ijDDuzryNJi2HUPggeo9uWNJri3elv0,10431
|
3
3
|
ai_edge_quantizer/algorithm_manager_api.py,sha256=u903TG0s1uIDhJqfeJne3CFl8A93phZrwgV2-hwdcXU,9247
|
4
4
|
ai_edge_quantizer/algorithm_manager_api_test.py,sha256=tL_ozYFTsOPX8qGcti0KTz37nVsCxf0SSG5C45SyT-g,7319
|
5
5
|
ai_edge_quantizer/calibrator.py,sha256=n7AD9j7UScR-CieoI6DQRMeiG_fhLBfSLRiM4460xaM,11895
|
@@ -28,14 +28,16 @@ ai_edge_quantizer/algorithms/nonlinear_quantize/__init__.py,sha256=lpq1g2ayg3lCP
|
|
28
28
|
ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting.py,sha256=Bs9CK7wZAw6jNaZ8xEtbwO2vM34VYXNZSMVWvxJo9nw,9297
|
29
29
|
ai_edge_quantizer/algorithms/nonlinear_quantize/float_casting_test.py,sha256=s64eDDH9bmRWy6Bl1peHnhGewLnFJjvnhYOdjo1zYOA,22625
|
30
30
|
ai_edge_quantizer/algorithms/uniform_quantize/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
|
31
|
-
ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=
|
31
|
+
ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py,sha256=LnItMEsR47qe8T5pg9UI5NGfhi4cOxt0vAU35IkWnaY,27163
|
32
32
|
ai_edge_quantizer/algorithms/uniform_quantize/common_quantize_test.py,sha256=qMmKbWqxrCoVKbLKHn9WuCrGKPfHkEyU0Nmhokh8Qeo,2597
|
33
33
|
ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery.py,sha256=OTXjEZ3Ctq3ffYzisX-6HwgK_DuA7uos_aap5PiIUPE,8686
|
34
34
|
ai_edge_quantizer/algorithms/uniform_quantize/dequantized_weight_recovery_test.py,sha256=y7BK11fkF63Ex_Jzg3fbIdy0D_Ca6HuvChVZR7Uwggc,8073
|
35
|
-
ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=
|
35
|
+
ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize.py,sha256=aWHU4rneBv7ErufEWKQGAWTK-pgfn-rG9mAkC0d9V6Q,7871
|
36
36
|
ai_edge_quantizer/algorithms/uniform_quantize/naive_min_max_quantize_test.py,sha256=Hok09dloSyBfD0oDM5VABdSZjM9JWSQhm_hDHNbFujA,7640
|
37
|
-
ai_edge_quantizer/algorithms/uniform_quantize/
|
38
|
-
ai_edge_quantizer/algorithms/uniform_quantize/
|
37
|
+
ai_edge_quantizer/algorithms/uniform_quantize/octav.py,sha256=e5wYtki-vl739gSVAZHAKcs2hA87GvFUjVoSUPlnkyM,6433
|
38
|
+
ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py,sha256=IcTOaJ1pxtqsitqxOEP9LROVEP_19VFutHalqNied4I,6940
|
39
|
+
ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor.py,sha256=WmZzKQlzfu9gFr9SbUDoPY3rFqTl363om8-0rTLwotw,11629
|
40
|
+
ai_edge_quantizer/algorithms/uniform_quantize/uniform_quantize_tensor_test.py,sha256=G2PFpHhF-6OOuAwQ1lei63QEIm7uzIZJ62qpgA02qTM,12288
|
39
41
|
ai_edge_quantizer/algorithms/utils/__init__.py,sha256=lpq1g2ayg3lCPLy79t2VicYcnGKw64FfYIj1V7J-4m8,676
|
40
42
|
ai_edge_quantizer/algorithms/utils/common_utils.py,sha256=4qSlVNx3-91kJufnnJV1RdVRXBPapylZkrAp2nywoao,34581
|
41
43
|
ai_edge_quantizer/algorithms/utils/common_utils_test.py,sha256=zqapGEfYhjQWe9cNGPLmdbwtEUUYQRhlO_kNe0cXX6E,18104
|
@@ -62,8 +64,8 @@ ai_edge_quantizer/utils/tfl_interpreter_utils.py,sha256=x2xA2CFPpe_2trcV8v5xGaBE
|
|
62
64
|
ai_edge_quantizer/utils/tfl_interpreter_utils_test.py,sha256=Op3JxtOqlrjzmYF18jnnstL1k9xiY9kKJ8S2vklKGkc,11327
|
63
65
|
ai_edge_quantizer/utils/validation_utils.py,sha256=oYw33Sg547AqtGw-choPUJmp9SAKkV46J_ddqSsum2Q,3950
|
64
66
|
ai_edge_quantizer/utils/validation_utils_test.py,sha256=V_qNDikPD4OPB-siOLQCWNVWTAu87h2IgNYt7teFd-o,2934
|
65
|
-
ai_edge_quantizer_nightly-0.0.
|
66
|
-
ai_edge_quantizer_nightly-0.0.
|
67
|
-
ai_edge_quantizer_nightly-0.0.
|
68
|
-
ai_edge_quantizer_nightly-0.0.
|
69
|
-
ai_edge_quantizer_nightly-0.0.
|
67
|
+
ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
68
|
+
ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/METADATA,sha256=WTz-_FHdUgNLhVPcpu4VW9rw2drBw92tUqa35_OsDWg,1527
|
69
|
+
ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
70
|
+
ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/top_level.txt,sha256=8QTfPnFXNVUhScFLaa-NWZMFWMn72M50DVPubpwWB1g,18
|
71
|
+
ai_edge_quantizer_nightly-0.1.0.dev20250319.dist-info/RECORD,,
|
File without changes
|
File without changes
|