onnxruntime-directml 1.18.1__cp38-cp38-win_amd64.whl → 1.19.0__cp38-cp38-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxruntime/ThirdPartyNotices.txt +1 -1
- onnxruntime/__init__.py +1 -1
- onnxruntime/capi/DirectML.dll +0 -0
- onnxruntime/capi/onnxruntime.dll +0 -0
- onnxruntime/capi/onnxruntime_inference_collection.py +13 -5
- onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
- onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
- onnxruntime/capi/onnxruntime_validation.py +8 -9
- onnxruntime/quantization/base_quantizer.py +41 -8
- onnxruntime/quantization/calibrate.py +68 -13
- onnxruntime/quantization/execution_providers/qnn/quant_config.py +14 -3
- onnxruntime/quantization/matmul_4bits_quantizer.py +212 -87
- onnxruntime/quantization/operators/activation.py +4 -2
- onnxruntime/quantization/operators/direct_q8.py +1 -1
- onnxruntime/quantization/operators/norm.py +1 -1
- onnxruntime/quantization/qdq_quantizer.py +21 -8
- onnxruntime/quantization/quant_utils.py +64 -2
- onnxruntime/quantization/quantize.py +28 -1
- onnxruntime/quantization/registry.py +1 -0
- onnxruntime/tools/check_onnx_model_mobile_usability.py +6 -26
- onnxruntime/tools/mobile_helpers/check_model_can_use_ort_mobile_pkg.py +2 -1
- onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +31 -0
- onnxruntime/tools/mobile_helpers/{coreml_supported_ops.md → coreml_supported_neuralnetwork_ops.md} +1 -1
- onnxruntime/tools/mobile_helpers/usability_checker.py +292 -144
- onnxruntime/tools/pytorch_export_contrib_ops.py +1 -1
- onnxruntime/tools/reduced_build_config_parser.py +1 -1
- onnxruntime/tools/symbolic_shape_infer.py +44 -24
- onnxruntime/transformers/benchmark.py +2 -2
- onnxruntime/transformers/bert_test_data.py +3 -3
- onnxruntime/transformers/fusion_attention.py +4 -1
- onnxruntime/transformers/fusion_attention_clip.py +60 -21
- onnxruntime/transformers/fusion_layernorm.py +23 -10
- onnxruntime/transformers/fusion_quickgelu.py +74 -0
- onnxruntime/transformers/fusion_simplified_layernorm.py +18 -0
- onnxruntime/transformers/fusion_utils.py +1 -1
- onnxruntime/transformers/large_model_exporter.py +1 -1
- onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +1 -1
- onnxruntime/transformers/models/gpt2/convert_to_onnx.py +1 -1
- onnxruntime/transformers/models/gpt2/gpt2_tester.py +1 -1
- onnxruntime/transformers/models/llama/benchmark.py +8 -2
- onnxruntime/transformers/models/llama/benchmark_e2e.py +40 -15
- onnxruntime/transformers/models/llama/convert_to_onnx.py +27 -21
- onnxruntime/transformers/models/llama/llama_inputs.py +16 -26
- onnxruntime/transformers/models/llama/llama_parity.py +2 -2
- onnxruntime/transformers/models/llama/llama_torch.py +6 -3
- onnxruntime/transformers/models/stable_diffusion/engine_builder.py +1 -1
- onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +2 -2
- onnxruntime/transformers/onnx_model_bert.py +3 -0
- onnxruntime/transformers/onnx_model_clip.py +1 -0
- onnxruntime/transformers/onnx_model_phi.py +1 -1
- onnxruntime/transformers/onnx_model_t5.py +1 -1
- onnxruntime/transformers/onnx_model_tnlr.py +1 -1
- onnxruntime/transformers/optimizer.py +1 -1
- onnxruntime/transformers/shape_optimizer.py +1 -1
- {onnxruntime_directml-1.18.1.dist-info → onnxruntime_directml-1.19.0.dist-info}/METADATA +4 -4
- {onnxruntime_directml-1.18.1.dist-info → onnxruntime_directml-1.19.0.dist-info}/RECORD +59 -56
- {onnxruntime_directml-1.18.1.dist-info → onnxruntime_directml-1.19.0.dist-info}/WHEEL +1 -1
- {onnxruntime_directml-1.18.1.dist-info → onnxruntime_directml-1.19.0.dist-info}/entry_points.txt +0 -0
- {onnxruntime_directml-1.18.1.dist-info → onnxruntime_directml-1.19.0.dist-info}/top_level.txt +0 -0
|
@@ -4820,7 +4820,7 @@ SOFTWARE.
|
|
|
4820
4820
|
|
|
4821
4821
|
----------------------------------------------------------------------------
|
|
4822
4822
|
|
|
4823
|
-
This is the MIT/Expat
|
|
4823
|
+
This is the MIT/Expat License. For more information see:
|
|
4824
4824
|
|
|
4825
4825
|
1. http://www.opensource.org/licenses/mit-license.php
|
|
4826
4826
|
|
onnxruntime/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
|
|
|
7
7
|
For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
|
|
8
8
|
or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
|
|
9
9
|
"""
|
|
10
|
-
__version__ = "1.
|
|
10
|
+
__version__ = "1.19.0"
|
|
11
11
|
__author__ = "Microsoft"
|
|
12
12
|
|
|
13
13
|
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
|
onnxruntime/capi/DirectML.dll
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -438,10 +438,18 @@ class InferenceSession(Session):
|
|
|
438
438
|
|
|
439
439
|
# Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
|
|
440
440
|
if "TensorrtExecutionProvider" in available_providers:
|
|
441
|
-
if
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
441
|
+
if (
|
|
442
|
+
providers
|
|
443
|
+
and any(
|
|
444
|
+
provider == "CUDAExecutionProvider"
|
|
445
|
+
or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
|
|
446
|
+
for provider in providers
|
|
447
|
+
)
|
|
448
|
+
and any(
|
|
449
|
+
provider == "TensorrtExecutionProvider"
|
|
450
|
+
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
|
|
451
|
+
for provider in providers
|
|
452
|
+
)
|
|
445
453
|
):
|
|
446
454
|
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
447
455
|
else:
|
|
@@ -646,7 +654,7 @@ class IOBinding:
|
|
|
646
654
|
return self._iobinding.get_outputs()
|
|
647
655
|
|
|
648
656
|
def copy_outputs_to_cpu(self):
|
|
649
|
-
"""Copy output contents to CPU
|
|
657
|
+
"""Copy output contents to CPU."""
|
|
650
658
|
return self._iobinding.copy_outputs_to_cpu()
|
|
651
659
|
|
|
652
660
|
def clear_binding_inputs(self):
|
|
Binary file
|
|
Binary file
|
|
@@ -24,8 +24,7 @@ def check_distro_info():
|
|
|
24
24
|
|
|
25
25
|
if __my_distro_ver__ not in ["10", "11"]:
|
|
26
26
|
warnings.warn(
|
|
27
|
-
"Unsupported Windows version (
|
|
28
|
-
% __my_distro_ver__
|
|
27
|
+
f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
|
|
29
28
|
)
|
|
30
29
|
elif __my_system__ == "linux":
|
|
31
30
|
"""Although the 'platform' python module for getting Distro information works well on standard OS images
|
|
@@ -54,11 +53,11 @@ def check_distro_info():
|
|
|
54
53
|
|
|
55
54
|
if int(__my_distro_ver__.split(".")[0]) < 11:
|
|
56
55
|
warnings.warn(
|
|
57
|
-
"Unsupported macOS version (
|
|
56
|
+
f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
|
|
58
57
|
)
|
|
59
58
|
else:
|
|
60
59
|
warnings.warn(
|
|
61
|
-
"Unsupported platform (
|
|
60
|
+
f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only."
|
|
62
61
|
)
|
|
63
62
|
|
|
64
63
|
|
|
@@ -115,10 +114,10 @@ def validate_build_package_info():
|
|
|
115
114
|
cudart_version = None
|
|
116
115
|
|
|
117
116
|
def print_build_package_info():
|
|
118
|
-
warnings.warn("onnxruntime training package info: package_name:
|
|
119
|
-
warnings.warn("onnxruntime training package info: __version__:
|
|
120
|
-
warnings.warn("onnxruntime training package info: cuda_version:
|
|
121
|
-
warnings.warn("onnxruntime build info: cudart_version:
|
|
117
|
+
warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
|
|
118
|
+
warnings.warn(f"onnxruntime training package info: __version__: {version}")
|
|
119
|
+
warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
|
|
120
|
+
warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
|
|
122
121
|
|
|
123
122
|
# collection cuda library info from current environment.
|
|
124
123
|
from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
|
|
@@ -127,7 +126,7 @@ def validate_build_package_info():
|
|
|
127
126
|
if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
|
|
128
127
|
print_build_package_info()
|
|
129
128
|
warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
|
|
130
|
-
warnings.warn("WARNING: found cudart versions:
|
|
129
|
+
warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
|
|
131
130
|
else:
|
|
132
131
|
# TODO: rcom
|
|
133
132
|
pass
|
|
@@ -25,6 +25,7 @@ from .quant_utils import (
|
|
|
25
25
|
find_by_name,
|
|
26
26
|
model_has_infer_metadata,
|
|
27
27
|
normalize_axis,
|
|
28
|
+
pack_bytes_to_4bit,
|
|
28
29
|
quantize_data,
|
|
29
30
|
quantize_nparray,
|
|
30
31
|
save_and_reload_model_with_shape_infer,
|
|
@@ -339,6 +340,18 @@ class BaseQuantizer:
|
|
|
339
340
|
f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
|
|
340
341
|
f"\nraw={str(q_weight_initializer)[:200]}."
|
|
341
342
|
)
|
|
343
|
+
elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
|
|
344
|
+
if q_weight_data.dtype not in (np.int8, np.uint8):
|
|
345
|
+
raise RuntimeError(
|
|
346
|
+
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
|
|
350
|
+
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
|
|
351
|
+
packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
|
|
352
|
+
|
|
353
|
+
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
|
|
354
|
+
q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
|
|
342
355
|
else:
|
|
343
356
|
q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
|
|
344
357
|
weight.dims
|
|
@@ -396,7 +409,10 @@ class BaseQuantizer:
|
|
|
396
409
|
|
|
397
410
|
symmetric = quant_overrides_for_channels[0].get(
|
|
398
411
|
"symmetric",
|
|
399
|
-
(
|
|
412
|
+
(
|
|
413
|
+
self.is_weight_symmetric
|
|
414
|
+
or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
|
|
415
|
+
),
|
|
400
416
|
)
|
|
401
417
|
reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
|
|
402
418
|
zero_point_list = []
|
|
@@ -447,7 +463,8 @@ class BaseQuantizer:
|
|
|
447
463
|
quantized_per_channel_data_list.append(quantized_per_channel_data)
|
|
448
464
|
|
|
449
465
|
# combine per_channel_data into one
|
|
450
|
-
|
|
466
|
+
weights_shape = list(weights.shape)
|
|
467
|
+
reshape_dims = list(weights_shape) # deep copy
|
|
451
468
|
reshape_dims[channel_axis] = 1 # only one per channel for reshape
|
|
452
469
|
quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
|
|
453
470
|
for i in range(1, len(quantized_per_channel_data_list)):
|
|
@@ -470,12 +487,28 @@ class BaseQuantizer:
|
|
|
470
487
|
self.model.initializer_extend([scale_initializer, zero_initializer])
|
|
471
488
|
|
|
472
489
|
if not keep_float_weight:
|
|
473
|
-
|
|
474
|
-
quantized_weights,
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
490
|
+
if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
|
|
491
|
+
if quantized_weights.dtype not in (np.int8, np.uint8):
|
|
492
|
+
raise RuntimeError(
|
|
493
|
+
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
|
|
497
|
+
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
|
|
498
|
+
packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
|
|
499
|
+
|
|
500
|
+
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
|
|
501
|
+
q_weight_initializer = onnx.helper.make_tensor(
|
|
502
|
+
q_weight_name, weight_qType, weights_shape, packed_data, raw=True
|
|
503
|
+
)
|
|
504
|
+
self.model.initializer_extend([q_weight_initializer])
|
|
505
|
+
else:
|
|
506
|
+
quantized_weights = np.asarray(
|
|
507
|
+
quantized_weights,
|
|
508
|
+
dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
|
|
509
|
+
).reshape(initializer.dims)
|
|
510
|
+
q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
|
|
511
|
+
self.model.initializer_extend([q_weight_initializer])
|
|
479
512
|
|
|
480
513
|
return q_weight_name, zp_name, scale_name
|
|
481
514
|
|
|
@@ -128,6 +128,9 @@ class TensorsData:
|
|
|
128
128
|
def values(self):
|
|
129
129
|
return self.data.values()
|
|
130
130
|
|
|
131
|
+
def items(self):
|
|
132
|
+
return self.data.items()
|
|
133
|
+
|
|
131
134
|
|
|
132
135
|
class CalibrationMethod(Enum):
|
|
133
136
|
MinMax = 0
|
|
@@ -155,6 +158,12 @@ class CalibrationDataReader(metaclass=abc.ABCMeta):
|
|
|
155
158
|
raise StopIteration
|
|
156
159
|
return result
|
|
157
160
|
|
|
161
|
+
def __len__(self):
|
|
162
|
+
raise NotImplementedError
|
|
163
|
+
|
|
164
|
+
def set_range(self, start_index: int, end_index: int):
|
|
165
|
+
raise NotImplementedError
|
|
166
|
+
|
|
158
167
|
|
|
159
168
|
class CalibraterBase:
|
|
160
169
|
def __init__(
|
|
@@ -164,13 +173,15 @@ class CalibraterBase:
|
|
|
164
173
|
augmented_model_path="augmented_model.onnx",
|
|
165
174
|
symmetric=False,
|
|
166
175
|
use_external_data_format=False,
|
|
176
|
+
per_channel=False,
|
|
167
177
|
):
|
|
168
178
|
"""
|
|
169
179
|
:param model_path: ONNX model to calibrate. It should be a model file path
|
|
170
180
|
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
|
171
181
|
:param augmented_model_path: save augmented model to this path.
|
|
172
182
|
:param symmetric: make range of tensor symmetric (central point is 0).
|
|
173
|
-
:param use_external_data_format: use external data format to store model which size is >= 2Gb
|
|
183
|
+
:param use_external_data_format: use external data format to store model which size is >= 2Gb.
|
|
184
|
+
:param per_channel: whether to compute ranges per each channel.
|
|
174
185
|
"""
|
|
175
186
|
if isinstance(model_path, str):
|
|
176
187
|
self.model = load_model_with_shape_infer(Path(model_path))
|
|
@@ -183,6 +194,7 @@ class CalibraterBase:
|
|
|
183
194
|
self.augmented_model_path = augmented_model_path
|
|
184
195
|
self.symmetric = symmetric
|
|
185
196
|
self.use_external_data_format = use_external_data_format
|
|
197
|
+
self.per_channel = per_channel
|
|
186
198
|
|
|
187
199
|
self.augment_model = None
|
|
188
200
|
self.infer_session = None
|
|
@@ -274,6 +286,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
274
286
|
moving_average=False,
|
|
275
287
|
averaging_constant=0.01,
|
|
276
288
|
max_intermediate_outputs=None,
|
|
289
|
+
per_channel=False,
|
|
277
290
|
):
|
|
278
291
|
"""
|
|
279
292
|
:param model_path: ONNX model to calibrate. It is a model path
|
|
@@ -284,6 +297,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
284
297
|
:param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
|
|
285
298
|
:param averaging_constant: constant smoothing factor to use when computing the moving average.
|
|
286
299
|
:param max_intermediate_outputs: maximum number of intermediate outputs before an intermediate range is computed.
|
|
300
|
+
:param per_channel: whether to compute ranges per each channel.
|
|
287
301
|
"""
|
|
288
302
|
super().__init__(
|
|
289
303
|
model_path,
|
|
@@ -291,6 +305,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
291
305
|
augmented_model_path=augmented_model_path,
|
|
292
306
|
symmetric=symmetric,
|
|
293
307
|
use_external_data_format=use_external_data_format,
|
|
308
|
+
per_channel=per_channel,
|
|
294
309
|
)
|
|
295
310
|
self.intermediate_outputs = []
|
|
296
311
|
self.calibrate_tensors_range = None
|
|
@@ -310,9 +325,15 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
310
325
|
"""
|
|
311
326
|
tensors, _ = self.select_tensors_to_calibrate(self.model)
|
|
312
327
|
reshape_shape_name = str(uuid.uuid4())
|
|
313
|
-
reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
|
|
328
|
+
reshape_shape = numpy_helper.from_array(np.array([-1], dtype=np.int64), reshape_shape_name)
|
|
314
329
|
self.model.graph.initializer.append(reshape_shape)
|
|
315
330
|
|
|
331
|
+
def get_op_version(op_type, model):
|
|
332
|
+
for opset_import in model.opset_import:
|
|
333
|
+
if onnx.defs.has(op_type, opset_import.domain):
|
|
334
|
+
return opset_import.version
|
|
335
|
+
raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
|
|
336
|
+
|
|
316
337
|
def add_reduce_min_max(tensor_name, reduce_op_name):
|
|
317
338
|
# When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
|
|
318
339
|
# To make the code simple, we always let keepdims to be 1.
|
|
@@ -332,7 +353,6 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
332
353
|
name=intermediate_output,
|
|
333
354
|
)
|
|
334
355
|
|
|
335
|
-
self.model.graph.node.extend([reduce_node, reshape_node])
|
|
336
356
|
value_infos = {vi.name: vi for vi in self.model.graph.value_info}
|
|
337
357
|
value_infos.update({o.name: o for o in self.model.graph.output})
|
|
338
358
|
value_infos.update({i.name: i for i in self.model.graph.input})
|
|
@@ -343,7 +363,22 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
343
363
|
f"Unable to guess tensor type for tensor {tensor_name!r}, "
|
|
344
364
|
f"running shape inference before quantization may resolve this issue."
|
|
345
365
|
)
|
|
346
|
-
|
|
366
|
+
|
|
367
|
+
# Include axes in reduce_op when per_channel, always keeping axis=1
|
|
368
|
+
if self.per_channel:
|
|
369
|
+
tensor_rank = len(value_infos[tensor_name].type.tensor_type.shape.dim)
|
|
370
|
+
reduced_axes = [0, *range(2, tensor_rank)]
|
|
371
|
+
# Depending on opset version, axes in ReduceMin/ReduceMax are in attribute or inputs
|
|
372
|
+
if get_op_version(reduce_op_name, self.model) < 18:
|
|
373
|
+
reduce_node.attribute.append(helper.make_attribute("axes", reduced_axes))
|
|
374
|
+
else:
|
|
375
|
+
reduce_axes_name = str(uuid.uuid4())
|
|
376
|
+
reduce_axes = numpy_helper.from_array(np.array(reduced_axes, dtype=np.int64), reduce_axes_name)
|
|
377
|
+
reduce_node.input.append(reduce_axes_name)
|
|
378
|
+
self.model.graph.initializer.append(reduce_axes)
|
|
379
|
+
|
|
380
|
+
self.model.graph.node.extend([reduce_node, reshape_node])
|
|
381
|
+
self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
|
|
347
382
|
|
|
348
383
|
for tensor in tensors:
|
|
349
384
|
add_reduce_min_max(tensor, "ReduceMin")
|
|
@@ -383,13 +418,31 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
383
418
|
return new_range
|
|
384
419
|
|
|
385
420
|
for key, value in old_range.items():
|
|
421
|
+
# Handling for structured data types with TensorData
|
|
422
|
+
if isinstance(value, TensorData):
|
|
423
|
+
old_min = value.range_value[0]
|
|
424
|
+
old_max = value.range_value[1]
|
|
425
|
+
else:
|
|
426
|
+
old_min, old_max = value
|
|
427
|
+
|
|
428
|
+
if isinstance(new_range[key], TensorData):
|
|
429
|
+
new_min = new_range[key].range_value[0]
|
|
430
|
+
new_max = new_range[key].range_value[1]
|
|
431
|
+
else:
|
|
432
|
+
new_min, new_max = new_range[key]
|
|
433
|
+
|
|
386
434
|
if self.moving_average:
|
|
387
|
-
min_value =
|
|
388
|
-
max_value =
|
|
435
|
+
min_value = old_min + self.averaging_constant * (new_min - old_min)
|
|
436
|
+
max_value = old_max + self.averaging_constant * (new_max - old_max)
|
|
437
|
+
else:
|
|
438
|
+
min_value = min(old_min, new_min)
|
|
439
|
+
max_value = max(old_max, new_max)
|
|
440
|
+
|
|
441
|
+
# If structured as TensorData, wrap the result accordingly
|
|
442
|
+
if isinstance(value, TensorData) or isinstance(new_range[key], TensorData):
|
|
443
|
+
new_range[key] = TensorData(lowest=min_value, highest=max_value)
|
|
389
444
|
else:
|
|
390
|
-
|
|
391
|
-
max_value = max(value[1], new_range[key][1])
|
|
392
|
-
new_range[key] = (min_value, max_value)
|
|
445
|
+
new_range[key] = (min_value, max_value)
|
|
393
446
|
|
|
394
447
|
return new_range
|
|
395
448
|
|
|
@@ -430,7 +483,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|
|
430
483
|
max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
|
|
431
484
|
|
|
432
485
|
if self.symmetric:
|
|
433
|
-
max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
|
|
486
|
+
max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
|
|
434
487
|
pairs.append(tuple([-max_absolute_value, max_absolute_value]))
|
|
435
488
|
else:
|
|
436
489
|
pairs.append(tuple([min_value_array, max_value_array]))
|
|
@@ -759,7 +812,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|
|
759
812
|
hist_edges = hist_edges.astype(data_arr_np.dtype)
|
|
760
813
|
assert (
|
|
761
814
|
data_arr_np.dtype != np.float64
|
|
762
|
-
), "only float32 or float16 is supported, every constant must be
|
|
815
|
+
), "only float32 or float16 is supported, every constant must be explicitly typed"
|
|
763
816
|
self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
|
|
764
817
|
else:
|
|
765
818
|
old_histogram = self.histogram_dict[tensor]
|
|
@@ -781,7 +834,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|
|
781
834
|
hist[: len(old_hist)] += old_hist
|
|
782
835
|
assert (
|
|
783
836
|
data_arr_np.dtype != np.float64
|
|
784
|
-
), "only float32 or float16 is supported, every constant must be
|
|
837
|
+
), "only float32 or float16 is supported, every constant must be explicitly typed"
|
|
785
838
|
self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
|
|
786
839
|
|
|
787
840
|
def collect_value(self, name_to_arr):
|
|
@@ -1023,7 +1076,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|
|
1023
1076
|
|
|
1024
1077
|
for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
|
|
1025
1078
|
start_index = zero_bin_index - i
|
|
1026
|
-
end_index =
|
|
1079
|
+
end_index = min(zero_bin_index + i + 1, num_bins)
|
|
1027
1080
|
|
|
1028
1081
|
thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
|
|
1029
1082
|
|
|
@@ -1097,6 +1150,7 @@ def create_calibrator(
|
|
|
1097
1150
|
moving_average = extra_options.get("moving_average", False)
|
|
1098
1151
|
averaging_constant = extra_options.get("averaging_constant", 0.01)
|
|
1099
1152
|
max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
|
|
1153
|
+
per_channel = extra_options.get("per_channel", False)
|
|
1100
1154
|
calibrator = MinMaxCalibrater(
|
|
1101
1155
|
model,
|
|
1102
1156
|
op_types_to_calibrate,
|
|
@@ -1106,6 +1160,7 @@ def create_calibrator(
|
|
|
1106
1160
|
moving_average=moving_average,
|
|
1107
1161
|
averaging_constant=averaging_constant,
|
|
1108
1162
|
max_intermediate_outputs=max_intermediate_outputs,
|
|
1163
|
+
per_channel=per_channel,
|
|
1109
1164
|
)
|
|
1110
1165
|
elif calibrate_method == CalibrationMethod.Entropy:
|
|
1111
1166
|
# default settings for entropy algorithm
|
|
@@ -21,6 +21,7 @@ from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesF
|
|
|
21
21
|
|
|
22
22
|
Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
|
|
23
23
|
Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
|
|
24
|
+
Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
|
|
24
25
|
OP_TYPES_TO_EXCLUDE = {"Cast"}
|
|
25
26
|
MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
|
|
26
27
|
|
|
@@ -50,6 +51,8 @@ def get_qnn_qdq_config(
|
|
|
50
51
|
add_qtype_converts: bool = True,
|
|
51
52
|
activation_symmetric: bool = False,
|
|
52
53
|
weight_symmetric: bool | None = None,
|
|
54
|
+
keep_removable_activations: bool = False,
|
|
55
|
+
stride: int | None = None,
|
|
53
56
|
) -> StaticQuantConfig:
|
|
54
57
|
"""
|
|
55
58
|
Returns a static quantization configuration suitable for running QDQ models on QNN EP.
|
|
@@ -109,6 +112,11 @@ def get_qnn_qdq_config(
|
|
|
109
112
|
the zero-point values are 128 and 32,768, respectively.
|
|
110
113
|
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
|
|
111
114
|
Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
|
|
115
|
+
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
|
|
116
|
+
be removed, and will be explicitly represented in the QDQ model. If false, these activations
|
|
117
|
+
are automatically removed if activations are asymmetrically quantized. Keeping these activations
|
|
118
|
+
is necessary if optimizations or EP transformations will later remove
|
|
119
|
+
QuantizeLinear/DequantizeLinear operators from the model.
|
|
112
120
|
|
|
113
121
|
Returns:
|
|
114
122
|
A StaticQuantConfig object
|
|
@@ -160,17 +168,20 @@ def get_qnn_qdq_config(
|
|
|
160
168
|
extra_options = {
|
|
161
169
|
"MinimumRealRange": 0.0001,
|
|
162
170
|
"DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
|
|
171
|
+
"QDQKeepRemovableActivations": keep_removable_activations,
|
|
163
172
|
"TensorQuantOverrides": overrides_helper.get_dict(),
|
|
164
173
|
"ActivationSymmetric": activation_symmetric,
|
|
165
174
|
"WeightSymmetric": weight_symmetric,
|
|
175
|
+
"CalibStridedMinMax": stride,
|
|
166
176
|
}
|
|
167
177
|
|
|
168
178
|
# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
|
|
169
|
-
# on Q/DQ operators if using 16-bit quantization.
|
|
179
|
+
# on Q/DQ operators if using 16-bit or 4-bit quantization.
|
|
170
180
|
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
|
|
171
181
|
if onnx_opset.version < 21:
|
|
172
|
-
|
|
173
|
-
|
|
182
|
+
opset21_types = Q16_TYPES.union(Q4_TYPES)
|
|
183
|
+
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
|
|
184
|
+
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
|
|
174
185
|
extra_options["UseQDQContribOps"] = True
|
|
175
186
|
|
|
176
187
|
return StaticQuantConfig(
|