onnxruntime-directml 1.17.3__cp39-cp39-win_amd64.whl → 1.18.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. onnxruntime/ThirdPartyNotices.txt +1 -1
  2. onnxruntime/__init__.py +1 -1
  3. onnxruntime/capi/DirectML.dll +0 -0
  4. onnxruntime/capi/onnxruntime_inference_collection.py +2 -2
  5. onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
  6. onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
  7. onnxruntime/quantization/base_quantizer.py +503 -0
  8. onnxruntime/quantization/calibrate.py +18 -27
  9. onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py +6 -1
  10. onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py +413 -0
  11. onnxruntime/quantization/execution_providers/qnn/preprocess.py +259 -3
  12. onnxruntime/quantization/execution_providers/qnn/quant_config.py +333 -50
  13. onnxruntime/quantization/fusions/fusion.py +16 -3
  14. onnxruntime/quantization/fusions/fusion_gelu.py +14 -11
  15. onnxruntime/quantization/fusions/fusion_layernorm.py +1 -0
  16. onnxruntime/quantization/matmul_4bits_quantizer.py +353 -43
  17. onnxruntime/quantization/matmul_bnb4_quantizer.py +2 -2
  18. onnxruntime/quantization/onnx_model.py +28 -5
  19. onnxruntime/quantization/onnx_quantizer.py +64 -498
  20. onnxruntime/quantization/operators/concat.py +2 -2
  21. onnxruntime/quantization/operators/conv.py +6 -3
  22. onnxruntime/quantization/operators/direct_q8.py +2 -2
  23. onnxruntime/quantization/operators/gather.py +2 -2
  24. onnxruntime/quantization/operators/gemm.py +9 -6
  25. onnxruntime/quantization/operators/lstm.py +2 -0
  26. onnxruntime/quantization/operators/matmul.py +4 -3
  27. onnxruntime/quantization/operators/norm.py +7 -4
  28. onnxruntime/quantization/operators/pad.py +1 -0
  29. onnxruntime/quantization/operators/softmax.py +1 -37
  30. onnxruntime/quantization/operators/split.py +1 -1
  31. onnxruntime/quantization/qdq_quantizer.py +736 -91
  32. onnxruntime/quantization/quant_utils.py +32 -3
  33. onnxruntime/quantization/quantize.py +34 -15
  34. onnxruntime/quantization/registry.py +1 -2
  35. onnxruntime/quantization/shape_inference.py +48 -15
  36. onnxruntime/quantization/tensor_quant_overrides.py +516 -0
  37. onnxruntime/tools/convert_onnx_models_to_ort.py +5 -9
  38. onnxruntime/tools/mobile_helpers/check_model_can_use_ort_mobile_pkg.py +2 -2
  39. onnxruntime/tools/ort_format_model/operator_type_usage_processors.py +2 -10
  40. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py +0 -1
  41. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgTypeAndIndex.py +28 -5
  42. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Attribute.py +124 -21
  43. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/AttributeType.py +0 -1
  44. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Checkpoint.py +46 -8
  45. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedKernelCreateInfos.py +40 -7
  46. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedNodeIndexAndKernelDefHash.py +28 -5
  47. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSessionState.py +34 -6
  48. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSubGraphSessionState.py +28 -5
  49. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Dimension.py +28 -5
  50. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValue.py +34 -6
  51. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValueType.py +0 -1
  52. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/EdgeEnd.py +4 -0
  53. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/FloatProperty.py +28 -5
  54. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Graph.py +112 -19
  55. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/InferenceSession.py +34 -6
  56. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/IntProperty.py +28 -5
  57. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrArgsEntry.py +34 -6
  58. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrResolver.py +28 -5
  59. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/MapType.py +28 -5
  60. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Model.py +88 -15
  61. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ModuleState.py +66 -7
  62. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Node.py +124 -21
  63. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeEdge.py +46 -8
  64. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeType.py +0 -1
  65. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodesToOptimizeIndices.py +64 -11
  66. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OpIdKernelTypeStrArgsEntry.py +34 -6
  67. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OperatorSetId.py +28 -5
  68. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OptimizerGroup.py +46 -8
  69. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ParameterOptimizerState.py +34 -6
  70. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/PropertyBag.py +52 -9
  71. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecord.py +40 -7
  72. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecordContainerEntry.py +34 -6
  73. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizations.py +28 -5
  74. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SequenceType.py +22 -4
  75. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Shape.py +28 -5
  76. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SparseTensor.py +40 -7
  77. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringProperty.py +28 -5
  78. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringStringEntry.py +28 -5
  79. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Tensor.py +83 -12
  80. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorDataType.py +0 -1
  81. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorTypeAndShape.py +28 -5
  82. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfo.py +34 -6
  83. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfoValue.py +0 -1
  84. onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ValueInfo.py +34 -6
  85. onnxruntime/tools/ort_format_model/ort_model_processor.py +4 -4
  86. onnxruntime/tools/symbolic_shape_infer.py +109 -56
  87. onnxruntime/transformers/benchmark.py +17 -8
  88. onnxruntime/transformers/benchmark_helper.py +8 -8
  89. onnxruntime/transformers/bert_perf_test.py +3 -3
  90. onnxruntime/transformers/bert_test_data.py +1 -3
  91. onnxruntime/transformers/compare_bert_results.py +3 -13
  92. onnxruntime/transformers/convert_generation.py +9 -9
  93. onnxruntime/transformers/dynamo_onnx_helper.py +104 -0
  94. onnxruntime/transformers/float16.py +6 -6
  95. onnxruntime/transformers/fusion_attention_unet.py +2 -6
  96. onnxruntime/transformers/fusion_embedlayer.py +2 -7
  97. onnxruntime/transformers/fusion_options.py +23 -0
  98. onnxruntime/transformers/fusion_qordered_gelu.py +5 -3
  99. onnxruntime/transformers/fusion_qordered_layernorm.py +5 -3
  100. onnxruntime/transformers/fusion_rotary_attention.py +210 -10
  101. onnxruntime/transformers/fusion_skip_group_norm.py +1 -1
  102. onnxruntime/transformers/io_binding_helper.py +157 -35
  103. onnxruntime/transformers/large_model_exporter.py +0 -2
  104. onnxruntime/transformers/models/bert/eval_squad.py +1 -1
  105. onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +1 -1
  106. onnxruntime/transformers/models/gpt2/gpt2_helper.py +3 -3
  107. onnxruntime/transformers/models/gpt2/gpt2_parity.py +2 -4
  108. onnxruntime/transformers/models/gpt2/gpt2_tester.py +2 -2
  109. onnxruntime/transformers/models/llama/benchmark.py +28 -34
  110. onnxruntime/transformers/models/llama/benchmark_all.py +1 -1
  111. onnxruntime/transformers/models/llama/convert_to_onnx.py +19 -10
  112. onnxruntime/transformers/models/llama/llama_inputs.py +14 -9
  113. onnxruntime/transformers/models/llama/llama_parity.py +60 -31
  114. onnxruntime/transformers/models/longformer/benchmark_longformer.py +3 -5
  115. onnxruntime/transformers/models/phi2/__init__.py +12 -0
  116. onnxruntime/transformers/models/phi2/convert_to_onnx.py +576 -0
  117. onnxruntime/transformers/models/phi2/inference_example.py +414 -0
  118. onnxruntime/transformers/models/stable_diffusion/benchmark.py +4 -4
  119. onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +19 -6
  120. onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +22 -10
  121. onnxruntime/transformers/models/stable_diffusion/demo_utils.py +18 -8
  122. onnxruntime/transformers/models/stable_diffusion/diffusion_models.py +0 -1
  123. onnxruntime/transformers/models/stable_diffusion/engine_builder.py +7 -3
  124. onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +33 -8
  125. onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +11 -1
  126. onnxruntime/transformers/models/whisper/benchmark.py +3 -2
  127. onnxruntime/transformers/models/whisper/convert_to_onnx.py +3 -3
  128. onnxruntime/transformers/models/whisper/whisper_helper.py +2 -2
  129. onnxruntime/transformers/onnx_exporter.py +2 -7
  130. onnxruntime/transformers/onnx_model.py +7 -4
  131. onnxruntime/transformers/onnx_model_phi.py +930 -0
  132. onnxruntime/transformers/onnx_model_unet.py +1 -1
  133. onnxruntime/transformers/onnx_utils.py +55 -0
  134. onnxruntime/transformers/optimizer.py +53 -21
  135. onnxruntime/transformers/profiler.py +4 -4
  136. onnxruntime/transformers/shape_optimizer.py +1 -3
  137. {onnxruntime_directml-1.17.3.dist-info → onnxruntime_directml-1.18.0.dist-info}/METADATA +3 -13
  138. {onnxruntime_directml-1.17.3.dist-info → onnxruntime_directml-1.18.0.dist-info}/RECORD +141 -132
  139. {onnxruntime_directml-1.17.3.dist-info → onnxruntime_directml-1.18.0.dist-info}/WHEEL +0 -0
  140. {onnxruntime_directml-1.17.3.dist-info → onnxruntime_directml-1.18.0.dist-info}/entry_points.txt +0 -0
  141. {onnxruntime_directml-1.17.3.dist-info → onnxruntime_directml-1.18.0.dist-info}/top_level.txt +0 -0
@@ -1829,7 +1829,7 @@ Zbigniew Skowron <zbychs@gmail.com>
1829
1829
 
1830
1830
  _____
1831
1831
 
1832
- HalidelR
1832
+ HalideIR
1833
1833
 
1834
1834
  Copyright (c) 2016 HalideIR contributors
1835
1835
  Copyright (c) 2012-2014 MIT CSAIL, Google Inc., and other contributors
onnxruntime/__init__.py CHANGED
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
7
7
  For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
8
8
  or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
9
9
  """
10
- __version__ = "1.17.3"
10
+ __version__ = "1.18.0"
11
11
  __author__ = "Microsoft"
12
12
 
13
13
  # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
Binary file
@@ -358,7 +358,7 @@ class InferenceSession(Session):
358
358
  def __init__(
359
359
  self,
360
360
  path_or_bytes: str | bytes | os.PathLike,
361
- sess_options: Sequence[onnxruntime.SessionOptions] | None = None,
361
+ sess_options: onnxruntime.SessionOptions | None = None,
362
362
  providers: Sequence[str | tuple[str, dict[Any, Any]]] | None = None,
363
363
  provider_options: Sequence[dict[Any, Any]] | None = None,
364
364
  **kwargs,
@@ -413,7 +413,7 @@ class InferenceSession(Session):
413
413
  self._read_config_from_model = os.environ.get("ORT_LOAD_CONFIG_FROM_MODEL") == "1"
414
414
 
415
415
  # internal parameters that we don't expect to be used in general so aren't documented
416
- disabled_optimizers = kwargs["disabled_optimizers"] if "disabled_optimizers" in kwargs else None
416
+ disabled_optimizers = kwargs.get("disabled_optimizers")
417
417
 
418
418
  try:
419
419
  self._create_inference_session(providers, provider_options, disabled_optimizers)
@@ -0,0 +1,503 @@
1
+ # -------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # Licensed under the MIT License. See License.txt in the project root for
4
+ # license information.
5
+ # --------------------------------------------------------------------------
6
+ import logging
7
+ from typing import Any, Dict
8
+
9
+ import numpy as np
10
+ import onnx
11
+ import onnx.numpy_helper
12
+
13
+ try:
14
+ from onnx.reference.op_run import to_array_extended
15
+ except ImportError:
16
+ # old version of onnx.
17
+ to_array_extended = None
18
+
19
+ from .calibrate import TensorData
20
+ from .onnx_model import ONNXModel
21
+ from .quant_utils import (
22
+ ONNX_TYPE_TO_NP_TYPE,
23
+ TENSOR_NAME_QUANT_SUFFIX,
24
+ QuantType,
25
+ find_by_name,
26
+ model_has_infer_metadata,
27
+ normalize_axis,
28
+ quantize_data,
29
+ quantize_nparray,
30
+ save_and_reload_model_with_shape_infer,
31
+ tensor_proto_to_array,
32
+ )
33
+ from .tensor_quant_overrides import TensorQuantOverridesHelper
34
+
35
+
36
+ class QuantizationParams:
37
+ def __init__(self, **data: Dict[str, Any]):
38
+ self.data = {}
39
+ for k, v in data.items():
40
+ if not isinstance(k, str):
41
+ raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
42
+ if not isinstance(v, (int, str, np.ndarray)):
43
+ raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
44
+ if k == "scale" and v.dtype not in (np.float32, np.float16):
45
+ raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
46
+ self.data[k] = v
47
+
48
+ def __iter__(self):
49
+ yield from self.data
50
+
51
+ def __getitem__(self, key):
52
+ return self.data[key]
53
+
54
+ def __len__(self):
55
+ return len(self.data)
56
+
57
+
58
+ class BaseQuantizer:
59
+ def __init__(
60
+ self,
61
+ model,
62
+ per_channel,
63
+ reduce_range,
64
+ weight_qType,
65
+ activation_qType,
66
+ tensors_range,
67
+ nodes_to_quantize,
68
+ nodes_to_exclude,
69
+ op_types_to_quantize,
70
+ extra_options=None,
71
+ ):
72
+ if not model_has_infer_metadata(model):
73
+ model = save_and_reload_model_with_shape_infer(model)
74
+ self.value_infos = {vi.name: vi for vi in model.graph.value_info}
75
+ self.value_infos.update({ot.name: ot for ot in model.graph.output})
76
+ self.value_infos.update({it.name: it for it in model.graph.input})
77
+
78
+ self.model = ONNXModel(model)
79
+ self.per_channel = per_channel # weight-pack per channel
80
+ self.reduce_range = reduce_range
81
+
82
+ self.extra_options = extra_options if extra_options else {}
83
+ self.enable_subgraph_quantization = (
84
+ "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
85
+ )
86
+ self.parent = None
87
+ self.force_quantize_no_input_check = (
88
+ "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
89
+ )
90
+ self.is_weight_symmetric = self.extra_options.get(
91
+ "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
92
+ )
93
+ self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
94
+ self.min_real_range = self.extra_options.get("MinimumRealRange")
95
+
96
+ self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
97
+ self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
98
+
99
+ """
100
+ Dictionary specifying the min and max values for tensors. It has following format:
101
+ {
102
+ "param_name": [min, max]
103
+ }
104
+ example:
105
+ {
106
+ 'Conv_3:0': [np.float32(0), np.float32(0.5)],
107
+ 'Conv_4:0': [np.float32(1), np.float32(3.5)]
108
+ }
109
+ """
110
+ if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
111
+ raise TypeError(
112
+ f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
113
+ )
114
+ self.tensors_range = tensors_range
115
+ self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize
116
+ self.nodes_to_exclude = nodes_to_exclude # specific nodes to exclude
117
+ self.op_types_to_quantize = op_types_to_quantize
118
+
119
+ self.opset_version = self.check_opset_version()
120
+
121
+ # Get tensor-level quantization overrides and ensure they are valid.
122
+ self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
123
+
124
+ self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
125
+ overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
126
+ self.initializers, self.value_infos.keys(), activation_qType
127
+ )
128
+ if not overrides_valid:
129
+ raise ValueError(overrides_err)
130
+
131
+ self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
132
+
133
+ def quantize_model(self):
134
+ raise NotImplementedError
135
+
136
+ def is_input_a_initializer(self, input_name):
137
+ initializer = find_by_name(input_name, self.model.initializer())
138
+ return initializer is not None
139
+
140
+ def is_per_channel(self):
141
+ return self.per_channel
142
+
143
+ def is_valid_quantize_weight(self, weight_name):
144
+ weight = find_by_name(weight_name, self.model.initializer())
145
+ if weight is not None:
146
+ return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
147
+ if (not self.enable_subgraph_quantization) or (self.parent is None):
148
+ return False
149
+ return self.parent.is_valid_quantize_weight(weight_name)
150
+
151
+ def should_quantize_node(self, node):
152
+ if (
153
+ self.nodes_to_quantize is not None
154
+ and len(self.nodes_to_quantize) != 0
155
+ and node.name not in self.nodes_to_quantize
156
+ ):
157
+ return False
158
+
159
+ if node.op_type not in self.op_types_to_quantize:
160
+ return False
161
+
162
+ if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
163
+ return False
164
+
165
+ return True
166
+
167
+ def check_opset_version(self):
168
+ ai_onnx_domain = [
169
+ opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
170
+ ]
171
+ if len(ai_onnx_domain) != 1:
172
+ raise ValueError("Failed to find proper ai.onnx domain")
173
+ opset_version = ai_onnx_domain[0].version
174
+
175
+ if opset_version == 10:
176
+ logging.warning(
177
+ f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
178
+ )
179
+ return 10
180
+
181
+ if opset_version < 10:
182
+ logging.warning(
183
+ f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
184
+ )
185
+ self.model.model.opset_import.remove(ai_onnx_domain[0])
186
+ self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
187
+ opset_version = 11
188
+
189
+ if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
190
+ logging.warning(
191
+ f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
192
+ "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
193
+ "Please verify the quantized model."
194
+ )
195
+ self.model.model.opset_import.remove(ai_onnx_domain[0])
196
+ self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
197
+ self.model.model.ir_version = 9
198
+ opset_version = 19
199
+
200
+ return opset_version
201
+
202
+ def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
203
+ """
204
+ Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
205
+ """
206
+
207
+ # get bias
208
+ bias_initializer = find_by_name(bias_name, self.model.initializer())
209
+ bias_data = tensor_proto_to_array(bias_initializer)
210
+ quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
211
+
212
+ # quantize bias
213
+ if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
214
+ data = np.asarray(bias_data)
215
+ if data.dtype == np.float16:
216
+ node_qtype = onnx.TensorProto.FLOAT16
217
+ elif data.dtype == np.float32:
218
+ node_qtype = onnx.TensorProto.FLOAT
219
+ else:
220
+ raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
221
+ quantized_data = data.astype(np.float32)
222
+ bias_scale = np.array([1], dtype=quantized_data.dtype)
223
+ bias_scale_data = bias_scale.reshape(-1)
224
+ packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
225
+ self.model.initializer_extend([packed_bias_initializer])
226
+ node_type = "Cast"
227
+ else:
228
+ # calculate scale for bias
229
+ # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
230
+ bias_scale = input_scale * weight_scale * beta
231
+
232
+ quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
233
+
234
+ # update bias initializer
235
+ bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
236
+ packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
237
+ self.model.initializer_extend([packed_bias_initializer])
238
+
239
+ # Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
240
+ bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
241
+ node_type = "DequantizeLinear"
242
+ node_qtype = self.weight_qType
243
+
244
+ # update scale initializer
245
+ quantized_bias_scale_name = quantized_bias_name + "_scale"
246
+ packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
247
+ self.model.initializer_extend([packed_bias_scale_initializer])
248
+
249
+ # update zero initializer
250
+ if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
251
+ tensor_type = self.weight_qType
252
+ else:
253
+ tensor_type = onnx.TensorProto.INT32
254
+
255
+ quantized_bias_zp_name = quantized_bias_name + "_zero_point"
256
+ if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
257
+ packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
258
+ elif bias_scale.size > 1:
259
+ bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
260
+ packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
261
+ else:
262
+ packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
263
+ self.model.initializer_extend([packed_bias_zp_initializer])
264
+
265
+ return (
266
+ quantized_bias_name,
267
+ quantized_bias_scale_name,
268
+ quantized_bias_zp_name,
269
+ bias_scale_data,
270
+ node_type,
271
+ node_qtype,
272
+ )
273
+
274
+ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
275
+ """
276
+ :param weight: TensorProto initializer
277
+ :param qType: type to quantize to
278
+ :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
279
+ If keep_float_weight is False, quantize the weight, or don't quantize the weight.
280
+ :return: quantized weight name, zero point name, scale name
281
+ """
282
+ q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
283
+ zp_name = weight.name + "_zero_point"
284
+ scale_name = weight.name + "_scale"
285
+
286
+ # Quantize weight data. Use quantization overrides if provided by the user.
287
+ weight_data = tensor_proto_to_array(weight)
288
+ quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
289
+ if "quant_type" in quant_overrides:
290
+ qType = quant_overrides["quant_type"].tensor_type # noqa: N806
291
+
292
+ if "scale" in quant_overrides and "zero_point" in quant_overrides:
293
+ zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
294
+ scale = np.array(quant_overrides["scale"])
295
+ q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
296
+ assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
297
+ assert (
298
+ zero_point.dtype != np.float32 and zero_point.dtype != np.float16
299
+ ), f"Unexpected dtype {zero_point.dtype}"
300
+ assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
301
+
302
+ else:
303
+ _, _, zero_point, scale, q_weight_data = quantize_data(
304
+ weight_data.flatten(),
305
+ qType,
306
+ quant_overrides.get("symmetric", self.is_weight_symmetric),
307
+ reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
308
+ min_real_range=self.min_real_range,
309
+ rmin_override=quant_overrides.get("rmin"),
310
+ rmax_override=quant_overrides.get("rmax"),
311
+ )
312
+
313
+ assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
314
+ assert (
315
+ zero_point.dtype != np.float32 and zero_point.dtype != np.float16
316
+ ), f"Unexpected dtype {zero_point.dtype}"
317
+ assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
318
+
319
+ scale_dtype = weight.data_type
320
+ scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
321
+ zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
322
+ self.model.initializer_extend([scale_initializer, zero_initializer])
323
+
324
+ if not keep_float_weight:
325
+ if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
326
+ q_weight_initializer = onnx.TensorProto()
327
+ q_weight_initializer.data_type = self.weight_qType
328
+ q_weight_initializer.dims.extend(weight.dims)
329
+ q_weight_initializer.name = q_weight_name
330
+ # Do not remove .flatten().copy() numpy is not clear about data persistence.
331
+ q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
332
+ if to_array_extended is not None:
333
+ # This test should not be needed but it helped catch some issues
334
+ # with data persistence and tobytes.
335
+ check = to_array_extended(q_weight_initializer)
336
+ if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
337
+ raise RuntimeError(
338
+ f"The initializer of shape {weight_data.shape} could not be created, expecting "
339
+ f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
340
+ f"\nraw={str(q_weight_initializer)[:200]}."
341
+ )
342
+ else:
343
+ q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
344
+ weight.dims
345
+ )
346
+ q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
347
+ self.model.initializer_extend([q_weight_initializer])
348
+
349
+ return q_weight_name, zp_name, scale_name
350
+
351
+ def quantize_weight_per_channel_impl(
352
+ self,
353
+ weight_name,
354
+ weight_qType,
355
+ channel_axis,
356
+ reduce_range=True,
357
+ keep_float_weight=False,
358
+ ):
359
+ initializer = find_by_name(weight_name, self.model.initializer())
360
+ if initializer is None:
361
+ raise ValueError("{} is not an initializer", weight_name)
362
+
363
+ weights = tensor_proto_to_array(initializer)
364
+ weights_rank = len(weights.shape)
365
+ is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
366
+ if not is_axis_valid:
367
+ raise ValueError(
368
+ f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
369
+ f"out-of-bounds for rank {weights_rank}"
370
+ )
371
+
372
+ channel_axis = axis_norm
373
+ channel_count = weights.shape[channel_axis]
374
+ quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
375
+ weight_name, default_val=[{"axis": channel_axis}]
376
+ )
377
+
378
+ num_channel_overrides = len(quant_overrides_for_channels)
379
+ if num_channel_overrides != 1 and num_channel_overrides != channel_count:
380
+ raise ValueError(
381
+ f"Per-channel tensor quantization overrides for {weight_name} must have "
382
+ f"either 1 or {channel_count} elements in the list of dictionaries."
383
+ )
384
+
385
+ is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
386
+ if not is_axis_override_valid or axis_override != channel_axis:
387
+ raise ValueError(
388
+ f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
389
+ f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
390
+ )
391
+
392
+ # If user provides per-channel quantization overrides, all channels must use the same quant_type,
393
+ # axis, symmetric, and reduce_range values. So, just use the first channel's values.
394
+ if "quant_type" in quant_overrides_for_channels[0]:
395
+ weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
396
+
397
+ symmetric = quant_overrides_for_channels[0].get(
398
+ "symmetric",
399
+ (self.is_weight_symmetric or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN)),
400
+ )
401
+ reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
402
+ zero_point_list = []
403
+ scale_list = []
404
+ quantized_per_channel_data_list = []
405
+ for i in range(channel_count):
406
+ per_channel_data = weights.take(i, channel_axis)
407
+ channel_override_index = i if i < num_channel_overrides else 0
408
+ channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
409
+
410
+ if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
411
+ zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
412
+ scale = np.array(channel_quant_overrides["scale"])
413
+ quantized_per_channel_data = quantize_nparray(
414
+ weight_qType, per_channel_data.flatten(), scale, zero_point
415
+ )
416
+ assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
417
+ assert (
418
+ zero_point.dtype != np.float32 and zero_point.dtype != np.float16
419
+ ), f"Unexpected dtype {zero_point.dtype}"
420
+ assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
421
+ assert isinstance(
422
+ quantized_per_channel_data, np.ndarray
423
+ ), f"Unexpected type {type(quantized_per_channel_data)}"
424
+
425
+ else:
426
+ _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
427
+ per_channel_data.flatten(),
428
+ weight_qType,
429
+ symmetric,
430
+ reduce_range=reduce_range,
431
+ min_real_range=self.min_real_range,
432
+ rmin_override=channel_quant_overrides.get("rmin"),
433
+ rmax_override=channel_quant_overrides.get("rmax"),
434
+ )
435
+
436
+ assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
437
+ assert (
438
+ zero_point.dtype != np.float32 and zero_point.dtype != np.float16
439
+ ), f"Unexpected dtype {zero_point.dtype}"
440
+ assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
441
+ assert isinstance(
442
+ quantized_per_channel_data, np.ndarray
443
+ ), f"Unexpected type {type(quantized_per_channel_data)}"
444
+
445
+ zero_point_list.append(zero_point)
446
+ scale_list.append(scale)
447
+ quantized_per_channel_data_list.append(quantized_per_channel_data)
448
+
449
+ # combine per_channel_data into one
450
+ reshape_dims = list(weights.shape) # deep copy
451
+ reshape_dims[channel_axis] = 1 # only one per channel for reshape
452
+ quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
453
+ for i in range(1, len(quantized_per_channel_data_list)):
454
+ channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
455
+ quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
456
+
457
+ q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
458
+ zp_name = weight_name + "_zero_point"
459
+ scale_name = weight_name + "_scale"
460
+
461
+ # Update packed weight, zero point, and scale initializers
462
+ zero_scale_shape = [initializer.dims[channel_axis]]
463
+ scale_initializer = onnx.helper.make_tensor(
464
+ scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
465
+ )
466
+ zero_initializer = onnx.helper.make_tensor(
467
+ zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
468
+ )
469
+
470
+ self.model.initializer_extend([scale_initializer, zero_initializer])
471
+
472
+ if not keep_float_weight:
473
+ quantized_weights = np.asarray(
474
+ quantized_weights,
475
+ dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
476
+ ).reshape(initializer.dims)
477
+ q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
478
+ self.model.initializer_extend([q_weight_initializer])
479
+
480
+ return q_weight_name, zp_name, scale_name
481
+
482
+ def adjust_tensor_ranges(self):
483
+ if self.tensors_range is None:
484
+ return
485
+
486
+ for node in self.model.nodes():
487
+ # adjust tensor_ranges for input of Clip and Relu node
488
+ if node.op_type in ["Clip", "Relu"]:
489
+ if self.is_activation_symmetric:
490
+ continue
491
+ if not self.should_quantize_node(node):
492
+ continue
493
+ if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
494
+ continue
495
+ if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
496
+ continue
497
+ td = self.tensors_range[node.output[0]]
498
+ if not isinstance(td, TensorData):
499
+ raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
500
+ self.tensors_range[node.input[0]] = td
501
+ # Adjust Softmax to range from 0.0 to 1.0
502
+ elif node.op_type == "Softmax":
503
+ self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
@@ -368,7 +368,6 @@ class MinMaxCalibrater(CalibraterBase):
368
368
  self.max_intermediate_outputs is not None
369
369
  and len(self.intermediate_outputs) == self.max_intermediate_outputs
370
370
  ):
371
- self.compute_range()
372
371
  self.clear_collected_data()
373
372
 
374
373
  if len(self.intermediate_outputs) == 0 and self.calibrate_tensors_range is None:
@@ -734,13 +733,11 @@ class HistogramCollector(CalibrationDataCollector):
734
733
  for tensor, data_arr in name_to_arr.items():
735
734
  if isinstance(data_arr, list):
736
735
  for arr in data_arr:
737
- if not isinstance(arr, np.ndarray):
738
- raise ValueError(f"Unexpected type {type(arr)} for tensor={tensor!r}")
739
- dtypes = set(a.dtype for a in arr)
740
- if len(dtypes) != 1:
741
- raise ValueError(
742
- f"The calibration expects only one element type but got {dtypes} for tensor={tensor!r}"
743
- )
736
+ assert isinstance(arr, np.ndarray), f"Unexpected type {type(arr)} for tensor={tensor!r}"
737
+ dtypes = set(a.dtype for a in data_arr)
738
+ assert (
739
+ len(dtypes) == 1
740
+ ), f"The calibration expects only one element type but got {dtypes} for tensor={tensor!r}"
744
741
  data_arr_np = np.asarray(data_arr)
745
742
  elif not isinstance(data_arr, np.ndarray):
746
743
  raise ValueError(f"Unexpected type {type(data_arr)} for tensor={tensor!r}")
@@ -918,11 +915,7 @@ class HistogramCollector(CalibrationDataCollector):
918
915
  thresholds_dict = {} # per tensor thresholds
919
916
 
920
917
  print(f"Number of tensors : {len(histogram_dict)}")
921
- print(
922
- "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
923
- self.num_bins
924
- )
925
- )
918
+ print(f"Number of histogram bins : {self.num_bins} (The number may increase depends on the data it collects)")
926
919
  print(f"Number of quantized bins : {self.num_quantized_bins}")
927
920
 
928
921
  for tensor, histogram in histogram_dict.items():
@@ -1100,12 +1093,10 @@ def create_calibrator(
1100
1093
  calibrator = None
1101
1094
  if calibrate_method == CalibrationMethod.MinMax:
1102
1095
  # default settings for min-max algorithm
1103
- symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
1104
- moving_average = False if "moving_average" not in extra_options else extra_options["moving_average"]
1105
- averaging_constant = 0.01 if "averaging_constant" not in extra_options else extra_options["averaging_constant"]
1106
- max_intermediate_outputs = (
1107
- None if "max_intermediate_outputs" not in extra_options else extra_options["max_intermediate_outputs"]
1108
- )
1096
+ symmetric = extra_options.get("symmetric", False)
1097
+ moving_average = extra_options.get("moving_average", False)
1098
+ averaging_constant = extra_options.get("averaging_constant", 0.01)
1099
+ max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
1109
1100
  calibrator = MinMaxCalibrater(
1110
1101
  model,
1111
1102
  op_types_to_calibrate,
@@ -1118,9 +1109,9 @@ def create_calibrator(
1118
1109
  )
1119
1110
  elif calibrate_method == CalibrationMethod.Entropy:
1120
1111
  # default settings for entropy algorithm
1121
- num_bins = 128 if "num_bins" not in extra_options else extra_options["num_bins"]
1122
- num_quantized_bins = 128 if "num_quantized_bins" not in extra_options else extra_options["num_quantized_bins"]
1123
- symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
1112
+ num_bins = extra_options.get("num_bins", 128)
1113
+ num_quantized_bins = extra_options.get("num_quantized_bins", 128)
1114
+ symmetric = extra_options.get("symmetric", False)
1124
1115
  calibrator = EntropyCalibrater(
1125
1116
  model,
1126
1117
  op_types_to_calibrate,
@@ -1132,9 +1123,9 @@ def create_calibrator(
1132
1123
  )
1133
1124
  elif calibrate_method == CalibrationMethod.Percentile:
1134
1125
  # default settings for percentile algorithm
1135
- num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
1136
- percentile = 99.999 if "percentile" not in extra_options else extra_options["percentile"]
1137
- symmetric = True if "symmetric" not in extra_options else extra_options["symmetric"]
1126
+ num_bins = extra_options.get("num_bins", 2048)
1127
+ percentile = extra_options.get("percentile", 99.999)
1128
+ symmetric = extra_options.get("symmetric", True)
1138
1129
  calibrator = PercentileCalibrater(
1139
1130
  model,
1140
1131
  op_types_to_calibrate,
@@ -1147,8 +1138,8 @@ def create_calibrator(
1147
1138
 
1148
1139
  elif calibrate_method == CalibrationMethod.Distribution:
1149
1140
  # default settings for percentile algorithm
1150
- num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
1151
- scenario = "same" if "scenario" not in extra_options else extra_options["scenario"]
1141
+ num_bins = extra_options.get("num_bins", 2048)
1142
+ scenario = extra_options.get("scenario", "same")
1152
1143
 
1153
1144
  calibrator = DistributionCalibrater(
1154
1145
  model,
@@ -122,6 +122,11 @@ class FusionLpNormalization(Fusion):
122
122
 
123
123
  self.nodes_to_remove.extend(subgraph_nodes)
124
124
  fused_node = onnx.helper.make_node(
125
- self.fused_op_type, inputs=[subgraph_input], outputs=[subgraph_output], p=2, axis=-1
125
+ self.fused_op_type,
126
+ name=self.create_unique_node_name(),
127
+ inputs=[subgraph_input],
128
+ outputs=[subgraph_output],
129
+ p=2,
130
+ axis=-1,
126
131
  )
127
132
  self.nodes_to_add.append(fused_node)