onnxruntime-directml 1.20.0__cp313-cp313-win_amd64.whl → 1.20.1__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxruntime/__init__.py +1 -1
- onnxruntime/capi/DirectML.dll +0 -0
- onnxruntime/capi/onnxruntime.dll +0 -0
- onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
- onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
- onnxruntime/quantization/__init__.py +1 -0
- onnxruntime/quantization/base_quantizer.py +45 -18
- onnxruntime/quantization/onnx_model.py +20 -0
- onnxruntime/quantization/operators/pad.py +72 -0
- onnxruntime/quantization/qdq_quantizer.py +375 -133
- onnxruntime/quantization/quant_utils.py +166 -31
- onnxruntime/quantization/quantize.py +180 -7
- onnxruntime/quantization/registry.py +3 -1
- onnxruntime/quantization/tensor_quant_overrides.py +4 -0
- {onnxruntime_directml-1.20.0.dist-info → onnxruntime_directml-1.20.1.dist-info}/METADATA +6 -1
- {onnxruntime_directml-1.20.0.dist-info → onnxruntime_directml-1.20.1.dist-info}/RECORD +19 -19
- {onnxruntime_directml-1.20.0.dist-info → onnxruntime_directml-1.20.1.dist-info}/WHEEL +1 -1
- {onnxruntime_directml-1.20.0.dist-info → onnxruntime_directml-1.20.1.dist-info}/entry_points.txt +0 -0
- {onnxruntime_directml-1.20.0.dist-info → onnxruntime_directml-1.20.1.dist-info}/top_level.txt +0 -0
onnxruntime/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
|
|
|
7
7
|
For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
|
|
8
8
|
or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
|
|
9
9
|
"""
|
|
10
|
-
__version__ = "1.20.
|
|
10
|
+
__version__ = "1.20.1"
|
|
11
11
|
__author__ = "Microsoft"
|
|
12
12
|
|
|
13
13
|
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
|
onnxruntime/capi/DirectML.dll
CHANGED
|
Binary file
|
onnxruntime/capi/onnxruntime.dll
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -10,6 +10,7 @@ from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa
|
|
|
10
10
|
from .quantize import DynamicQuantConfig # noqa: F401
|
|
11
11
|
from .quantize import QuantizationMode # noqa: F401
|
|
12
12
|
from .quantize import StaticQuantConfig # noqa: F401
|
|
13
|
+
from .quantize import get_qdq_config # noqa: F401
|
|
13
14
|
from .quantize import quantize # noqa: F401
|
|
14
15
|
from .quantize import quantize_dynamic # noqa: F401
|
|
15
16
|
from .quantize import quantize_static # noqa: F401
|
|
@@ -21,7 +21,6 @@ from .onnx_model import ONNXModel
|
|
|
21
21
|
from .quant_utils import (
|
|
22
22
|
ONNX_TYPE_TO_NP_TYPE,
|
|
23
23
|
TENSOR_NAME_QUANT_SUFFIX,
|
|
24
|
-
QuantType,
|
|
25
24
|
find_by_name,
|
|
26
25
|
model_has_infer_metadata,
|
|
27
26
|
normalize_axis,
|
|
@@ -40,18 +39,26 @@ class QuantizationParams:
|
|
|
40
39
|
for k, v in data.items():
|
|
41
40
|
if not isinstance(k, str):
|
|
42
41
|
raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
|
|
43
|
-
if not isinstance(v, (int, str, np.ndarray)):
|
|
42
|
+
if k != "axis" and not isinstance(v, (int, str, np.ndarray)):
|
|
44
43
|
raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
|
|
44
|
+
if k == "axis" and not isinstance(v, int) and v is not None:
|
|
45
|
+
raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
|
|
45
46
|
if k == "scale" and v.dtype not in (np.float32, np.float16):
|
|
46
47
|
raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
|
|
47
48
|
self.data[k] = v
|
|
48
49
|
|
|
50
|
+
def get(self, key, default_value=None):
|
|
51
|
+
return self.data.get(key, default_value)
|
|
52
|
+
|
|
49
53
|
def __iter__(self):
|
|
50
54
|
yield from self.data
|
|
51
55
|
|
|
52
56
|
def __getitem__(self, key):
|
|
53
57
|
return self.data[key]
|
|
54
58
|
|
|
59
|
+
def __setitem__(self, key, value):
|
|
60
|
+
self.data[key] = value
|
|
61
|
+
|
|
55
62
|
def __len__(self):
|
|
56
63
|
return len(self.data)
|
|
57
64
|
|
|
@@ -88,9 +95,10 @@ class BaseQuantizer:
|
|
|
88
95
|
self.force_quantize_no_input_check = (
|
|
89
96
|
"ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
|
|
90
97
|
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
)
|
|
98
|
+
|
|
99
|
+
# If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
|
|
100
|
+
# the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
|
|
101
|
+
self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
|
|
94
102
|
self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
|
|
95
103
|
self.min_real_range = self.extra_options.get("MinimumRealRange")
|
|
96
104
|
|
|
@@ -131,6 +139,16 @@ class BaseQuantizer:
|
|
|
131
139
|
|
|
132
140
|
self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
|
|
133
141
|
|
|
142
|
+
def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
|
|
143
|
+
if self._is_weight_symmetric is not None:
|
|
144
|
+
return self._is_weight_symmetric # Return value explicitly set by user.
|
|
145
|
+
return weight_quant_type in (
|
|
146
|
+
onnx.TensorProto.INT4,
|
|
147
|
+
onnx.TensorProto.INT8,
|
|
148
|
+
onnx.TensorProto.INT16,
|
|
149
|
+
onnx.TensorProto.FLOAT8E4M3FN,
|
|
150
|
+
)
|
|
151
|
+
|
|
134
152
|
def quantize_model(self):
|
|
135
153
|
raise NotImplementedError
|
|
136
154
|
|
|
@@ -230,9 +248,19 @@ class BaseQuantizer:
|
|
|
230
248
|
# TODO: This formula should be explained including why the scale is not estimated for the bias as well.
|
|
231
249
|
bias_scale = input_scale * weight_scale * beta
|
|
232
250
|
|
|
233
|
-
|
|
234
|
-
quantized_data = np.
|
|
235
|
-
quantized_data = quantized_data.
|
|
251
|
+
# Quantize by dividing by bias_scale
|
|
252
|
+
quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
|
|
253
|
+
quantized_data = quantized_data.round()
|
|
254
|
+
|
|
255
|
+
# Clip quantized data to the range of a int32
|
|
256
|
+
int32_min = np.float64(np.iinfo(np.int32).min)
|
|
257
|
+
int32_max = np.float64(np.iinfo(np.int32).max)
|
|
258
|
+
if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
|
|
259
|
+
logging.warning(
|
|
260
|
+
f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
|
|
236
264
|
|
|
237
265
|
# update bias initializer
|
|
238
266
|
bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
|
|
@@ -282,6 +310,7 @@ class BaseQuantizer:
|
|
|
282
310
|
If keep_float_weight is False, quantize the weight, or don't quantize the weight.
|
|
283
311
|
:return: quantized weight name, zero point name, scale name
|
|
284
312
|
"""
|
|
313
|
+
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
|
|
285
314
|
q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
|
|
286
315
|
zp_name = weight.name + "_zero_point"
|
|
287
316
|
scale_name = weight.name + "_scale"
|
|
@@ -303,10 +332,11 @@ class BaseQuantizer:
|
|
|
303
332
|
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
|
|
304
333
|
|
|
305
334
|
else:
|
|
306
|
-
|
|
335
|
+
symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
|
|
336
|
+
zero_point, scale, q_weight_data = quantize_data(
|
|
307
337
|
weight_data.flatten(),
|
|
308
338
|
qType,
|
|
309
|
-
quant_overrides.get("symmetric",
|
|
339
|
+
quant_overrides.get("symmetric", symmetric),
|
|
310
340
|
reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
|
|
311
341
|
min_real_range=self.min_real_range,
|
|
312
342
|
rmin_override=quant_overrides.get("rmin"),
|
|
@@ -371,6 +401,7 @@ class BaseQuantizer:
|
|
|
371
401
|
reduce_range=True,
|
|
372
402
|
keep_float_weight=False,
|
|
373
403
|
):
|
|
404
|
+
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
|
|
374
405
|
initializer = find_by_name(weight_name, self.model.initializer())
|
|
375
406
|
if initializer is None:
|
|
376
407
|
raise ValueError("{} is not an initializer", weight_name)
|
|
@@ -409,13 +440,7 @@ class BaseQuantizer:
|
|
|
409
440
|
if "quant_type" in quant_overrides_for_channels[0]:
|
|
410
441
|
weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
|
|
411
442
|
|
|
412
|
-
symmetric = quant_overrides_for_channels[0].get(
|
|
413
|
-
"symmetric",
|
|
414
|
-
(
|
|
415
|
-
self.is_weight_symmetric
|
|
416
|
-
or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
|
|
417
|
-
),
|
|
418
|
-
)
|
|
443
|
+
symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
|
|
419
444
|
reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
|
|
420
445
|
zero_point_list = []
|
|
421
446
|
scale_list = []
|
|
@@ -444,7 +469,7 @@ class BaseQuantizer:
|
|
|
444
469
|
), f"Unexpected type {type(quantized_per_channel_data)}"
|
|
445
470
|
|
|
446
471
|
else:
|
|
447
|
-
|
|
472
|
+
zero_point, scale, quantized_per_channel_data = quantize_data(
|
|
448
473
|
per_channel_data.flatten(),
|
|
449
474
|
weight_qType,
|
|
450
475
|
symmetric,
|
|
@@ -529,4 +554,6 @@ class BaseQuantizer:
|
|
|
529
554
|
self.tensors_range[node.input[0]] = td
|
|
530
555
|
# Adjust Softmax to range from 0.0 to 1.0
|
|
531
556
|
elif node.op_type == "Softmax":
|
|
557
|
+
if not self.should_quantize_node(node):
|
|
558
|
+
continue
|
|
532
559
|
self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
|
|
@@ -296,6 +296,26 @@ class ONNXModel:
|
|
|
296
296
|
|
|
297
297
|
return suffix
|
|
298
298
|
|
|
299
|
+
def get_largest_initializer_name_suffix(self, initializer_name_prefix):
|
|
300
|
+
"""
|
|
301
|
+
Gets the largest initializer name integer suffix for all initializer names that begin
|
|
302
|
+
with `initializer_name_prefix`. This can be used to create unique initializer names.
|
|
303
|
+
|
|
304
|
+
Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
|
|
305
|
+
`initializer_name_prefix` is 'my_weight_'.
|
|
306
|
+
"""
|
|
307
|
+
suffix = -1
|
|
308
|
+
|
|
309
|
+
for initializer in self.model.graph.initializer:
|
|
310
|
+
if initializer.name.startswith(initializer_name_prefix):
|
|
311
|
+
try:
|
|
312
|
+
index = int(initializer.name[len(initializer_name_prefix) :])
|
|
313
|
+
suffix = max(index, suffix)
|
|
314
|
+
except ValueError:
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
return suffix
|
|
318
|
+
|
|
299
319
|
def find_nodes_by_initializer(self, graph, initializer):
|
|
300
320
|
"""
|
|
301
321
|
Find all nodes with given initializer as an input.
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
# --------------------------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# Licensed under the MIT License.
|
|
4
|
+
# --------------------------------------------------------------------------
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
1
10
|
import onnx
|
|
2
11
|
|
|
3
12
|
from ..quant_utils import (
|
|
@@ -8,6 +17,7 @@ from ..quant_utils import (
|
|
|
8
17
|
quantize_nparray,
|
|
9
18
|
)
|
|
10
19
|
from .base_operator import QuantOperatorBase
|
|
20
|
+
from .qdq_base_operator import QDQOperatorBase
|
|
11
21
|
|
|
12
22
|
|
|
13
23
|
class QPad(QuantOperatorBase):
|
|
@@ -98,3 +108,65 @@ class QPad(QuantOperatorBase):
|
|
|
98
108
|
node.input[0] = quantized_input_value.q_name
|
|
99
109
|
node.output[0] = quantized_output_value.q_name
|
|
100
110
|
self.quantizer.new_nodes += [node]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class QDQPad(QDQOperatorBase):
|
|
114
|
+
def __init__(self, onnx_quantizer, onnx_node):
|
|
115
|
+
super().__init__(onnx_quantizer, onnx_node)
|
|
116
|
+
|
|
117
|
+
def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
|
|
118
|
+
"""
|
|
119
|
+
Returns the Pad's constant padding value. Returns `None` if the padding value is
|
|
120
|
+
not constant (i.e., comes from a dynamic input).
|
|
121
|
+
"""
|
|
122
|
+
const_val = None
|
|
123
|
+
onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
|
|
124
|
+
if onnx_tensor_type is None:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
|
|
128
|
+
if self.quantizer.opset_version < 11:
|
|
129
|
+
const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
|
|
130
|
+
elif len(self.node.input) >= 3 and self.node.input[2]:
|
|
131
|
+
const_val = self.quantizer.model.get_constant_value(self.node.input[2])
|
|
132
|
+
else:
|
|
133
|
+
const_val = np.array(0, dtype=np_dtype)
|
|
134
|
+
|
|
135
|
+
return const_val
|
|
136
|
+
|
|
137
|
+
def _should_quantize_output_same_as_input(self) -> bool:
|
|
138
|
+
"""
|
|
139
|
+
Returns true if Pad's output should use the same quantization parameters as input[0]
|
|
140
|
+
"""
|
|
141
|
+
attrs_dict = {}
|
|
142
|
+
for attribute in self.node.attribute:
|
|
143
|
+
kv = attribute_to_kwarg(attribute)
|
|
144
|
+
attrs_dict.update(kv)
|
|
145
|
+
|
|
146
|
+
pad_mode = attrs_dict.get("mode", b"constant")
|
|
147
|
+
if pad_mode in (b"reflect", b"edge", b"wrap"):
|
|
148
|
+
# These modes pad the output with a value that already exists in the input.
|
|
149
|
+
# So, we can quantize the output the same as the input.
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
# For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
|
|
153
|
+
# because our quantization floating-point range always includes 0.
|
|
154
|
+
if pad_mode == b"constant":
|
|
155
|
+
pad_val = self._get_pad_const_val(attrs_dict)
|
|
156
|
+
if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
|
|
157
|
+
return float(pad_val.item()) == 0
|
|
158
|
+
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def quantize(self):
|
|
162
|
+
assert self.node.op_type == "Pad"
|
|
163
|
+
|
|
164
|
+
for input_name in self.node.input:
|
|
165
|
+
if input_name:
|
|
166
|
+
self.quantizer.quantize_activation_tensor(input_name)
|
|
167
|
+
|
|
168
|
+
if not self.disable_qdq_for_node_output:
|
|
169
|
+
if self._should_quantize_output_same_as_input():
|
|
170
|
+
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
|
171
|
+
else:
|
|
172
|
+
self.quantizer.quantize_activation_tensor(self.node.output[0])
|