onnxruntime-directml 1.20.0__cp313-cp313-win_amd64.whl → 1.20.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
onnxruntime/__init__.py CHANGED
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
7
7
  For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
8
8
  or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
9
9
  """
10
- __version__ = "1.20.0"
10
+ __version__ = "1.20.1"
11
11
  __author__ = "Microsoft"
12
12
 
13
13
  # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
Binary file
Binary file
@@ -10,6 +10,7 @@ from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa
10
10
  from .quantize import DynamicQuantConfig # noqa: F401
11
11
  from .quantize import QuantizationMode # noqa: F401
12
12
  from .quantize import StaticQuantConfig # noqa: F401
13
+ from .quantize import get_qdq_config # noqa: F401
13
14
  from .quantize import quantize # noqa: F401
14
15
  from .quantize import quantize_dynamic # noqa: F401
15
16
  from .quantize import quantize_static # noqa: F401
@@ -21,7 +21,6 @@ from .onnx_model import ONNXModel
21
21
  from .quant_utils import (
22
22
  ONNX_TYPE_TO_NP_TYPE,
23
23
  TENSOR_NAME_QUANT_SUFFIX,
24
- QuantType,
25
24
  find_by_name,
26
25
  model_has_infer_metadata,
27
26
  normalize_axis,
@@ -40,18 +39,26 @@ class QuantizationParams:
40
39
  for k, v in data.items():
41
40
  if not isinstance(k, str):
42
41
  raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
43
- if not isinstance(v, (int, str, np.ndarray)):
42
+ if k != "axis" and not isinstance(v, (int, str, np.ndarray)):
44
43
  raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
44
+ if k == "axis" and not isinstance(v, int) and v is not None:
45
+ raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
45
46
  if k == "scale" and v.dtype not in (np.float32, np.float16):
46
47
  raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
47
48
  self.data[k] = v
48
49
 
50
+ def get(self, key, default_value=None):
51
+ return self.data.get(key, default_value)
52
+
49
53
  def __iter__(self):
50
54
  yield from self.data
51
55
 
52
56
  def __getitem__(self, key):
53
57
  return self.data[key]
54
58
 
59
+ def __setitem__(self, key, value):
60
+ self.data[key] = value
61
+
55
62
  def __len__(self):
56
63
  return len(self.data)
57
64
 
@@ -88,9 +95,10 @@ class BaseQuantizer:
88
95
  self.force_quantize_no_input_check = (
89
96
  "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
90
97
  )
91
- self.is_weight_symmetric = self.extra_options.get(
92
- "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
93
- )
98
+
99
+ # If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
100
+ # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
101
+ self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
94
102
  self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
95
103
  self.min_real_range = self.extra_options.get("MinimumRealRange")
96
104
 
@@ -131,6 +139,16 @@ class BaseQuantizer:
131
139
 
132
140
  self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
133
141
 
142
+ def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
143
+ if self._is_weight_symmetric is not None:
144
+ return self._is_weight_symmetric # Return value explicitly set by user.
145
+ return weight_quant_type in (
146
+ onnx.TensorProto.INT4,
147
+ onnx.TensorProto.INT8,
148
+ onnx.TensorProto.INT16,
149
+ onnx.TensorProto.FLOAT8E4M3FN,
150
+ )
151
+
134
152
  def quantize_model(self):
135
153
  raise NotImplementedError
136
154
 
@@ -230,9 +248,19 @@ class BaseQuantizer:
230
248
  # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
231
249
  bias_scale = input_scale * weight_scale * beta
232
250
 
233
- quantized_data = (np.asarray(bias_data) / bias_scale).round()
234
- quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
235
- quantized_data = quantized_data.astype(np.int32)
251
+ # Quantize by dividing by bias_scale
252
+ quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
253
+ quantized_data = quantized_data.round()
254
+
255
+ # Clip quantized data to the range of a int32
256
+ int32_min = np.float64(np.iinfo(np.int32).min)
257
+ int32_max = np.float64(np.iinfo(np.int32).max)
258
+ if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
259
+ logging.warning(
260
+ f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
261
+ )
262
+
263
+ quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
236
264
 
237
265
  # update bias initializer
238
266
  bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
@@ -282,6 +310,7 @@ class BaseQuantizer:
282
310
  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
283
311
  :return: quantized weight name, zero point name, scale name
284
312
  """
313
+ # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
285
314
  q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
286
315
  zp_name = weight.name + "_zero_point"
287
316
  scale_name = weight.name + "_scale"
@@ -303,10 +332,11 @@ class BaseQuantizer:
303
332
  assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
304
333
 
305
334
  else:
306
- _, _, zero_point, scale, q_weight_data = quantize_data(
335
+ symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
336
+ zero_point, scale, q_weight_data = quantize_data(
307
337
  weight_data.flatten(),
308
338
  qType,
309
- quant_overrides.get("symmetric", self.is_weight_symmetric),
339
+ quant_overrides.get("symmetric", symmetric),
310
340
  reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
311
341
  min_real_range=self.min_real_range,
312
342
  rmin_override=quant_overrides.get("rmin"),
@@ -371,6 +401,7 @@ class BaseQuantizer:
371
401
  reduce_range=True,
372
402
  keep_float_weight=False,
373
403
  ):
404
+ # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
374
405
  initializer = find_by_name(weight_name, self.model.initializer())
375
406
  if initializer is None:
376
407
  raise ValueError("{} is not an initializer", weight_name)
@@ -409,13 +440,7 @@ class BaseQuantizer:
409
440
  if "quant_type" in quant_overrides_for_channels[0]:
410
441
  weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
411
442
 
412
- symmetric = quant_overrides_for_channels[0].get(
413
- "symmetric",
414
- (
415
- self.is_weight_symmetric
416
- or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
417
- ),
418
- )
443
+ symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
419
444
  reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
420
445
  zero_point_list = []
421
446
  scale_list = []
@@ -444,7 +469,7 @@ class BaseQuantizer:
444
469
  ), f"Unexpected type {type(quantized_per_channel_data)}"
445
470
 
446
471
  else:
447
- _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
472
+ zero_point, scale, quantized_per_channel_data = quantize_data(
448
473
  per_channel_data.flatten(),
449
474
  weight_qType,
450
475
  symmetric,
@@ -529,4 +554,6 @@ class BaseQuantizer:
529
554
  self.tensors_range[node.input[0]] = td
530
555
  # Adjust Softmax to range from 0.0 to 1.0
531
556
  elif node.op_type == "Softmax":
557
+ if not self.should_quantize_node(node):
558
+ continue
532
559
  self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
@@ -296,6 +296,26 @@ class ONNXModel:
296
296
 
297
297
  return suffix
298
298
 
299
+ def get_largest_initializer_name_suffix(self, initializer_name_prefix):
300
+ """
301
+ Gets the largest initializer name integer suffix for all initializer names that begin
302
+ with `initializer_name_prefix`. This can be used to create unique initializer names.
303
+
304
+ Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
305
+ `initializer_name_prefix` is 'my_weight_'.
306
+ """
307
+ suffix = -1
308
+
309
+ for initializer in self.model.graph.initializer:
310
+ if initializer.name.startswith(initializer_name_prefix):
311
+ try:
312
+ index = int(initializer.name[len(initializer_name_prefix) :])
313
+ suffix = max(index, suffix)
314
+ except ValueError:
315
+ continue
316
+
317
+ return suffix
318
+
299
319
  def find_nodes_by_initializer(self, graph, initializer):
300
320
  """
301
321
  Find all nodes with given initializer as an input.
@@ -1,3 +1,12 @@
1
+ # --------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # Licensed under the MIT License.
4
+ # --------------------------------------------------------------------------
5
+ from __future__ import annotations
6
+
7
+ from typing import Any
8
+
9
+ import numpy as np
1
10
  import onnx
2
11
 
3
12
  from ..quant_utils import (
@@ -8,6 +17,7 @@ from ..quant_utils import (
8
17
  quantize_nparray,
9
18
  )
10
19
  from .base_operator import QuantOperatorBase
20
+ from .qdq_base_operator import QDQOperatorBase
11
21
 
12
22
 
13
23
  class QPad(QuantOperatorBase):
@@ -98,3 +108,65 @@ class QPad(QuantOperatorBase):
98
108
  node.input[0] = quantized_input_value.q_name
99
109
  node.output[0] = quantized_output_value.q_name
100
110
  self.quantizer.new_nodes += [node]
111
+
112
+
113
+ class QDQPad(QDQOperatorBase):
114
+ def __init__(self, onnx_quantizer, onnx_node):
115
+ super().__init__(onnx_quantizer, onnx_node)
116
+
117
+ def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
118
+ """
119
+ Returns the Pad's constant padding value. Returns `None` if the padding value is
120
+ not constant (i.e., comes from a dynamic input).
121
+ """
122
+ const_val = None
123
+ onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
124
+ if onnx_tensor_type is None:
125
+ return None
126
+
127
+ np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
128
+ if self.quantizer.opset_version < 11:
129
+ const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
130
+ elif len(self.node.input) >= 3 and self.node.input[2]:
131
+ const_val = self.quantizer.model.get_constant_value(self.node.input[2])
132
+ else:
133
+ const_val = np.array(0, dtype=np_dtype)
134
+
135
+ return const_val
136
+
137
+ def _should_quantize_output_same_as_input(self) -> bool:
138
+ """
139
+ Returns true if Pad's output should use the same quantization parameters as input[0]
140
+ """
141
+ attrs_dict = {}
142
+ for attribute in self.node.attribute:
143
+ kv = attribute_to_kwarg(attribute)
144
+ attrs_dict.update(kv)
145
+
146
+ pad_mode = attrs_dict.get("mode", b"constant")
147
+ if pad_mode in (b"reflect", b"edge", b"wrap"):
148
+ # These modes pad the output with a value that already exists in the input.
149
+ # So, we can quantize the output the same as the input.
150
+ return True
151
+
152
+ # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
153
+ # because our quantization floating-point range always includes 0.
154
+ if pad_mode == b"constant":
155
+ pad_val = self._get_pad_const_val(attrs_dict)
156
+ if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
157
+ return float(pad_val.item()) == 0
158
+
159
+ return False
160
+
161
+ def quantize(self):
162
+ assert self.node.op_type == "Pad"
163
+
164
+ for input_name in self.node.input:
165
+ if input_name:
166
+ self.quantizer.quantize_activation_tensor(input_name)
167
+
168
+ if not self.disable_qdq_for_node_output:
169
+ if self._should_quantize_output_same_as_input():
170
+ self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
171
+ else:
172
+ self.quantizer.quantize_activation_tensor(self.node.output[0])