onnxruntime-directml 1.18.0__cp39-cp39-win_amd64.whl → 1.19.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. onnxruntime/ThirdPartyNotices.txt +1 -1
  2. onnxruntime/__init__.py +1 -1
  3. onnxruntime/capi/DirectML.dll +0 -0
  4. onnxruntime/capi/onnxruntime.dll +0 -0
  5. onnxruntime/capi/onnxruntime_inference_collection.py +13 -5
  6. onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
  7. onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
  8. onnxruntime/capi/onnxruntime_validation.py +8 -9
  9. onnxruntime/quantization/base_quantizer.py +41 -8
  10. onnxruntime/quantization/calibrate.py +68 -13
  11. onnxruntime/quantization/execution_providers/qnn/quant_config.py +14 -3
  12. onnxruntime/quantization/matmul_4bits_quantizer.py +212 -87
  13. onnxruntime/quantization/operators/activation.py +4 -2
  14. onnxruntime/quantization/operators/direct_q8.py +1 -1
  15. onnxruntime/quantization/operators/norm.py +1 -1
  16. onnxruntime/quantization/qdq_quantizer.py +21 -8
  17. onnxruntime/quantization/quant_utils.py +64 -2
  18. onnxruntime/quantization/quantize.py +28 -1
  19. onnxruntime/quantization/registry.py +1 -0
  20. onnxruntime/tools/check_onnx_model_mobile_usability.py +6 -26
  21. onnxruntime/tools/mobile_helpers/check_model_can_use_ort_mobile_pkg.py +2 -1
  22. onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +31 -0
  23. onnxruntime/tools/mobile_helpers/{coreml_supported_ops.md → coreml_supported_neuralnetwork_ops.md} +1 -1
  24. onnxruntime/tools/mobile_helpers/usability_checker.py +292 -144
  25. onnxruntime/tools/pytorch_export_contrib_ops.py +1 -1
  26. onnxruntime/tools/reduced_build_config_parser.py +1 -1
  27. onnxruntime/tools/symbolic_shape_infer.py +44 -24
  28. onnxruntime/transformers/benchmark.py +2 -2
  29. onnxruntime/transformers/bert_test_data.py +3 -3
  30. onnxruntime/transformers/fusion_attention.py +4 -1
  31. onnxruntime/transformers/fusion_attention_clip.py +60 -21
  32. onnxruntime/transformers/fusion_layernorm.py +23 -10
  33. onnxruntime/transformers/fusion_quickgelu.py +74 -0
  34. onnxruntime/transformers/fusion_simplified_layernorm.py +18 -0
  35. onnxruntime/transformers/fusion_utils.py +1 -1
  36. onnxruntime/transformers/large_model_exporter.py +1 -1
  37. onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +1 -1
  38. onnxruntime/transformers/models/gpt2/convert_to_onnx.py +1 -1
  39. onnxruntime/transformers/models/gpt2/gpt2_tester.py +1 -1
  40. onnxruntime/transformers/models/llama/benchmark.py +8 -2
  41. onnxruntime/transformers/models/llama/benchmark_e2e.py +40 -15
  42. onnxruntime/transformers/models/llama/convert_to_onnx.py +27 -21
  43. onnxruntime/transformers/models/llama/llama_inputs.py +16 -26
  44. onnxruntime/transformers/models/llama/llama_parity.py +2 -2
  45. onnxruntime/transformers/models/llama/llama_torch.py +6 -3
  46. onnxruntime/transformers/models/stable_diffusion/engine_builder.py +1 -1
  47. onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +2 -2
  48. onnxruntime/transformers/onnx_model_bert.py +3 -0
  49. onnxruntime/transformers/onnx_model_clip.py +1 -0
  50. onnxruntime/transformers/onnx_model_phi.py +1 -1
  51. onnxruntime/transformers/onnx_model_t5.py +1 -1
  52. onnxruntime/transformers/onnx_model_tnlr.py +1 -1
  53. onnxruntime/transformers/optimizer.py +1 -1
  54. onnxruntime/transformers/shape_optimizer.py +1 -1
  55. {onnxruntime_directml-1.18.0.dist-info → onnxruntime_directml-1.19.0.dist-info}/METADATA +7 -2
  56. {onnxruntime_directml-1.18.0.dist-info → onnxruntime_directml-1.19.0.dist-info}/RECORD +59 -56
  57. {onnxruntime_directml-1.18.0.dist-info → onnxruntime_directml-1.19.0.dist-info}/WHEEL +1 -1
  58. {onnxruntime_directml-1.18.0.dist-info → onnxruntime_directml-1.19.0.dist-info}/entry_points.txt +0 -0
  59. {onnxruntime_directml-1.18.0.dist-info → onnxruntime_directml-1.19.0.dist-info}/top_level.txt +0 -0
@@ -4820,7 +4820,7 @@ SOFTWARE.
4820
4820
 
4821
4821
  ----------------------------------------------------------------------------
4822
4822
 
4823
- This is the MIT/Expat Licence. For more information see:
4823
+ This is the MIT/Expat License. For more information see:
4824
4824
 
4825
4825
  1. http://www.opensource.org/licenses/mit-license.php
4826
4826
 
onnxruntime/__init__.py CHANGED
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
7
7
  For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
8
8
  or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
9
9
  """
10
- __version__ = "1.18.0"
10
+ __version__ = "1.19.0"
11
11
  __author__ = "Microsoft"
12
12
 
13
13
  # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
Binary file
Binary file
@@ -438,10 +438,18 @@ class InferenceSession(Session):
438
438
 
439
439
  # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
440
440
  if "TensorrtExecutionProvider" in available_providers:
441
- if providers and any(
442
- provider == "CUDAExecutionProvider"
443
- or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
444
- for provider in providers
441
+ if (
442
+ providers
443
+ and any(
444
+ provider == "CUDAExecutionProvider"
445
+ or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
446
+ for provider in providers
447
+ )
448
+ and any(
449
+ provider == "TensorrtExecutionProvider"
450
+ or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
451
+ for provider in providers
452
+ )
445
453
  ):
446
454
  self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
447
455
  else:
@@ -646,7 +654,7 @@ class IOBinding:
646
654
  return self._iobinding.get_outputs()
647
655
 
648
656
  def copy_outputs_to_cpu(self):
649
- """Copy output contents to CPU (if on another device). No-op if already on the CPU."""
657
+ """Copy output contents to CPU."""
650
658
  return self._iobinding.copy_outputs_to_cpu()
651
659
 
652
660
  def clear_binding_inputs(self):
@@ -24,8 +24,7 @@ def check_distro_info():
24
24
 
25
25
  if __my_distro_ver__ not in ["10", "11"]:
26
26
  warnings.warn(
27
- "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
28
- % __my_distro_ver__
27
+ f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
29
28
  )
30
29
  elif __my_system__ == "linux":
31
30
  """Although the 'platform' python module for getting Distro information works well on standard OS images
@@ -54,11 +53,11 @@ def check_distro_info():
54
53
 
55
54
  if int(__my_distro_ver__.split(".")[0]) < 11:
56
55
  warnings.warn(
57
- "Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__)
56
+ f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
58
57
  )
59
58
  else:
60
59
  warnings.warn(
61
- "Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__
60
+ f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only."
62
61
  )
63
62
 
64
63
 
@@ -115,10 +114,10 @@ def validate_build_package_info():
115
114
  cudart_version = None
116
115
 
117
116
  def print_build_package_info():
118
- warnings.warn("onnxruntime training package info: package_name: %s" % package_name)
119
- warnings.warn("onnxruntime training package info: __version__: %s" % version)
120
- warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version)
121
- warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version)
117
+ warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
118
+ warnings.warn(f"onnxruntime training package info: __version__: {version}")
119
+ warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
120
+ warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
122
121
 
123
122
  # collection cuda library info from current environment.
124
123
  from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
@@ -127,7 +126,7 @@ def validate_build_package_info():
127
126
  if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
128
127
  print_build_package_info()
129
128
  warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
130
- warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions)
129
+ warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
131
130
  else:
132
131
  # TODO: rcom
133
132
  pass
@@ -25,6 +25,7 @@ from .quant_utils import (
25
25
  find_by_name,
26
26
  model_has_infer_metadata,
27
27
  normalize_axis,
28
+ pack_bytes_to_4bit,
28
29
  quantize_data,
29
30
  quantize_nparray,
30
31
  save_and_reload_model_with_shape_infer,
@@ -339,6 +340,18 @@ class BaseQuantizer:
339
340
  f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
340
341
  f"\nraw={str(q_weight_initializer)[:200]}."
341
342
  )
343
+ elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
344
+ if q_weight_data.dtype not in (np.int8, np.uint8):
345
+ raise RuntimeError(
346
+ f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
347
+ )
348
+
349
+ # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
350
+ # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
351
+ packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
352
+
353
+ # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
354
+ q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
342
355
  else:
343
356
  q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
344
357
  weight.dims
@@ -396,7 +409,10 @@ class BaseQuantizer:
396
409
 
397
410
  symmetric = quant_overrides_for_channels[0].get(
398
411
  "symmetric",
399
- (self.is_weight_symmetric or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN)),
412
+ (
413
+ self.is_weight_symmetric
414
+ or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
415
+ ),
400
416
  )
401
417
  reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
402
418
  zero_point_list = []
@@ -447,7 +463,8 @@ class BaseQuantizer:
447
463
  quantized_per_channel_data_list.append(quantized_per_channel_data)
448
464
 
449
465
  # combine per_channel_data into one
450
- reshape_dims = list(weights.shape) # deep copy
466
+ weights_shape = list(weights.shape)
467
+ reshape_dims = list(weights_shape) # deep copy
451
468
  reshape_dims[channel_axis] = 1 # only one per channel for reshape
452
469
  quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
453
470
  for i in range(1, len(quantized_per_channel_data_list)):
@@ -470,12 +487,28 @@ class BaseQuantizer:
470
487
  self.model.initializer_extend([scale_initializer, zero_initializer])
471
488
 
472
489
  if not keep_float_weight:
473
- quantized_weights = np.asarray(
474
- quantized_weights,
475
- dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
476
- ).reshape(initializer.dims)
477
- q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
478
- self.model.initializer_extend([q_weight_initializer])
490
+ if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
491
+ if quantized_weights.dtype not in (np.int8, np.uint8):
492
+ raise RuntimeError(
493
+ f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
494
+ )
495
+
496
+ # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
497
+ # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
498
+ packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
499
+
500
+ # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
501
+ q_weight_initializer = onnx.helper.make_tensor(
502
+ q_weight_name, weight_qType, weights_shape, packed_data, raw=True
503
+ )
504
+ self.model.initializer_extend([q_weight_initializer])
505
+ else:
506
+ quantized_weights = np.asarray(
507
+ quantized_weights,
508
+ dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
509
+ ).reshape(initializer.dims)
510
+ q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
511
+ self.model.initializer_extend([q_weight_initializer])
479
512
 
480
513
  return q_weight_name, zp_name, scale_name
481
514
 
@@ -128,6 +128,9 @@ class TensorsData:
128
128
  def values(self):
129
129
  return self.data.values()
130
130
 
131
+ def items(self):
132
+ return self.data.items()
133
+
131
134
 
132
135
  class CalibrationMethod(Enum):
133
136
  MinMax = 0
@@ -155,6 +158,12 @@ class CalibrationDataReader(metaclass=abc.ABCMeta):
155
158
  raise StopIteration
156
159
  return result
157
160
 
161
+ def __len__(self):
162
+ raise NotImplementedError
163
+
164
+ def set_range(self, start_index: int, end_index: int):
165
+ raise NotImplementedError
166
+
158
167
 
159
168
  class CalibraterBase:
160
169
  def __init__(
@@ -164,13 +173,15 @@ class CalibraterBase:
164
173
  augmented_model_path="augmented_model.onnx",
165
174
  symmetric=False,
166
175
  use_external_data_format=False,
176
+ per_channel=False,
167
177
  ):
168
178
  """
169
179
  :param model_path: ONNX model to calibrate. It should be a model file path
170
180
  :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
171
181
  :param augmented_model_path: save augmented model to this path.
172
182
  :param symmetric: make range of tensor symmetric (central point is 0).
173
- :param use_external_data_format: use external data format to store model which size is >= 2Gb
183
+ :param use_external_data_format: use external data format to store model which size is >= 2Gb.
184
+ :param per_channel: whether to compute ranges per each channel.
174
185
  """
175
186
  if isinstance(model_path, str):
176
187
  self.model = load_model_with_shape_infer(Path(model_path))
@@ -183,6 +194,7 @@ class CalibraterBase:
183
194
  self.augmented_model_path = augmented_model_path
184
195
  self.symmetric = symmetric
185
196
  self.use_external_data_format = use_external_data_format
197
+ self.per_channel = per_channel
186
198
 
187
199
  self.augment_model = None
188
200
  self.infer_session = None
@@ -274,6 +286,7 @@ class MinMaxCalibrater(CalibraterBase):
274
286
  moving_average=False,
275
287
  averaging_constant=0.01,
276
288
  max_intermediate_outputs=None,
289
+ per_channel=False,
277
290
  ):
278
291
  """
279
292
  :param model_path: ONNX model to calibrate. It is a model path
@@ -284,6 +297,7 @@ class MinMaxCalibrater(CalibraterBase):
284
297
  :param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
285
298
  :param averaging_constant: constant smoothing factor to use when computing the moving average.
286
299
  :param max_intermediate_outputs: maximum number of intermediate outputs before an intermediate range is computed.
300
+ :param per_channel: whether to compute ranges per each channel.
287
301
  """
288
302
  super().__init__(
289
303
  model_path,
@@ -291,6 +305,7 @@ class MinMaxCalibrater(CalibraterBase):
291
305
  augmented_model_path=augmented_model_path,
292
306
  symmetric=symmetric,
293
307
  use_external_data_format=use_external_data_format,
308
+ per_channel=per_channel,
294
309
  )
295
310
  self.intermediate_outputs = []
296
311
  self.calibrate_tensors_range = None
@@ -310,9 +325,15 @@ class MinMaxCalibrater(CalibraterBase):
310
325
  """
311
326
  tensors, _ = self.select_tensors_to_calibrate(self.model)
312
327
  reshape_shape_name = str(uuid.uuid4())
313
- reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
328
+ reshape_shape = numpy_helper.from_array(np.array([-1], dtype=np.int64), reshape_shape_name)
314
329
  self.model.graph.initializer.append(reshape_shape)
315
330
 
331
+ def get_op_version(op_type, model):
332
+ for opset_import in model.opset_import:
333
+ if onnx.defs.has(op_type, opset_import.domain):
334
+ return opset_import.version
335
+ raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
336
+
316
337
  def add_reduce_min_max(tensor_name, reduce_op_name):
317
338
  # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
318
339
  # To make the code simple, we always let keepdims to be 1.
@@ -332,7 +353,6 @@ class MinMaxCalibrater(CalibraterBase):
332
353
  name=intermediate_output,
333
354
  )
334
355
 
335
- self.model.graph.node.extend([reduce_node, reshape_node])
336
356
  value_infos = {vi.name: vi for vi in self.model.graph.value_info}
337
357
  value_infos.update({o.name: o for o in self.model.graph.output})
338
358
  value_infos.update({i.name: i for i in self.model.graph.input})
@@ -343,7 +363,22 @@ class MinMaxCalibrater(CalibraterBase):
343
363
  f"Unable to guess tensor type for tensor {tensor_name!r}, "
344
364
  f"running shape inference before quantization may resolve this issue."
345
365
  )
346
- self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
366
+
367
+ # Include axes in reduce_op when per_channel, always keeping axis=1
368
+ if self.per_channel:
369
+ tensor_rank = len(value_infos[tensor_name].type.tensor_type.shape.dim)
370
+ reduced_axes = [0, *range(2, tensor_rank)]
371
+ # Depending on opset version, axes in ReduceMin/ReduceMax are in attribute or inputs
372
+ if get_op_version(reduce_op_name, self.model) < 18:
373
+ reduce_node.attribute.append(helper.make_attribute("axes", reduced_axes))
374
+ else:
375
+ reduce_axes_name = str(uuid.uuid4())
376
+ reduce_axes = numpy_helper.from_array(np.array(reduced_axes, dtype=np.int64), reduce_axes_name)
377
+ reduce_node.input.append(reduce_axes_name)
378
+ self.model.graph.initializer.append(reduce_axes)
379
+
380
+ self.model.graph.node.extend([reduce_node, reshape_node])
381
+ self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
347
382
 
348
383
  for tensor in tensors:
349
384
  add_reduce_min_max(tensor, "ReduceMin")
@@ -383,13 +418,31 @@ class MinMaxCalibrater(CalibraterBase):
383
418
  return new_range
384
419
 
385
420
  for key, value in old_range.items():
421
+ # Handling for structured data types with TensorData
422
+ if isinstance(value, TensorData):
423
+ old_min = value.range_value[0]
424
+ old_max = value.range_value[1]
425
+ else:
426
+ old_min, old_max = value
427
+
428
+ if isinstance(new_range[key], TensorData):
429
+ new_min = new_range[key].range_value[0]
430
+ new_max = new_range[key].range_value[1]
431
+ else:
432
+ new_min, new_max = new_range[key]
433
+
386
434
  if self.moving_average:
387
- min_value = value[0] + self.averaging_constant * (new_range[key][0] - value[0])
388
- max_value = value[1] + self.averaging_constant * (new_range[key][1] - value[1])
435
+ min_value = old_min + self.averaging_constant * (new_min - old_min)
436
+ max_value = old_max + self.averaging_constant * (new_max - old_max)
437
+ else:
438
+ min_value = min(old_min, new_min)
439
+ max_value = max(old_max, new_max)
440
+
441
+ # If structured as TensorData, wrap the result accordingly
442
+ if isinstance(value, TensorData) or isinstance(new_range[key], TensorData):
443
+ new_range[key] = TensorData(lowest=min_value, highest=max_value)
389
444
  else:
390
- min_value = min(value[0], new_range[key][0])
391
- max_value = max(value[1], new_range[key][1])
392
- new_range[key] = (min_value, max_value)
445
+ new_range[key] = (min_value, max_value)
393
446
 
394
447
  return new_range
395
448
 
@@ -430,7 +483,7 @@ class MinMaxCalibrater(CalibraterBase):
430
483
  max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
431
484
 
432
485
  if self.symmetric:
433
- max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
486
+ max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
434
487
  pairs.append(tuple([-max_absolute_value, max_absolute_value]))
435
488
  else:
436
489
  pairs.append(tuple([min_value_array, max_value_array]))
@@ -759,7 +812,7 @@ class HistogramCollector(CalibrationDataCollector):
759
812
  hist_edges = hist_edges.astype(data_arr_np.dtype)
760
813
  assert (
761
814
  data_arr_np.dtype != np.float64
762
- ), "only float32 or float16 is supported, every constant must be explicetly typed"
815
+ ), "only float32 or float16 is supported, every constant must be explicitly typed"
763
816
  self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
764
817
  else:
765
818
  old_histogram = self.histogram_dict[tensor]
@@ -781,7 +834,7 @@ class HistogramCollector(CalibrationDataCollector):
781
834
  hist[: len(old_hist)] += old_hist
782
835
  assert (
783
836
  data_arr_np.dtype != np.float64
784
- ), "only float32 or float16 is supported, every constant must be explicetly typed"
837
+ ), "only float32 or float16 is supported, every constant must be explicitly typed"
785
838
  self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
786
839
 
787
840
  def collect_value(self, name_to_arr):
@@ -1023,7 +1076,7 @@ class HistogramCollector(CalibrationDataCollector):
1023
1076
 
1024
1077
  for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
1025
1078
  start_index = zero_bin_index - i
1026
- end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
1079
+ end_index = min(zero_bin_index + i + 1, num_bins)
1027
1080
 
1028
1081
  thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
1029
1082
 
@@ -1097,6 +1150,7 @@ def create_calibrator(
1097
1150
  moving_average = extra_options.get("moving_average", False)
1098
1151
  averaging_constant = extra_options.get("averaging_constant", 0.01)
1099
1152
  max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
1153
+ per_channel = extra_options.get("per_channel", False)
1100
1154
  calibrator = MinMaxCalibrater(
1101
1155
  model,
1102
1156
  op_types_to_calibrate,
@@ -1106,6 +1160,7 @@ def create_calibrator(
1106
1160
  moving_average=moving_average,
1107
1161
  averaging_constant=averaging_constant,
1108
1162
  max_intermediate_outputs=max_intermediate_outputs,
1163
+ per_channel=per_channel,
1109
1164
  )
1110
1165
  elif calibrate_method == CalibrationMethod.Entropy:
1111
1166
  # default settings for entropy algorithm
@@ -21,6 +21,7 @@ from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesF
21
21
 
22
22
  Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
23
23
  Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
24
+ Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
24
25
  OP_TYPES_TO_EXCLUDE = {"Cast"}
25
26
  MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
26
27
 
@@ -50,6 +51,8 @@ def get_qnn_qdq_config(
50
51
  add_qtype_converts: bool = True,
51
52
  activation_symmetric: bool = False,
52
53
  weight_symmetric: bool | None = None,
54
+ keep_removable_activations: bool = False,
55
+ stride: int | None = None,
53
56
  ) -> StaticQuantConfig:
54
57
  """
55
58
  Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@@ -109,6 +112,11 @@ def get_qnn_qdq_config(
109
112
  the zero-point values are 128 and 32,768, respectively.
110
113
  weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
111
114
  Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
115
+ keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
116
+ be removed, and will be explicitly represented in the QDQ model. If false, these activations
117
+ are automatically removed if activations are asymmetrically quantized. Keeping these activations
118
+ is necessary if optimizations or EP transformations will later remove
119
+ QuantizeLinear/DequantizeLinear operators from the model.
112
120
 
113
121
  Returns:
114
122
  A StaticQuantConfig object
@@ -160,17 +168,20 @@ def get_qnn_qdq_config(
160
168
  extra_options = {
161
169
  "MinimumRealRange": 0.0001,
162
170
  "DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
171
+ "QDQKeepRemovableActivations": keep_removable_activations,
163
172
  "TensorQuantOverrides": overrides_helper.get_dict(),
164
173
  "ActivationSymmetric": activation_symmetric,
165
174
  "WeightSymmetric": weight_symmetric,
175
+ "CalibStridedMinMax": stride,
166
176
  }
167
177
 
168
178
  # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
169
- # on Q/DQ operators if using 16-bit quantization.
179
+ # on Q/DQ operators if using 16-bit or 4-bit quantization.
170
180
  onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
171
181
  if onnx_opset.version < 21:
172
- overrides_have_int16 = any(t in Q16_TYPES for t in overrides_helper.get_quant_types())
173
- if activation_type in Q16_TYPES or weight_type in Q16_TYPES or overrides_have_int16:
182
+ opset21_types = Q16_TYPES.union(Q4_TYPES)
183
+ overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
184
+ if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
174
185
  extra_options["UseQDQContribOps"] = True
175
186
 
176
187
  return StaticQuantConfig(