onnxruntime-directml 1.19.2__cp310-cp310-win_amd64.whl → 1.20.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. onnxruntime/__init__.py +3 -1
  2. onnxruntime/capi/DirectML.dll +0 -0
  3. onnxruntime/capi/convert_npz_to_onnx_adapter.py +48 -0
  4. onnxruntime/capi/onnxruntime.dll +0 -0
  5. onnxruntime/capi/onnxruntime_collect_build_info.py +0 -56
  6. onnxruntime/capi/onnxruntime_inference_collection.py +78 -6
  7. onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
  8. onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
  9. onnxruntime/capi/onnxruntime_validation.py +7 -1
  10. onnxruntime/quantization/base_quantizer.py +8 -12
  11. onnxruntime/quantization/calibrate.py +34 -2
  12. onnxruntime/quantization/matmul_4bits_quantizer.py +662 -39
  13. onnxruntime/quantization/operators/gather.py +1 -1
  14. onnxruntime/quantization/operators/matmul.py +8 -5
  15. onnxruntime/quantization/qdq_quantizer.py +1 -2
  16. onnxruntime/quantization/quant_utils.py +30 -5
  17. onnxruntime/quantization/quantize.py +14 -3
  18. onnxruntime/quantization/registry.py +1 -0
  19. onnxruntime/quantization/tensor_quant_overrides.py +2 -2
  20. onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +7 -5
  21. onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md +1 -1
  22. onnxruntime/transformers/benchmark.py +1 -1
  23. onnxruntime/transformers/constants.py +2 -2
  24. onnxruntime/transformers/convert_generation.py +1 -1
  25. onnxruntime/transformers/convert_to_packing_mode.py +10 -10
  26. onnxruntime/transformers/fusion_attention_sam2.py +534 -0
  27. onnxruntime/transformers/fusion_gelu.py +12 -3
  28. onnxruntime/transformers/fusion_layernorm.py +158 -2
  29. onnxruntime/transformers/fusion_rotary_attention.py +1 -1
  30. onnxruntime/transformers/fusion_skiplayernorm.py +26 -17
  31. onnxruntime/transformers/io_binding_helper.py +7 -4
  32. onnxruntime/transformers/machine_info.py +0 -2
  33. onnxruntime/transformers/models/gpt2/convert_to_onnx.py +5 -1
  34. onnxruntime/transformers/models/llama/benchmark_all.py +1 -5
  35. onnxruntime/transformers/models/llama/convert_to_onnx.py +25 -58
  36. onnxruntime/transformers/models/sam2/__init__.py +12 -0
  37. onnxruntime/transformers/models/sam2/benchmark_sam2.py +625 -0
  38. onnxruntime/transformers/models/sam2/convert_to_onnx.py +260 -0
  39. onnxruntime/transformers/models/sam2/image_decoder.py +273 -0
  40. onnxruntime/transformers/models/sam2/image_encoder.py +186 -0
  41. onnxruntime/transformers/models/sam2/mask_decoder.py +208 -0
  42. onnxruntime/transformers/models/sam2/nvtx_helper.py +33 -0
  43. onnxruntime/transformers/models/sam2/prompt_encoder.py +189 -0
  44. onnxruntime/transformers/models/sam2/sam2_demo.py +322 -0
  45. onnxruntime/transformers/models/sam2/sam2_image_onnx_predictor.py +280 -0
  46. onnxruntime/transformers/models/sam2/sam2_utils.py +147 -0
  47. onnxruntime/transformers/models/stable_diffusion/engine_builder.py +1 -1
  48. onnxruntime/transformers/models/whisper/benchmark_all.py +1 -5
  49. onnxruntime/transformers/models/whisper/whisper_chain.py +4 -1
  50. onnxruntime/transformers/onnx_model.py +62 -17
  51. onnxruntime/transformers/onnx_model_bert.py +3 -3
  52. onnxruntime/transformers/onnx_model_clip.py +1 -0
  53. onnxruntime/transformers/onnx_model_sam2.py +138 -0
  54. onnxruntime/transformers/optimizer.py +5 -3
  55. {onnxruntime_directml-1.19.2.dist-info → onnxruntime_directml-1.20.0.dist-info}/METADATA +3 -8
  56. {onnxruntime_directml-1.19.2.dist-info → onnxruntime_directml-1.20.0.dist-info}/RECORD +59 -47
  57. onnxruntime/tools/mobile_helpers/check_model_can_use_ort_mobile_pkg.py +0 -301
  58. onnxruntime/tools/mobile_helpers/mobile_package.required_operators.config +0 -46
  59. {onnxruntime_directml-1.19.2.dist-info → onnxruntime_directml-1.20.0.dist-info}/WHEEL +0 -0
  60. {onnxruntime_directml-1.19.2.dist-info → onnxruntime_directml-1.20.0.dist-info}/entry_points.txt +0 -0
  61. {onnxruntime_directml-1.19.2.dist-info → onnxruntime_directml-1.20.0.dist-info}/top_level.txt +0 -0
onnxruntime/__init__.py CHANGED
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
7
7
  For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
8
8
  or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
9
9
  """
10
- __version__ = "1.19.2"
10
+ __version__ = "1.20.0"
11
11
  __author__ = "Microsoft"
12
12
 
13
13
  # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
@@ -23,6 +23,7 @@ try:
23
23
  from onnxruntime.capi._pybind_state import ExecutionMode # noqa: F401
24
24
  from onnxruntime.capi._pybind_state import ExecutionOrder # noqa: F401
25
25
  from onnxruntime.capi._pybind_state import GraphOptimizationLevel # noqa: F401
26
+ from onnxruntime.capi._pybind_state import LoraAdapter # noqa: F401
26
27
  from onnxruntime.capi._pybind_state import ModelMetadata # noqa: F401
27
28
  from onnxruntime.capi._pybind_state import NodeArg # noqa: F401
28
29
  from onnxruntime.capi._pybind_state import OrtAllocatorType # noqa: F401
@@ -56,6 +57,7 @@ from onnxruntime.capi import onnxruntime_validation
56
57
  if import_capi_exception:
57
58
  raise import_capi_exception
58
59
 
60
+ from onnxruntime.capi.onnxruntime_inference_collection import AdapterFormat # noqa: F401
59
61
  from onnxruntime.capi.onnxruntime_inference_collection import InferenceSession # noqa: F401
60
62
  from onnxruntime.capi.onnxruntime_inference_collection import IOBinding # noqa: F401
61
63
  from onnxruntime.capi.onnxruntime_inference_collection import OrtDevice # noqa: F401
Binary file
@@ -0,0 +1,48 @@
1
+ # Copyright (c) Microsoft Corporation. All rights reserved.
2
+ # Licensed under the MIT License.
3
+
4
+ # This script helps converting .npz files to .onnx_adapter files
5
+
6
+ import argparse
7
+ import os
8
+ import sys
9
+
10
+ import numpy as np
11
+
12
+ import onnxruntime as ort
13
+
14
+
15
+ def get_args() -> argparse:
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("--npz_file_path", type=str, required=True)
18
+ parser.add_argument("--output_file_path", type=str, required=True)
19
+ parser.add_argument("--adapter_version", type=int, required=True)
20
+ parser.add_argument("--model_version", type=int, required=True)
21
+ return parser.parse_args()
22
+
23
+
24
+ def export_lora_parameters(
25
+ npz_file_path: os.PathLike, adapter_version: int, model_version: int, output_file_path: os.PathLike
26
+ ):
27
+ """The function converts lora parameters in npz to onnx_adapter format"""
28
+ adapter_format = ort.AdapterFormat()
29
+ adapter_format.set_adapter_version(adapter_version)
30
+ adapter_format.set_model_version(model_version)
31
+ name_to_ort_value = {}
32
+ with np.load(npz_file_path) as data:
33
+ for name, np_arr in data.items():
34
+ ort_value = ort.OrtValue.ortvalue_from_numpy(np_arr)
35
+ name_to_ort_value[name] = ort_value
36
+
37
+ adapter_format.set_parameters(name_to_ort_value)
38
+ adapter_format.export_adapter(output_file_path)
39
+
40
+
41
+ def main() -> int:
42
+ args = get_args()
43
+ export_lora_parameters(args.npz_file_path, args.adapter_version, args.model_version, args.output_file_path)
44
+ return 0
45
+
46
+
47
+ if __name__ == "__main__":
48
+ sys.exit(main())
Binary file
@@ -45,59 +45,3 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
45
45
 
46
46
  # convert to list and remove None
47
47
  return [ver for ver in cudart_found_versions if ver]
48
-
49
-
50
- def find_cudnn_supported_cuda_versions(build_env=False):
51
- # comments in get_cudart_version apply here
52
- if not sys.platform.startswith("linux"):
53
- warnings.warn("find_cudnn_versions only works on Linux")
54
-
55
- cudnn_possible_versions = {None}
56
- if not build_env:
57
- # if not in a build environment, there may be more than one installed cudnn.
58
- # https://developer.nvidia.com/rdp/cudnn-archive to include all that may support Cuda 10+.
59
- cudnn_possible_versions.update(
60
- {
61
- "8.2",
62
- "8.1.1",
63
- "8.1.0",
64
- "8.0.5",
65
- "8.0.4",
66
- "8.0.3",
67
- "8.0.2",
68
- "8.0.1",
69
- "7.6.5",
70
- "7.6.4",
71
- "7.6.3",
72
- "7.6.2",
73
- "7.6.1",
74
- "7.6.0",
75
- "7.5.1",
76
- "7.5.0",
77
- "7.4.2",
78
- "7.4.1",
79
- "7.3.1",
80
- "7.3.0",
81
- }
82
- )
83
-
84
- def get_cudnn_supported_cuda_version(find_cudnn_version=None):
85
- cudnn_lib_filename = "libcudnn.so"
86
- if find_cudnn_version:
87
- cudnn_lib_filename = cudnn_lib_filename + "." + find_cudnn_version
88
-
89
- # in cudnn.h cudnn version are calculated as:
90
- # #define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
91
- try:
92
- cudnn = ctypes.CDLL(cudnn_lib_filename)
93
- # cudnn_ver = cudnn.cudnnGetVersion()
94
- cuda_ver = cudnn.cudnnGetCudartVersion()
95
- return cuda_ver
96
- except Exception:
97
- return None
98
-
99
- # use set to avoid duplications
100
- cuda_found_versions = {get_cudnn_supported_cuda_version(cudnn_version) for cudnn_version in cudnn_possible_versions}
101
-
102
- # convert to list and remove None
103
- return [ver for ver in cuda_found_versions if ver]
@@ -32,6 +32,52 @@ def get_ort_device_type(device_type: str, device_index) -> C.OrtDevice:
32
32
  raise Exception("Unsupported device type: " + device_type)
33
33
 
34
34
 
35
+ class AdapterFormat:
36
+ """
37
+ This class is used to create adapter files from python structures
38
+ """
39
+
40
+ def __init__(self, adapter=None) -> None:
41
+ if adapter is None:
42
+ self._adapter = C.AdapterFormat()
43
+ else:
44
+ self._adapter = adapter
45
+
46
+ @staticmethod
47
+ def read_adapter(file_path: os.PathLike) -> AdapterFormat:
48
+ return AdapterFormat(C.AdapterFormat.read_adapter(file_path))
49
+
50
+ def export_adapter(self, file_path: os.PathLike):
51
+ """
52
+ This function writes a file at the specified location
53
+ in onnxrunitme adapter format containing Lora parameters.
54
+
55
+ :param file_path: absolute path for the adapter
56
+ """
57
+ self._adapter.export_adapter(file_path)
58
+
59
+ def get_format_version(self):
60
+ return self._adapter.format_version
61
+
62
+ def set_adapter_version(self, adapter_version: int):
63
+ self._adapter.adapter_version = adapter_version
64
+
65
+ def get_adapter_version(self):
66
+ return self._adapter.adapter_version
67
+
68
+ def set_model_version(self, model_version: int):
69
+ self._adapter.model_version = model_version
70
+
71
+ def get_model_version(self):
72
+ return self._adapter.model_version
73
+
74
+ def set_parameters(self, params: dict[str, OrtValue]):
75
+ self._adapter.parameters = {k: v._ortvalue for k, v in params.items()}
76
+
77
+ def get_parameters(self) -> dict[str, OrtValue]:
78
+ return {k: OrtValue(v) for k, v in self._adapter.parameters.items()}
79
+
80
+
35
81
  def check_and_normalize_provider_args(
36
82
  providers: Sequence[str | tuple[str, dict[Any, Any]]] | None,
37
83
  provider_options: Sequence[dict[Any, Any]] | None,
@@ -556,7 +602,7 @@ class IOBinding:
556
602
  :param name: input name
557
603
  :param device_type: e.g. cpu, cuda, cann
558
604
  :param device_id: device id, e.g. 0
559
- :param element_type: input element type
605
+ :param element_type: input element type. It can be either numpy type (like numpy.float32) or an integer for onnx type (like onnx.TensorProto.BFLOAT16)
560
606
  :param shape: input shape
561
607
  :param buffer_ptr: memory pointer to input data
562
608
  """
@@ -595,7 +641,7 @@ class IOBinding:
595
641
  :param name: output name
596
642
  :param device_type: e.g. cpu, cuda, cann, cpu by default
597
643
  :param device_id: device id, e.g. 0
598
- :param element_type: output element type
644
+ :param element_type: output element type. It can be either numpy type (like numpy.float32) or an integer for onnx type (like onnx.TensorProto.BFLOAT16)
599
645
  :param shape: output shape
600
646
  :param buffer_ptr: memory pointer to output data
601
647
  """
@@ -712,17 +758,43 @@ class OrtValue:
712
758
  )
713
759
 
714
760
  @staticmethod
715
- def ortvalue_from_shape_and_type(shape=None, element_type=None, device_type="cpu", device_id=0):
761
+ def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
762
+ """
763
+ This method creates an instance of OrtValue on top of the numpy array.
764
+ No data copy is made and the lifespan of the resulting OrtValue should never
765
+ exceed the lifespan of bytes object. The API attempts to reinterpret
766
+ the data type which is expected to be the same size. This is useful
767
+ when we want to use an ONNX data type that is not supported by numpy.
768
+
769
+ :param data: numpy.ndarray.
770
+ :param onnx_elemenet_type: a valid onnx TensorProto::DataType enum value
771
+ """
772
+ return OrtValue(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
773
+
774
+ @staticmethod
775
+ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", device_id: int = 0):
716
776
  """
717
777
  Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
718
778
 
719
779
  :param shape: List of integers indicating the shape of the OrtValue
720
- :param element_type: The data type of the elements in the OrtValue (numpy type)
780
+ :param element_type: The data type of the elements. It can be either numpy type (like numpy.float32) or an integer for onnx type (like onnx.TensorProto.BFLOAT16).
721
781
  :param device_type: e.g. cpu, cuda, cann, cpu by default
722
782
  :param device_id: device id, e.g. 0
723
783
  """
724
- if shape is None or element_type is None:
725
- raise ValueError("`element_type` and `shape` are to be provided if pre-allocated memory is provided")
784
+ # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html).
785
+ # This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy.
786
+ if isinstance(element_type, int):
787
+ return OrtValue(
788
+ C.OrtValue.ortvalue_from_shape_and_onnx_type(
789
+ shape,
790
+ element_type,
791
+ C.OrtDevice(
792
+ get_ort_device_type(device_type, device_id),
793
+ C.OrtDevice.default_memory(),
794
+ device_id,
795
+ ),
796
+ )
797
+ )
726
798
 
727
799
  return OrtValue(
728
800
  C.OrtValue.ortvalue_from_shape_and_type(
@@ -55,9 +55,15 @@ def check_distro_info():
55
55
  warnings.warn(
56
56
  f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
57
57
  )
58
+ elif __my_system__ == "aix":
59
+ import subprocess
60
+
61
+ returned_output = subprocess.check_output("oslevel")
62
+ __my_distro_ver__str = returned_output.decode("utf-8")
63
+ __my_distro_ver = __my_distro_ver__str[:3]
58
64
  else:
59
65
  warnings.warn(
60
- f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only."
66
+ f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS, AIX and Windows platforms, only."
61
67
  )
62
68
 
63
69
 
@@ -230,7 +230,9 @@ class BaseQuantizer:
230
230
  # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
231
231
  bias_scale = input_scale * weight_scale * beta
232
232
 
233
- quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
233
+ quantized_data = (np.asarray(bias_data) / bias_scale).round()
234
+ quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
235
+ quantized_data = quantized_data.astype(np.int32)
234
236
 
235
237
  # update bias initializer
236
238
  bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
@@ -418,6 +420,9 @@ class BaseQuantizer:
418
420
  zero_point_list = []
419
421
  scale_list = []
420
422
  quantized_per_channel_data_list = []
423
+ weights_shape = list(weights.shape)
424
+ reshape_dims = list(weights_shape) # deep copy
425
+ reshape_dims[channel_axis] = 1 # only one per channel for reshape
421
426
  for i in range(channel_count):
422
427
  per_channel_data = weights.take(i, channel_axis)
423
428
  channel_override_index = i if i < num_channel_overrides else 0
@@ -460,17 +465,10 @@ class BaseQuantizer:
460
465
 
461
466
  zero_point_list.append(zero_point)
462
467
  scale_list.append(scale)
463
- quantized_per_channel_data_list.append(quantized_per_channel_data)
468
+ quantized_per_channel_data_list.append(np.asarray(quantized_per_channel_data).reshape(reshape_dims))
464
469
 
465
470
  # combine per_channel_data into one
466
- weights_shape = list(weights.shape)
467
- reshape_dims = list(weights_shape) # deep copy
468
- reshape_dims[channel_axis] = 1 # only one per channel for reshape
469
- quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
470
- for i in range(1, len(quantized_per_channel_data_list)):
471
- channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
472
- quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
473
-
471
+ quantized_weights = np.concatenate(quantized_per_channel_data_list, channel_axis)
474
472
  q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
475
473
  zp_name = weight_name + "_zero_point"
476
474
  scale_name = weight_name + "_scale"
@@ -519,8 +517,6 @@ class BaseQuantizer:
519
517
  for node in self.model.nodes():
520
518
  # adjust tensor_ranges for input of Clip and Relu node
521
519
  if node.op_type in ["Clip", "Relu"]:
522
- if self.is_activation_symmetric:
523
- continue
524
520
  if not self.should_quantize_node(node):
525
521
  continue
526
522
  if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
@@ -69,6 +69,7 @@ class TensorData:
69
69
  _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
70
70
 
71
71
  def __init__(self, **kwargs):
72
+ self._attrs = list(kwargs.keys())
72
73
  for k, v in kwargs.items():
73
74
  if k not in TensorData._allowed:
74
75
  raise ValueError(f"Unexpected value {k!r} not in {TensorData._allowed}.")
@@ -91,6 +92,12 @@ class TensorData:
91
92
  raise AttributeError(f"Attributes 'avg' and/or 'std' missing in {dir(self)}.")
92
93
  return (self.avg, self.std)
93
94
 
95
+ def to_dict(self):
96
+ # This is needed to serialize the data into JSON.
97
+ data = {k: getattr(self, k) for k in self._attrs}
98
+ data["CLS"] = self.__class__.__name__
99
+ return data
100
+
94
101
 
95
102
  class TensorsData:
96
103
  def __init__(self, calibration_method, data: Dict[str, Union[TensorData, Tuple]]):
@@ -125,12 +132,24 @@ class TensorsData:
125
132
  raise RuntimeError(f"Only an existing tensor can be modified, {key!r} is not.")
126
133
  self.data[key] = value
127
134
 
135
+ def keys(self):
136
+ return self.data.keys()
137
+
128
138
  def values(self):
129
139
  return self.data.values()
130
140
 
131
141
  def items(self):
132
142
  return self.data.items()
133
143
 
144
+ def to_dict(self):
145
+ # This is needed to serialize the data into JSON.
146
+ data = {
147
+ "CLS": self.__class__.__name__,
148
+ "data": self.data,
149
+ "calibration_method": self.calibration_method,
150
+ }
151
+ return data
152
+
134
153
 
135
154
  class CalibrationMethod(Enum):
136
155
  MinMax = 0
@@ -565,16 +584,29 @@ class HistogramCalibrater(CalibraterBase):
565
584
  """
566
585
  Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
567
586
  """
587
+ input_names_set = {node_arg.name for node_arg in self.infer_session.get_inputs()}
588
+ output_names = [node_arg.name for node_arg in self.infer_session.get_outputs()]
589
+
568
590
  while True:
569
591
  inputs = data_reader.get_next()
570
592
  if not inputs:
571
593
  break
572
- self.intermediate_outputs.append(self.infer_session.run(None, inputs))
594
+ outputs = self.infer_session.run(None, inputs)
595
+
596
+ # Copy np.ndarray only for graph outputs that are also graph inputs to workaround bug:
597
+ # https://github.com/microsoft/onnxruntime/issues/21922
598
+ fixed_outputs = []
599
+ for output_index, output in enumerate(outputs):
600
+ if output_names[output_index] in input_names_set:
601
+ fixed_outputs.append(copy.copy(output))
602
+ else:
603
+ fixed_outputs.append(output)
604
+
605
+ self.intermediate_outputs.append(fixed_outputs)
573
606
 
574
607
  if len(self.intermediate_outputs) == 0:
575
608
  raise ValueError("No data is collected.")
576
609
 
577
- output_names = [self.infer_session.get_outputs()[i].name for i in range(len(self.intermediate_outputs[0]))]
578
610
  output_dicts_list = [
579
611
  dict(zip(output_names, intermediate_output)) for intermediate_output in self.intermediate_outputs
580
612
  ]