onnxruntime-directml 1.20.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxruntime/LICENSE +21 -0
- onnxruntime/Privacy.md +21 -0
- onnxruntime/ThirdPartyNotices.txt +6508 -0
- onnxruntime/__init__.py +78 -0
- onnxruntime/backend/__init__.py +6 -0
- onnxruntime/backend/backend.py +174 -0
- onnxruntime/backend/backend_rep.py +53 -0
- onnxruntime/capi/DirectML.dll +0 -0
- onnxruntime/capi/__init__.py +4 -0
- onnxruntime/capi/_ld_preload.py +7 -0
- onnxruntime/capi/_pybind_state.py +33 -0
- onnxruntime/capi/convert_npz_to_onnx_adapter.py +48 -0
- onnxruntime/capi/onnxruntime.dll +0 -0
- onnxruntime/capi/onnxruntime_collect_build_info.py +47 -0
- onnxruntime/capi/onnxruntime_inference_collection.py +1108 -0
- onnxruntime/capi/onnxruntime_providers_shared.dll +0 -0
- onnxruntime/capi/onnxruntime_pybind11_state.pyd +0 -0
- onnxruntime/capi/onnxruntime_validation.py +150 -0
- onnxruntime/capi/version_info.py +2 -0
- onnxruntime/datasets/__init__.py +17 -0
- onnxruntime/datasets/logreg_iris.onnx +0 -0
- onnxruntime/datasets/mul_1.onnx +0 -0
- onnxruntime/datasets/sigmoid.onnx +13 -0
- onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py +78 -0
- onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py +90 -0
- onnxruntime/quantization/CalTableFlatBuffers/__init__.py +0 -0
- onnxruntime/quantization/__init__.py +16 -0
- onnxruntime/quantization/base_quantizer.py +532 -0
- onnxruntime/quantization/calibrate.py +1245 -0
- onnxruntime/quantization/execution_providers/qnn/__init__.py +2 -0
- onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py +132 -0
- onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py +413 -0
- onnxruntime/quantization/execution_providers/qnn/preprocess.py +307 -0
- onnxruntime/quantization/execution_providers/qnn/quant_config.py +387 -0
- onnxruntime/quantization/fusions/__init__.py +3 -0
- onnxruntime/quantization/fusions/fusion.py +311 -0
- onnxruntime/quantization/fusions/fusion_gelu.py +272 -0
- onnxruntime/quantization/fusions/fusion_layernorm.py +135 -0
- onnxruntime/quantization/matmul_4bits_quantizer.py +1480 -0
- onnxruntime/quantization/matmul_bnb4_quantizer.py +240 -0
- onnxruntime/quantization/onnx_model.py +580 -0
- onnxruntime/quantization/onnx_quantizer.py +1008 -0
- onnxruntime/quantization/operators/__init__.py +2 -0
- onnxruntime/quantization/operators/activation.py +119 -0
- onnxruntime/quantization/operators/argmax.py +18 -0
- onnxruntime/quantization/operators/attention.py +73 -0
- onnxruntime/quantization/operators/base_operator.py +26 -0
- onnxruntime/quantization/operators/binary_op.py +72 -0
- onnxruntime/quantization/operators/concat.py +62 -0
- onnxruntime/quantization/operators/conv.py +258 -0
- onnxruntime/quantization/operators/direct_q8.py +78 -0
- onnxruntime/quantization/operators/embed_layernorm.py +121 -0
- onnxruntime/quantization/operators/gather.py +64 -0
- onnxruntime/quantization/operators/gavgpool.py +62 -0
- onnxruntime/quantization/operators/gemm.py +166 -0
- onnxruntime/quantization/operators/lstm.py +117 -0
- onnxruntime/quantization/operators/matmul.py +231 -0
- onnxruntime/quantization/operators/maxpool.py +34 -0
- onnxruntime/quantization/operators/norm.py +40 -0
- onnxruntime/quantization/operators/pad.py +100 -0
- onnxruntime/quantization/operators/pooling.py +67 -0
- onnxruntime/quantization/operators/qdq_base_operator.py +22 -0
- onnxruntime/quantization/operators/resize.py +34 -0
- onnxruntime/quantization/operators/softmax.py +74 -0
- onnxruntime/quantization/operators/split.py +63 -0
- onnxruntime/quantization/operators/where.py +87 -0
- onnxruntime/quantization/preprocess.py +141 -0
- onnxruntime/quantization/qdq_loss_debug.py +389 -0
- onnxruntime/quantization/qdq_quantizer.py +1187 -0
- onnxruntime/quantization/quant_utils.py +891 -0
- onnxruntime/quantization/quantize.py +748 -0
- onnxruntime/quantization/registry.py +106 -0
- onnxruntime/quantization/shape_inference.py +187 -0
- onnxruntime/quantization/tensor_quant_overrides.py +516 -0
- onnxruntime/tools/__init__.py +10 -0
- onnxruntime/tools/check_onnx_model_mobile_usability.py +47 -0
- onnxruntime/tools/convert_onnx_models_to_ort.py +377 -0
- onnxruntime/tools/file_utils.py +46 -0
- onnxruntime/tools/logger.py +11 -0
- onnxruntime/tools/make_dynamic_shape_fixed.py +72 -0
- onnxruntime/tools/mobile_helpers/__init__.py +0 -0
- onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md +33 -0
- onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md +43 -0
- onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md +58 -0
- onnxruntime/tools/mobile_helpers/usability_checker.py +739 -0
- onnxruntime/tools/offline_tuning.py +169 -0
- onnxruntime/tools/onnx_model_utils.py +413 -0
- onnxruntime/tools/onnx_randomizer.py +85 -0
- onnxruntime/tools/onnxruntime_test.py +164 -0
- onnxruntime/tools/optimize_onnx_model.py +55 -0
- onnxruntime/tools/ort_format_model/__init__.py +25 -0
- onnxruntime/tools/ort_format_model/operator_type_usage_processors.py +663 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/__init__.py +0 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py +7 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgTypeAndIndex.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Attribute.py +337 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/AttributeType.py +18 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Checkpoint.py +125 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedKernelCreateInfos.py +120 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedNodeIndexAndKernelDefHash.py +68 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSessionState.py +96 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DeprecatedSubGraphSessionState.py +72 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Dimension.py +71 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValue.py +80 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/DimensionValueType.py +8 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/EdgeEnd.py +32 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/FloatProperty.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Graph.py +320 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/InferenceSession.py +88 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/IntProperty.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrArgsEntry.py +91 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/KernelTypeStrResolver.py +78 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/MapType.py +71 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Model.py +223 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ModuleState.py +141 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Node.py +317 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeEdge.py +126 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodeType.py +7 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/NodesToOptimizeIndices.py +160 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OpIdKernelTypeStrArgsEntry.py +91 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OperatorSetId.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/OptimizerGroup.py +117 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ParameterOptimizerState.py +91 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/PropertyBag.py +152 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecord.py +105 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizationRecordContainerEntry.py +91 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/RuntimeOptimizations.py +79 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SequenceType.py +58 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Shape.py +78 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/SparseTensor.py +114 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringProperty.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/StringStringEntry.py +67 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/Tensor.py +203 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorDataType.py +26 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TensorTypeAndShape.py +71 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfo.py +83 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/TypeInfoValue.py +9 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ValueInfo.py +84 -0
- onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/__init__.py +6 -0
- onnxruntime/tools/ort_format_model/ort_model_processor.py +86 -0
- onnxruntime/tools/ort_format_model/types.py +84 -0
- onnxruntime/tools/ort_format_model/utils.py +62 -0
- onnxruntime/tools/pytorch_export_contrib_ops.py +108 -0
- onnxruntime/tools/pytorch_export_helpers.py +131 -0
- onnxruntime/tools/qdq_helpers/__init__.py +0 -0
- onnxruntime/tools/qdq_helpers/optimize_qdq_model.py +37 -0
- onnxruntime/tools/reduced_build_config_parser.py +202 -0
- onnxruntime/tools/symbolic_shape_infer.py +3016 -0
- onnxruntime/tools/update_onnx_opset.py +31 -0
- onnxruntime/transformers/__init__.py +8 -0
- onnxruntime/transformers/affinity_helper.py +40 -0
- onnxruntime/transformers/benchmark.py +944 -0
- onnxruntime/transformers/benchmark_helper.py +646 -0
- onnxruntime/transformers/bert_perf_test.py +634 -0
- onnxruntime/transformers/bert_test_data.py +642 -0
- onnxruntime/transformers/compare_bert_results.py +246 -0
- onnxruntime/transformers/constants.py +47 -0
- onnxruntime/transformers/convert_generation.py +3124 -0
- onnxruntime/transformers/convert_tf_models_to_pytorch.py +205 -0
- onnxruntime/transformers/convert_to_packing_mode.py +387 -0
- onnxruntime/transformers/dynamo_onnx_helper.py +104 -0
- onnxruntime/transformers/float16.py +501 -0
- onnxruntime/transformers/fusion_attention.py +1235 -0
- onnxruntime/transformers/fusion_attention_clip.py +257 -0
- onnxruntime/transformers/fusion_attention_sam2.py +534 -0
- onnxruntime/transformers/fusion_attention_unet.py +1304 -0
- onnxruntime/transformers/fusion_attention_vae.py +301 -0
- onnxruntime/transformers/fusion_bart_attention.py +640 -0
- onnxruntime/transformers/fusion_base.py +137 -0
- onnxruntime/transformers/fusion_bias_add.py +58 -0
- onnxruntime/transformers/fusion_biasgelu.py +66 -0
- onnxruntime/transformers/fusion_biassplitgelu.py +111 -0
- onnxruntime/transformers/fusion_conformer_attention.py +143 -0
- onnxruntime/transformers/fusion_embedlayer.py +811 -0
- onnxruntime/transformers/fusion_fastgelu.py +360 -0
- onnxruntime/transformers/fusion_gelu.py +259 -0
- onnxruntime/transformers/fusion_gelu_approximation.py +25 -0
- onnxruntime/transformers/fusion_gemmfastgelu.py +122 -0
- onnxruntime/transformers/fusion_gpt_attention.py +546 -0
- onnxruntime/transformers/fusion_gpt_attention_megatron.py +355 -0
- onnxruntime/transformers/fusion_gpt_attention_no_past.py +260 -0
- onnxruntime/transformers/fusion_group_norm.py +179 -0
- onnxruntime/transformers/fusion_layernorm.py +465 -0
- onnxruntime/transformers/fusion_nhwc_conv.py +100 -0
- onnxruntime/transformers/fusion_options.py +340 -0
- onnxruntime/transformers/fusion_qordered_attention.py +421 -0
- onnxruntime/transformers/fusion_qordered_gelu.py +119 -0
- onnxruntime/transformers/fusion_qordered_layernorm.py +123 -0
- onnxruntime/transformers/fusion_qordered_matmul.py +217 -0
- onnxruntime/transformers/fusion_quickgelu.py +74 -0
- onnxruntime/transformers/fusion_reshape.py +173 -0
- onnxruntime/transformers/fusion_rotary_attention.py +1592 -0
- onnxruntime/transformers/fusion_shape.py +110 -0
- onnxruntime/transformers/fusion_simplified_layernorm.py +159 -0
- onnxruntime/transformers/fusion_skip_group_norm.py +255 -0
- onnxruntime/transformers/fusion_skiplayernorm.py +209 -0
- onnxruntime/transformers/fusion_transpose.py +168 -0
- onnxruntime/transformers/fusion_utils.py +307 -0
- onnxruntime/transformers/huggingface_models.py +167 -0
- onnxruntime/transformers/import_utils.py +20 -0
- onnxruntime/transformers/io_binding_helper.py +442 -0
- onnxruntime/transformers/large_model_exporter.py +395 -0
- onnxruntime/transformers/machine_info.py +221 -0
- onnxruntime/transformers/metrics.py +164 -0
- onnxruntime/transformers/models/bart/__init__.py +12 -0
- onnxruntime/transformers/models/bart/export.py +98 -0
- onnxruntime/transformers/models/bert/__init__.py +12 -0
- onnxruntime/transformers/models/bert/eval_squad.py +329 -0
- onnxruntime/transformers/models/gpt2/__init__.py +12 -0
- onnxruntime/transformers/models/gpt2/benchmark_gpt2.py +413 -0
- onnxruntime/transformers/models/gpt2/convert_to_onnx.py +561 -0
- onnxruntime/transformers/models/gpt2/gpt2_helper.py +1032 -0
- onnxruntime/transformers/models/gpt2/gpt2_parity.py +513 -0
- onnxruntime/transformers/models/gpt2/gpt2_tester.py +501 -0
- onnxruntime/transformers/models/gpt2/parity_check_helper.py +146 -0
- onnxruntime/transformers/models/llama/__init__.py +12 -0
- onnxruntime/transformers/models/llama/benchmark.py +703 -0
- onnxruntime/transformers/models/llama/benchmark_all.py +488 -0
- onnxruntime/transformers/models/llama/benchmark_e2e.py +606 -0
- onnxruntime/transformers/models/llama/convert_to_onnx.py +1027 -0
- onnxruntime/transformers/models/llama/dist_settings.py +57 -0
- onnxruntime/transformers/models/llama/llama_inputs.py +503 -0
- onnxruntime/transformers/models/llama/llama_parity.py +309 -0
- onnxruntime/transformers/models/llama/llama_torch.py +47 -0
- onnxruntime/transformers/models/llama/quant_kv_dataloader.py +108 -0
- onnxruntime/transformers/models/longformer/__init__.py +12 -0
- onnxruntime/transformers/models/longformer/benchmark_longformer.py +821 -0
- onnxruntime/transformers/models/longformer/convert_to_onnx.py +413 -0
- onnxruntime/transformers/models/longformer/generate_test_data.py +347 -0
- onnxruntime/transformers/models/longformer/longformer_helper.py +77 -0
- onnxruntime/transformers/models/phi2/__init__.py +12 -0
- onnxruntime/transformers/models/phi2/convert_to_onnx.py +576 -0
- onnxruntime/transformers/models/phi2/inference_example.py +414 -0
- onnxruntime/transformers/models/sam2/__init__.py +12 -0
- onnxruntime/transformers/models/sam2/benchmark_sam2.py +625 -0
- onnxruntime/transformers/models/sam2/convert_to_onnx.py +260 -0
- onnxruntime/transformers/models/sam2/image_decoder.py +273 -0
- onnxruntime/transformers/models/sam2/image_encoder.py +186 -0
- onnxruntime/transformers/models/sam2/mask_decoder.py +208 -0
- onnxruntime/transformers/models/sam2/nvtx_helper.py +33 -0
- onnxruntime/transformers/models/sam2/prompt_encoder.py +189 -0
- onnxruntime/transformers/models/sam2/sam2_demo.py +322 -0
- onnxruntime/transformers/models/sam2/sam2_image_onnx_predictor.py +280 -0
- onnxruntime/transformers/models/sam2/sam2_utils.py +147 -0
- onnxruntime/transformers/models/stable_diffusion/__init__.py +12 -0
- onnxruntime/transformers/models/stable_diffusion/benchmark.py +1429 -0
- onnxruntime/transformers/models/stable_diffusion/benchmark_controlnet.py +426 -0
- onnxruntime/transformers/models/stable_diffusion/demo_txt2img.py +102 -0
- onnxruntime/transformers/models/stable_diffusion/demo_txt2img_xl.py +268 -0
- onnxruntime/transformers/models/stable_diffusion/demo_utils.py +778 -0
- onnxruntime/transformers/models/stable_diffusion/diffusion_models.py +1319 -0
- onnxruntime/transformers/models/stable_diffusion/diffusion_schedulers.py +1181 -0
- onnxruntime/transformers/models/stable_diffusion/engine_builder.py +296 -0
- onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +388 -0
- onnxruntime/transformers/models/stable_diffusion/engine_builder_ort_trt.py +288 -0
- onnxruntime/transformers/models/stable_diffusion/engine_builder_tensorrt.py +395 -0
- onnxruntime/transformers/models/stable_diffusion/engine_builder_torch.py +108 -0
- onnxruntime/transformers/models/stable_diffusion/optimize_pipeline.py +350 -0
- onnxruntime/transformers/models/stable_diffusion/ort_optimizer.py +136 -0
- onnxruntime/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +831 -0
- onnxruntime/transformers/models/stable_diffusion/trt_utilities.py +12 -0
- onnxruntime/transformers/models/t5/__init__.py +12 -0
- onnxruntime/transformers/models/t5/convert_to_onnx.py +278 -0
- onnxruntime/transformers/models/t5/past_helper.py +150 -0
- onnxruntime/transformers/models/t5/t5_decoder.py +438 -0
- onnxruntime/transformers/models/t5/t5_encoder.py +171 -0
- onnxruntime/transformers/models/t5/t5_encoder_decoder_init.py +299 -0
- onnxruntime/transformers/models/t5/t5_helper.py +272 -0
- onnxruntime/transformers/models/whisper/__init__.py +12 -0
- onnxruntime/transformers/models/whisper/benchmark.py +610 -0
- onnxruntime/transformers/models/whisper/benchmark_all.py +528 -0
- onnxruntime/transformers/models/whisper/convert_to_onnx.py +536 -0
- onnxruntime/transformers/models/whisper/whisper_chain.py +329 -0
- onnxruntime/transformers/models/whisper/whisper_decoder.py +402 -0
- onnxruntime/transformers/models/whisper/whisper_encoder.py +164 -0
- onnxruntime/transformers/models/whisper/whisper_encoder_decoder_init.py +306 -0
- onnxruntime/transformers/models/whisper/whisper_helper.py +524 -0
- onnxruntime/transformers/models/whisper/whisper_openai_helper.py +84 -0
- onnxruntime/transformers/onnx_exporter.py +717 -0
- onnxruntime/transformers/onnx_model.py +1569 -0
- onnxruntime/transformers/onnx_model_bart.py +142 -0
- onnxruntime/transformers/onnx_model_bert.py +481 -0
- onnxruntime/transformers/onnx_model_bert_keras.py +475 -0
- onnxruntime/transformers/onnx_model_bert_tf.py +589 -0
- onnxruntime/transformers/onnx_model_clip.py +40 -0
- onnxruntime/transformers/onnx_model_conformer.py +33 -0
- onnxruntime/transformers/onnx_model_gpt2.py +101 -0
- onnxruntime/transformers/onnx_model_phi.py +930 -0
- onnxruntime/transformers/onnx_model_sam2.py +138 -0
- onnxruntime/transformers/onnx_model_t5.py +791 -0
- onnxruntime/transformers/onnx_model_tnlr.py +227 -0
- onnxruntime/transformers/onnx_model_unet.py +259 -0
- onnxruntime/transformers/onnx_model_vae.py +43 -0
- onnxruntime/transformers/onnx_utils.py +55 -0
- onnxruntime/transformers/optimizer.py +612 -0
- onnxruntime/transformers/profiler.py +725 -0
- onnxruntime/transformers/quantize_helper.py +76 -0
- onnxruntime/transformers/shape_infer_helper.py +122 -0
- onnxruntime/transformers/shape_optimizer.py +401 -0
- onnxruntime/transformers/torch_onnx_export_helper.py +74 -0
- onnxruntime_directml-1.20.0.dist-info/METADATA +187 -0
- onnxruntime_directml-1.20.0.dist-info/RECORD +305 -0
- onnxruntime_directml-1.20.0.dist-info/WHEEL +5 -0
- onnxruntime_directml-1.20.0.dist-info/entry_points.txt +2 -0
- onnxruntime_directml-1.20.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,748 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# Licensed under the MIT License. See License.txt in the project root for
|
|
4
|
+
# license information.
|
|
5
|
+
# --------------------------------------------------------------------------
|
|
6
|
+
import logging
|
|
7
|
+
import tempfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
11
|
+
import onnx
|
|
12
|
+
|
|
13
|
+
from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
|
|
14
|
+
from .onnx_quantizer import ONNXQuantizer
|
|
15
|
+
from .qdq_quantizer import QDQQuantizer
|
|
16
|
+
from .quant_utils import (
|
|
17
|
+
QuantFormat,
|
|
18
|
+
QuantizationMode,
|
|
19
|
+
QuantType,
|
|
20
|
+
load_model_with_shape_infer,
|
|
21
|
+
model_has_pre_process_metadata,
|
|
22
|
+
save_and_reload_model_with_shape_infer,
|
|
23
|
+
)
|
|
24
|
+
from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class QuantConfig:
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
activation_type=QuantType.QUInt8,
|
|
31
|
+
weight_type=QuantType.QInt8,
|
|
32
|
+
op_types_to_quantize=None,
|
|
33
|
+
nodes_to_quantize=None,
|
|
34
|
+
nodes_to_exclude=None,
|
|
35
|
+
per_channel=False,
|
|
36
|
+
reduce_range=False,
|
|
37
|
+
use_external_data_format=False,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
This is the Base class for both Static and Dynamic Quantize Configuration
|
|
41
|
+
Args:
|
|
42
|
+
activation_type:
|
|
43
|
+
quantization data type of activation. Please refer to
|
|
44
|
+
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
|
45
|
+
weight_type:
|
|
46
|
+
quantization data type of weight. Please refer to
|
|
47
|
+
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
|
48
|
+
op_types_to_quantize:
|
|
49
|
+
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
|
50
|
+
It quantizes all supported operators by default.
|
|
51
|
+
nodes_to_quantize:
|
|
52
|
+
List of nodes names to quantize. When this list is not None only the nodes in this list
|
|
53
|
+
are quantized.
|
|
54
|
+
example:
|
|
55
|
+
[
|
|
56
|
+
'Conv__224',
|
|
57
|
+
'Conv__252'
|
|
58
|
+
]
|
|
59
|
+
nodes_to_exclude:
|
|
60
|
+
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
|
61
|
+
when it is not None.
|
|
62
|
+
per_channel: quantize weights per channel
|
|
63
|
+
reduce_range:
|
|
64
|
+
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
|
65
|
+
especially for per-channel mode
|
|
66
|
+
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
nodes_to_exclude = nodes_to_exclude or []
|
|
70
|
+
nodes_to_quantize = nodes_to_quantize or []
|
|
71
|
+
op_types_to_quantize = op_types_to_quantize or []
|
|
72
|
+
self.op_types_to_quantize = op_types_to_quantize
|
|
73
|
+
self.per_channel = per_channel
|
|
74
|
+
self.reduce_range = reduce_range
|
|
75
|
+
self.weight_type = weight_type
|
|
76
|
+
self.activation_type = activation_type
|
|
77
|
+
self.nodes_to_quantize = nodes_to_quantize
|
|
78
|
+
self.nodes_to_exclude = nodes_to_exclude
|
|
79
|
+
self.use_external_data_format = use_external_data_format
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class StaticQuantConfig(QuantConfig):
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
calibration_data_reader: CalibrationDataReader,
|
|
86
|
+
calibrate_method=CalibrationMethod.MinMax,
|
|
87
|
+
quant_format=QuantFormat.QDQ,
|
|
88
|
+
activation_type=QuantType.QInt8,
|
|
89
|
+
weight_type=QuantType.QInt8,
|
|
90
|
+
op_types_to_quantize=None,
|
|
91
|
+
nodes_to_quantize=None,
|
|
92
|
+
nodes_to_exclude=None,
|
|
93
|
+
per_channel=False,
|
|
94
|
+
reduce_range=False,
|
|
95
|
+
use_external_data_format=False,
|
|
96
|
+
extra_options=None,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
This is the derived class for static Quantize Configuration
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
calibration_data_reader:
|
|
103
|
+
a calibration data reader. It enumerates calibration data and generates inputs for the original model.
|
|
104
|
+
calibrate_method:
|
|
105
|
+
Current calibration methods supported are MinMax, Entropy and Percentile.
|
|
106
|
+
quant_format: QuantFormat{QOperator, QDQ}.
|
|
107
|
+
QOperator format quantizes the model with quantized operators directly.
|
|
108
|
+
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
|
|
109
|
+
extra_options:
|
|
110
|
+
key value pair dictionary for various options in different case. Current used:
|
|
111
|
+
extra.Sigmoid.nnapi = True/False (Default is False)
|
|
112
|
+
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
|
113
|
+
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
|
114
|
+
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
|
|
115
|
+
Dyanmic mode currently is supported. Will support more in future.
|
|
116
|
+
ForceQuantizeNoInputCheck = True/False :
|
|
117
|
+
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
|
118
|
+
quantized already. Setting to True to force such operator always quantize input and so generate
|
|
119
|
+
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
|
120
|
+
MatMulConstBOnly = True/False:
|
|
121
|
+
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
|
|
122
|
+
AddQDQPairToWeight = True/False :
|
|
123
|
+
Default is False which quantizes floating-point weight and feeds it to solely inserted
|
|
124
|
+
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
|
|
125
|
+
QuantizeLinear/DeQuantizeLinear nodes to weight.
|
|
126
|
+
OpTypesToExcludeOutputQuantization = list of op type :
|
|
127
|
+
Default is []. If any op type is specified, it won't quantize the output of ops with this
|
|
128
|
+
specific op types.
|
|
129
|
+
DedicatedQDQPair = True/False :
|
|
130
|
+
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
|
|
131
|
+
inputs. If True, it will create identical and dedicated QDQ pair for each node.
|
|
132
|
+
QDQOpTypePerChannelSupportToAxis = dictionary :
|
|
133
|
+
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
|
|
134
|
+
effective only when per channel quantization is supported and per_channel is True. If specific
|
|
135
|
+
op type supports per channel quantization but not explicitly specified with channel axis,
|
|
136
|
+
default channel axis will be used.
|
|
137
|
+
CalibTensorRangeSymmetric = True/False :
|
|
138
|
+
Default is False. If enabled, the final range of tensor during calibration will be explicitly
|
|
139
|
+
set to symmetric to central point "0".
|
|
140
|
+
CalibMovingAverage = True/False :
|
|
141
|
+
Default is False. If enabled, the moving average of the minimum and maximum values will be
|
|
142
|
+
computed when the calibration method selected is MinMax.
|
|
143
|
+
CalibMovingAverageConstant = float :
|
|
144
|
+
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
|
|
145
|
+
minimum and maximum values. Effective only when the calibration method selected is MinMax and
|
|
146
|
+
when CalibMovingAverage is set to True.
|
|
147
|
+
QuantizeBias = True/False :
|
|
148
|
+
Default is True which quantizes floating-point biases and it solely inserts
|
|
149
|
+
a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
|
|
150
|
+
any quantization nodes associated with biases.
|
|
151
|
+
This extra option is only effective when quant_format is QuantFormat.QDQ.
|
|
152
|
+
SmoothQuant = True/False :
|
|
153
|
+
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
|
|
154
|
+
fake input channel quantization.
|
|
155
|
+
SmoothQuantAlpha = float :
|
|
156
|
+
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
|
|
157
|
+
and activation quantization. A larger alpha value could be used on models with more significant
|
|
158
|
+
activation outliers to migrate more quantization difficulty to weights.
|
|
159
|
+
SmoothQuantFolding = True/False :
|
|
160
|
+
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
|
|
161
|
+
SmoothQuant will be folded into the previous op if the previous op is foldable.
|
|
162
|
+
UseQDQContribOps = True/False :
|
|
163
|
+
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
|
|
164
|
+
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
|
|
165
|
+
contrib op implementations. The contrib op implementations may support features not standardized
|
|
166
|
+
into the ONNX specification (e.g., 16-bit quantization types).
|
|
167
|
+
MinimumRealRange = float|None :
|
|
168
|
+
Default is None. If set to a floating-point value, the calculation of the quantization parameters
|
|
169
|
+
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
|
|
170
|
+
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
|
|
171
|
+
necessary for EPs like QNN that require a minimum floating-point range when determining
|
|
172
|
+
quantization parameters.
|
|
173
|
+
TensorQuantOverrides = dictionary :
|
|
174
|
+
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
|
|
175
|
+
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
|
|
176
|
+
per-channel quantization, the list contains a dictionary for each channel in the tensor.
|
|
177
|
+
Each dictionary contains optional overrides with the following keys and values.
|
|
178
|
+
'quant_type' = QuantType : The tensor's quantization data type.
|
|
179
|
+
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
|
180
|
+
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
|
181
|
+
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
|
182
|
+
set `scale` or `zero_point`.
|
|
183
|
+
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
|
184
|
+
set `scale` or `zero_point`.
|
|
185
|
+
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
|
186
|
+
Invalid if also set `scale` or `zero_point`.
|
|
187
|
+
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
|
188
|
+
Invalid if also set `scale` or `zero_point`.
|
|
189
|
+
QDQKeepRemovableActivations = True/False:
|
|
190
|
+
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
|
|
191
|
+
will be explicitly represented in the QDQ model. If false, these activations are automatically
|
|
192
|
+
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
|
|
193
|
+
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
|
|
194
|
+
operators from the model.
|
|
195
|
+
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: Raise ValueError if execution provider is unknown
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
super().__init__(
|
|
201
|
+
activation_type=activation_type,
|
|
202
|
+
weight_type=weight_type,
|
|
203
|
+
op_types_to_quantize=op_types_to_quantize,
|
|
204
|
+
nodes_to_quantize=nodes_to_quantize,
|
|
205
|
+
nodes_to_exclude=nodes_to_exclude,
|
|
206
|
+
per_channel=per_channel,
|
|
207
|
+
reduce_range=reduce_range,
|
|
208
|
+
use_external_data_format=use_external_data_format,
|
|
209
|
+
)
|
|
210
|
+
self.calibration_data_reader = calibration_data_reader
|
|
211
|
+
self.calibrate_method = calibrate_method
|
|
212
|
+
self.quant_format = quant_format
|
|
213
|
+
self.extra_options = extra_options or {}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class DynamicQuantConfig(QuantConfig):
|
|
217
|
+
def __init__(
|
|
218
|
+
self,
|
|
219
|
+
weight_type=QuantType.QInt8,
|
|
220
|
+
op_types_to_quantize=None,
|
|
221
|
+
nodes_to_quantize=None,
|
|
222
|
+
nodes_to_exclude=None,
|
|
223
|
+
per_channel=False,
|
|
224
|
+
reduce_range=False,
|
|
225
|
+
use_external_data_format=False,
|
|
226
|
+
extra_options=None,
|
|
227
|
+
):
|
|
228
|
+
"""
|
|
229
|
+
This is a class for dynamic Quant Configuration
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
extra_options: key value pair dictionary for various options in different case. Current used:
|
|
233
|
+
extra.Sigmoid.nnapi = True/False (Default is False)
|
|
234
|
+
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
|
235
|
+
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
|
236
|
+
EnableSubgraph = True/False :
|
|
237
|
+
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
|
|
238
|
+
support more in the future.
|
|
239
|
+
ForceQuantizeNoInputCheck = True/False :
|
|
240
|
+
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
|
241
|
+
quantized already. Setting to True to force such operator always quantize input and so generate
|
|
242
|
+
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
|
243
|
+
MatMulConstBOnly = True/False:
|
|
244
|
+
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
|
|
245
|
+
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: Raise ValueError if execution provider is unknown
|
|
249
|
+
"""
|
|
250
|
+
super().__init__(
|
|
251
|
+
op_types_to_quantize=op_types_to_quantize,
|
|
252
|
+
per_channel=per_channel,
|
|
253
|
+
reduce_range=reduce_range,
|
|
254
|
+
weight_type=weight_type,
|
|
255
|
+
nodes_to_quantize=nodes_to_quantize,
|
|
256
|
+
nodes_to_exclude=nodes_to_exclude,
|
|
257
|
+
use_external_data_format=use_external_data_format,
|
|
258
|
+
)
|
|
259
|
+
self.extra_options = extra_options or {}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
|
|
263
|
+
if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
"ONNXRuntime quantization doesn't support data format:"
|
|
266
|
+
"activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
|
|
267
|
+
)
|
|
268
|
+
if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
|
|
269
|
+
raise ValueError(
|
|
270
|
+
f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
|
|
271
|
+
f"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
|
|
275
|
+
raise ValueError(
|
|
276
|
+
"ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
|
|
277
|
+
f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
q16_types = [QuantType.QInt16, QuantType.QUInt16]
|
|
281
|
+
|
|
282
|
+
if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
|
|
283
|
+
raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
|
|
284
|
+
|
|
285
|
+
if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
|
|
286
|
+
logging.warning(
|
|
287
|
+
"Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
|
|
288
|
+
"Or it will lead to bad performance on x64."
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def quantize_static(
|
|
293
|
+
model_input: Union[str, Path, onnx.ModelProto],
|
|
294
|
+
model_output: Union[str, Path],
|
|
295
|
+
calibration_data_reader: CalibrationDataReader,
|
|
296
|
+
quant_format=QuantFormat.QDQ,
|
|
297
|
+
op_types_to_quantize=None,
|
|
298
|
+
per_channel=False,
|
|
299
|
+
reduce_range=False,
|
|
300
|
+
activation_type=QuantType.QInt8,
|
|
301
|
+
weight_type=QuantType.QInt8,
|
|
302
|
+
nodes_to_quantize=None,
|
|
303
|
+
nodes_to_exclude=None,
|
|
304
|
+
use_external_data_format=False,
|
|
305
|
+
calibrate_method=CalibrationMethod.MinMax,
|
|
306
|
+
extra_options=None,
|
|
307
|
+
):
|
|
308
|
+
"""
|
|
309
|
+
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
|
|
310
|
+
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
|
|
311
|
+
= QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
|
|
312
|
+
targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
|
|
313
|
+
accuracy.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
|
|
317
|
+
model_input: file path of model or ModelProto to quantize
|
|
318
|
+
model_output: file path of quantized model
|
|
319
|
+
calibration_data_reader: a calibration data reader. It
|
|
320
|
+
enumerates calibration data and generates inputs for the
|
|
321
|
+
original model.
|
|
322
|
+
quant_format: QuantFormat{QOperator, QDQ}.
|
|
323
|
+
QOperator format quantizes the model with quantized operators directly.
|
|
324
|
+
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
|
|
325
|
+
activation_type:
|
|
326
|
+
quantization data type of activation. Please refer to
|
|
327
|
+
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
|
328
|
+
calibrate_method:
|
|
329
|
+
Current calibration methods supported are MinMax and Entropy.
|
|
330
|
+
Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
|
|
331
|
+
op_types_to_quantize:
|
|
332
|
+
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
|
333
|
+
It quantizes all supported operators by default.
|
|
334
|
+
per_channel: quantize weights per channel
|
|
335
|
+
reduce_range:
|
|
336
|
+
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
|
337
|
+
especially for per-channel mode
|
|
338
|
+
weight_type:
|
|
339
|
+
quantization data type of weight. Please refer to
|
|
340
|
+
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
|
341
|
+
nodes_to_quantize:
|
|
342
|
+
List of nodes names to quantize. When this list is not None only the nodes in this list
|
|
343
|
+
are quantized.
|
|
344
|
+
example:
|
|
345
|
+
[
|
|
346
|
+
'Conv__224',
|
|
347
|
+
'Conv__252'
|
|
348
|
+
]
|
|
349
|
+
nodes_to_exclude:
|
|
350
|
+
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
|
351
|
+
when it is not None.
|
|
352
|
+
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
|
353
|
+
extra_options:
|
|
354
|
+
key value pair dictionary for various options in different case. Current used:
|
|
355
|
+
extra.Sigmoid.nnapi = True/False (Default is False)
|
|
356
|
+
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
|
357
|
+
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
|
358
|
+
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
|
|
359
|
+
Dyanmic mode currently is supported. Will support more in the future.
|
|
360
|
+
ForceQuantizeNoInputCheck = True/False :
|
|
361
|
+
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
|
362
|
+
quantized already. Setting to True to force such operator always quantize input and so generate
|
|
363
|
+
quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
|
|
364
|
+
MatMulConstBOnly = True/False:
|
|
365
|
+
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
|
|
366
|
+
AddQDQPairToWeight = True/False :
|
|
367
|
+
Default is False which quantizes floating-point weight and feeds it to solely inserted
|
|
368
|
+
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
|
|
369
|
+
QuantizeLinear/DeQuantizeLinear nodes to weight.
|
|
370
|
+
OpTypesToExcludeOutputQuantization = list of op type :
|
|
371
|
+
Default is []. If any op type is specified, it won't quantize the output of ops with this
|
|
372
|
+
specific op types.
|
|
373
|
+
DedicatedQDQPair = True/False :
|
|
374
|
+
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
|
|
375
|
+
inputs. If True, it will create identical and dedicated QDQ pair for each node.
|
|
376
|
+
QDQOpTypePerChannelSupportToAxis = dictionary :
|
|
377
|
+
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
|
|
378
|
+
effective only when per channel quantization is supported and per_channel is True. If specific
|
|
379
|
+
op type supports per channel quantization but not explicitly specified with channel axis,
|
|
380
|
+
default channel axis will be used.
|
|
381
|
+
CalibTensorRangeSymmetric = True/False :
|
|
382
|
+
Default is False. If enabled, the final range of tensor during calibration will be explicitly
|
|
383
|
+
set to symmetric to central point "0".
|
|
384
|
+
CalibStridedMinMax = Optional[int] :
|
|
385
|
+
Default is None. If set to an integer, during calculation of the min-max, only stride amount of
|
|
386
|
+
data will be used and then all results will be merged in the end.
|
|
387
|
+
CalibMovingAverage = True/False :
|
|
388
|
+
Default is False. If enabled, the moving average of the minimum and maximum values will be
|
|
389
|
+
computed when the calibration method selected is MinMax.
|
|
390
|
+
CalibMovingAverageConstant = float :
|
|
391
|
+
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
|
|
392
|
+
minimum and maximum values. Effective only when the calibration method selected is MinMax and
|
|
393
|
+
when CalibMovingAverage is set to True.
|
|
394
|
+
CalibMaxIntermediateOutputs = Optional[int] :
|
|
395
|
+
Default is None. If set to an integer, during calculation of the min-max range of the tensors
|
|
396
|
+
it will load at max value number of outputs before computing and merging the range. This will
|
|
397
|
+
produce the same result as all computing with None, but is more memory efficient.
|
|
398
|
+
SmoothQuant = True/False :
|
|
399
|
+
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
|
|
400
|
+
fake input channel quantization.
|
|
401
|
+
SmoothQuantAlpha = float :
|
|
402
|
+
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
|
|
403
|
+
and activation quantization. A larger alpha value could be used on models with more significant
|
|
404
|
+
activation outliers to migrate more quantization difficulty to weights.
|
|
405
|
+
SmoothQuantFolding = True/False :
|
|
406
|
+
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
|
|
407
|
+
SmoothQuant will be folded into the previous op if the previous op is foldable.
|
|
408
|
+
UseQDQContribOps = True/False :
|
|
409
|
+
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
|
|
410
|
+
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
|
|
411
|
+
contrib op implementations. The contrib op implementations may support features not standardized
|
|
412
|
+
into the ONNX specification (e.g., 16-bit quantization types).
|
|
413
|
+
MinimumRealRange = float|None :
|
|
414
|
+
Default is None. If set to a floating-point value, the calculation of the quantization parameters
|
|
415
|
+
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
|
|
416
|
+
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
|
|
417
|
+
necessary for EPs like QNN that require a minimum floating-point range when determining
|
|
418
|
+
quantization parameters.
|
|
419
|
+
TensorQuantOverrides = dictionary :
|
|
420
|
+
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
|
|
421
|
+
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
|
|
422
|
+
per-channel quantization, the list contains a dictionary for each channel in the tensor.
|
|
423
|
+
Each dictionary contains optional overrides with the following keys and values.
|
|
424
|
+
'quant_type' = QuantType : The tensor's quantization data type.
|
|
425
|
+
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
|
426
|
+
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
|
427
|
+
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
|
428
|
+
set `scale` or `zero_point`.
|
|
429
|
+
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
|
430
|
+
set `scale` or `zero_point`.
|
|
431
|
+
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
|
432
|
+
Invalid if also set `scale` or `zero_point`.
|
|
433
|
+
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
|
434
|
+
Invalid if also set `scale` or `zero_point`.
|
|
435
|
+
QDQKeepRemovableActivations = True/False:
|
|
436
|
+
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
|
|
437
|
+
will be explicitly represented in the QDQ model. If false, these activations are automatically
|
|
438
|
+
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
|
|
439
|
+
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
|
|
440
|
+
operators from the model.
|
|
441
|
+
"""
|
|
442
|
+
if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
|
|
443
|
+
if calibrate_method != CalibrationMethod.Distribution:
|
|
444
|
+
raise ValueError("Only Distribution calibration method is supported for float quantization.")
|
|
445
|
+
|
|
446
|
+
extra_options = extra_options or {}
|
|
447
|
+
nodes_to_exclude = nodes_to_exclude or []
|
|
448
|
+
nodes_to_quantize = nodes_to_quantize or []
|
|
449
|
+
op_types_to_quantize = op_types_to_quantize or []
|
|
450
|
+
mode = QuantizationMode.QLinearOps
|
|
451
|
+
|
|
452
|
+
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
|
|
453
|
+
q_linear_ops = list(QLinearOpsRegistry.keys())
|
|
454
|
+
qdq_ops = list(QDQRegistry.keys())
|
|
455
|
+
op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
|
|
456
|
+
|
|
457
|
+
model = (
|
|
458
|
+
save_and_reload_model_with_shape_infer(model_input)
|
|
459
|
+
if isinstance(model_input, onnx.ModelProto)
|
|
460
|
+
else load_model_with_shape_infer(Path(model_input))
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
pre_processed: bool = model_has_pre_process_metadata(model)
|
|
464
|
+
if not pre_processed:
|
|
465
|
+
logging.warning(
|
|
466
|
+
"Please consider to run pre-processing before quantization. Refer to example: "
|
|
467
|
+
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
|
468
|
+
"/cpu/ReadMe.md "
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
calib_extra_options_keys = [
|
|
472
|
+
("CalibTensorRangeSymmetric", "symmetric"),
|
|
473
|
+
("CalibMovingAverage", "moving_average"),
|
|
474
|
+
("CalibMovingAverageConstant", "averaging_constant"),
|
|
475
|
+
("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
|
|
476
|
+
]
|
|
477
|
+
calib_extra_options = {
|
|
478
|
+
key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
if extra_options.get("SmoothQuant", False):
|
|
482
|
+
import importlib
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logging.error(f"{e}.")
|
|
488
|
+
raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
|
|
489
|
+
|
|
490
|
+
import copy
|
|
491
|
+
|
|
492
|
+
from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
|
|
493
|
+
|
|
494
|
+
def inc_dataloader():
|
|
495
|
+
data_reader = copy.deepcopy(calibration_data_reader)
|
|
496
|
+
for data in data_reader:
|
|
497
|
+
yield data, None
|
|
498
|
+
|
|
499
|
+
orig_nodes = [i.name for i in model.graph.node]
|
|
500
|
+
dataloader = inc_dataloader()
|
|
501
|
+
sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
|
|
502
|
+
del dataloader
|
|
503
|
+
model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
|
|
504
|
+
sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
|
|
505
|
+
model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
|
|
506
|
+
model.save(model_input)
|
|
507
|
+
nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
|
|
508
|
+
model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration
|
|
509
|
+
|
|
510
|
+
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
|
|
511
|
+
if isinstance(model_input, onnx.ModelProto):
|
|
512
|
+
output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
|
|
513
|
+
onnx.save_model(
|
|
514
|
+
model_input,
|
|
515
|
+
output_path,
|
|
516
|
+
save_as_external_data=True,
|
|
517
|
+
)
|
|
518
|
+
model_input = output_path
|
|
519
|
+
|
|
520
|
+
calibrator = create_calibrator(
|
|
521
|
+
Path(model_input),
|
|
522
|
+
op_types_to_quantize,
|
|
523
|
+
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
|
|
524
|
+
calibrate_method=calibrate_method,
|
|
525
|
+
use_external_data_format=use_external_data_format,
|
|
526
|
+
extra_options=calib_extra_options,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
stride = extra_options.get("CalibStridedMinMax", None)
|
|
530
|
+
if stride:
|
|
531
|
+
total_data_size = len(calibration_data_reader)
|
|
532
|
+
if total_data_size % stride != 0:
|
|
533
|
+
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
|
|
534
|
+
|
|
535
|
+
for start in range(0, total_data_size, stride):
|
|
536
|
+
end_index = start + stride
|
|
537
|
+
calibration_data_reader.set_range(start_index=start, end_index=end_index)
|
|
538
|
+
calibrator.collect_data(calibration_data_reader)
|
|
539
|
+
else:
|
|
540
|
+
calibrator.collect_data(calibration_data_reader)
|
|
541
|
+
tensors_range = calibrator.compute_data()
|
|
542
|
+
if not isinstance(tensors_range, TensorsData):
|
|
543
|
+
raise TypeError(
|
|
544
|
+
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
|
|
545
|
+
)
|
|
546
|
+
del calibrator
|
|
547
|
+
|
|
548
|
+
check_static_quant_arguments(quant_format, activation_type, weight_type)
|
|
549
|
+
|
|
550
|
+
if quant_format is QuantFormat.QOperator:
|
|
551
|
+
quantizer = ONNXQuantizer(
|
|
552
|
+
model,
|
|
553
|
+
per_channel,
|
|
554
|
+
reduce_range,
|
|
555
|
+
mode,
|
|
556
|
+
True, # static
|
|
557
|
+
weight_type,
|
|
558
|
+
activation_type,
|
|
559
|
+
tensors_range,
|
|
560
|
+
nodes_to_quantize,
|
|
561
|
+
nodes_to_exclude,
|
|
562
|
+
op_types_to_quantize,
|
|
563
|
+
extra_options,
|
|
564
|
+
)
|
|
565
|
+
else:
|
|
566
|
+
quantizer = QDQQuantizer(
|
|
567
|
+
model,
|
|
568
|
+
per_channel,
|
|
569
|
+
reduce_range,
|
|
570
|
+
weight_type,
|
|
571
|
+
activation_type,
|
|
572
|
+
tensors_range,
|
|
573
|
+
nodes_to_quantize,
|
|
574
|
+
nodes_to_exclude,
|
|
575
|
+
op_types_to_quantize,
|
|
576
|
+
extra_options,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
quantizer.quantize_model()
|
|
580
|
+
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
|
581
|
+
if not pre_processed:
|
|
582
|
+
logging.warning(
|
|
583
|
+
"Please consider pre-processing before quantization. See "
|
|
584
|
+
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
|
585
|
+
"/cpu/ReadMe.md "
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if extra_options.get("SmoothQuant", False):
|
|
589
|
+
sq_path.cleanup()
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def quantize_dynamic(
|
|
593
|
+
model_input: Union[str, Path, onnx.ModelProto],
|
|
594
|
+
model_output: Union[str, Path],
|
|
595
|
+
op_types_to_quantize=None,
|
|
596
|
+
per_channel=False,
|
|
597
|
+
reduce_range=False,
|
|
598
|
+
weight_type=QuantType.QInt8,
|
|
599
|
+
nodes_to_quantize=None,
|
|
600
|
+
nodes_to_exclude=None,
|
|
601
|
+
use_external_data_format=False,
|
|
602
|
+
extra_options=None,
|
|
603
|
+
):
|
|
604
|
+
"""Given an onnx model, create a quantized onnx model and save it into a file
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
model_input: file path of model or ModelProto to quantize
|
|
608
|
+
model_output: file path of quantized model
|
|
609
|
+
op_types_to_quantize:
|
|
610
|
+
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
|
611
|
+
It quantizes all supported operators by default.
|
|
612
|
+
per_channel: quantize weights per channel
|
|
613
|
+
reduce_range:
|
|
614
|
+
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
|
615
|
+
especially for per-channel mode
|
|
616
|
+
weight_type:
|
|
617
|
+
quantization data type of weight. Please refer to
|
|
618
|
+
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
|
619
|
+
nodes_to_quantize:
|
|
620
|
+
List of nodes names to quantize. When this list is not None only the nodes in this list
|
|
621
|
+
are quantized.
|
|
622
|
+
example:
|
|
623
|
+
[
|
|
624
|
+
'Conv__224',
|
|
625
|
+
'Conv__252'
|
|
626
|
+
]
|
|
627
|
+
nodes_to_exclude:
|
|
628
|
+
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
|
629
|
+
when it is not None.
|
|
630
|
+
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
|
631
|
+
extra_options:
|
|
632
|
+
key value pair dictionary for various options in different case. Current used:
|
|
633
|
+
extra.Sigmoid.nnapi = True/False (Default is False)
|
|
634
|
+
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
|
635
|
+
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
|
636
|
+
EnableSubgraph = True/False :
|
|
637
|
+
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
|
|
638
|
+
support more in the future.
|
|
639
|
+
ForceQuantizeNoInputCheck = True/False :
|
|
640
|
+
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
|
641
|
+
quantized already. Setting to True to force such operator always quantize input and so generate
|
|
642
|
+
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
|
643
|
+
MatMulConstBOnly = True/False:
|
|
644
|
+
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
|
|
645
|
+
"""
|
|
646
|
+
extra_options = extra_options or {}
|
|
647
|
+
nodes_to_exclude = nodes_to_exclude or []
|
|
648
|
+
nodes_to_quantize = nodes_to_quantize or []
|
|
649
|
+
op_types_to_quantize = op_types_to_quantize or []
|
|
650
|
+
|
|
651
|
+
mode = QuantizationMode.IntegerOps
|
|
652
|
+
|
|
653
|
+
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
|
|
654
|
+
op_types_to_quantize = list(IntegerOpsRegistry.keys())
|
|
655
|
+
|
|
656
|
+
model = (
|
|
657
|
+
save_and_reload_model_with_shape_infer(model_input)
|
|
658
|
+
if isinstance(model_input, onnx.ModelProto)
|
|
659
|
+
else load_model_with_shape_infer(Path(model_input))
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
pre_processed: bool = model_has_pre_process_metadata(model)
|
|
663
|
+
if not pre_processed:
|
|
664
|
+
logging.warning(
|
|
665
|
+
"Please consider to run pre-processing before quantization. Refer to example: "
|
|
666
|
+
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
|
667
|
+
"/cpu/ReadMe.md "
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
if "MatMulConstBOnly" not in extra_options:
|
|
671
|
+
extra_options["MatMulConstBOnly"] = True
|
|
672
|
+
|
|
673
|
+
quantizer = ONNXQuantizer(
|
|
674
|
+
model,
|
|
675
|
+
per_channel,
|
|
676
|
+
reduce_range,
|
|
677
|
+
mode,
|
|
678
|
+
False, # static
|
|
679
|
+
weight_type,
|
|
680
|
+
QuantType.QUInt8, # dynamic activation only supports uint8
|
|
681
|
+
None,
|
|
682
|
+
nodes_to_quantize,
|
|
683
|
+
nodes_to_exclude,
|
|
684
|
+
op_types_to_quantize,
|
|
685
|
+
extra_options,
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
quantizer.quantize_model()
|
|
689
|
+
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def quantize(
|
|
693
|
+
model_input: Union[str, Path, onnx.ModelProto],
|
|
694
|
+
model_output: Union[str, Path],
|
|
695
|
+
quant_config: QuantConfig,
|
|
696
|
+
):
|
|
697
|
+
"""Quantize a model with QuantConfig.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
|
|
701
|
+
model_output (str | Path): Path to save the quantized model.
|
|
702
|
+
quant_config (QuantConfig | WeightOnlyQuantConfig): Quantization Configuration.
|
|
703
|
+
"""
|
|
704
|
+
if isinstance(quant_config, StaticQuantConfig):
|
|
705
|
+
quantize_static(
|
|
706
|
+
model_input,
|
|
707
|
+
model_output,
|
|
708
|
+
quant_config.calibration_data_reader,
|
|
709
|
+
calibrate_method=quant_config.calibrate_method,
|
|
710
|
+
quant_format=quant_config.quant_format,
|
|
711
|
+
activation_type=quant_config.activation_type,
|
|
712
|
+
weight_type=quant_config.weight_type,
|
|
713
|
+
op_types_to_quantize=quant_config.op_types_to_quantize,
|
|
714
|
+
nodes_to_quantize=quant_config.nodes_to_quantize,
|
|
715
|
+
nodes_to_exclude=quant_config.nodes_to_exclude,
|
|
716
|
+
per_channel=quant_config.per_channel,
|
|
717
|
+
reduce_range=quant_config.reduce_range,
|
|
718
|
+
use_external_data_format=quant_config.use_external_data_format,
|
|
719
|
+
extra_options=quant_config.extra_options,
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
elif isinstance(quant_config, DynamicQuantConfig):
|
|
723
|
+
quantize_dynamic(
|
|
724
|
+
model_input,
|
|
725
|
+
model_output,
|
|
726
|
+
weight_type=quant_config.weight_type,
|
|
727
|
+
op_types_to_quantize=quant_config.op_types_to_quantize,
|
|
728
|
+
nodes_to_quantize=quant_config.nodes_to_quantize,
|
|
729
|
+
nodes_to_exclude=quant_config.nodes_to_exclude,
|
|
730
|
+
per_channel=quant_config.per_channel,
|
|
731
|
+
reduce_range=quant_config.reduce_range,
|
|
732
|
+
use_external_data_format=quant_config.use_external_data_format,
|
|
733
|
+
extra_options=quant_config.extra_options,
|
|
734
|
+
)
|
|
735
|
+
else:
|
|
736
|
+
# training package doesn't has quantize_matmul_4bits, avoid global import
|
|
737
|
+
from .matmul_4bits_quantizer import MatMul4BitsQuantizer, WeightOnlyQuantConfig
|
|
738
|
+
|
|
739
|
+
if isinstance(quant_config, WeightOnlyQuantConfig):
|
|
740
|
+
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load(model_input)
|
|
741
|
+
quant = MatMul4BitsQuantizer(model, algo_config=quant_config)
|
|
742
|
+
quant.process()
|
|
743
|
+
quant.model.save_model_to_file(model_output, True)
|
|
744
|
+
else:
|
|
745
|
+
raise TypeError(
|
|
746
|
+
"Invalid quantization config type, it must be either StaticQuantConfig, "
|
|
747
|
+
"DynamicQuantConfig, or WeightOnlyQuantConfig."
|
|
748
|
+
)
|