mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
from msprobe.core.compare.indicator_analysis.api_data import ApiData
|
|
20
|
+
from msprobe.core.compare.indicator_analysis.utils import is_inf_or_nan, str2float, ResultLevel, IgnoreInfo, \
|
|
21
|
+
get_data_list_by_ignore_info
|
|
22
|
+
from msprobe.core.common.const import CompareConst
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BaseAlgorithm(ABC):
|
|
26
|
+
"""比对算法基类"""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
30
|
+
"""
|
|
31
|
+
算法执行接口
|
|
32
|
+
:param api_data: 结构化的 API 数据
|
|
33
|
+
:param ignore_info: 当前 API 数据需要忽略的指标信息
|
|
34
|
+
"""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class InfNanErrChecker(BaseAlgorithm):
|
|
39
|
+
"""
|
|
40
|
+
适用于真实数据模式、统计数据模式
|
|
41
|
+
一个 API 或模块的 NPU 的最大值或最小值中存在 nan/inf/-inf 标记为 error
|
|
42
|
+
但如果 bench 侧也有相同现象,则忽略
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
self.result_level = ResultLevel.ERROR
|
|
47
|
+
self.err_msg = f'{self.result_level.value}: There is nan/inf/-inf in the maximum or minimum value of NPU.'
|
|
48
|
+
|
|
49
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
50
|
+
data_lists = get_data_list_by_ignore_info(api_data, ignore_info)
|
|
51
|
+
if not data_lists:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
for data_list in data_lists:
|
|
55
|
+
bench_max = api_data.get_data_by_header(CompareConst.BENCH_MAX, data_list)
|
|
56
|
+
bench_min = api_data.get_data_by_header(CompareConst.BENCH_MIN, data_list)
|
|
57
|
+
|
|
58
|
+
if is_inf_or_nan(bench_max) or is_inf_or_nan(bench_min):
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
npu_max = api_data.get_data_by_header(CompareConst.NPU_MAX, data_list)
|
|
62
|
+
npu_min = api_data.get_data_by_header(CompareConst.NPU_MIN, data_list)
|
|
63
|
+
|
|
64
|
+
if is_inf_or_nan(npu_max) or is_inf_or_nan(npu_min):
|
|
65
|
+
api_data.set_result(data_list, self.result_level)
|
|
66
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class RelativeErrChecker(BaseAlgorithm):
|
|
70
|
+
"""
|
|
71
|
+
适用于统计数据模式
|
|
72
|
+
指标需要结合输入和输出共同计算得到
|
|
73
|
+
一个 API 或模块的 input 的相对误差 < 0.1 且 output 的相对误差 > 0.5,默认选取norm relative err观测, 标记为 error
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self):
|
|
77
|
+
self.in_threshold = 0.1
|
|
78
|
+
self.out_threshold = 0.5
|
|
79
|
+
self.result_level = ResultLevel.ERROR
|
|
80
|
+
self.err_msg = (f'{self.result_level.value}: The {CompareConst.NORM_RELATIVE_ERR} of output '
|
|
81
|
+
f'is greater than {self.out_threshold}.')
|
|
82
|
+
|
|
83
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
84
|
+
if ignore_info in [IgnoreInfo.ALL_IGNORE, IgnoreInfo.INPUT_IGNORE]:
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
norm_relative_err_max = abs(api_data.get_min_or_max_value(CompareConst.NORM_RELATIVE_ERR, is_min=False))
|
|
88
|
+
|
|
89
|
+
if norm_relative_err_max < self.in_threshold:
|
|
90
|
+
for data_list in api_data.output_data:
|
|
91
|
+
norm_relative_err = str2float(api_data.get_data_by_header(CompareConst.NORM_RELATIVE_ERR, data_list))
|
|
92
|
+
if abs(norm_relative_err) > self.out_threshold:
|
|
93
|
+
api_data.set_result(data_list, self.result_level)
|
|
94
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class OneThousandthErrChecker(BaseAlgorithm):
|
|
98
|
+
"""
|
|
99
|
+
适用于真实数据模式
|
|
100
|
+
指标需要结合输入和输出共同计算得到
|
|
101
|
+
一个 API 或模块的 One Thousandth Err Ratio 的 input/parameters > 0.9 同时 output < 0.6, 标记为 error
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self):
|
|
105
|
+
self.input_threshold = 0.9
|
|
106
|
+
self.output_threshold = 0.1
|
|
107
|
+
self.result_level = ResultLevel.ERROR
|
|
108
|
+
self.err_msg = (f'{self.result_level.value}: The input/parameters of '
|
|
109
|
+
f'One Thousandth Err Ratio > 0.9 while the output < 0.6.')
|
|
110
|
+
|
|
111
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
112
|
+
if ignore_info in [IgnoreInfo.ALL_IGNORE, IgnoreInfo.INPUT_IGNORE]:
|
|
113
|
+
return
|
|
114
|
+
if not api_data.output_data:
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
min_input_ratio = api_data.get_min_or_max_value(CompareConst.ONE_THOUSANDTH_ERR_RATIO)
|
|
118
|
+
min_output_ratio = api_data.get_min_or_max_value(CompareConst.ONE_THOUSANDTH_ERR_RATIO, is_input=False)
|
|
119
|
+
|
|
120
|
+
if min_input_ratio > self.input_threshold and min_output_ratio < self.output_threshold:
|
|
121
|
+
api_data.set_result(api_data.output_data[0], self.result_level)
|
|
122
|
+
api_data.set_err_msg(api_data.output_data[0], self.err_msg)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class RequiresGradErrChecker(BaseAlgorithm):
|
|
126
|
+
"""
|
|
127
|
+
适用于真实数据模式、统计数据模式
|
|
128
|
+
一个 API 或模块的 Requires_grad Consistent 为 False,标记为 error
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
self.result_level = ResultLevel.ERROR
|
|
133
|
+
self.err_msg = f'{self.result_level.value}: The Required_Grad of NPU and Bench are inconsistent'
|
|
134
|
+
|
|
135
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
136
|
+
data_lists = get_data_list_by_ignore_info(api_data, ignore_info)
|
|
137
|
+
if not data_lists:
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
for data_list in data_lists:
|
|
141
|
+
# not match
|
|
142
|
+
bench_name = api_data.get_data_by_header(CompareConst.BENCH_NAME, data_list)
|
|
143
|
+
if not bench_name or bench_name == CompareConst.N_A:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
npu_req_grad = api_data.get_data_by_header(CompareConst.NPU_REQ_GRAD, data_list)
|
|
147
|
+
bench_req_grad = api_data.get_data_by_header(CompareConst.BENCH_REQ_GRAD, data_list)
|
|
148
|
+
|
|
149
|
+
if not npu_req_grad or not bench_req_grad:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
if npu_req_grad != bench_req_grad:
|
|
153
|
+
api_data.set_result(data_list, self.result_level)
|
|
154
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class ParametersErrChecker(BaseAlgorithm):
|
|
158
|
+
"""
|
|
159
|
+
适用于真实数据模式、统计数据模式
|
|
160
|
+
一个 API 或模块的非 tensor 标量参数,NPU 和 Bench 不一致,标记为 error
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def __init__(self):
|
|
164
|
+
self.result_level = ResultLevel.ERROR
|
|
165
|
+
self.err_msg = f'{self.result_level.value}: The scalar parameters of NPU and Bench are inconsistent.'
|
|
166
|
+
|
|
167
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
168
|
+
if ignore_info in [IgnoreInfo.ALL_IGNORE, IgnoreInfo.INPUT_IGNORE]:
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
for data_list in api_data.input_data:
|
|
172
|
+
# not match
|
|
173
|
+
bench_name = api_data.get_data_by_header(CompareConst.BENCH_NAME, data_list)
|
|
174
|
+
if not bench_name or bench_name == CompareConst.N_A:
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
npu_dtype = api_data.get_data_by_header(CompareConst.NPU_DTYPE, data_list)
|
|
178
|
+
bench_dtype = api_data.get_data_by_header(CompareConst.BENCH_DTYPE, data_list)
|
|
179
|
+
if not npu_dtype or not bench_dtype:
|
|
180
|
+
continue
|
|
181
|
+
# 非tensor标量的dtype一定包含'class',例如int类型的dtype为<class int>
|
|
182
|
+
if 'class' not in npu_dtype or 'class' not in bench_dtype:
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
npu_shape = api_data.get_data_by_header(CompareConst.NPU_SHAPE, data_list)
|
|
186
|
+
# 以shape是否为[]判断其是否为标量
|
|
187
|
+
if str(npu_shape) != '[]':
|
|
188
|
+
continue
|
|
189
|
+
npu_max = api_data.get_data_by_header(CompareConst.NPU_MAX, data_list)
|
|
190
|
+
bench_max = api_data.get_data_by_header(CompareConst.BENCH_MAX, data_list)
|
|
191
|
+
if npu_max != bench_max:
|
|
192
|
+
api_data.set_result(data_list, self.result_level)
|
|
193
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class CRC32ErrChecker(BaseAlgorithm):
|
|
197
|
+
"""
|
|
198
|
+
适用于MD5模式
|
|
199
|
+
NPU 与标杆的 CRC-32 值不一致,标记为 error
|
|
200
|
+
NPU 与标杆的参数未匹配上,标记为 warning
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self):
|
|
204
|
+
self.err_level = ResultLevel.ERROR
|
|
205
|
+
self.err_msg = f'{self.err_level.value}: The CRC-32 value of NPU differs from that of the bench.'
|
|
206
|
+
self.warn_level = ResultLevel.WARNING
|
|
207
|
+
self.warn_msg = f'{self.warn_level.value}: The parameter of NPU does not match the bench.'
|
|
208
|
+
|
|
209
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
210
|
+
data_lists = get_data_list_by_ignore_info(api_data, ignore_info)
|
|
211
|
+
if not data_lists:
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
null_set = (CompareConst.N_A, CompareConst.NAN)
|
|
215
|
+
for data_list in data_lists:
|
|
216
|
+
npu_md5 = api_data.get_data_by_header(CompareConst.NPU_MD5, data_list)
|
|
217
|
+
bench_md5 = api_data.get_data_by_header(CompareConst.BENCH_MD5, data_list)
|
|
218
|
+
if npu_md5 != bench_md5:
|
|
219
|
+
# 参数未匹配上
|
|
220
|
+
if npu_md5 in null_set or bench_md5 in null_set:
|
|
221
|
+
api_data.set_result(data_list, self.warn_level)
|
|
222
|
+
api_data.set_err_msg(data_list, self.warn_msg)
|
|
223
|
+
else:
|
|
224
|
+
api_data.set_result(data_list, self.err_level)
|
|
225
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class DTypeErrChecker(BaseAlgorithm):
|
|
229
|
+
"""
|
|
230
|
+
适用于真实数据模式、统计数据模式
|
|
231
|
+
一个 API 或模块的 dtype 不一致,标记为 error
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def __init__(self):
|
|
235
|
+
self.result_level = ResultLevel.ERROR
|
|
236
|
+
self.err_msg = f'{self.result_level.value}: The dtype of NPU and Bench are inconsistent.'
|
|
237
|
+
|
|
238
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
239
|
+
data_lists = get_data_list_by_ignore_info(api_data, ignore_info)
|
|
240
|
+
if not data_lists:
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
for data_list in data_lists:
|
|
244
|
+
# not match
|
|
245
|
+
bench_name = api_data.get_data_by_header(CompareConst.BENCH_NAME, data_list)
|
|
246
|
+
if not bench_name or bench_name == CompareConst.N_A:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
npu_dtype = api_data.get_data_by_header(CompareConst.NPU_DTYPE, data_list)
|
|
250
|
+
bench_dtype = api_data.get_data_by_header(CompareConst.BENCH_DTYPE, data_list)
|
|
251
|
+
|
|
252
|
+
if not npu_dtype or not bench_dtype:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
if npu_dtype != bench_dtype:
|
|
256
|
+
api_data.set_result(data_list, self.result_level)
|
|
257
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class ShapeErrChecker(BaseAlgorithm):
|
|
261
|
+
"""
|
|
262
|
+
适用于真实数据模式、统计数据模式
|
|
263
|
+
一个 API 或模块的 shape 不一致,标记为 error
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
def __init__(self):
|
|
267
|
+
self.result_level = ResultLevel.ERROR
|
|
268
|
+
self.err_msg = f'{self.result_level.value}: The shape of NPU and Bench are inconsistent.'
|
|
269
|
+
|
|
270
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
271
|
+
data_lists = get_data_list_by_ignore_info(api_data, ignore_info)
|
|
272
|
+
if not data_lists:
|
|
273
|
+
return
|
|
274
|
+
for data_list in data_lists:
|
|
275
|
+
# not match
|
|
276
|
+
bench_name = api_data.get_data_by_header(CompareConst.BENCH_NAME, data_list)
|
|
277
|
+
if not bench_name or bench_name == CompareConst.N_A:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
npu_shape = api_data.get_data_by_header(CompareConst.NPU_SHAPE, data_list)
|
|
281
|
+
bench_shape = api_data.get_data_by_header(CompareConst.BENCH_SHAPE, data_list)
|
|
282
|
+
|
|
283
|
+
if npu_shape is None or bench_shape is None:
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
if npu_shape != bench_shape:
|
|
287
|
+
api_data.set_result(data_list, self.result_level)
|
|
288
|
+
api_data.set_err_msg(data_list, self.err_msg)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class RelativeWarnChecker(BaseAlgorithm):
|
|
292
|
+
"""
|
|
293
|
+
适用于统计数据模式
|
|
294
|
+
指标需要结合输入和输出共同计算得到
|
|
295
|
+
一个 API 或模块的 output 相对误差是 input 相对误差的10倍,标记为 warning,默认选取norm观测
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
def __init__(self):
|
|
299
|
+
self.threshold = 10
|
|
300
|
+
self.result_level = ResultLevel.WARNING
|
|
301
|
+
self.err_msg = (f'{self.result_level.value}: The norm relative error of output '
|
|
302
|
+
f'is {self.threshold} times that of input.')
|
|
303
|
+
|
|
304
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
305
|
+
if ignore_info in [IgnoreInfo.ALL_IGNORE, IgnoreInfo.INPUT_IGNORE]:
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
if not api_data.output_data:
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
norm_relative_error_max_in = abs(api_data.get_min_or_max_value(CompareConst.NORM_RELATIVE_ERR, is_min=False))
|
|
312
|
+
norm_relative_error_max_out = abs(
|
|
313
|
+
api_data.get_min_or_max_value(CompareConst.NORM_RELATIVE_ERR, is_input=False, is_min=False))
|
|
314
|
+
|
|
315
|
+
should_set = False
|
|
316
|
+
|
|
317
|
+
if norm_relative_error_max_in == 0:
|
|
318
|
+
if norm_relative_error_max_out > 0.1:
|
|
319
|
+
should_set = True
|
|
320
|
+
elif norm_relative_error_max_out / norm_relative_error_max_in > self.threshold:
|
|
321
|
+
should_set = True
|
|
322
|
+
|
|
323
|
+
if should_set:
|
|
324
|
+
api_data.set_result(api_data.output_data[0], self.result_level)
|
|
325
|
+
api_data.set_err_msg(api_data.output_data[0], self.err_msg)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class CosineWarnChecker(BaseAlgorithm):
|
|
329
|
+
"""
|
|
330
|
+
适用于真实数据模式
|
|
331
|
+
指标需要结合输入和输出共同计算得到
|
|
332
|
+
一个 API 或模块的 Cosine 的 input/parameters > 0.9 且 input/parameters - output > 0.1
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
def __init__(self):
|
|
336
|
+
self.input_threshold = 0.9
|
|
337
|
+
self.output_threshold = 0.1
|
|
338
|
+
self.result_level = ResultLevel.WARNING
|
|
339
|
+
self.err_msg = (f'{self.result_level.value}: The input/parameters of Cosine > {self.input_threshold}, '
|
|
340
|
+
f'and input/parameters - output > {self.output_threshold}')
|
|
341
|
+
|
|
342
|
+
def run(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
343
|
+
if ignore_info in [IgnoreInfo.ALL_IGNORE, IgnoreInfo.INPUT_IGNORE]:
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
if not api_data.output_data:
|
|
347
|
+
return
|
|
348
|
+
|
|
349
|
+
min_input_cosine = api_data.get_min_or_max_value(CompareConst.COSINE)
|
|
350
|
+
min_output_cosine = api_data.get_min_or_max_value(CompareConst.COSINE, is_input=False)
|
|
351
|
+
|
|
352
|
+
if min_input_cosine > self.input_threshold and min_input_cosine - min_output_cosine > self.output_threshold:
|
|
353
|
+
api_data.set_result(api_data.output_data[0], self.result_level)
|
|
354
|
+
api_data.set_err_msg(api_data.output_data[0], self.err_msg)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
TENSOR_CHECKERS = [InfNanErrChecker, OneThousandthErrChecker, RequiresGradErrChecker, ParametersErrChecker,
|
|
358
|
+
DTypeErrChecker, ShapeErrChecker, CosineWarnChecker]
|
|
359
|
+
STATISTICS_CHECKERS = [InfNanErrChecker, RelativeErrChecker, RequiresGradErrChecker, ParametersErrChecker,
|
|
360
|
+
DTypeErrChecker, ShapeErrChecker, RelativeWarnChecker]
|
|
361
|
+
MD5_CHECKERS = [CRC32ErrChecker]
|
|
362
|
+
STATISTICS_CHECKERS_PARALLEL_MERGE = [InfNanErrChecker, RelativeErrChecker, RequiresGradErrChecker,
|
|
363
|
+
ParametersErrChecker, DTypeErrChecker, RelativeWarnChecker]
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
from msprobe.core.compare.indicator_analysis.utils import CompareMode, ResultLevel, str2float
|
|
20
|
+
from msprobe.core.common.const import Const, CompareConst
|
|
21
|
+
from msprobe.core.common.log import logger
|
|
22
|
+
from msprobe.core.common.exceptions import MsprobeException
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ApiData:
|
|
26
|
+
header_index_mapping_cache = {}
|
|
27
|
+
|
|
28
|
+
def __init__(self, mode, data_lists: List[List]):
|
|
29
|
+
self.input_data = []
|
|
30
|
+
self.output_data = []
|
|
31
|
+
self.mode = mode
|
|
32
|
+
self.data_lists = data_lists
|
|
33
|
+
self.header = self._get_header()
|
|
34
|
+
# 表头与索引映射,可以基于表头拿到索引,从input_data和output_data中获取对应数据
|
|
35
|
+
self.header_index_mapping = self.get_header_index_mapping()
|
|
36
|
+
self._init_data()
|
|
37
|
+
|
|
38
|
+
def get_header_index_mapping(self):
|
|
39
|
+
if self.mode in ApiData.header_index_mapping_cache:
|
|
40
|
+
return ApiData.header_index_mapping_cache[self.mode]
|
|
41
|
+
|
|
42
|
+
mapping = {item: index for index, item in enumerate(self.header)}
|
|
43
|
+
ApiData.header_index_mapping_cache[self.mode] = mapping
|
|
44
|
+
return mapping
|
|
45
|
+
|
|
46
|
+
def get_data_by_header(self, header: str, data_list: List):
|
|
47
|
+
"""
|
|
48
|
+
基于表头从data list获取数据
|
|
49
|
+
"""
|
|
50
|
+
index = self.header_index_mapping.get(header)
|
|
51
|
+
try:
|
|
52
|
+
data = data_list[index]
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(f'Unable to get data from the data list based on the header: {e}')
|
|
55
|
+
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) from e
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
def set_data_by_header(self, header: str, data_list: List, new_data):
|
|
59
|
+
index = self.header_index_mapping.get(header)
|
|
60
|
+
try:
|
|
61
|
+
data_list[index] = new_data
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.error(f'Unable to set data from the data list based on the header: {e}')
|
|
64
|
+
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) from e
|
|
65
|
+
|
|
66
|
+
def set_result(self, data_list: List, status: ResultLevel = ResultLevel.PASS):
|
|
67
|
+
index = self.header_index_mapping.get(CompareConst.RESULT)
|
|
68
|
+
try:
|
|
69
|
+
current_status = data_list[index]
|
|
70
|
+
if not isinstance(current_status, ResultLevel) or status > current_status:
|
|
71
|
+
data_list[index] = status
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error(f'Unable to set status from the data list based on the header: {e}')
|
|
74
|
+
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) from e
|
|
75
|
+
|
|
76
|
+
def set_err_msg(self, data_list: List, msg: str = '', init_msg=False):
|
|
77
|
+
index = self.header_index_mapping.get(CompareConst.ERROR_MESSAGE)
|
|
78
|
+
try:
|
|
79
|
+
if init_msg:
|
|
80
|
+
data_list[index] = []
|
|
81
|
+
else:
|
|
82
|
+
current_msg = data_list[index]
|
|
83
|
+
if not isinstance(current_msg, list):
|
|
84
|
+
current_msg = [current_msg] if current_msg else []
|
|
85
|
+
data_list[index] = current_msg
|
|
86
|
+
if msg:
|
|
87
|
+
current_msg.append(msg)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(f'Unable to set err msg from the data list based on the header: {e}')
|
|
90
|
+
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR) from e
|
|
91
|
+
|
|
92
|
+
def get_min_or_max_value(self, header, is_input=True, is_min=True):
|
|
93
|
+
"""
|
|
94
|
+
获取多个输入或输出参数中的最小或最大指标
|
|
95
|
+
"""
|
|
96
|
+
default_value = 1.0 if is_min else 0.0
|
|
97
|
+
data_lists = self.input_data if is_input else self.output_data
|
|
98
|
+
for data_list in data_lists:
|
|
99
|
+
value = self.get_data_by_header(header, data_list)
|
|
100
|
+
|
|
101
|
+
if value is None or value in [CompareConst.NAN, CompareConst.N_A]:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
if isinstance(value, str) and value.endswith('%'):
|
|
105
|
+
value = str2float(value)
|
|
106
|
+
|
|
107
|
+
if isinstance(value, str):
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
default_value = min(default_value, value) if is_min else max(default_value, value)
|
|
111
|
+
return default_value
|
|
112
|
+
|
|
113
|
+
def _get_header(self):
|
|
114
|
+
if self.mode == CompareMode.STATISTICS.value:
|
|
115
|
+
return CompareConst.SUMMARY_COMPARE_RESULT_HEADER
|
|
116
|
+
elif self.mode == CompareMode.TENSOR.value:
|
|
117
|
+
return CompareConst.COMPARE_RESULT_HEADER
|
|
118
|
+
elif self.mode == CompareMode.MD5.value:
|
|
119
|
+
return CompareConst.MD5_COMPARE_RESULT_HEADER
|
|
120
|
+
else:
|
|
121
|
+
logger.error(f'The parameter "mode" error, '
|
|
122
|
+
f'expected {CompareMode.STATISTICS.value}/{CompareMode.TENSOR.value}/{CompareMode.MD5.value}, '
|
|
123
|
+
f'actually {self.mode}.')
|
|
124
|
+
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
|
|
125
|
+
|
|
126
|
+
def _init_data(self):
|
|
127
|
+
for data_list in self.data_lists:
|
|
128
|
+
if not data_list:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
if len(data_list) < len(self.header):
|
|
132
|
+
data_list = data_list + [''] * (len(self.header) - len(data_list))
|
|
133
|
+
|
|
134
|
+
# 初始化result为pass,error message为list
|
|
135
|
+
self.set_result(data_list)
|
|
136
|
+
self.set_err_msg(data_list, init_msg=True)
|
|
137
|
+
|
|
138
|
+
if f'{Const.SEP}{Const.OUTPUT}{Const.SEP}' in data_list[0]:
|
|
139
|
+
self.output_data.append(data_list)
|
|
140
|
+
else:
|
|
141
|
+
self.input_data.append(data_list)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
import json
|
|
19
|
+
from typing import List
|
|
20
|
+
|
|
21
|
+
from msprobe.core.common.const import Const, CompareConst
|
|
22
|
+
from msprobe.core.common.log import logger
|
|
23
|
+
from msprobe.core.compare.indicator_analysis.utils import CompareMode, ResultLevel, divide_result_df, IgnoreInfo
|
|
24
|
+
from msprobe.core.compare.indicator_analysis.algorithm import BaseAlgorithm, TENSOR_CHECKERS, STATISTICS_CHECKERS, \
|
|
25
|
+
MD5_CHECKERS, STATISTICS_CHECKERS_PARALLEL_MERGE
|
|
26
|
+
from msprobe.core.compare.indicator_analysis.api_data import ApiData
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ApiIndicatorCalculator:
|
|
30
|
+
RANK_SUFFIX_PATTERN = re.compile(r'_rank\d+$')
|
|
31
|
+
|
|
32
|
+
def __init__(self, mode, parallel_merge=False):
|
|
33
|
+
self.mode = mode
|
|
34
|
+
self.parallel_merge = parallel_merge
|
|
35
|
+
self.all_ignore_set = {'empty', 'empty_like', 'numpy', 'to', '__setitem__', 'empty_with_format',
|
|
36
|
+
'new_empty_strided', 'new_empty', 'empty_strided'}
|
|
37
|
+
self.input_ignore_set = {'_reduce_scatter_base', '_all_gather_base', 'all_to_all_single', 'batch_isend_irecv'}
|
|
38
|
+
self.algorithms: List[BaseAlgorithm] = []
|
|
39
|
+
self._add_algorithm()
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def get_api_indicator_and_msg(api_data: ApiData):
|
|
43
|
+
"""
|
|
44
|
+
基于all_data_lists(api或模块的所有参数的数据)得到一个api或模块的指标和异常信息
|
|
45
|
+
|
|
46
|
+
indicator取所有参数最差的(error)
|
|
47
|
+
err msg取所有参数汇总
|
|
48
|
+
|
|
49
|
+
Return:
|
|
50
|
+
精度比对指标(pass/warning/error)
|
|
51
|
+
"""
|
|
52
|
+
all_data_lists = api_data.input_data + api_data.output_data
|
|
53
|
+
final_indicator = ResultLevel.PASS
|
|
54
|
+
for data_list in all_data_lists:
|
|
55
|
+
indicator = api_data.get_data_by_header(CompareConst.RESULT, data_list)
|
|
56
|
+
if isinstance(indicator, ResultLevel):
|
|
57
|
+
api_data.set_data_by_header(CompareConst.RESULT, data_list, indicator.value)
|
|
58
|
+
if indicator > final_indicator:
|
|
59
|
+
final_indicator = indicator
|
|
60
|
+
err_msg = api_data.get_data_by_header(CompareConst.ERROR_MESSAGE, data_list)
|
|
61
|
+
if isinstance(err_msg, list):
|
|
62
|
+
api_data.set_data_by_header(CompareConst.ERROR_MESSAGE, data_list, json.dumps(err_msg))
|
|
63
|
+
|
|
64
|
+
return final_indicator.value
|
|
65
|
+
|
|
66
|
+
def get_api_ignore_info(self, api_data: ApiData):
|
|
67
|
+
"""
|
|
68
|
+
api是否需要忽略判断规则的情况
|
|
69
|
+
"""
|
|
70
|
+
if not api_data.input_data:
|
|
71
|
+
return IgnoreInfo.NO_IGNORE
|
|
72
|
+
npu_param_name = api_data.get_data_by_header(CompareConst.NPU_NAME, api_data.input_data[0])
|
|
73
|
+
name_split = npu_param_name.split(Const.SEP)
|
|
74
|
+
if len(name_split) < 2:
|
|
75
|
+
return IgnoreInfo.NO_IGNORE
|
|
76
|
+
api_name = self.RANK_SUFFIX_PATTERN.sub('', name_split[1]) if self.parallel_merge else name_split[1]
|
|
77
|
+
if api_name in self.all_ignore_set:
|
|
78
|
+
return IgnoreInfo.ALL_IGNORE
|
|
79
|
+
elif api_name in self.input_ignore_set:
|
|
80
|
+
return IgnoreInfo.INPUT_IGNORE
|
|
81
|
+
|
|
82
|
+
return IgnoreInfo.NO_IGNORE
|
|
83
|
+
|
|
84
|
+
def calculate(self, raw_data_list: List[List]):
|
|
85
|
+
"""
|
|
86
|
+
计算入口
|
|
87
|
+
"""
|
|
88
|
+
api_data = ApiData(self.mode, raw_data_list)
|
|
89
|
+
|
|
90
|
+
ignore_info = self.get_api_ignore_info(api_data)
|
|
91
|
+
|
|
92
|
+
self.execute_all(api_data, ignore_info)
|
|
93
|
+
|
|
94
|
+
return self.get_api_indicator_and_msg(api_data)
|
|
95
|
+
|
|
96
|
+
def add_algorithm(self, algorithm: BaseAlgorithm):
|
|
97
|
+
if not isinstance(algorithm, BaseAlgorithm):
|
|
98
|
+
msg = 'It must be an instance of a subclass of BaseAlgorithm.'
|
|
99
|
+
logger.error(msg)
|
|
100
|
+
raise TypeError(msg)
|
|
101
|
+
self.algorithms.append(algorithm)
|
|
102
|
+
|
|
103
|
+
def execute_all(self, api_data: ApiData, ignore_info: IgnoreInfo):
|
|
104
|
+
for algorithm in self.algorithms:
|
|
105
|
+
try:
|
|
106
|
+
algorithm.run(api_data, ignore_info)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
msg = f'Run algorithm failed.'
|
|
109
|
+
logger.error(msg)
|
|
110
|
+
raise RuntimeError(msg) from e
|
|
111
|
+
|
|
112
|
+
def _add_algorithm(self):
|
|
113
|
+
if self.mode == CompareMode.STATISTICS.value:
|
|
114
|
+
checkers = STATISTICS_CHECKERS_PARALLEL_MERGE if self.parallel_merge else STATISTICS_CHECKERS
|
|
115
|
+
for checker in checkers:
|
|
116
|
+
self.add_algorithm(checker())
|
|
117
|
+
elif self.mode == CompareMode.TENSOR.value:
|
|
118
|
+
for checker in TENSOR_CHECKERS:
|
|
119
|
+
self.add_algorithm(checker())
|
|
120
|
+
elif self.mode == CompareMode.MD5.value:
|
|
121
|
+
for checker in MD5_CHECKERS:
|
|
122
|
+
self.add_algorithm(checker())
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def calculate_excel_result_df(result_df, mode, chunk_size=1000):
|
|
126
|
+
"""
|
|
127
|
+
仅适用于excel比对场景,得到表格每行数据的精度比对指标(pass/warning/error)
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
result_df: DataFrame数据结构,即转换成excel前的表单结构
|
|
131
|
+
mode: 比对模式,分为 tensor 模式、统计量模式和 md5 模式
|
|
132
|
+
chunk_size: 分块赋值参数,默认1000,把 result 分成小块,逐块赋值给 result_df,这样每次只占用小块内存,避免内存峰值过高
|
|
133
|
+
"""
|
|
134
|
+
result_dict = divide_result_df(result_df)
|
|
135
|
+
calculator = ApiIndicatorCalculator(mode)
|
|
136
|
+
calculated_result_lists = []
|
|
137
|
+
for data_lists in result_dict.values():
|
|
138
|
+
calculator.calculate(data_lists)
|
|
139
|
+
calculated_result_lists.extend(data_lists)
|
|
140
|
+
|
|
141
|
+
head = CompareConst.HEAD_OF_COMPARE_MODE.get(mode)
|
|
142
|
+
if not head:
|
|
143
|
+
logger.error(f'Unable to obtain header based on compare mode: {mode}')
|
|
144
|
+
raise RuntimeError()
|
|
145
|
+
# 配置列映射关系:[(result_df的目标列名, result子列表的列索引)]
|
|
146
|
+
try:
|
|
147
|
+
cols_mapping = [
|
|
148
|
+
(CompareConst.RESULT, head.index(CompareConst.RESULT)),
|
|
149
|
+
(CompareConst.ERROR_MESSAGE, head.index(CompareConst.ERROR_MESSAGE))
|
|
150
|
+
]
|
|
151
|
+
except ValueError as e:
|
|
152
|
+
logger.error(f'The {CompareConst.RESULT} or {CompareConst.ERROR_MESSAGE} does not exist in the header: {e}')
|
|
153
|
+
raise e
|
|
154
|
+
|
|
155
|
+
total_rows = len(calculated_result_lists)
|
|
156
|
+
|
|
157
|
+
# 分块逐批赋值,降低内存瞬时峰值
|
|
158
|
+
for i in range(0, total_rows, chunk_size):
|
|
159
|
+
end_idx = min(i + chunk_size, total_rows)
|
|
160
|
+
current_result_chunk = calculated_result_lists[i:end_idx]
|
|
161
|
+
|
|
162
|
+
for df_col_name, result_col_idx in cols_mapping:
|
|
163
|
+
col_data = [sublist[result_col_idx] for sublist in current_result_chunk]
|
|
164
|
+
df_col_idx = result_df.columns.get_loc(df_col_name)
|
|
165
|
+
result_df.iloc[i:end_idx, df_col_idx] = col_data
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def calculate_result(result, mode, parallel_merge=False):
|
|
169
|
+
"""
|
|
170
|
+
得到一个api或模块的指标和异常信息
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
result: List[List]数据结构,每个list元素代表api或模块参数的具体信息
|
|
174
|
+
mode: 比对模式,分为 tensor 模式、统计量模式和 md5 模式
|
|
175
|
+
parallel_merge: 是否为不同切分策略图合并比对场景,默认False
|
|
176
|
+
|
|
177
|
+
Return:
|
|
178
|
+
精度比对指标(pass/warning/error)
|
|
179
|
+
"""
|
|
180
|
+
calculator = ApiIndicatorCalculator(mode, parallel_merge)
|
|
181
|
+
return calculator.calculate(result)
|