mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
msprobe/msprobe.py
CHANGED
|
@@ -1,163 +1,134 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
import argparse
|
|
17
|
-
import importlib.util
|
|
18
18
|
import sys
|
|
19
19
|
|
|
20
|
-
from msprobe.core.common.const import Const
|
|
21
|
-
from msprobe.core.common.file_utils import root_privilege_warning
|
|
22
20
|
from msprobe.core.common.log import logger
|
|
21
|
+
from msprobe.core.compare.utils import _compare_parser
|
|
23
22
|
from msprobe.core.compare.compare_cli import compare_cli
|
|
24
23
|
from msprobe.core.compare.merge_result.merge_result_cli import _merge_result_parser, merge_result_cli
|
|
25
|
-
from msprobe.core.compare.utils import _compare_parser
|
|
26
24
|
from msprobe.core.config_check.config_check_cli import _config_checking_parser, _run_config_checking_command
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
from msprobe.overflow_check.analyzer import _overflow_check_parser, _run_overflow_check
|
|
26
|
+
from msprobe.core.acc_check.acc_check_cli import acc_check_cli, multi_acc_check_cli
|
|
27
|
+
from msprobe.visualization.graph_service import _graph_service_parser, _graph_service_command
|
|
28
|
+
from msprobe.core.dump.dump2db.dump2db import _data2db_service_parser, _data2db_command
|
|
29
|
+
from msprobe.infer.offline.compare.msquickcmp.main import _offline_dump_parser, offline_dump_cli
|
|
30
|
+
from msprobe.core.install_deps.install_deps import _install_deps_parser, install_deps_cli
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
def main():
|
|
35
34
|
parser = argparse.ArgumentParser(
|
|
36
35
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
37
36
|
description="msprobe(mindstudio probe), [Powered by MindStudio].\n"
|
|
38
|
-
"
|
|
37
|
+
"A full-process, all-scenario precision tool base on Ascend products.\n"
|
|
39
38
|
f"For any issue, refer README.md first",
|
|
40
39
|
)
|
|
41
40
|
|
|
42
41
|
parser.set_defaults(print_help=parser.print_help)
|
|
43
|
-
parser.add_argument('-f', '--framework', required=True, choices=[Const.PT_FRAMEWORK, Const.MS_FRAMEWORK],
|
|
44
|
-
help='Deep learning framework.')
|
|
45
42
|
subparsers = parser.add_subparsers()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
run_overflow_check_cmd_parser = subparsers.add_parser('run_overflow_check')
|
|
52
|
-
code_mapping_cmd_parser = subparsers.add_parser('code_mapping')
|
|
53
|
-
graph_service_cmd_parser = subparsers.add_parser('graph')
|
|
54
|
-
op_generate_cmd_parser = subparsers.add_parser('op_generate')
|
|
43
|
+
|
|
44
|
+
compare_parser = subparsers.add_parser('compare')
|
|
45
|
+
_compare_parser(compare_parser)
|
|
46
|
+
acc_check_cmd_parser = subparsers.add_parser('acc_check')
|
|
47
|
+
multi_acc_check_cmd_parser = subparsers.add_parser('multi_acc_check')
|
|
55
48
|
merge_result_parser = subparsers.add_parser('merge_result')
|
|
49
|
+
_merge_result_parser(merge_result_parser)
|
|
50
|
+
overflow_check_parse = subparsers.add_parser('overflow_check')
|
|
51
|
+
_overflow_check_parser(overflow_check_parse)
|
|
56
52
|
config_checking_parser = subparsers.add_parser('config_check')
|
|
57
|
-
nan_analyze_parser = subparsers.add_parser('nan_analyze')
|
|
58
53
|
_config_checking_parser(config_checking_parser)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
54
|
+
# api_precision_compare 是 PyTorch 相关能力,延迟加载
|
|
55
|
+
api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
|
|
56
|
+
try:
|
|
57
|
+
from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import (
|
|
58
|
+
_api_precision_compare_parser
|
|
59
|
+
)
|
|
60
|
+
_api_precision_compare_parser(api_precision_compare_cmd_parser)
|
|
61
|
+
except ImportError as e:
|
|
62
|
+
# torch 不存在时,parser 仍然存在,但给提示
|
|
63
|
+
def _no_torch_parser(_):
|
|
64
|
+
api_precision_compare_cmd_parser.set_defaults(
|
|
65
|
+
func=lambda *_: (
|
|
66
|
+
logger.error(
|
|
67
|
+
"api_precision_compare requires PyTorch environment. "
|
|
68
|
+
"Please install torch / torch_npu first."
|
|
69
|
+
),
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
_no_torch_parser(api_precision_compare_cmd_parser)
|
|
74
|
+
graph_service_cmd_parser = subparsers.add_parser('graph_visualize')
|
|
75
|
+
_graph_service_parser(graph_service_cmd_parser)
|
|
76
|
+
graph_service_cmd_parser_deprecated = subparsers.add_parser('graph')
|
|
77
|
+
_graph_service_parser(graph_service_cmd_parser_deprecated)
|
|
78
|
+
data2db_parser = subparsers.add_parser('data2db')
|
|
79
|
+
_data2db_service_parser(data2db_parser)
|
|
80
|
+
|
|
81
|
+
offline_dump_parser = subparsers.add_parser('offline_dump')
|
|
82
|
+
_offline_dump_parser(offline_dump_parser)
|
|
83
|
+
|
|
84
|
+
install_deps_cmd_parser = subparsers.add_parser('install_deps')
|
|
85
|
+
_install_deps_parser(install_deps_cmd_parser)
|
|
86
|
+
|
|
87
|
+
if len(sys.argv) >= 2 and sys.argv[1] == "acc_check":
|
|
88
|
+
acc_check_cli(sys.argv[2:])
|
|
89
|
+
return
|
|
90
|
+
elif len(sys.argv) >= 2 and sys.argv[1] == "multi_acc_check":
|
|
91
|
+
multi_acc_check_cli(sys.argv[2:])
|
|
92
|
+
return
|
|
93
|
+
elif len(sys.argv) < 2:
|
|
65
94
|
parser.print_help()
|
|
66
95
|
sys.exit(0)
|
|
67
96
|
|
|
68
|
-
root_privilege_warning()
|
|
69
|
-
framework_args = parser.parse_args(sys.argv[1:3])
|
|
70
|
-
if framework_args.framework == Const.PT_FRAMEWORK:
|
|
71
|
-
from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
|
|
72
|
-
from msprobe.pytorch.parse_tool.cli import parse as cli_parse
|
|
73
|
-
from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
|
|
74
|
-
from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, \
|
|
75
|
-
_api_precision_compare_command
|
|
76
|
-
from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \
|
|
77
|
-
_run_overflow_check_command
|
|
78
|
-
from msprobe.visualization.graph_service import _pt_graph_service_parser, _pt_graph_service_command
|
|
79
|
-
from msprobe.pytorch.api_accuracy_checker.generate_op_script.op_generator import _op_generator_parser, \
|
|
80
|
-
_run_operator_generate_commond
|
|
81
|
-
from msprobe.nan_analyze.analyzer import _nan_analyze_parser, _run_nan_analyze
|
|
82
|
-
|
|
83
|
-
_run_ut_parser(run_ut_cmd_parser)
|
|
84
|
-
_run_ut_parser(multi_run_ut_cmd_parser)
|
|
85
|
-
multi_run_ut_cmd_parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8,
|
|
86
|
-
help='Number of splits for parallel processing. Range: 1-64')
|
|
87
|
-
_api_precision_compare_parser(api_precision_compare_cmd_parser)
|
|
88
|
-
_run_overflow_check_parser(run_overflow_check_cmd_parser)
|
|
89
|
-
_pt_graph_service_parser(graph_service_cmd_parser)
|
|
90
|
-
_op_generator_parser(op_generate_cmd_parser)
|
|
91
|
-
_nan_analyze_parser(nan_analyze_parser)
|
|
92
|
-
elif framework_args.framework == Const.MS_FRAMEWORK:
|
|
93
|
-
from msprobe.mindspore.api_accuracy_checker.cmd_parser import add_api_accuracy_checker_argument
|
|
94
|
-
from msprobe.visualization.graph_service import _ms_graph_service_parser, _ms_graph_service_command
|
|
95
|
-
add_api_accuracy_checker_argument(run_ut_cmd_parser)
|
|
96
|
-
from msprobe.mindspore.api_accuracy_checker.cmd_parser import multi_add_api_accuracy_checker_argument
|
|
97
|
-
multi_add_api_accuracy_checker_argument(multi_run_ut_cmd_parser)
|
|
98
|
-
from msprobe.mindspore.code_mapping.cmd_parser import add_ir_parser_arguments
|
|
99
|
-
add_ir_parser_arguments(code_mapping_cmd_parser)
|
|
100
|
-
|
|
101
|
-
_ms_graph_service_parser(graph_service_cmd_parser)
|
|
102
|
-
|
|
103
|
-
from msprobe.mindspore.api_accuracy_checker.generate_op_script.op_generator import _op_generator_parser, \
|
|
104
|
-
_run_operator_generate_commond
|
|
105
|
-
_op_generator_parser(op_generate_cmd_parser)
|
|
106
|
-
|
|
107
97
|
args = parser.parse_args(sys.argv[1:])
|
|
108
|
-
if sys.argv[
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
98
|
+
if sys.argv[1] == "compare":
|
|
99
|
+
compare_cli(args, sys.argv[1:])
|
|
100
|
+
elif sys.argv[1] == "merge_result":
|
|
101
|
+
merge_result_cli(args)
|
|
102
|
+
elif sys.argv[1] == "overflow_check":
|
|
103
|
+
_run_overflow_check(args)
|
|
104
|
+
elif sys.argv[1] == "graph_visualize":
|
|
105
|
+
_graph_service_command(args)
|
|
106
|
+
elif sys.argv[1] == "api_precision_compare":
|
|
107
|
+
try:
|
|
108
|
+
from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import (
|
|
109
|
+
_api_precision_compare_command
|
|
110
|
+
)
|
|
120
111
|
_api_precision_compare_command(args)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
raise Exception("MindSpore does not exist, please install MindSpore library")
|
|
142
|
-
if sys.argv[3] == "compare":
|
|
143
|
-
compare_cli(args)
|
|
144
|
-
elif sys.argv[3] == "merge_result":
|
|
145
|
-
merge_result_cli(args)
|
|
146
|
-
elif sys.argv[3] == "run_ut":
|
|
147
|
-
from msprobe.mindspore.api_accuracy_checker.main import api_checker_main
|
|
148
|
-
api_checker_main(args)
|
|
149
|
-
elif sys.argv[3] == "multi_run_ut":
|
|
150
|
-
from msprobe.mindspore.api_accuracy_checker.main import mul_api_checker_main
|
|
151
|
-
mul_api_checker_main(args)
|
|
152
|
-
elif sys.argv[3] == "graph":
|
|
153
|
-
_ms_graph_service_command(args)
|
|
154
|
-
elif sys.argv[3] == 'op_generate':
|
|
155
|
-
_run_operator_generate_commond(args)
|
|
156
|
-
elif sys.argv[3] == "code_mapping":
|
|
157
|
-
from msprobe.mindspore.code_mapping.main import code_mapping_main
|
|
158
|
-
code_mapping_main(args)
|
|
159
|
-
elif sys.argv[3] == "config_check":
|
|
160
|
-
_run_config_checking_command(args)
|
|
112
|
+
except ImportError:
|
|
113
|
+
logger.error(
|
|
114
|
+
"api_precision_compare requires PyTorch environment. "
|
|
115
|
+
"Please install torch / torch_npu first."
|
|
116
|
+
)
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
|
|
119
|
+
elif sys.argv[1] == "graph":
|
|
120
|
+
logger.warning('The "graph" parameter has been deprecated and will be removed in future versions. '
|
|
121
|
+
'Please use the "graph_visualize" parameter instead.')
|
|
122
|
+
_graph_service_command(args)
|
|
123
|
+
elif sys.argv[1] == "config_check":
|
|
124
|
+
_run_config_checking_command(args)
|
|
125
|
+
elif sys.argv[1] == "data2db":
|
|
126
|
+
_data2db_command(args)
|
|
127
|
+
|
|
128
|
+
elif sys.argv[1] == "offline_dump":
|
|
129
|
+
offline_dump_cli(args)
|
|
130
|
+
elif sys.argv[1] == "install_deps":
|
|
131
|
+
install_deps_cli(args)
|
|
161
132
|
|
|
162
133
|
|
|
163
134
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
@@ -1,37 +1,39 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
import time
|
|
17
18
|
from collections import defaultdict
|
|
18
19
|
import os
|
|
19
20
|
from itertools import dropwhile, chain
|
|
20
21
|
|
|
21
|
-
from msprobe.core.common.file_utils import check_file_or_directory_path, save_json, make_dir
|
|
22
|
+
from msprobe.core.common.file_utils import check_file_or_directory_path, save_json, make_dir, load_json
|
|
22
23
|
from msprobe.core.common.log import logger
|
|
23
24
|
from msprobe.core.common.const import Const
|
|
24
|
-
from msprobe.
|
|
25
|
-
|
|
26
|
-
from msprobe.
|
|
25
|
+
from msprobe.overflow_check.utils import (RankPath, FileCache, is_communication_op, is_ignore_op, OverFlowCheckConst,
|
|
26
|
+
analyze_anomaly_in_group)
|
|
27
|
+
from msprobe.overflow_check.graph import DataNode, CommunicationNode
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
class
|
|
30
|
+
class OverFlowCheck:
|
|
30
31
|
def __init__(self, input_path, output_path):
|
|
31
32
|
self._input_path = input_path
|
|
32
33
|
self._output_path = output_path
|
|
33
34
|
self._paths = {}
|
|
34
35
|
self._resolve_input_path()
|
|
36
|
+
self._check_framework()
|
|
35
37
|
self._anomaly_nodes = [] # 记录所有异常节点
|
|
36
38
|
self._cache = FileCache()
|
|
37
39
|
self._first_comm_nodes = {} # 记录各rank下首个通信节点的node_id
|
|
@@ -51,18 +53,27 @@ class NanAnalyzer:
|
|
|
51
53
|
for path in contents:
|
|
52
54
|
if not path.startswith('rank'):
|
|
53
55
|
continue
|
|
54
|
-
rank_str = path
|
|
56
|
+
rank_str = path[len('rank'):]
|
|
55
57
|
if not rank_str:
|
|
56
58
|
rank = 0
|
|
57
59
|
elif not rank_str.isdigit():
|
|
58
60
|
continue
|
|
59
61
|
else:
|
|
60
62
|
rank = int(rank_str)
|
|
61
|
-
dump_path = os.path.join(self._input_path, path,
|
|
62
|
-
construct_path = os.path.join(self._input_path, path,
|
|
63
|
-
stack_path = os.path.join(self._input_path, path,
|
|
63
|
+
dump_path = os.path.join(self._input_path, path, OverFlowCheckConst.DUMP_FILE)
|
|
64
|
+
construct_path = os.path.join(self._input_path, path, OverFlowCheckConst.CONSTRUCT_FILE)
|
|
65
|
+
stack_path = os.path.join(self._input_path, path, OverFlowCheckConst.STACK_FILE)
|
|
64
66
|
self._paths[rank] = RankPath(rank, dump_path, construct_path, stack_path)
|
|
65
67
|
|
|
68
|
+
def _check_framework(self):
|
|
69
|
+
for _, rankpath in self._paths.items():
|
|
70
|
+
data = load_json(rankpath.dump_path)
|
|
71
|
+
if data.get('framework', '') != "pytorch":
|
|
72
|
+
logger.error('overflow check is only available for PyTorch, though the data is stored in '
|
|
73
|
+
'a different framework.')
|
|
74
|
+
raise RuntimeError(f'invalid dump_path {rankpath.dump_path}')
|
|
75
|
+
break
|
|
76
|
+
|
|
66
77
|
def _pre_analyze(self):
|
|
67
78
|
logger.info('Start searching anomaly node before communication.')
|
|
68
79
|
for path in self._paths.values():
|
|
@@ -148,9 +159,9 @@ class NanAnalyzer:
|
|
|
148
159
|
def _find_connection(self, conn_info, cur_node, searched_ranks, seen_nodes):
|
|
149
160
|
def connect():
|
|
150
161
|
seen_nodes.add(search_node.node_id)
|
|
151
|
-
if search_node.type ==
|
|
162
|
+
if search_node.type == OverFlowCheckConst.DST:
|
|
152
163
|
cur_node.add_dst(search_node)
|
|
153
|
-
elif search_node.type ==
|
|
164
|
+
elif search_node.type == OverFlowCheckConst.SRC:
|
|
154
165
|
search_node.layer = cur_node.layer
|
|
155
166
|
search_node.add_dst(cur_node)
|
|
156
167
|
else:
|
|
@@ -167,7 +178,7 @@ class NanAnalyzer:
|
|
|
167
178
|
if not (search_id.startswith(tar_id_prefix) and search_node.type == conn_info.get('type')):
|
|
168
179
|
continue
|
|
169
180
|
search_conn_ranks = search_node.find_connected_nodes().get('ranks')
|
|
170
|
-
if ((not search_conn_ranks and search_node.api not in
|
|
181
|
+
if ((not search_conn_ranks and search_node.api not in OverFlowCheckConst.DIRECTED_API) or
|
|
171
182
|
cur_node.rank in search_conn_ranks): # 有些无向通信算子没有填ProcessGroup,默认连接所有rank
|
|
172
183
|
connect()
|
|
173
184
|
found = True
|
|
@@ -241,15 +252,15 @@ class NanAnalyzer:
|
|
|
241
252
|
return self._rank_comm_nodes_dict.get(rank, {}).get(node_id)
|
|
242
253
|
|
|
243
254
|
|
|
244
|
-
def
|
|
255
|
+
def _overflow_check_parser(parser):
|
|
245
256
|
parser.add_argument("-i", "--input_path", dest="input_path", default="", type=str,
|
|
246
257
|
help="<Required> The dump file path, over step level. eg: \"xxx/step_0/\".",
|
|
247
258
|
required=True)
|
|
248
259
|
parser.add_argument("-o", "--output_path", dest="output_path", default="./output", type=str,
|
|
249
|
-
help="<optional> The
|
|
260
|
+
help="<optional> The overflow check result output file path.",
|
|
250
261
|
required=False)
|
|
251
262
|
|
|
252
263
|
|
|
253
|
-
def
|
|
264
|
+
def _run_overflow_check(args):
|
|
254
265
|
check_file_or_directory_path(args.input_path, True)
|
|
255
|
-
|
|
266
|
+
OverFlowCheck(args.input_path, args.output_path).analyze()
|
|
@@ -1,22 +1,23 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from dataclasses import dataclass
|
|
17
18
|
from msprobe.core.common.const import Const
|
|
18
19
|
from msprobe.core.common.log import logger
|
|
19
|
-
from msprobe.
|
|
20
|
+
from msprobe.overflow_check.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, OverFlowCheckConst
|
|
20
21
|
from msprobe.core.common.exceptions import MsprobeException
|
|
21
22
|
|
|
22
23
|
|
|
@@ -46,6 +47,8 @@ class DataNode:
|
|
|
46
47
|
seen = set(op_name)
|
|
47
48
|
while True:
|
|
48
49
|
op_name = construct_info.get(op_name)
|
|
50
|
+
if isinstance(op_name, list):
|
|
51
|
+
op_name = op_name[0]
|
|
49
52
|
if not op_name or op_name in seen:
|
|
50
53
|
return construct
|
|
51
54
|
construct.insert(0, op_name)
|
|
@@ -140,7 +143,7 @@ class CommunicationNode:
|
|
|
140
143
|
|
|
141
144
|
def has_nan_inf(self):
|
|
142
145
|
return self.input_has_nan_inf() or check_item_anomaly(self.data.outputs)
|
|
143
|
-
|
|
146
|
+
|
|
144
147
|
def input_has_nan_inf(self):
|
|
145
148
|
return check_item_anomaly(self.data.input_args) or check_item_anomaly(self.data.input_kwargs)
|
|
146
149
|
|
|
@@ -148,15 +151,15 @@ class CommunicationNode:
|
|
|
148
151
|
"""
|
|
149
152
|
根据 api/类型/入参/调用次数 确定相连接的node的op_name
|
|
150
153
|
"""
|
|
151
|
-
tar_api =
|
|
154
|
+
tar_api = OverFlowCheckConst.P2P_API_MAPPING.get(self.api, self.api)
|
|
152
155
|
ranks = set()
|
|
153
|
-
for dst in [
|
|
156
|
+
for dst in [OverFlowCheckConst.DST, OverFlowCheckConst.DST_GROUP]:
|
|
154
157
|
if dst in self.data.input_kwargs:
|
|
155
158
|
dst_value = self.data.input_kwargs.get(dst)
|
|
156
159
|
if dst_value:
|
|
157
160
|
ranks.add(dst_value.get('value'))
|
|
158
161
|
break
|
|
159
|
-
for src in [
|
|
162
|
+
for src in [OverFlowCheckConst.SRC, OverFlowCheckConst.SRC_GROUP]:
|
|
160
163
|
if src in self.data.input_kwargs:
|
|
161
164
|
src_value = self.data.input_kwargs.get(src)
|
|
162
165
|
if src_value:
|
|
@@ -170,24 +173,24 @@ class CommunicationNode:
|
|
|
170
173
|
if group:
|
|
171
174
|
ranks.update(group.get('group_ranks'))
|
|
172
175
|
return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
|
|
173
|
-
'type':
|
|
176
|
+
'type': OverFlowCheckConst.OPPOSITE_DIR.get(self.type, OverFlowCheckConst.LINK)}
|
|
174
177
|
|
|
175
178
|
def _resolve_type(self):
|
|
176
|
-
for src in [
|
|
179
|
+
for src in [OverFlowCheckConst.SRC, OverFlowCheckConst.SRC_GROUP]:
|
|
177
180
|
if src in self.data.input_kwargs and self.data.input_kwargs[src]:
|
|
178
181
|
if self.data.input_kwargs[src].get('value') == self.rank:
|
|
179
|
-
return
|
|
182
|
+
return OverFlowCheckConst.SRC
|
|
180
183
|
else:
|
|
181
|
-
return
|
|
182
|
-
for dst in [
|
|
184
|
+
return OverFlowCheckConst.DST
|
|
185
|
+
for dst in [OverFlowCheckConst.DST, OverFlowCheckConst.DST_GROUP]:
|
|
183
186
|
if dst in self.data.input_kwargs and self.data.input_kwargs[dst]:
|
|
184
187
|
if self.data.input_kwargs[dst].get('value') == self.rank:
|
|
185
|
-
return
|
|
188
|
+
return OverFlowCheckConst.DST
|
|
186
189
|
else:
|
|
187
|
-
return
|
|
188
|
-
if self.api in
|
|
190
|
+
return OverFlowCheckConst.SRC
|
|
191
|
+
if self.api in OverFlowCheckConst.DIRECTED_API:
|
|
189
192
|
for item in self.data.input_args:
|
|
190
193
|
if item.get(Const.TYPE) == 'int':
|
|
191
|
-
node_type =
|
|
192
|
-
return node_type if item.get('value') == self.rank else
|
|
193
|
-
return
|
|
194
|
+
node_type = OverFlowCheckConst.DIRECTED_API[self.api]
|
|
195
|
+
return node_type if item.get('value') == self.rank else OverFlowCheckConst.OPPOSITE_DIR[node_type]
|
|
196
|
+
return OverFlowCheckConst.LINK
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
from dataclasses import dataclass
|
|
@@ -120,7 +121,7 @@ def is_communication_op(op_name):
|
|
|
120
121
|
# 定义通信算子的关键字,覆盖各种通信操作,如all_reduce, send, broadcast等
|
|
121
122
|
# 从wrap文件中读取,先硬编码在文件中
|
|
122
123
|
return (op_name.startswith('Distributed.') and
|
|
123
|
-
any(keyword in op_name for keyword in
|
|
124
|
+
any(keyword in op_name for keyword in OverFlowCheckConst.COMMUNICATION_KEYWORDS))
|
|
124
125
|
|
|
125
126
|
|
|
126
127
|
def is_ignore_op(op_name):
|
|
@@ -163,7 +164,7 @@ def analyze_anomaly_in_group(nodes_group):
|
|
|
163
164
|
anomaly_nodes.append(node.data)
|
|
164
165
|
|
|
165
166
|
# 先看src或link中input是否有异常
|
|
166
|
-
src_list = list(filter(lambda node: node.type in [
|
|
167
|
+
src_list = list(filter(lambda node: node.type in [OverFlowCheckConst.SRC, OverFlowCheckConst.LINK], nodes_group))
|
|
167
168
|
input_anomaly_nodes = list(filter(lambda node: node.input_has_nan_inf(), src_list))
|
|
168
169
|
# 如果有异常回溯计算节点找到异常来源
|
|
169
170
|
# 使用cpu模拟节点进行计算,查看结果是否有问题。需要对所有计算节点录入/映射,暂不实现。
|
|
@@ -174,7 +175,7 @@ def analyze_anomaly_in_group(nodes_group):
|
|
|
174
175
|
return anomaly_nodes
|
|
175
176
|
|
|
176
177
|
|
|
177
|
-
class
|
|
178
|
+
class OverFlowCheckConst:
|
|
178
179
|
COMMUNICATION_KEYWORDS = {
|
|
179
180
|
'send', # send 算子
|
|
180
181
|
'recv', # recv 算子
|
msprobe/pytorch/__init__.py
CHANGED
|
@@ -1,24 +1,30 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
import torch
|
|
17
|
-
|
|
18
|
-
from .
|
|
18
|
+
|
|
19
|
+
from msprobe.pytorch.dump.debugger.precision_debugger import PrecisionDebugger, module_dump, module_dump_end
|
|
19
20
|
from .common.utils import seed_all
|
|
20
|
-
from .
|
|
21
|
+
from .torchair_dump import set_fx_dump_config, set_ge_dump_config
|
|
21
22
|
|
|
22
23
|
torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
|
|
23
24
|
if torch_version_above_or_equal_2:
|
|
24
25
|
from msprobe.pytorch.monitor.module_hook import TrainerMon
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from msprobe.pytorch.aclgraph_dump import acl_save # noqa: F401
|
|
29
|
+
except Exception:
|
|
30
|
+
acl_save = None
|