mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
from msprobe.core.dump.debugger.precision_debugger import BasePrecisionDebugger
|
|
18
|
+
from msprobe.pytorch.dump.debugger.debugger_config import DebuggerConfig
|
|
19
|
+
from msprobe.pytorch.dump.pt_config import parse_task_config
|
|
20
|
+
from msprobe.pytorch.dump.pytorch_service import PytorchService
|
|
21
|
+
|
|
22
|
+
from msprobe.core.common.const import Const, MsgConst
|
|
23
|
+
from msprobe.core.common.exceptions import MsprobeException
|
|
24
|
+
from msprobe.core.common.utils import check_token_range, ThreadSafe, check_rank_id
|
|
25
|
+
from msprobe.pytorch.common.utils import check_save_param, is_torch_nn_module
|
|
26
|
+
from msprobe.pytorch.dump.module_dump.module_dump import ModuleDumper
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PrecisionDebugger(BasePrecisionDebugger):
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
config_path=None,
|
|
34
|
+
task=None,
|
|
35
|
+
dump_path=None,
|
|
36
|
+
level=None,
|
|
37
|
+
step=None
|
|
38
|
+
):
|
|
39
|
+
if self.initialized:
|
|
40
|
+
return
|
|
41
|
+
super().__init__(config_path, task, dump_path, level, step)
|
|
42
|
+
self.config = DebuggerConfig(
|
|
43
|
+
self.common_config,
|
|
44
|
+
self.task_config,
|
|
45
|
+
task,
|
|
46
|
+
dump_path,
|
|
47
|
+
level
|
|
48
|
+
)
|
|
49
|
+
self.service = PytorchService(self.config)
|
|
50
|
+
self.module_dumper = ModuleDumper(self.service)
|
|
51
|
+
self.ori_customer_func = {}
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _get_task_config(task, json_config):
|
|
55
|
+
return parse_task_config(task, json_config)
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
@ThreadSafe.synchronized
|
|
59
|
+
def start(cls, model=None, token_range=None, rank_id=None):
|
|
60
|
+
instance = cls._get_instance()
|
|
61
|
+
if instance is None:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
check_token_range(token_range)
|
|
65
|
+
check_rank_id(rank_id)
|
|
66
|
+
instance.config.check_model(model, token_range)
|
|
67
|
+
instance.service.start(model, token_range, rank_id)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
@ThreadSafe.synchronized
|
|
71
|
+
def stop(cls):
|
|
72
|
+
instance = cls._get_instance()
|
|
73
|
+
if instance is None:
|
|
74
|
+
return
|
|
75
|
+
instance.service.stop()
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
@ThreadSafe.synchronized
|
|
79
|
+
def step(cls):
|
|
80
|
+
instance = cls._get_instance()
|
|
81
|
+
if instance is None:
|
|
82
|
+
return
|
|
83
|
+
cls._instance.service.step()
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
@ThreadSafe.synchronized
|
|
87
|
+
def save(cls, variable, name, save_backward=True):
|
|
88
|
+
instance = cls._instance
|
|
89
|
+
if not instance:
|
|
90
|
+
raise Exception(MsgConst.NOT_CREATED_INSTANCE)
|
|
91
|
+
if instance.task not in [Const.TENSOR, Const.STATISTICS] or instance.config.level != Const.LEVEL_DEBUG:
|
|
92
|
+
return
|
|
93
|
+
try:
|
|
94
|
+
check_save_param(variable, name, save_backward)
|
|
95
|
+
except ValueError:
|
|
96
|
+
return
|
|
97
|
+
instance.service.save(variable, name, save_backward)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@ThreadSafe.synchronized
|
|
101
|
+
def module_dump(module, dump_name):
|
|
102
|
+
if not is_torch_nn_module(module):
|
|
103
|
+
raise MsprobeException(
|
|
104
|
+
MsprobeException.INVALID_PARAM_ERROR,
|
|
105
|
+
f"the module argument in module_dump must be a torch.nn.Module type, "
|
|
106
|
+
f"but currently there is an unsupported {type(module)} type."
|
|
107
|
+
)
|
|
108
|
+
if not isinstance(dump_name, str):
|
|
109
|
+
raise MsprobeException(
|
|
110
|
+
MsprobeException.INVALID_PARAM_ERROR,
|
|
111
|
+
f"the dump_name argument in module_dump must be a str type"
|
|
112
|
+
)
|
|
113
|
+
instance = PrecisionDebugger._instance
|
|
114
|
+
if not instance:
|
|
115
|
+
raise MsprobeException(
|
|
116
|
+
MsprobeException.INTERFACE_USAGE_ERROR,
|
|
117
|
+
f"PrecisionDebugger must be instantiated before using module_dump interface"
|
|
118
|
+
)
|
|
119
|
+
instance.module_dumper.start_module_dump(module, dump_name)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@ThreadSafe.synchronized
|
|
123
|
+
def module_dump_end():
|
|
124
|
+
instance = PrecisionDebugger._instance
|
|
125
|
+
if not instance:
|
|
126
|
+
raise MsprobeException(
|
|
127
|
+
MsprobeException.INTERFACE_USAGE_ERROR,
|
|
128
|
+
f"PrecisionDebugger must be instantiated before using module_dump_end interface"
|
|
129
|
+
)
|
|
130
|
+
instance.module_dumper.stop_module_dump()
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from msprobe.pytorch.bench_functions.apply_adam_w import npu_apply_adam_w
|
|
17
18
|
from msprobe.pytorch.bench_functions.confusion_transpose import npu_confusion_transpose, \
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from functools import wraps
|
|
17
18
|
from typing import Any, Callable
|
|
@@ -22,14 +23,17 @@ from torch.utils.hooks import BackwardHook
|
|
|
22
23
|
from msprobe.core.common.const import Const
|
|
23
24
|
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
24
25
|
from msprobe.pytorch.common.log import logger
|
|
25
|
-
from msprobe.pytorch.
|
|
26
|
+
from msprobe.pytorch.common.utils import is_float8_tensor
|
|
27
|
+
from msprobe.pytorch.dump.api_dump.api_register import get_api_register
|
|
26
28
|
|
|
27
29
|
torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
def wrap_setup_backward_hook(func):
|
|
31
33
|
def requires_clone(tensor, need_check_leaf=False):
|
|
32
|
-
|
|
34
|
+
if not isinstance(tensor, torch.Tensor):
|
|
35
|
+
return False
|
|
36
|
+
need_clone = torch.is_grad_enabled() and tensor.requires_grad and not is_float8_tensor(tensor)
|
|
33
37
|
if need_check_leaf:
|
|
34
38
|
need_clone &= tensor.grad_fn is not None
|
|
35
39
|
return need_clone
|
|
@@ -1,21 +1,22 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from msprobe.pytorch.common.log import logger
|
|
17
|
-
from msprobe.pytorch.dump.module_dump.
|
|
18
|
-
from msprobe.pytorch.
|
|
18
|
+
from msprobe.pytorch.dump.module_dump.module_processor import ModuleProcessor
|
|
19
|
+
from msprobe.pytorch.dump.api_dump.api_register import get_api_register
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ModuleDumper:
|
|
@@ -28,7 +29,7 @@ class ModuleDumper:
|
|
|
28
29
|
logger.info_on_rank_0("The init dump is enabled, and the module dump function will not be available.")
|
|
29
30
|
return
|
|
30
31
|
|
|
31
|
-
|
|
32
|
+
ModuleProcessor.enable_module_dump = True
|
|
32
33
|
self.api_register.restore_all_api()
|
|
33
34
|
if not hasattr(module, 'msprobe_module_dump'):
|
|
34
35
|
self.service.module_processor.register_module_hook(module, self.service.build_hook,
|
|
@@ -36,5 +37,5 @@ class ModuleDumper:
|
|
|
36
37
|
setattr(module, 'msprobe_module_dump', True)
|
|
37
38
|
|
|
38
39
|
def stop_module_dump(self):
|
|
39
|
-
|
|
40
|
+
ModuleProcessor.enable_module_dump = False
|
|
40
41
|
self.api_register.register_all_api()
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
import sys
|
|
17
18
|
import threading
|
|
@@ -24,7 +25,7 @@ from msprobe.core.common.const import Const
|
|
|
24
25
|
from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
|
|
25
26
|
from msprobe.core.common.runtime import Runtime
|
|
26
27
|
from msprobe.core.common.utils import ModuleQueue, ThreadSafe
|
|
27
|
-
from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
|
|
28
|
+
from msprobe.core.dump.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
|
|
28
29
|
from msprobe.pytorch.common.log import logger
|
|
29
30
|
from msprobe.pytorch.common.utils import is_torch_nn_module, register_forward_pre_hook
|
|
30
31
|
from msprobe.pytorch.dump.module_dump.hook_wrapper import (
|
|
@@ -63,9 +64,9 @@ def wrap_forward_with_hook_safety(module):
|
|
|
63
64
|
except _StopRecomputationError as e:
|
|
64
65
|
exception_output = None
|
|
65
66
|
if len(module._forward_hooks.values()) > 0:
|
|
66
|
-
# 仅执行msprobe的forward_hook, hook名称必然包含'
|
|
67
|
+
# 仅执行msprobe的forward_hook, hook名称必然包含'ModuleProcessor.'
|
|
67
68
|
for hook_fn in module._forward_hooks.values():
|
|
68
|
-
if '
|
|
69
|
+
if 'ModuleProcessor' in str(hook_fn):
|
|
69
70
|
hook_fn(module, args, kwargs, exception_output)
|
|
70
71
|
break
|
|
71
72
|
raise e
|
|
@@ -74,7 +75,7 @@ def wrap_forward_with_hook_safety(module):
|
|
|
74
75
|
module.forward = wrapped_forward
|
|
75
76
|
|
|
76
77
|
|
|
77
|
-
class
|
|
78
|
+
class ModuleProcessor:
|
|
78
79
|
module_queue = ModuleQueue()
|
|
79
80
|
module_count = {}
|
|
80
81
|
module_stack = {}
|
|
@@ -83,6 +84,7 @@ class ModuleProcesser:
|
|
|
83
84
|
module_bw_hook_kernels = {}
|
|
84
85
|
module_with_backward_hook = {}
|
|
85
86
|
enable_module_dump = False
|
|
87
|
+
is_megatron_module = False
|
|
86
88
|
|
|
87
89
|
def __init__(self, scope):
|
|
88
90
|
self.scope = scope if isinstance(scope, (ModuleRangeScope, MixRangeScope)) else None
|
|
@@ -109,11 +111,11 @@ class ModuleProcesser:
|
|
|
109
111
|
|
|
110
112
|
@staticmethod
|
|
111
113
|
def set_and_get_calls_number(module_name):
|
|
112
|
-
if module_name not in
|
|
113
|
-
|
|
114
|
+
if module_name not in ModuleProcessor.module_count:
|
|
115
|
+
ModuleProcessor.module_count[module_name] = 0
|
|
114
116
|
else:
|
|
115
|
-
|
|
116
|
-
return
|
|
117
|
+
ModuleProcessor.module_count[module_name] += 1
|
|
118
|
+
return ModuleProcessor.module_count[module_name]
|
|
117
119
|
|
|
118
120
|
@staticmethod
|
|
119
121
|
def has_register_backward_hook(module):
|
|
@@ -137,6 +139,15 @@ class ModuleProcesser:
|
|
|
137
139
|
[(module_names[0], models)]
|
|
138
140
|
return modules_and_names_with_index
|
|
139
141
|
|
|
142
|
+
@staticmethod
|
|
143
|
+
def check_is_megatron_module(models):
|
|
144
|
+
if isinstance(models, (list, tuple)):
|
|
145
|
+
for model in models:
|
|
146
|
+
if "megatron" in type(model).__module__:
|
|
147
|
+
return True
|
|
148
|
+
else:
|
|
149
|
+
return "megatron" in type(models).__module__
|
|
150
|
+
|
|
140
151
|
@classmethod
|
|
141
152
|
def reset_module_stats(cls):
|
|
142
153
|
cls.module_queue = ModuleQueue()
|
|
@@ -151,16 +162,17 @@ class ModuleProcesser:
|
|
|
151
162
|
if module_names is None:
|
|
152
163
|
module_names = []
|
|
153
164
|
|
|
165
|
+
ModuleProcessor.is_megatron_module = self.check_is_megatron_module(models)
|
|
154
166
|
modules_and_names_with_index = self.get_modules_and_names(models, recursive, module_names)
|
|
155
167
|
for index, modules_and_names in modules_and_names_with_index.items():
|
|
156
168
|
model = models if index == "-1" else models[int(index)]
|
|
157
|
-
|
|
169
|
+
|
|
158
170
|
model_list = []
|
|
159
171
|
for name, module in modules_and_names:
|
|
160
172
|
model_list.append((name, module))
|
|
161
|
-
|
|
173
|
+
|
|
162
174
|
is_verl = "verl" in sys.modules
|
|
163
|
-
for idx, (name, module) in enumerate(model_list):
|
|
175
|
+
for idx, (name, module) in enumerate(model_list):
|
|
164
176
|
if recursive and module == model:
|
|
165
177
|
continue
|
|
166
178
|
if not is_torch_nn_module(module):
|
|
@@ -177,7 +189,7 @@ class ModuleProcesser:
|
|
|
177
189
|
logger.warning(f"The module {name} is the first or last layer in verl scenario, "
|
|
178
190
|
f"the data dump for this module will be skipped.")
|
|
179
191
|
continue
|
|
180
|
-
|
|
192
|
+
|
|
181
193
|
setattr(module, 'msprobe_hook', True)
|
|
182
194
|
module_index = (index + Const.SEP) if index != "-1" else ""
|
|
183
195
|
prefix_name = f'{BaseScope.Module_Type_Module}{Const.SEP}{module_index}{name}{Const.SEP}' + \
|
|
@@ -190,7 +202,7 @@ class ModuleProcesser:
|
|
|
190
202
|
f"The {prefix_name[:-1]} has registered deprecated register_backward_hook,"
|
|
191
203
|
f"which may cause abnormal data dump. The backward data dump for this module will be skipped."
|
|
192
204
|
)
|
|
193
|
-
|
|
205
|
+
ModuleProcessor.module_with_backward_hook[prefix_name] = True
|
|
194
206
|
wrap_forward_with_hook_safety(module)
|
|
195
207
|
register_forward_pre_hook(module, forward_pre_hook)
|
|
196
208
|
|
|
@@ -206,7 +218,7 @@ class ModuleProcesser:
|
|
|
206
218
|
if hasattr(module, 'msprobe_module_dump') and not self.enable_module_dump:
|
|
207
219
|
return (args, kwargs) if torch_version_above_or_equal_2 else args
|
|
208
220
|
|
|
209
|
-
index =
|
|
221
|
+
index = ModuleProcessor.set_and_get_calls_number(module_name)
|
|
210
222
|
full_forward_name = f'{module_name}{Const.FORWARD}{Const.SEP}{index}'
|
|
211
223
|
full_backward_name = f'{module_name}{Const.BACKWARD}{Const.SEP}{index}'
|
|
212
224
|
|
|
@@ -241,14 +253,14 @@ class ModuleProcesser:
|
|
|
241
253
|
|
|
242
254
|
return backward_hook_fn
|
|
243
255
|
|
|
244
|
-
if not
|
|
256
|
+
if not ModuleProcessor.module_with_backward_hook.get(module_name):
|
|
245
257
|
backward_pre_hook = get_backward_pre_hook(full_backward_name)
|
|
246
258
|
backward_hook = get_backward_hook(hook_set.backward_hook, full_backward_name)
|
|
247
259
|
if torch_version_above_or_equal_2:
|
|
248
260
|
bw_hook = BackwardHook(module, [backward_hook], [backward_pre_hook])
|
|
249
261
|
else:
|
|
250
262
|
bw_hook = BackwardHook(module, [backward_hook])
|
|
251
|
-
|
|
263
|
+
ModuleProcessor.module_bw_hook_kernels[full_forward_name] = bw_hook
|
|
252
264
|
args = bw_hook.setup_input_hook(args)
|
|
253
265
|
return (args, kwargs) if torch_version_above_or_equal_2 else args
|
|
254
266
|
|
|
@@ -257,7 +269,7 @@ class ModuleProcesser:
|
|
|
257
269
|
if hasattr(module, 'msprobe_module_dump') and not self.enable_module_dump:
|
|
258
270
|
return output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
|
|
259
271
|
|
|
260
|
-
index =
|
|
272
|
+
index = ModuleProcessor.module_count.get(module_name)
|
|
261
273
|
full_name = f'{module_name}{Const.FORWARD}{Const.SEP}{index}'
|
|
262
274
|
|
|
263
275
|
hook_set = build_data_hook(BaseScope.Module_Type_Module, full_name)
|
|
@@ -269,7 +281,7 @@ class ModuleProcesser:
|
|
|
269
281
|
else:
|
|
270
282
|
result = output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
|
|
271
283
|
|
|
272
|
-
bw_hook =
|
|
284
|
+
bw_hook = ModuleProcessor.module_bw_hook_kernels.get(full_name)
|
|
273
285
|
if bw_hook:
|
|
274
286
|
result = bw_hook.setup_output_hook(result)
|
|
275
287
|
|
|
@@ -280,35 +292,35 @@ class ModuleProcesser:
|
|
|
280
292
|
def set_construct_info_in_pre_hook(self, full_name):
|
|
281
293
|
tid = threading.get_ident()
|
|
282
294
|
if tid not in self.module_stack:
|
|
283
|
-
|
|
295
|
+
ModuleProcessor.module_stack[tid] = []
|
|
284
296
|
|
|
285
297
|
if self.module_stack[tid]:
|
|
286
|
-
|
|
298
|
+
ModuleProcessor.module_node[full_name] = self.module_stack[tid][-1] if not is_megatron() \
|
|
287
299
|
else [self.module_stack[tid][-1], get_micro_step()]
|
|
288
300
|
else:
|
|
289
|
-
parent_name =
|
|
290
|
-
|
|
301
|
+
parent_name = ModuleProcessor.module_queue.find_last(full_name)
|
|
302
|
+
ModuleProcessor.module_node[full_name] = parent_name if not is_megatron() \
|
|
291
303
|
else [parent_name, get_micro_step()]
|
|
292
304
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
305
|
+
ModuleProcessor.module_queue.add_name(full_name)
|
|
306
|
+
ModuleProcessor.module_stack[tid].append(full_name)
|
|
307
|
+
ModuleProcessor.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
|
|
296
308
|
if self.scope:
|
|
297
309
|
self.scope.begin_module(full_name)
|
|
298
310
|
|
|
299
311
|
def set_construct_info_in_hook(self, full_name, is_forward=True):
|
|
300
312
|
tid = threading.get_ident()
|
|
301
313
|
if torch_version_above_or_equal_2 or is_forward:
|
|
302
|
-
|
|
303
|
-
|
|
314
|
+
ModuleProcessor.module_queue.remove_name(full_name)
|
|
315
|
+
ModuleProcessor.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
|
|
304
316
|
if self.module_stack.get(tid):
|
|
305
|
-
|
|
317
|
+
ModuleProcessor.module_stack[tid].pop()
|
|
306
318
|
if self.module_stack.get(tid):
|
|
307
|
-
|
|
308
|
-
else [
|
|
319
|
+
ModuleProcessor.api_parent_node[tid] = ModuleProcessor.module_stack[tid][-1] if not is_megatron() \
|
|
320
|
+
else [ModuleProcessor.module_stack[tid][-1], get_micro_step()]
|
|
309
321
|
if self.scope:
|
|
310
322
|
self.scope.end_module(full_name)
|
|
311
323
|
else:
|
|
312
324
|
if self.scope:
|
|
313
325
|
self.scope.begin_module(full_name)
|
|
314
|
-
|
|
326
|
+
ModuleProcessor.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from msprobe.core.common.const import Const
|
|
20
|
+
from msprobe.core.common.file_utils import load_json
|
|
21
|
+
from msprobe.core.dump.common_config import BaseConfig, CommonConfig
|
|
22
|
+
from msprobe.pytorch.dump.api_dump.utils import get_ops
|
|
23
|
+
from msprobe.core.common.utils import is_int
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TensorConfig(BaseConfig):
|
|
27
|
+
def __init__(self, json_config):
|
|
28
|
+
super().__init__(json_config)
|
|
29
|
+
self.check_config()
|
|
30
|
+
self._check_summary_mode()
|
|
31
|
+
self._check_file_format()
|
|
32
|
+
self.bench_path = json_config.get("bench_path")
|
|
33
|
+
|
|
34
|
+
def _check_file_format(self):
|
|
35
|
+
if self.file_format is not None and self.file_format not in ["npy", "bin"]:
|
|
36
|
+
raise Exception("file_format is invalid")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StatisticsConfig(BaseConfig):
|
|
40
|
+
def __init__(self, json_config):
|
|
41
|
+
super().__init__(json_config)
|
|
42
|
+
self.check_config()
|
|
43
|
+
self._check_summary_mode()
|
|
44
|
+
|
|
45
|
+
self.tensor_list = json_config.get("tensor_list", [])
|
|
46
|
+
self._check_str_list_config(self.tensor_list, "tensor_list")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ACCCheckConfig(BaseConfig):
|
|
50
|
+
WrapApi = get_ops()
|
|
51
|
+
|
|
52
|
+
def __init__(self, json_config):
|
|
53
|
+
super().__init__(json_config)
|
|
54
|
+
self.white_list = json_config.get("white_list", Const.DEFAULT_LIST)
|
|
55
|
+
self.black_list = json_config.get("black_list", Const.DEFAULT_LIST)
|
|
56
|
+
self.error_data_path = json_config.get("error_data_path", Const.DEFAULT_PATH)
|
|
57
|
+
|
|
58
|
+
self.check_acc_check_config()
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def check_filter_list_config(cls, key, filter_list):
|
|
62
|
+
if not isinstance(filter_list, list):
|
|
63
|
+
raise Exception("%s must be a list type" % key)
|
|
64
|
+
if not all(isinstance(item, str) for item in filter_list):
|
|
65
|
+
raise Exception("All elements in %s must be string type" % key)
|
|
66
|
+
invalid_api = [item for item in filter_list if item not in cls.WrapApi]
|
|
67
|
+
if invalid_api:
|
|
68
|
+
raise Exception("Invalid api in %s: %s" % (key, invalid_api))
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def check_error_data_path_config(cls, error_data_path):
|
|
72
|
+
if not os.path.exists(error_data_path):
|
|
73
|
+
raise Exception("error_data_path: %s does not exist" % error_data_path)
|
|
74
|
+
|
|
75
|
+
def check_acc_check_config(self):
|
|
76
|
+
ACCCheckConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)
|
|
77
|
+
ACCCheckConfig.check_filter_list_config(Const.BLACK_LIST, self.black_list)
|
|
78
|
+
ACCCheckConfig.check_error_data_path_config(self.error_data_path)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class DiffCheckConfig(BaseConfig):
|
|
82
|
+
def __init__(self, json_config):
|
|
83
|
+
super().__init__(json_config)
|
|
84
|
+
self.diff_nums = json_config.get("diff_nums")
|
|
85
|
+
self.check_mode = json_config.get("check_mode")
|
|
86
|
+
self.check_diff_config()
|
|
87
|
+
|
|
88
|
+
def check_diff_config(self):
|
|
89
|
+
if self.diff_nums is not None and not is_int(self.diff_nums):
|
|
90
|
+
raise Exception("diff_num is invalid")
|
|
91
|
+
if self.diff_nums is not None and self.diff_nums != -1 and self.diff_nums <= 0:
|
|
92
|
+
raise Exception("diff_nums should be -1 or positive integer")
|
|
93
|
+
if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]:
|
|
94
|
+
raise Exception("check_mode is invalid")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class StructureConfig(BaseConfig):
|
|
98
|
+
def __init__(self, json_config):
|
|
99
|
+
super().__init__(json_config)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
TaskDict = {
|
|
103
|
+
Const.TENSOR: TensorConfig,
|
|
104
|
+
Const.STATISTICS: StatisticsConfig,
|
|
105
|
+
Const.DIFF_CHECK: DiffCheckConfig,
|
|
106
|
+
Const.STRUCTURE: StructureConfig,
|
|
107
|
+
Const.ACC_CHECK: ACCCheckConfig
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_task_config(task, json_config):
|
|
112
|
+
if task not in Const.TORCH_TASK_LIST:
|
|
113
|
+
raise Exception(f"task <{task}> is invalid, it is not in the {Const.TORCH_TASK_LIST}.")
|
|
114
|
+
task_map = json_config.get(task, dict())
|
|
115
|
+
return TaskDict.get(task)(task_map)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_json_config(json_file_path, task):
|
|
119
|
+
if not json_file_path:
|
|
120
|
+
config_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
121
|
+
json_file_path = os.path.join(config_dir, "../../config.json")
|
|
122
|
+
json_config = load_json(json_file_path)
|
|
123
|
+
common_config = CommonConfig(json_config)
|
|
124
|
+
if task:
|
|
125
|
+
task_config = parse_task_config(task, json_config)
|
|
126
|
+
else:
|
|
127
|
+
task_config = parse_task_config(common_config.task, json_config)
|
|
128
|
+
return common_config, task_config
|