mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
|
@@ -1,27 +1,29 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
import os
|
|
17
18
|
import sqlite3
|
|
18
19
|
import json
|
|
19
20
|
import re
|
|
20
21
|
import time
|
|
22
|
+
from typing import Dict
|
|
21
23
|
from msprobe.core.common.log import logger
|
|
22
24
|
from msprobe.core.common.file_utils import change_mode, check_path_before_create, FileChecker
|
|
23
|
-
from msprobe.core.common.const import FileCheckConst
|
|
24
|
-
from msprobe.visualization.utils import GraphConst
|
|
25
|
+
from msprobe.core.common.const import FileCheckConst, Const
|
|
26
|
+
from msprobe.visualization.utils import GraphConst, update_shared_dict, update_pbar_info, post_process_db_pbar
|
|
25
27
|
from msprobe.visualization.builder.msprobe_adapter import format_node_data
|
|
26
28
|
|
|
27
29
|
TEXT_PRIMARY_KEY = 'TEXT PRIMARY KEY'
|
|
@@ -29,6 +31,7 @@ TEXT_NOT_NULL = 'TEXT NOT NULL'
|
|
|
29
31
|
INTEGER_NOT_NULL = 'INTEGER NOT NULL'
|
|
30
32
|
TEXT = 'TEXT'
|
|
31
33
|
INTEGER = 'INTEGER'
|
|
34
|
+
DB_INSERT_SIZE = 1000
|
|
32
35
|
|
|
33
36
|
node_columns = {
|
|
34
37
|
'id': TEXT_PRIMARY_KEY,
|
|
@@ -51,7 +54,8 @@ node_columns = {
|
|
|
51
54
|
'data_source': TEXT,
|
|
52
55
|
'dump_data_dir': TEXT,
|
|
53
56
|
'step': INTEGER_NOT_NULL,
|
|
54
|
-
'rank': INTEGER_NOT_NULL
|
|
57
|
+
'rank': INTEGER_NOT_NULL,
|
|
58
|
+
'is_distributed': INTEGER
|
|
55
59
|
}
|
|
56
60
|
|
|
57
61
|
config_columns = {
|
|
@@ -77,7 +81,8 @@ indexes = {
|
|
|
77
81
|
"index3": ["step", "rank", "data_source", "node_order"],
|
|
78
82
|
"index4": ["step", "rank", "node_order"],
|
|
79
83
|
"index5": ["step", "rank", "micro_step_id", "node_order"],
|
|
80
|
-
"index6": ["step", "rank", "modified", "matched_node_link"]
|
|
84
|
+
"index6": ["step", "rank", "modified", "matched_node_link"],
|
|
85
|
+
"index7": ["is_distributed"]
|
|
81
86
|
}
|
|
82
87
|
|
|
83
88
|
SAFE_NAME_PATTERN = re.compile(r'^[a-zA-Z0-9_]+$')
|
|
@@ -133,7 +138,7 @@ def create_insert_sql_from_dict(table_name, columns_dict, ignore_insert=False):
|
|
|
133
138
|
return sql
|
|
134
139
|
|
|
135
140
|
|
|
136
|
-
def to_db(db_path, create_table_sql, insert_sql, data,
|
|
141
|
+
def to_db(db_path, create_table_sql, insert_sql, data, pbar_info=None):
|
|
137
142
|
max_retries = 10
|
|
138
143
|
initial_delay = 0.1
|
|
139
144
|
if not os.path.exists(db_path):
|
|
@@ -144,6 +149,11 @@ def to_db(db_path, create_table_sql, insert_sql, data, db_insert_size=1000):
|
|
|
144
149
|
|
|
145
150
|
retry_count = 0
|
|
146
151
|
current_delay = initial_delay
|
|
152
|
+
total_data = len(data)
|
|
153
|
+
commited_data = 0
|
|
154
|
+
|
|
155
|
+
if pbar_info:
|
|
156
|
+
update_shared_dict(pbar_info.current_stage_dict, pbar_info.task_id, 1)
|
|
147
157
|
|
|
148
158
|
while retry_count <= max_retries:
|
|
149
159
|
conn = None
|
|
@@ -154,9 +164,13 @@ def to_db(db_path, create_table_sql, insert_sql, data, db_insert_size=1000):
|
|
|
154
164
|
cursor.execute("PRAGMA journal_mode=WAL")
|
|
155
165
|
cursor.execute("PRAGMA synchronous=NORMAL")
|
|
156
166
|
cursor.execute(create_table_sql)
|
|
157
|
-
for i in range(0,
|
|
158
|
-
batch = data[i:i +
|
|
167
|
+
for i in range(0, total_data, DB_INSERT_SIZE):
|
|
168
|
+
batch = data[i:i + DB_INSERT_SIZE]
|
|
159
169
|
cursor.executemany(insert_sql, batch)
|
|
170
|
+
|
|
171
|
+
if pbar_info:
|
|
172
|
+
commited_data += len(batch)
|
|
173
|
+
update_pbar_info(pbar_info, commited_data, total_data)
|
|
160
174
|
conn.commit()
|
|
161
175
|
return
|
|
162
176
|
except sqlite3.OperationalError as e:
|
|
@@ -217,12 +231,16 @@ def add_table_index(db_path):
|
|
|
217
231
|
conn.close()
|
|
218
232
|
|
|
219
233
|
|
|
220
|
-
def post_process_db(db_path):
|
|
234
|
+
def post_process_db(db_path, pbar_info=None, is_parallel_merge=False):
|
|
235
|
+
logger.info('Start adding index to db file, please wait...')
|
|
221
236
|
add_table_index(db_path)
|
|
222
237
|
change_mode(db_path, FileCheckConst.DATA_FILE_AUTHORITY)
|
|
238
|
+
if pbar_info:
|
|
239
|
+
post_process_db_pbar(pbar_info, is_parallel_merge)
|
|
240
|
+
logger.info('Adding index to db file completed.')
|
|
223
241
|
|
|
224
242
|
|
|
225
|
-
def node_to_db(graph, db_name):
|
|
243
|
+
def node_to_db(graph, db_name, pbar_info=None):
|
|
226
244
|
create_table_sql = create_table_sql_from_dict('tb_nodes', node_columns)
|
|
227
245
|
insert_sql = create_insert_sql_from_dict('tb_nodes', node_columns)
|
|
228
246
|
data = []
|
|
@@ -241,8 +259,9 @@ def node_to_db(graph, db_name):
|
|
|
241
259
|
json.dumps(node.matched_distributed), 0,
|
|
242
260
|
json.dumps(format_node_data(node.input_data, node.id, graph.compare_mode)),
|
|
243
261
|
json.dumps(format_node_data(node.output_data, node.id, graph.compare_mode)),
|
|
244
|
-
graph.data_source, graph.data_path, graph.step, graph.rank
|
|
245
|
-
|
|
262
|
+
graph.data_source, graph.data_path, graph.step, graph.rank,
|
|
263
|
+
1 if node.id.startswith(Const.DISTRIBUTED) else 0))
|
|
264
|
+
to_db(db_name, create_table_sql, insert_sql, data, pbar_info=pbar_info)
|
|
246
265
|
stack_to_db(stack_dict, db_name)
|
|
247
266
|
|
|
248
267
|
|
|
@@ -274,3 +293,95 @@ def get_node_unique_id(graph, node):
|
|
|
274
293
|
|
|
275
294
|
def get_stack_unique_id(graph, stack_dict):
|
|
276
295
|
return f'{get_graph_unique_id(graph)}_{len(stack_dict)}'
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class DBActuator:
|
|
299
|
+
def __init__(self, db_path):
|
|
300
|
+
self.db_path = db_path
|
|
301
|
+
self._init_connection()
|
|
302
|
+
|
|
303
|
+
def __enter__(self):
|
|
304
|
+
return self
|
|
305
|
+
|
|
306
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
307
|
+
if self.conn:
|
|
308
|
+
try:
|
|
309
|
+
self.conn.close()
|
|
310
|
+
logger.info(f"Database connection to {self.db_path} closed successfully")
|
|
311
|
+
except sqlite3.Error as e:
|
|
312
|
+
logger.error(f"Failed to close database connection: {e}")
|
|
313
|
+
return False
|
|
314
|
+
|
|
315
|
+
def query_distributed_nodes_info(self):
|
|
316
|
+
query = """
|
|
317
|
+
SELECT
|
|
318
|
+
node_name,
|
|
319
|
+
input_data,
|
|
320
|
+
rank,
|
|
321
|
+
step,
|
|
322
|
+
data_source,
|
|
323
|
+
precision_index,
|
|
324
|
+
overflow_level
|
|
325
|
+
FROM
|
|
326
|
+
tb_nodes
|
|
327
|
+
WHERE
|
|
328
|
+
is_distributed = 1
|
|
329
|
+
"""
|
|
330
|
+
if not self.conn:
|
|
331
|
+
logger.warning("Database connection is not initialized.")
|
|
332
|
+
return []
|
|
333
|
+
try:
|
|
334
|
+
with self.conn as conn:
|
|
335
|
+
cursor = conn.execute(query)
|
|
336
|
+
rows = cursor.fetchall()
|
|
337
|
+
cursor.close()
|
|
338
|
+
|
|
339
|
+
return rows
|
|
340
|
+
except Exception as e:
|
|
341
|
+
logger.error(f'Failed to query distributed nodes: {e}')
|
|
342
|
+
return []
|
|
343
|
+
|
|
344
|
+
def update_matched_distributed(self, distributed_info: Dict[int, dict], step=0,
|
|
345
|
+
data_source=GraphConst.JSON_NPU_KEY, batch_size=1000):
|
|
346
|
+
update_data_list = []
|
|
347
|
+
update_sql = """
|
|
348
|
+
UPDATE tb_nodes
|
|
349
|
+
SET matched_distributed = ?
|
|
350
|
+
WHERE step = ? AND rank = ? AND data_source = ? AND node_name = ?;
|
|
351
|
+
|
|
352
|
+
"""
|
|
353
|
+
if not self.conn:
|
|
354
|
+
logger.warning("Database connection is not initialized.")
|
|
355
|
+
return
|
|
356
|
+
for rank, nodes_dict in distributed_info.items():
|
|
357
|
+
for node in nodes_dict.values():
|
|
358
|
+
if not node.matched_distributed:
|
|
359
|
+
continue
|
|
360
|
+
try:
|
|
361
|
+
|
|
362
|
+
matched_distributed = json.dumps(node.matched_distributed)
|
|
363
|
+
except Exception as e:
|
|
364
|
+
logger.warning(f"Failed to serialize matched_distributed for node {node.id}: {e}")
|
|
365
|
+
continue
|
|
366
|
+
update_data_list.append((matched_distributed, step, rank, data_source, node.id))
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
with self.conn as conn:
|
|
370
|
+
cursor = conn.cursor()
|
|
371
|
+
for i in range(0, len(update_data_list), batch_size):
|
|
372
|
+
batch_data = update_data_list[i:i + batch_size]
|
|
373
|
+
cursor.executemany(update_sql, batch_data)
|
|
374
|
+
cursor.close()
|
|
375
|
+
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.error(f"Failed to update matched_distributed: {e}")
|
|
378
|
+
raise
|
|
379
|
+
|
|
380
|
+
def _init_connection(self):
|
|
381
|
+
FileChecker(self.db_path, FileCheckConst.FILE, FileCheckConst.READ_WRITE_ABLE,
|
|
382
|
+
FileCheckConst.DB_SUFFIX).common_check()
|
|
383
|
+
try:
|
|
384
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
385
|
+
except sqlite3.Error as e:
|
|
386
|
+
logger.error(f"Failed to connect to database: {e}")
|
|
387
|
+
raise e
|
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from msprobe.core.overflow_check.level import OverflowLevel
|
|
17
18
|
from msprobe.visualization.utils import GraphConst
|
|
18
|
-
from msprobe.visualization.builder.msprobe_adapter import format_node_data, compare_data, compare_data_fuzzy
|
|
19
19
|
from msprobe.core.common.log import logger
|
|
20
20
|
|
|
21
21
|
|
|
@@ -44,21 +44,9 @@ class BaseNode:
|
|
|
44
44
|
return info
|
|
45
45
|
|
|
46
46
|
def __eq__(self, other):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if not compare_data(self.input_data, other.input_data):
|
|
51
|
-
return False
|
|
52
|
-
if not compare_data(self.output_data, other.output_data):
|
|
53
|
-
return False
|
|
54
|
-
return True
|
|
55
|
-
|
|
56
|
-
def fuzzy_eq(self, other):
|
|
57
|
-
if not compare_data_fuzzy(self.input_data, other.input_data):
|
|
58
|
-
return False
|
|
59
|
-
if not compare_data_fuzzy(self.output_data, other.output_data):
|
|
60
|
-
return False
|
|
61
|
-
return True
|
|
47
|
+
if self.id == other.id:
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
62
50
|
|
|
63
51
|
def set_input_output(self, input_data, output_data):
|
|
64
52
|
self.input_data = input_data
|
|
@@ -1,21 +1,27 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
import json
|
|
15
18
|
from enum import Enum
|
|
16
|
-
from msprobe.visualization.utils import GraphConst
|
|
19
|
+
from msprobe.visualization.utils import GraphConst, post_process_db_pbar
|
|
17
20
|
from msprobe.core.common.const import Const, CompareConst
|
|
18
21
|
from msprobe.core.common.log import logger
|
|
22
|
+
from msprobe.visualization.db_utils import DBActuator
|
|
23
|
+
from msprobe.visualization.graph.base_node import BaseNode
|
|
24
|
+
from msprobe.visualization.graph.node_op import NodeOp
|
|
19
25
|
|
|
20
26
|
|
|
21
27
|
class CommunicationType(Enum):
|
|
@@ -40,8 +46,7 @@ CANNOT_MATCH = 'cannot match distributed node in rank'
|
|
|
40
46
|
|
|
41
47
|
class DistributedAnalyzer:
|
|
42
48
|
|
|
43
|
-
def __init__(self,
|
|
44
|
-
self.graphs = graphs
|
|
49
|
+
def __init__(self, distributed_info: dict, overflow_check: bool):
|
|
45
50
|
self.overflow_check = overflow_check
|
|
46
51
|
self.config = {
|
|
47
52
|
# 当前通信api名称: 匹配目标通信api名称, 获取rank信息的位置参数或关键字参数, 通信类型, 分布式类型
|
|
@@ -55,6 +60,7 @@ class DistributedAnalyzer:
|
|
|
55
60
|
'reduce': ['reduce', '1', CommunicationType.RECEIVE.value, DistributedType.COLLECTIVE]
|
|
56
61
|
}
|
|
57
62
|
self.group_node_mapping = {}
|
|
63
|
+
self.distributed_info = distributed_info
|
|
58
64
|
self._make_group_node_mapping()
|
|
59
65
|
|
|
60
66
|
@staticmethod
|
|
@@ -101,18 +107,30 @@ class DistributedAnalyzer:
|
|
|
101
107
|
if not group_ranks:
|
|
102
108
|
logger.debug(f'The group_ranks of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
|
|
103
109
|
return None, None
|
|
110
|
+
group_ranks = DistributedAnalyzer._check_and_convert_group_ranks(group_ranks)
|
|
111
|
+
if not group_ranks:
|
|
112
|
+
return None, None
|
|
104
113
|
group_id = group.get('group_id')
|
|
105
114
|
if not group_id:
|
|
106
115
|
logger.debug(f'The group_id of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
|
|
107
116
|
return None, None
|
|
108
117
|
return group_ranks, group_id
|
|
109
118
|
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _check_and_convert_group_ranks(group_ranks):
|
|
121
|
+
if not isinstance(group_ranks, list):
|
|
122
|
+
try:
|
|
123
|
+
group_ranks = json.loads(group_ranks)
|
|
124
|
+
except json.decoder.JSONDecodeError as e:
|
|
125
|
+
logger.debug(f'Error occurred, group_rank cannot be converted to list: {e}')
|
|
126
|
+
return []
|
|
127
|
+
return group_ranks
|
|
128
|
+
|
|
110
129
|
def distributed_match(self):
|
|
111
|
-
for rank,
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
if not node_id.startswith(Const.DISTRIBUTED) or node.matched_distributed:
|
|
130
|
+
for rank, nodes_dict in self.distributed_info.items():
|
|
131
|
+
for node_id, node in nodes_dict.items():
|
|
132
|
+
# 已经匹配过了
|
|
133
|
+
if node.matched_distributed:
|
|
116
134
|
continue
|
|
117
135
|
api_name, distributed_type = self._get_distributed_name_and_type(node_id)
|
|
118
136
|
if api_name == GraphConst.BATCH_P2P:
|
|
@@ -137,14 +155,11 @@ class DistributedAnalyzer:
|
|
|
137
155
|
"2": {}
|
|
138
156
|
}
|
|
139
157
|
"""
|
|
140
|
-
for rank,
|
|
158
|
+
for rank, nodes_dict in self.distributed_info.items():
|
|
141
159
|
group_count = {}
|
|
142
160
|
group_info = {}
|
|
143
161
|
batch_p2p_count = {}
|
|
144
|
-
|
|
145
|
-
for node_id, node in nodes.items():
|
|
146
|
-
if not node_id.startswith(Const.DISTRIBUTED):
|
|
147
|
-
continue
|
|
162
|
+
for node_id, node in nodes_dict.items():
|
|
148
163
|
api_name, distributed_type = self._get_distributed_name_and_type(node_id)
|
|
149
164
|
if api_name == GraphConst.BATCH_P2P:
|
|
150
165
|
self._make_batch_p2p_mapping(node, rank, batch_p2p_count)
|
|
@@ -213,18 +228,18 @@ class DistributedAnalyzer:
|
|
|
213
228
|
:param target_api_name: 与当前节点产生通信的节点api名称, 仅p2p通信需要配置
|
|
214
229
|
:return: 目标节点
|
|
215
230
|
"""
|
|
216
|
-
|
|
217
|
-
if not
|
|
218
|
-
logger.debug(f'
|
|
231
|
+
target_nodes_dict = self.distributed_info.get(str(target_rank))
|
|
232
|
+
if not target_nodes_dict:
|
|
233
|
+
logger.debug(f'Node data does not exist, {CANNOT_MATCH}{target_rank}')
|
|
219
234
|
return None
|
|
220
|
-
target_group_mapping = self.group_node_mapping.get(target_rank)
|
|
235
|
+
target_group_mapping = self.group_node_mapping.get(str(target_rank))
|
|
221
236
|
# p2p通信,想要获取目标节点,需要替换unique_group_id中的rank和api name,
|
|
222
237
|
# 例如isend发送到rank1,对应的irecv接收自rank0, isend_rank1与irecv_rank0对应
|
|
223
238
|
target_unique_group_id = (unique_group_id
|
|
224
239
|
.replace(Const.RANK + str(target_rank), Const.RANK + str(rank))
|
|
225
240
|
.replace(api_name, target_api_name)) if target_api_name else unique_group_id
|
|
226
241
|
target_node_id = target_group_mapping.get(target_unique_group_id, '')
|
|
227
|
-
target_node =
|
|
242
|
+
target_node = target_nodes_dict.get(target_node_id)
|
|
228
243
|
if not target_node:
|
|
229
244
|
logger.debug(f'Node {target_node_id} does not exist, {CANNOT_MATCH}{target_rank}')
|
|
230
245
|
return None
|
|
@@ -244,6 +259,7 @@ class DistributedAnalyzer:
|
|
|
244
259
|
else communications_type
|
|
245
260
|
index = target_node.data.get(GraphConst.OVERFLOW_LEVEL, CompareConst.NAN) if self.overflow_check \
|
|
246
261
|
else target_node.data.get(GraphConst.JSON_INDEX_KEY, CompareConst.NAN)
|
|
262
|
+
index = CompareConst.NAN if index is None else index
|
|
247
263
|
matched_distributed = {
|
|
248
264
|
'communications_type': communications_type,
|
|
249
265
|
'nodes_info': {target_rank: [str(index), target_node.id]}
|
|
@@ -264,11 +280,10 @@ class DistributedAnalyzer:
|
|
|
264
280
|
"""
|
|
265
281
|
config_info = self.config.get(api_name)
|
|
266
282
|
target_api_name = config_info[0]
|
|
267
|
-
#
|
|
268
283
|
target_rank = self._get_target_rank(node, rank, config_info[1])
|
|
269
284
|
if target_rank is None:
|
|
270
285
|
return
|
|
271
|
-
unique_group_id = self.group_node_mapping.get(rank, {}).get(node.id, '')
|
|
286
|
+
unique_group_id = self.group_node_mapping.get(str(rank), {}).get(node.id, '')
|
|
272
287
|
target_node = self._get_target_node(rank, unique_group_id, api_name, target_rank, target_api_name)
|
|
273
288
|
if not target_node:
|
|
274
289
|
return
|
|
@@ -288,13 +303,6 @@ class DistributedAnalyzer:
|
|
|
288
303
|
f'The rank is inconsistent, cannot match distributed node')
|
|
289
304
|
return
|
|
290
305
|
|
|
291
|
-
# 点对点通信,两个匹配节点的输出数据要一致
|
|
292
|
-
if not DistributedAnalyzer._node_output_all_equal(node.output_data.get(node.id + '.output.0'),
|
|
293
|
-
target_node.output_data.get(target_node.id + '.output.0')):
|
|
294
|
-
logger.debug(f'{node.id} output of rank{rank} is different from the {target_node.id} '
|
|
295
|
-
f'output of rank{target_rank}, cannot match distributed node')
|
|
296
|
-
return
|
|
297
|
-
|
|
298
306
|
self._add_node_matched_distributed(node, target_node, api_name, target_rank)
|
|
299
307
|
self._add_node_matched_distributed(target_node, node, target_api_name, rank)
|
|
300
308
|
|
|
@@ -319,7 +327,10 @@ class DistributedAnalyzer:
|
|
|
319
327
|
group_ranks, group_id = self._get_group_info(node, rank)
|
|
320
328
|
if not group_ranks or not group_id:
|
|
321
329
|
return
|
|
322
|
-
|
|
330
|
+
group_ranks = DistributedAnalyzer._check_and_convert_group_ranks(group_ranks)
|
|
331
|
+
if not group_ranks:
|
|
332
|
+
return
|
|
333
|
+
unique_group_id = self.group_node_mapping.get(str(rank), {}).get(node.id, '')
|
|
323
334
|
matched_distributed = {'communications_type': communications_type}
|
|
324
335
|
nodes_info = {}
|
|
325
336
|
for target_rank in group_ranks:
|
|
@@ -340,6 +351,7 @@ class DistributedAnalyzer:
|
|
|
340
351
|
# 给当前通信节点添加matched_distributed字段信息
|
|
341
352
|
index = target_node.data.get(GraphConst.OVERFLOW_LEVEL, CompareConst.NAN) if self.overflow_check \
|
|
342
353
|
else target_node.data.get(GraphConst.JSON_INDEX_KEY, CompareConst.NAN)
|
|
354
|
+
index = CompareConst.NAN if index is None else index
|
|
343
355
|
nodes_info[target_rank] = [str(index), target_node.id]
|
|
344
356
|
if config_info:
|
|
345
357
|
# 给匹配上的目标节点也添加matched_distributed字段信息
|
|
@@ -357,7 +369,7 @@ class DistributedAnalyzer:
|
|
|
357
369
|
:param rank: 当前节点所属rank
|
|
358
370
|
:return:
|
|
359
371
|
"""
|
|
360
|
-
unique_group_ids = self.group_node_mapping.get(rank, {}).get(node.id)
|
|
372
|
+
unique_group_ids = self.group_node_mapping.get(str(rank), {}).get(node.id)
|
|
361
373
|
if not unique_group_ids:
|
|
362
374
|
return
|
|
363
375
|
matched_distributed = [] if len(unique_group_ids) > 1 else {}
|
|
@@ -376,6 +388,7 @@ class DistributedAnalyzer:
|
|
|
376
388
|
communications_type = self.config.get(api_name)[2]
|
|
377
389
|
index = target_node.data.get(GraphConst.OVERFLOW_LEVEL, CompareConst.NAN) if self.overflow_check \
|
|
378
390
|
else target_node.data.get(GraphConst.JSON_INDEX_KEY, CompareConst.NAN)
|
|
391
|
+
index = CompareConst.NAN if index is None else index
|
|
379
392
|
matched_info = {
|
|
380
393
|
'communications_type': communications_type,
|
|
381
394
|
'nodes_info': {target_rank: [str(index), target_node.id]}
|
|
@@ -384,3 +397,47 @@ class DistributedAnalyzer:
|
|
|
384
397
|
else matched_distributed.update(matched_info)
|
|
385
398
|
if matched_distributed:
|
|
386
399
|
node.matched_distributed = matched_distributed
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def distributed_analyse(db_path: str, overflow_check: bool, pbar_info=None):
|
|
403
|
+
distributed_info_data_source = {}
|
|
404
|
+
db_actuator = DBActuator(db_path)
|
|
405
|
+
rows = db_actuator.query_distributed_nodes_info()
|
|
406
|
+
if not rows:
|
|
407
|
+
if pbar_info:
|
|
408
|
+
post_process_db_pbar(pbar_info, True)
|
|
409
|
+
return
|
|
410
|
+
logger.info('Start analyzing distributed nodes...')
|
|
411
|
+
for row in rows:
|
|
412
|
+
if len(row) < 7:
|
|
413
|
+
continue
|
|
414
|
+
node_id = row[0]
|
|
415
|
+
input_data = row[1]
|
|
416
|
+
rank = row[2]
|
|
417
|
+
step = row[3]
|
|
418
|
+
data_source = row[4]
|
|
419
|
+
precision_index = row[5]
|
|
420
|
+
overflow_level = row[6]
|
|
421
|
+
node = BaseNode(NodeOp.function_api, node_id)
|
|
422
|
+
node.data[GraphConst.JSON_INDEX_KEY] = precision_index
|
|
423
|
+
node.data[GraphConst.OVERFLOW_LEVEL] = overflow_level
|
|
424
|
+
try:
|
|
425
|
+
node.input_data = json.loads(input_data)
|
|
426
|
+
except json.JSONDecodeError as e:
|
|
427
|
+
logger.debug(f"JSON parsing of input data for node {node_id} failed: {e}, data: {input_data}")
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# data_source -> step -> rank -> {node_id: node} 分类
|
|
431
|
+
distributed_info_data_source.setdefault(data_source, {}) \
|
|
432
|
+
.setdefault(step, {}) \
|
|
433
|
+
.setdefault(str(rank), {})[node_id] = node
|
|
434
|
+
|
|
435
|
+
for data_source, distributed_info_steps in distributed_info_data_source.items():
|
|
436
|
+
for step, distributed_info in distributed_info_steps.items():
|
|
437
|
+
analyzer = DistributedAnalyzer(distributed_info, overflow_check)
|
|
438
|
+
analyzer.distributed_match()
|
|
439
|
+
db_actuator.update_matched_distributed(analyzer.distributed_info, step, data_source)
|
|
440
|
+
|
|
441
|
+
if pbar_info:
|
|
442
|
+
post_process_db_pbar(pbar_info, True)
|
|
443
|
+
logger.info('Successfully analyzed distributed nodes.')
|
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
15
17
|
from msprobe.core.overflow_check.checker import AnomalyDetector
|
|
16
18
|
from msprobe.visualization.graph.base_node import BaseNode
|
|
17
19
|
from msprobe.visualization.graph.node_op import NodeOp
|
|
@@ -50,8 +52,6 @@ class Graph:
|
|
|
50
52
|
if not node_n or node_n.id not in graph_b.node_map:
|
|
51
53
|
return None, []
|
|
52
54
|
node_b = graph_b.node_map.get(node_n.id)
|
|
53
|
-
if node_n != node_b:
|
|
54
|
-
return None, []
|
|
55
55
|
ancestors_n = node_n.get_ancestors()
|
|
56
56
|
ancestors_b = node_b.get_ancestors()
|
|
57
57
|
if ancestors_n != ancestors_b:
|
|
@@ -71,11 +71,9 @@ class Graph:
|
|
|
71
71
|
return node_b, ancestors_n, ancestors_b
|
|
72
72
|
|
|
73
73
|
@staticmethod
|
|
74
|
-
def fuzzy_match(node_n, node_b
|
|
74
|
+
def fuzzy_match(node_n, node_b):
|
|
75
75
|
if not node_n or not node_b:
|
|
76
76
|
return None, [], []
|
|
77
|
-
if check_shape and not node_n.fuzzy_eq(node_b):
|
|
78
|
-
return None, [], []
|
|
79
77
|
ancestors_n = node_n.get_ancestors()
|
|
80
78
|
ancestors_b = node_b.get_ancestors()
|
|
81
79
|
return node_b, ancestors_n, ancestors_b
|