mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from typing import Any, Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from msprobe.core.common.const import Const, MonitorConst
|
|
23
|
+
from msprobe.core.common.framework_adapter import FmkAdp
|
|
24
|
+
from msprobe.core.common.log import logger
|
|
25
|
+
from msprobe.core.monitor_v2.base import BaseMonitorV2
|
|
26
|
+
from msprobe.core.monitor_v2.utils import build_param2name, get_vpp_stage_from_tag, iter_model_chunks
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class WeightGradMonitorV2(BaseMonitorV2, ABC):
|
|
30
|
+
"""
|
|
31
|
+
Lightweight weight-gradient monitor that patches optimizer.step to grab
|
|
32
|
+
pre/post gradients, reusing per-framework metric computation.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
super().__init__()
|
|
37
|
+
self._optimizer: Any = None
|
|
38
|
+
self._model: Any = None
|
|
39
|
+
self._param2name: Dict[Any, str] = {}
|
|
40
|
+
self._orig_step = None
|
|
41
|
+
self._patched = False
|
|
42
|
+
self._eps: float = 1e-8
|
|
43
|
+
self._grad_hooks: List[Any] = []
|
|
44
|
+
self.monitor_mbs_grad: bool = False
|
|
45
|
+
self._micro_batch_number: int = 1
|
|
46
|
+
self._param_micro_steps: Dict[Any, int] = {}
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def _compute_metrics(self, tag2tensor: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def set_config(self, config: Dict[str, Any]) -> None:
|
|
53
|
+
super().set_config(config)
|
|
54
|
+
self._eps = self._parse_eps(self.config.get("eps", 1e-8))
|
|
55
|
+
self.monitor_mbs_grad = self._parse_bool(self.config.get("monitor_mbs_grad", False))
|
|
56
|
+
|
|
57
|
+
def start(self, model: Any = None, optimizer: Any = None, **context: Any) -> None:
|
|
58
|
+
self.set_context(**context)
|
|
59
|
+
if self._optimizer is not None:
|
|
60
|
+
return
|
|
61
|
+
optimizer = optimizer or self._context.get("optimizer")
|
|
62
|
+
if optimizer is None:
|
|
63
|
+
raise ValueError("Optimizer must be provided to start weight grad function")
|
|
64
|
+
self._optimizer = optimizer
|
|
65
|
+
self._model = model or self._context.get("model")
|
|
66
|
+
self._micro_batch_number = self._resolve_micro_batch_number()
|
|
67
|
+
self._register_params(self._model, optimizer)
|
|
68
|
+
self._patch_optimizer()
|
|
69
|
+
|
|
70
|
+
def stop(self) -> None:
|
|
71
|
+
if self._optimizer is None:
|
|
72
|
+
return
|
|
73
|
+
self._restore_optimizer()
|
|
74
|
+
self._optimizer = None
|
|
75
|
+
self._model = None
|
|
76
|
+
self._rows = []
|
|
77
|
+
self._param2name.clear()
|
|
78
|
+
self._clear_grad_hooks()
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------
|
|
81
|
+
# Hook registration helpers
|
|
82
|
+
# ---------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _register_params(self, model: Any, optimizer: Any) -> None:
|
|
85
|
+
self._param2name = build_param2name(model=model, optimizer=optimizer)
|
|
86
|
+
self._register_pre_grad_hooks()
|
|
87
|
+
|
|
88
|
+
def _register_pre_grad_hooks(self) -> None:
|
|
89
|
+
self._param_micro_steps.clear()
|
|
90
|
+
for param, name in self._param2name.items():
|
|
91
|
+
if not getattr(param, "requires_grad", True):
|
|
92
|
+
continue
|
|
93
|
+
handle = self._create_param_hook(param, name)
|
|
94
|
+
if handle is not None:
|
|
95
|
+
self._grad_hooks.append(handle)
|
|
96
|
+
|
|
97
|
+
def _create_param_hook(self, param: Any, name: str):
|
|
98
|
+
hook_fn = getattr(param, "register_hook", None)
|
|
99
|
+
if not callable(hook_fn):
|
|
100
|
+
return None
|
|
101
|
+
self._param_micro_steps[param] = 0
|
|
102
|
+
return hook_fn(lambda grad, *, _p=param, _n=name: self._on_param_grad(_p, _n, grad))
|
|
103
|
+
|
|
104
|
+
def _on_param_grad(self, param: Any, name: str, grad: Any) -> None:
|
|
105
|
+
if grad is None or not FmkAdp.is_tensor(grad):
|
|
106
|
+
return
|
|
107
|
+
current_idx = self._param_micro_steps.get(param, 0) + 1
|
|
108
|
+
self._param_micro_steps[param] = current_idx
|
|
109
|
+
|
|
110
|
+
should_record = self.monitor_mbs_grad or (
|
|
111
|
+
self._micro_batch_number <= 0 or current_idx >= self._micro_batch_number
|
|
112
|
+
)
|
|
113
|
+
if should_record:
|
|
114
|
+
micro_step_val = current_idx if self.monitor_mbs_grad else self._micro_batch_number
|
|
115
|
+
self._record_grad("unreduced", name, grad, micro_step=micro_step_val)
|
|
116
|
+
|
|
117
|
+
if self._micro_batch_number > 0 and current_idx >= self._micro_batch_number:
|
|
118
|
+
self._param_micro_steps[param] = 0
|
|
119
|
+
|
|
120
|
+
def _clear_grad_hooks(self) -> None:
|
|
121
|
+
for handle in self._grad_hooks:
|
|
122
|
+
try:
|
|
123
|
+
handle.remove()
|
|
124
|
+
except Exception as exc:
|
|
125
|
+
logger.warning(f"[monitor_v2] Failed to remove grad hook: {exc}")
|
|
126
|
+
self._grad_hooks.clear()
|
|
127
|
+
self._param_micro_steps.clear()
|
|
128
|
+
|
|
129
|
+
# ---------------------------------------------------------------------
|
|
130
|
+
# Optimizer patching
|
|
131
|
+
# ---------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def _patch_optimizer(self) -> None:
|
|
134
|
+
if self._patched or not hasattr(self._optimizer, "step"):
|
|
135
|
+
return
|
|
136
|
+
self._orig_step = self._optimizer.step
|
|
137
|
+
|
|
138
|
+
def patched_step(*args: Any, **kwargs: Any):
|
|
139
|
+
self._capture_grads("reduced")
|
|
140
|
+
return self._orig_step(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
self._optimizer.step = patched_step # type: ignore[assignment]
|
|
143
|
+
self._patched = True
|
|
144
|
+
|
|
145
|
+
def _restore_optimizer(self) -> None:
|
|
146
|
+
if self._patched and self._orig_step is not None:
|
|
147
|
+
self._optimizer.step = self._orig_step # type: ignore[assignment]
|
|
148
|
+
self._patched = False
|
|
149
|
+
self._orig_step = None
|
|
150
|
+
self._clear_grad_hooks()
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------
|
|
153
|
+
# Recording helpers
|
|
154
|
+
# ---------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def _capture_grads(self, kind: str) -> None:
|
|
157
|
+
tag2tensor: Dict[str, Any] = {}
|
|
158
|
+
tag2micro: Dict[str, int] = {}
|
|
159
|
+
micro_step_val = self._micro_batch_number
|
|
160
|
+
for param, name in self._param2name.items():
|
|
161
|
+
grad = self._fetch_param_grad(param)
|
|
162
|
+
if grad is None or not FmkAdp.is_tensor(grad):
|
|
163
|
+
continue
|
|
164
|
+
tag = name
|
|
165
|
+
tag2tensor[tag] = grad
|
|
166
|
+
tag2micro[tag] = micro_step_val
|
|
167
|
+
self._record_grad_dict(kind, tag2tensor, tag2micro)
|
|
168
|
+
|
|
169
|
+
def _resolve_micro_batch_number(self) -> int:
|
|
170
|
+
candidates = [
|
|
171
|
+
self.config.get("micro_batch_number"),
|
|
172
|
+
self.config.get("grad_acc_steps"),
|
|
173
|
+
self._context.get("micro_batch_number"),
|
|
174
|
+
self._context.get("grad_acc_steps"),
|
|
175
|
+
]
|
|
176
|
+
for candidate in candidates:
|
|
177
|
+
if candidate is None:
|
|
178
|
+
continue
|
|
179
|
+
try:
|
|
180
|
+
value = int(candidate)
|
|
181
|
+
except (TypeError, ValueError):
|
|
182
|
+
continue
|
|
183
|
+
if value > 0:
|
|
184
|
+
return value
|
|
185
|
+
return 1
|
|
186
|
+
|
|
187
|
+
def _record_grad(self, kind: str, name: str, grad: Any, micro_step: Optional[int] = None) -> None:
|
|
188
|
+
if grad is None or not FmkAdp.is_tensor(grad):
|
|
189
|
+
return
|
|
190
|
+
tag = name
|
|
191
|
+
micro_map = {tag: micro_step} if micro_step is not None else None
|
|
192
|
+
self._record_grad_dict(kind, {tag: grad}, micro_map)
|
|
193
|
+
|
|
194
|
+
def _record_grad_dict(
|
|
195
|
+
self,
|
|
196
|
+
kind: str,
|
|
197
|
+
tag2tensor: Dict[str, Any],
|
|
198
|
+
tag2micro: Optional[Dict[str, int]] = None,
|
|
199
|
+
) -> None:
|
|
200
|
+
if not tag2tensor:
|
|
201
|
+
return
|
|
202
|
+
stats = self._compute_metrics(tag2tensor)
|
|
203
|
+
for tag, op_dict in stats.items():
|
|
204
|
+
vpp_stage = get_vpp_stage_from_tag(tag)
|
|
205
|
+
row = {
|
|
206
|
+
"module_name": tag,
|
|
207
|
+
"scope": kind,
|
|
208
|
+
"stats": {op: self._detach_stat(val) for op, val in op_dict.items()},
|
|
209
|
+
}
|
|
210
|
+
if tag2micro and tag in tag2micro:
|
|
211
|
+
row["micro_step"] = tag2micro[tag]
|
|
212
|
+
if vpp_stage is not None:
|
|
213
|
+
row["vpp_stage"] = vpp_stage
|
|
214
|
+
else:
|
|
215
|
+
row["vpp_stage"] = MonitorConst.DEFAULT_STAGE
|
|
216
|
+
self._rows.append(row)
|
|
217
|
+
|
|
218
|
+
def _fetch_param_grad(self, param: Any) -> Any:
|
|
219
|
+
return getattr(param, "grad", None)
|
|
220
|
+
|
|
221
|
+
def _detach_stat(self, value: Any) -> Any:
|
|
222
|
+
if value is None or not FmkAdp.is_tensor(value):
|
|
223
|
+
return value
|
|
224
|
+
if FmkAdp.fmk == Const.PT_FRAMEWORK:
|
|
225
|
+
return value.detach()
|
|
226
|
+
return value
|
|
227
|
+
|
|
228
|
+
def _parse_eps(self, value: Any) -> float:
|
|
229
|
+
try:
|
|
230
|
+
eps = float(value)
|
|
231
|
+
except (TypeError, ValueError):
|
|
232
|
+
logger.warning(f"[monitor_v2] Invalid eps value: {value}, fallback to 1e-8.")
|
|
233
|
+
return 1e-8
|
|
234
|
+
if eps <= 0:
|
|
235
|
+
logger.warning(f"[monitor_v2] eps must be positive, got {eps}, fallback to 1e-8.")
|
|
236
|
+
return 1e-8
|
|
237
|
+
return eps
|
|
238
|
+
|
|
239
|
+
def _parse_bool(self, value: Any) -> bool:
|
|
240
|
+
if isinstance(value, bool):
|
|
241
|
+
return value
|
|
242
|
+
logger.warning(f"[monitor_v2] Invalid boolean value: {value}, fallback to False.")
|
|
243
|
+
return False
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class PyTorchWeightGradMonitorV2(WeightGradMonitorV2):
|
|
249
|
+
def __init__(self) -> None:
|
|
250
|
+
super().__init__()
|
|
251
|
+
self._fsdp_mode: bool = False
|
|
252
|
+
self._orig_autograd_backward = None
|
|
253
|
+
self._fsdp_backward_patched: bool = False
|
|
254
|
+
self._fsdp_micro_step: int = 0
|
|
255
|
+
|
|
256
|
+
def start(self, model: Any = None, optimizer: Any = None, **context: Any) -> None:
|
|
257
|
+
self.set_context(**context)
|
|
258
|
+
if self._optimizer is not None:
|
|
259
|
+
return
|
|
260
|
+
model = model or self._context.get("model")
|
|
261
|
+
self._fsdp_mode = self._detect_fsdp(model)
|
|
262
|
+
self._fsdp_micro_step = 0
|
|
263
|
+
super().start(model=model, optimizer=optimizer, **context)
|
|
264
|
+
if self._fsdp_mode:
|
|
265
|
+
self._patch_autograd_backward()
|
|
266
|
+
|
|
267
|
+
def stop(self) -> None:
|
|
268
|
+
self._restore_autograd_backward()
|
|
269
|
+
self._fsdp_mode = False
|
|
270
|
+
self._fsdp_micro_step = 0
|
|
271
|
+
super().stop()
|
|
272
|
+
|
|
273
|
+
def _register_pre_grad_hooks(self) -> None:
|
|
274
|
+
if self._fsdp_mode:
|
|
275
|
+
self._param_micro_steps.clear()
|
|
276
|
+
return
|
|
277
|
+
super()._register_pre_grad_hooks()
|
|
278
|
+
|
|
279
|
+
def _create_param_hook(self, param: Any, name: str):
|
|
280
|
+
try:
|
|
281
|
+
import torch
|
|
282
|
+
|
|
283
|
+
param_tmp = param.expand_as(param)
|
|
284
|
+
grad_acc = param_tmp.grad_fn.next_functions[0][0]
|
|
285
|
+
if grad_acc is None:
|
|
286
|
+
raise RuntimeError
|
|
287
|
+
except Exception:
|
|
288
|
+
return super()._create_param_hook(param, name)
|
|
289
|
+
|
|
290
|
+
@torch.no_grad()
|
|
291
|
+
def acc_hook(*_unused):
|
|
292
|
+
grad = self._fetch_param_grad(param)
|
|
293
|
+
self._on_param_grad(param, name, grad)
|
|
294
|
+
|
|
295
|
+
self._param_micro_steps[param] = 0
|
|
296
|
+
return grad_acc.register_hook(acc_hook)
|
|
297
|
+
|
|
298
|
+
def _fetch_param_grad(self, param: Any) -> Any:
|
|
299
|
+
grad = getattr(param, "main_grad", None)
|
|
300
|
+
if grad is None:
|
|
301
|
+
grad = getattr(param, "grad", None)
|
|
302
|
+
if grad is None:
|
|
303
|
+
return None
|
|
304
|
+
if grad.__class__.__name__ == "DTensor":
|
|
305
|
+
try:
|
|
306
|
+
grad = grad.to_local()
|
|
307
|
+
except Exception as exc:
|
|
308
|
+
logger.warning(f"[monitor_v2] Failed to convert DTensor grad to local: {exc}")
|
|
309
|
+
try:
|
|
310
|
+
from msprobe.pytorch.common.utils import is_float8_tensor
|
|
311
|
+
|
|
312
|
+
if is_float8_tensor(grad):
|
|
313
|
+
grad = grad.float()
|
|
314
|
+
except Exception as exc:
|
|
315
|
+
logger.warning(f"[monitor_v2] Failed to convert grad tensor from float8: {exc}")
|
|
316
|
+
try:
|
|
317
|
+
return grad.clone()
|
|
318
|
+
except Exception:
|
|
319
|
+
return grad
|
|
320
|
+
|
|
321
|
+
def _compute_metrics(self, tag2tensor: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
322
|
+
from msprobe.pytorch.monitor.module_metric import get_metrics
|
|
323
|
+
|
|
324
|
+
return get_metrics(self._ops, tag2tensor, self._eps, {})
|
|
325
|
+
|
|
326
|
+
def _detect_fsdp(self, model: Any) -> bool:
|
|
327
|
+
if model is None:
|
|
328
|
+
return False
|
|
329
|
+
try:
|
|
330
|
+
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP # type: ignore[import]
|
|
331
|
+
except Exception:
|
|
332
|
+
FSDP = None
|
|
333
|
+
for chunk, _prefix in iter_model_chunks(model):
|
|
334
|
+
if FSDP is not None and isinstance(chunk, FSDP):
|
|
335
|
+
if getattr(chunk, "_use_orig_params", False):
|
|
336
|
+
return True
|
|
337
|
+
try:
|
|
338
|
+
for _param_name, param in chunk.named_parameters():
|
|
339
|
+
if param.__class__.__name__ == "DTensor":
|
|
340
|
+
return True
|
|
341
|
+
except Exception:
|
|
342
|
+
continue
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
def _patch_autograd_backward(self) -> None:
|
|
346
|
+
if self._fsdp_backward_patched:
|
|
347
|
+
return
|
|
348
|
+
try:
|
|
349
|
+
import torch.autograd as _autograd
|
|
350
|
+
except Exception:
|
|
351
|
+
logger.warning("[monitor_v2] Failed to import torch.autograd, skipping FSDP grad patch.")
|
|
352
|
+
return
|
|
353
|
+
|
|
354
|
+
orig = getattr(_autograd, "backward", None)
|
|
355
|
+
if orig is None:
|
|
356
|
+
return
|
|
357
|
+
|
|
358
|
+
def wrapped_backward(*args: Any, **kwargs: Any):
|
|
359
|
+
out = orig(*args, **kwargs)
|
|
360
|
+
self._capture_fsdp_pre_grads()
|
|
361
|
+
return out
|
|
362
|
+
|
|
363
|
+
self._orig_autograd_backward = orig
|
|
364
|
+
_autograd.backward = wrapped_backward
|
|
365
|
+
self._fsdp_backward_patched = True
|
|
366
|
+
|
|
367
|
+
def _restore_autograd_backward(self) -> None:
|
|
368
|
+
if not self._fsdp_backward_patched or self._orig_autograd_backward is None:
|
|
369
|
+
self._fsdp_backward_patched = False
|
|
370
|
+
self._orig_autograd_backward = None
|
|
371
|
+
return
|
|
372
|
+
try:
|
|
373
|
+
import torch.autograd as _autograd
|
|
374
|
+
|
|
375
|
+
_autograd.backward = self._orig_autograd_backward
|
|
376
|
+
except Exception as exc:
|
|
377
|
+
logger.warning(f"[monitor_v2] Failed to restore torch.autograd.backward: {exc}")
|
|
378
|
+
self._fsdp_backward_patched = False
|
|
379
|
+
self._orig_autograd_backward = None
|
|
380
|
+
|
|
381
|
+
def _capture_fsdp_pre_grads(self) -> None:
|
|
382
|
+
if self._model is None:
|
|
383
|
+
return
|
|
384
|
+
current_idx = self._fsdp_micro_step + 1
|
|
385
|
+
self._fsdp_micro_step = current_idx
|
|
386
|
+
|
|
387
|
+
tag2tensor: Dict[str, Any] = {}
|
|
388
|
+
tag2micro: Dict[str, int] = {}
|
|
389
|
+
should_record = True
|
|
390
|
+
if self.monitor_mbs_grad:
|
|
391
|
+
micro_step_val = current_idx
|
|
392
|
+
else:
|
|
393
|
+
micro_step_val = self._micro_batch_number
|
|
394
|
+
if self._micro_batch_number > 0 and current_idx < self._micro_batch_number:
|
|
395
|
+
should_record = False
|
|
396
|
+
|
|
397
|
+
if should_record:
|
|
398
|
+
for param, name in self._param2name.items():
|
|
399
|
+
if not getattr(param, "requires_grad", True):
|
|
400
|
+
continue
|
|
401
|
+
grad = self._fetch_param_grad(param)
|
|
402
|
+
if grad is None or not FmkAdp.is_tensor(grad):
|
|
403
|
+
continue
|
|
404
|
+
clean_name = name.replace("_fsdp_wrapped_module.", "")
|
|
405
|
+
tag = clean_name
|
|
406
|
+
tag2tensor[tag] = grad
|
|
407
|
+
tag2micro[tag] = micro_step_val
|
|
408
|
+
|
|
409
|
+
self._record_grad_dict("unreduced", tag2tensor, tag2micro)
|
|
410
|
+
|
|
411
|
+
if self._micro_batch_number > 0 and current_idx >= self._micro_batch_number:
|
|
412
|
+
self._fsdp_micro_step = 0
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class MindSporeWeightGradMonitorV2(WeightGradMonitorV2):
|
|
416
|
+
def _compute_metrics(self, tag2tensor: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
417
|
+
from msprobe.mindspore.monitor.utils import get_metrics
|
|
418
|
+
|
|
419
|
+
return get_metrics(self._ops, tag2tensor, self._eps, {})
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import csv
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from typing import Any, Dict, List
|
|
23
|
+
|
|
24
|
+
from msprobe.core.common.const import Const
|
|
25
|
+
from msprobe.core.common.framework_adapter import FmkAdp
|
|
26
|
+
from msprobe.core.common.file_utils import make_dir
|
|
27
|
+
from msprobe.core.common.log import logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CSVWriterV2:
|
|
31
|
+
"""
|
|
32
|
+
Minimal CSV writer for monitor_v2.
|
|
33
|
+
|
|
34
|
+
- Each rank gets its own directory: <out_dir>/rank_<rank>/
|
|
35
|
+
- Each monitor writes to <slug>_step<start>-<end>.csv
|
|
36
|
+
- Row structure: step plus dynamic fields from rows (no ts/monitor/rank).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, out_dir: str = "monitor_v2_output", rank: int = 0, async_write: bool = False):
|
|
40
|
+
self.out_dir = out_dir
|
|
41
|
+
self.rank = rank
|
|
42
|
+
self.async_write = async_write # reserved, currently synchronous
|
|
43
|
+
if self.async_write:
|
|
44
|
+
logger.warning("[monitor_v2] async_write is not supported yet; falling back to sync writes.")
|
|
45
|
+
|
|
46
|
+
self.rank_dir = os.path.join(self.out_dir, f"rank_{self.rank}")
|
|
47
|
+
make_dir(self.rank_dir)
|
|
48
|
+
self._known_fields: Dict[str, List[str]] = {}
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def close():
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def _stack_tensors(tensors: List[Any]) -> Any:
|
|
56
|
+
if FmkAdp.fmk == Const.PT_FRAMEWORK:
|
|
57
|
+
import torch
|
|
58
|
+
|
|
59
|
+
return torch.stack(tensors, dim=0)
|
|
60
|
+
from mindspore import ops
|
|
61
|
+
|
|
62
|
+
return ops.stack(tensors, axis=0)
|
|
63
|
+
|
|
64
|
+
def write_monitor_data(self, monitor_data: Dict[str, Any]):
|
|
65
|
+
rows = monitor_data.get("rows") or []
|
|
66
|
+
if not rows:
|
|
67
|
+
return None
|
|
68
|
+
rows = self._flatten_rows(rows)
|
|
69
|
+
if not rows:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
monitor_name = monitor_data.get("monitor", "monitor")
|
|
73
|
+
slug = self._safe_slug(monitor_data.get("slug", monitor_name))
|
|
74
|
+
step = monitor_data.get("step", "unknown")
|
|
75
|
+
interval = self._safe_int(monitor_data.get("step_count_per_record", 1), default=1)
|
|
76
|
+
start_step = self._safe_int(monitor_data.get("start_step", 0), default=0)
|
|
77
|
+
csv_path = self._resolve_csv_path(slug, step, interval, start_step)
|
|
78
|
+
|
|
79
|
+
base_fields = ["vpp_stage", "step"]
|
|
80
|
+
row_keys = list(dict.fromkeys(k for r in rows for k in r if k not in base_fields))
|
|
81
|
+
fields = base_fields + row_keys
|
|
82
|
+
|
|
83
|
+
file_exists = os.path.exists(csv_path)
|
|
84
|
+
existing = self._known_fields.get(slug)
|
|
85
|
+
if existing:
|
|
86
|
+
if not file_exists:
|
|
87
|
+
for k in fields:
|
|
88
|
+
if k not in existing:
|
|
89
|
+
existing.append(k)
|
|
90
|
+
fields = existing
|
|
91
|
+
else:
|
|
92
|
+
self._known_fields[slug] = fields
|
|
93
|
+
|
|
94
|
+
with open(csv_path, "a", newline="") as f:
|
|
95
|
+
writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
96
|
+
if not file_exists:
|
|
97
|
+
writer.writeheader()
|
|
98
|
+
for r in rows:
|
|
99
|
+
out = dict(r)
|
|
100
|
+
out.setdefault("step", step)
|
|
101
|
+
writer.writerow(out)
|
|
102
|
+
|
|
103
|
+
return csv_path
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def _safe_int(value: Any, default: int = 0) -> int:
|
|
107
|
+
try:
|
|
108
|
+
return int(value)
|
|
109
|
+
except (TypeError, ValueError):
|
|
110
|
+
return default
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def _safe_slug(slug: Any) -> str:
|
|
114
|
+
text = str(slug) if slug is not None else "monitor"
|
|
115
|
+
text = text.replace(os.sep, "_")
|
|
116
|
+
if os.altsep:
|
|
117
|
+
text = text.replace(os.altsep, "_")
|
|
118
|
+
text = text.replace("..", "_")
|
|
119
|
+
text = re.sub(r"[^A-Za-z0-9_.-]+", "_", text).strip("._")
|
|
120
|
+
return text or "monitor"
|
|
121
|
+
|
|
122
|
+
def _resolve_csv_path(self, slug: str, step: Any, interval: int, start_step: int) -> str:
|
|
123
|
+
try:
|
|
124
|
+
step_val = int(step)
|
|
125
|
+
except (TypeError, ValueError):
|
|
126
|
+
return os.path.join(self.rank_dir, f"{slug}.csv")
|
|
127
|
+
interval = max(interval, 1)
|
|
128
|
+
relative = step_val - start_step
|
|
129
|
+
if relative < 0:
|
|
130
|
+
start = step_val
|
|
131
|
+
else:
|
|
132
|
+
start = start_step + (relative // interval) * interval
|
|
133
|
+
end = start + interval - 1
|
|
134
|
+
return os.path.join(self.rank_dir, f"{slug}_step{start}-{end}.csv")
|
|
135
|
+
|
|
136
|
+
def _flatten_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
137
|
+
stat_ops = list(dict.fromkeys(op for row in rows for op in (row.get("stats") or {})))
|
|
138
|
+
stacked_values = self._stack_stats(rows, stat_ops)
|
|
139
|
+
default_values = [float("nan")] * len(rows)
|
|
140
|
+
flattened = []
|
|
141
|
+
for idx, row in enumerate(rows):
|
|
142
|
+
base = {k: v for k, v in row.items() if k != "stats"}
|
|
143
|
+
for op in stat_ops:
|
|
144
|
+
values = stacked_values.get(op, default_values)
|
|
145
|
+
base[op] = values[idx] if idx < len(values) else float("nan")
|
|
146
|
+
flattened.append(base)
|
|
147
|
+
return flattened
|
|
148
|
+
|
|
149
|
+
def _stack_stats(self, rows: List[Dict[str, Any]], stat_ops: List[str]) -> Dict[str, List[float]]:
|
|
150
|
+
stacked: Dict[str, List[float]] = {}
|
|
151
|
+
total = len(rows)
|
|
152
|
+
for op in stat_ops:
|
|
153
|
+
tensors = [(row.get("stats") or {}).get(op) for row in rows]
|
|
154
|
+
if not tensors or any(tensor is None for tensor in tensors):
|
|
155
|
+
stacked[op] = [float("nan")] * total
|
|
156
|
+
continue
|
|
157
|
+
stacked_tensor = self._stack_tensors(tensors)
|
|
158
|
+
if FmkAdp.fmk == Const.PT_FRAMEWORK:
|
|
159
|
+
stacked_tensor = stacked_tensor.cpu()
|
|
160
|
+
numpy_vals = FmkAdp.asnumpy(stacked_tensor).reshape(-1)
|
|
161
|
+
stacked[op] = [float(value) for value in numpy_vals.tolist()]
|
|
162
|
+
return stacked
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from typing import List, Dict, Union, Any
|
|
17
18
|
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from dataclasses import dataclass
|
|
17
18
|
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
15
16
|
|
|
16
17
|
from typing import Dict, List, Optional, Any
|
|
17
18
|
|