mindstudio-probe 8.3.3__py3-none-any.whl → 26.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.3.3.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
- mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
- {mindstudio_probe-8.3.3.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
- mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
- mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
- mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
- msprobe/__init__.py +12 -13
- msprobe/config.json +9 -31
- msprobe/core/__init__.py +12 -11
- msprobe/core/acc_check/acc_check_cli.py +145 -0
- msprobe/core/common/const.py +97 -38
- msprobe/core/common/db_manager.py +133 -12
- msprobe/core/common/decorator.py +12 -11
- msprobe/core/common/exceptions.py +12 -11
- msprobe/core/common/file_utils.py +101 -25
- msprobe/core/common/framework_adapter.py +36 -25
- msprobe/core/common/global_lock.py +12 -11
- msprobe/core/common/inplace_op_checker.py +12 -11
- msprobe/core/common/log.py +22 -11
- msprobe/core/common/megatron_utils.py +566 -11
- msprobe/core/common/parallel_state.py +12 -11
- msprobe/core/common/runtime.py +12 -11
- msprobe/core/common/utils.py +41 -41
- msprobe/core/compare/acc_compare.py +361 -104
- msprobe/core/compare/atb_data_compare.py +422 -0
- msprobe/core/compare/auto_compare.py +134 -0
- msprobe/core/compare/check.py +14 -17
- msprobe/core/compare/compare_cli.py +72 -149
- msprobe/core/compare/config.py +12 -13
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +18 -18
- msprobe/core/compare/find_first/graph.py +12 -11
- msprobe/core/compare/find_first/utils.py +13 -12
- msprobe/core/compare/indicator_analysis/__init__.py +15 -0
- msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
- msprobe/core/compare/indicator_analysis/api_data.py +141 -0
- msprobe/core/compare/indicator_analysis/calculator.py +181 -0
- msprobe/core/compare/indicator_analysis/utils.py +116 -0
- msprobe/core/compare/layer_mapping/__init__.py +12 -11
- msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
- msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
- msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
- msprobe/core/compare/merge_result/merge_result.py +12 -11
- msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
- msprobe/core/compare/merge_result/utils.py +12 -11
- msprobe/core/compare/multiprocessing_compute.py +13 -14
- msprobe/core/compare/npy_compare.py +13 -11
- msprobe/core/compare/offline_data_compare.py +160 -0
- msprobe/core/compare/stats_diff_calc.py +39 -0
- msprobe/core/compare/torchair_acc_cmp.py +764 -0
- msprobe/core/compare/torchair_cmp_utils.py +338 -0
- msprobe/core/compare/utils.py +140 -49
- msprobe/core/config_check/__init__.py +12 -11
- msprobe/core/config_check/checkers/__init__.py +12 -11
- msprobe/core/config_check/checkers/base_checker.py +15 -14
- msprobe/core/config_check/checkers/dataset_checker.py +13 -12
- msprobe/core/config_check/checkers/env_args_checker.py +13 -12
- msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
- msprobe/core/config_check/checkers/pip_checker.py +15 -15
- msprobe/core/config_check/checkers/random_checker.py +13 -12
- msprobe/core/config_check/checkers/weights_checker.py +14 -12
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
- msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
- msprobe/core/config_check/config_check_cli.py +18 -17
- msprobe/core/config_check/config_checker.py +16 -14
- msprobe/core/config_check/resource/dependency.yaml +15 -12
- msprobe/core/config_check/resource/env.yaml +12 -11
- msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
- msprobe/core/config_check/utils/utils.py +12 -11
- msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
- msprobe/core/{common_config.py → dump/common_config.py} +13 -24
- msprobe/core/dump/data_dump/data_collector.py +257 -0
- msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
- msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
- msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
- msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
- msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
- msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
- msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
- msprobe/core/dump/dump2db/db_utils.py +215 -0
- msprobe/core/dump/dump2db/dump2db.py +409 -0
- msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
- msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
- msprobe/core/{service.py → dump/service.py} +43 -27
- msprobe/core/install_deps/install_deps.py +51 -0
- msprobe/core/monitor/anomaly_processor.py +13 -11
- msprobe/core/monitor/csv2db.py +73 -93
- msprobe/core/monitor/db_utils.py +140 -205
- msprobe/core/monitor/utils.py +18 -17
- msprobe/core/monitor_v2/__init__.py +20 -0
- msprobe/core/monitor_v2/base.py +83 -0
- msprobe/core/monitor_v2/cc.py +287 -0
- msprobe/core/monitor_v2/factory.py +81 -0
- msprobe/core/monitor_v2/module.py +201 -0
- msprobe/core/monitor_v2/optimizer.py +245 -0
- msprobe/core/monitor_v2/param.py +154 -0
- msprobe/core/monitor_v2/trainer.py +326 -0
- msprobe/core/monitor_v2/utils.py +122 -0
- msprobe/core/monitor_v2/weight_grad.py +419 -0
- msprobe/core/monitor_v2/writer.py +162 -0
- msprobe/core/overflow_check/abnormal_scene.py +12 -11
- msprobe/core/overflow_check/api_info.py +12 -11
- msprobe/core/overflow_check/checker.py +12 -11
- msprobe/core/overflow_check/filter.py +13 -11
- msprobe/core/overflow_check/level.py +12 -11
- msprobe/core/overflow_check/utils.py +12 -11
- msprobe/core/single_save/single_comparator.py +12 -11
- msprobe/core/single_save/single_saver.py +12 -11
- msprobe/infer/__init__.py +16 -0
- msprobe/infer/offline/__init__.py +16 -0
- msprobe/infer/offline/compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
- msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
- msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
- msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
- msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
- msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
- msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
- msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
- msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
- msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
- msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
- msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
- msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
- msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
- msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
- msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
- msprobe/infer/utils/__init__.py +15 -0
- msprobe/infer/utils/acc_cmp.py +94 -0
- msprobe/infer/utils/check/__init__.py +37 -0
- msprobe/infer/utils/check/args_checker.py +35 -0
- msprobe/infer/utils/check/checker.py +227 -0
- msprobe/infer/utils/check/dict_checker.py +78 -0
- msprobe/infer/utils/check/func_wrapper.py +96 -0
- msprobe/infer/utils/check/list_checker.py +56 -0
- msprobe/infer/utils/check/number_checker.py +64 -0
- msprobe/infer/utils/check/obj_checker.py +41 -0
- msprobe/infer/utils/check/path_checker.py +249 -0
- msprobe/infer/utils/check/rule.py +126 -0
- msprobe/infer/utils/check/string_checker.py +66 -0
- msprobe/infer/utils/cmp_algorithm.py +261 -0
- msprobe/infer/utils/constants.py +112 -0
- msprobe/infer/utils/file_open_check.py +337 -0
- msprobe/infer/utils/util.py +177 -0
- msprobe/mindspore/__init__.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
- msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
- msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
- msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
- msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/main.py +12 -11
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
- msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
- msprobe/mindspore/common/const.py +15 -74
- msprobe/mindspore/common/log.py +12 -11
- msprobe/mindspore/common/utils.py +30 -15
- msprobe/mindspore/compare/common_dir_compare.py +21 -23
- msprobe/mindspore/compare/distributed_compare.py +18 -16
- msprobe/mindspore/compare/ms_compare.py +14 -14
- msprobe/mindspore/compare/ms_graph_compare.py +26 -20
- msprobe/mindspore/compare/utils.py +14 -12
- msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
- msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
- msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
- msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
- msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
- msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
- msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
- msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
- msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
- msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
- msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
- msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
- msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
- msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
- msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
- msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
- msprobe/mindspore/dump/ms_config.py +105 -0
- msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
- msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
- msprobe/mindspore/dump/task_handler_factory.py +43 -0
- msprobe/mindspore/monitor/common_func.py +12 -11
- msprobe/mindspore/monitor/data_writers.py +12 -11
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
- msprobe/mindspore/monitor/features.py +12 -11
- msprobe/mindspore/monitor/module_hook.py +19 -22
- msprobe/mindspore/monitor/optimizer_collect.py +29 -25
- msprobe/mindspore/monitor/utils.py +13 -11
- msprobe/msaccucmp/advisor/__init__.py +16 -0
- msprobe/msaccucmp/advisor/advisor_const.py +65 -0
- msprobe/msaccucmp/advisor/advisor_result.py +73 -0
- msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
- msprobe/msaccucmp/advisor/input_advisor.py +66 -0
- msprobe/msaccucmp/advisor/node_advisor.py +68 -0
- msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
- msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
- msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
- msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
- msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/common.py +113 -0
- msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
- msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
- msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
- msprobe/msaccucmp/cmp_utils/log.py +257 -0
- msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
- msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
- msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
- msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
- msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
- msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
- msprobe/msaccucmp/cmp_utils/utils.py +356 -0
- msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
- msprobe/msaccucmp/compare_vector.py +48 -0
- msprobe/msaccucmp/conversion/__init__.py +16 -0
- msprobe/msaccucmp/conversion/data_conversion.py +277 -0
- msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
- msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
- msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
- msprobe/msaccucmp/dump_data_conversion.py +46 -0
- msprobe/msaccucmp/dump_parse/__init__.py +16 -0
- msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
- msprobe/msaccucmp/dump_parse/dump.py +423 -0
- msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
- msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
- msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
- msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
- msprobe/msaccucmp/dump_parse/mapping.py +62 -0
- msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
- msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
- msprobe/msaccucmp/dump_parser.py +90 -0
- msprobe/msaccucmp/format_manager/__init__.py +16 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
- msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
- msprobe/msaccucmp/format_manager/format_manager.py +307 -0
- msprobe/msaccucmp/inplace_layer_process.py +186 -0
- msprobe/msaccucmp/msaccucmp.py +532 -0
- msprobe/msaccucmp/mscmp_advisor.py +128 -0
- msprobe/msaccucmp/overflow/__init__.py +16 -0
- msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
- msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
- msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
- msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
- msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
- msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
- msprobe/msaccucmp/shape_conversion.py +41 -0
- msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
- msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
- msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
- msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
- msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
- msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
- msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
- msprobe/msprobe.py +101 -130
- msprobe/overflow_check/__init__.py +15 -0
- msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
- msprobe/{nan_analyze → overflow_check}/graph.py +28 -27
- msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
- msprobe/pytorch/__init__.py +20 -14
- msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
- msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
- msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
- msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
- msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
- msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
- msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
- msprobe/pytorch/bench_functions/__init__.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam.py +12 -11
- msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
- msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
- msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
- msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
- msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
- msprobe/pytorch/bench_functions/linear.py +12 -11
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
- msprobe/pytorch/bench_functions/mish.py +12 -11
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
- msprobe/pytorch/bench_functions/rms_norm.py +12 -11
- msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
- msprobe/pytorch/bench_functions/sort_v2.py +12 -11
- msprobe/pytorch/bench_functions/swiglu.py +12 -11
- msprobe/pytorch/common/__init__.py +12 -11
- msprobe/pytorch/common/log.py +12 -11
- msprobe/pytorch/common/parse_json.py +12 -11
- msprobe/pytorch/common/utils.py +52 -19
- msprobe/pytorch/compare/distributed_compare.py +13 -13
- msprobe/pytorch/compare/match.py +12 -11
- msprobe/pytorch/compare/pt_compare.py +14 -20
- msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
- msprobe/pytorch/compare/utils.py +12 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
- msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
- msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
- msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
- msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
- msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
- msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
- msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
- msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
- msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
- msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
- msprobe/pytorch/dump/pt_config.py +128 -0
- msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
- msprobe/pytorch/monitor/csv2tb.py +13 -11
- msprobe/pytorch/monitor/data_writers.py +13 -11
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
- msprobe/pytorch/monitor/features.py +12 -11
- msprobe/pytorch/monitor/module_hook.py +67 -59
- msprobe/pytorch/monitor/module_metric.py +13 -11
- msprobe/pytorch/monitor/optimizer_collect.py +37 -35
- msprobe/pytorch/monitor/utils.py +13 -11
- msprobe/pytorch/monitor/visualizer.py +12 -11
- msprobe/pytorch/torchair_dump/__init__.py +17 -0
- msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
- msprobe/scripts/atb/config_example.json +10 -0
- msprobe/scripts/atb/load_atb_probe.sh +101 -0
- msprobe/scripts/atb/unload_atb_probe.sh +27 -0
- msprobe/scripts/build_msaccucmp.sh +186 -0
- msprobe/scripts/conf/help.info +6 -0
- msprobe/scripts/conf/version.info +3 -0
- msprobe/scripts/run_script/common.sh +538 -0
- msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
- msprobe/visualization/__init__.py +12 -11
- msprobe/visualization/builder/__init__.py +12 -11
- msprobe/visualization/builder/graph_builder.py +45 -30
- msprobe/visualization/builder/graph_merger.py +53 -32
- msprobe/visualization/builder/msprobe_adapter.py +34 -44
- msprobe/visualization/compare/__init__.py +12 -11
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +28 -113
- msprobe/visualization/db_utils.py +133 -22
- msprobe/visualization/graph/__init__.py +12 -11
- msprobe/visualization/graph/base_node.py +15 -27
- msprobe/visualization/graph/distributed_analyzer.py +97 -40
- msprobe/visualization/graph/graph.py +14 -16
- msprobe/visualization/graph/node_colors.py +34 -31
- msprobe/visualization/graph/node_op.py +12 -11
- msprobe/visualization/graph_service.py +580 -205
- msprobe/visualization/utils.py +278 -31
- tb_graph_ascend/secure_build.py +175 -0
- tb_graph_ascend/server/__init__.py +15 -0
- tb_graph_ascend/server/app/__init__.py +15 -0
- tb_graph_ascend/server/app/model/__init__.py +15 -0
- tb_graph_ascend/server/app/model/hierarchy.py +348 -0
- tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
- tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
- tb_graph_ascend/server/app/repositories/__init__.py +15 -0
- tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
- tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
- tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
- tb_graph_ascend/server/app/service/__init__.py +18 -0
- tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
- tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
- tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
- tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
- tb_graph_ascend/server/app/utils/__init__.py +15 -0
- tb_graph_ascend/server/app/utils/constant.py +80 -0
- tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
- tb_graph_ascend/server/app/utils/global_state.py +95 -0
- tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
- tb_graph_ascend/server/app/utils/i18n.py +153 -0
- tb_graph_ascend/server/app/utils/request_method.py +46 -0
- tb_graph_ascend/server/app/views/__init__.py +15 -0
- tb_graph_ascend/server/app/views/graph_views.py +304 -0
- tb_graph_ascend/server/plugin.py +108 -0
- tb_graph_ascend/server/static/index.html +9250 -0
- tb_graph_ascend/server/static/index.js +21 -0
- tb_graph_ascend/setup.py +57 -0
- mindstudio_probe-8.3.3.dist-info/LICENSE +0 -201
- mindstudio_probe-8.3.3.dist-info/RECORD +0 -491
- mindstudio_probe-8.3.3.dist-info/entry_points.txt +0 -2
- mindstudio_probe-8.3.3.dist-info/top_level.txt +0 -1
- msprobe/CMakeLists.txt +0 -5
- msprobe/README.md +0 -203
- msprobe/core/advisor/advisor.py +0 -129
- msprobe/core/advisor/advisor_const.py +0 -58
- msprobe/core/advisor/advisor_result.py +0 -58
- msprobe/core/compare/find_first/data_processor.py +0 -35
- msprobe/core/compare/highlight.py +0 -390
- msprobe/core/data_dump/data_collector.py +0 -356
- msprobe/core/grad_probe/constant.py +0 -90
- msprobe/core/grad_probe/grad_compare.py +0 -187
- msprobe/core/grad_probe/utils.py +0 -105
- msprobe/core/kernel_dump/kernel_config.py +0 -33
- msprobe/docs/01.installation.md +0 -250
- msprobe/docs/02.config_introduction.md +0 -221
- msprobe/docs/03.config_examples.md +0 -281
- msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
- msprobe/docs/05.data_dump_PyTorch.md +0 -518
- msprobe/docs/06.data_dump_MindSpore.md +0 -618
- msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
- msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
- msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
- msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
- msprobe/docs/12.overflow_check_PyTorch.md +0 -82
- msprobe/docs/13.overflow_check_MindSpore.md +0 -33
- msprobe/docs/14.data_parse_PyTorch.md +0 -282
- msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
- msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
- msprobe/docs/17.grad_probe.md +0 -205
- msprobe/docs/18.online_dispatch.md +0 -89
- msprobe/docs/19.monitor.md +0 -753
- msprobe/docs/20.monitor_performance_baseline.md +0 -52
- msprobe/docs/21.visualization_PyTorch.md +0 -519
- msprobe/docs/22.visualization_MindSpore.md +0 -515
- msprobe/docs/23.generate_operator_PyTorch.md +0 -107
- msprobe/docs/24.code_mapping_Mindspore.md +0 -29
- msprobe/docs/25.tool_function_introduction.md +0 -29
- msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
- msprobe/docs/27.dump_json_instruction.md +0 -795
- msprobe/docs/28.debugger_save_instruction.md +0 -288
- msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
- msprobe/docs/29.data_dump_MSAdapter.md +0 -235
- msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
- msprobe/docs/31.config_check.md +0 -107
- msprobe/docs/32.ckpt_compare.md +0 -69
- msprobe/docs/33.generate_operator_MindSpore.md +0 -181
- msprobe/docs/34.RL_collect.md +0 -101
- msprobe/docs/35.nan_analyze.md +0 -73
- msprobe/docs/36.calculation_result_change.md +0 -75
- msprobe/docs/FAQ.md +0 -232
- msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
- msprobe/docs/img/BLOOM-7B_1.png +0 -0
- msprobe/docs/img/BLOOM-7B_2.png +0 -0
- msprobe/docs/img/BLOOM-7B_3.png +0 -0
- msprobe/docs/img/BLOOM-7B_4.png +0 -0
- msprobe/docs/img/GPT-3_1.png +0 -0
- msprobe/docs/img/GPT-3_2.png +0 -0
- msprobe/docs/img/GPT-3_3.png +0 -0
- msprobe/docs/img/GPT-3_4.png +0 -0
- msprobe/docs/img/GPT-3_5.png +0 -0
- msprobe/docs/img/GPT-3_6.png +0 -0
- msprobe/docs/img/GPT-3_7.png +0 -0
- msprobe/docs/img/GPT-3_8.png +0 -0
- msprobe/docs/img/YOLOV5S_1.png +0 -0
- msprobe/docs/img/YOLOV5S_2.png +0 -0
- msprobe/docs/img/accuracy_checking_details.png +0 -0
- msprobe/docs/img/accuracy_checking_result.png +0 -0
- msprobe/docs/img/api_precision_compare_details.png +0 -0
- msprobe/docs/img/api_precision_compare_result.png +0 -0
- msprobe/docs/img/auto_analyze_log.png +0 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/compare_result_pkl.png +0 -0
- msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/docs/img/cpu_info.png +0 -0
- msprobe/docs/img/free_benchmark.png +0 -0
- msprobe/docs/img/free_benchmark_framework.png +0 -0
- msprobe/docs/img/grad_probe_image-1.png +0 -0
- msprobe/docs/img/grad_probe_image-2.png +0 -0
- msprobe/docs/img/grad_probe_image-3.png +0 -0
- msprobe/docs/img/grad_probe_image-4.png +0 -0
- msprobe/docs/img/grad_probe_image.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/module_compare.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +0 -132
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
- msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
- msprobe/mindspore/code_mapping/bind.py +0 -283
- msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
- msprobe/mindspore/code_mapping/graph.py +0 -49
- msprobe/mindspore/code_mapping/graph_parser.py +0 -211
- msprobe/mindspore/code_mapping/main.py +0 -24
- msprobe/mindspore/code_mapping/processor.py +0 -34
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
- msprobe/mindspore/free_benchmark/common/config.py +0 -27
- msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
- msprobe/mindspore/free_benchmark/common/utils.py +0 -100
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
- msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
- msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
- msprobe/mindspore/grad_probe/global_context.py +0 -127
- msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
- msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
- msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
- msprobe/mindspore/grad_probe/hook.py +0 -115
- msprobe/mindspore/grad_probe/utils.py +0 -43
- msprobe/mindspore/mindtorch/__init__.py +0 -18
- msprobe/mindspore/ms_config.py +0 -153
- msprobe/mindspore/task_handler_factory.py +0 -44
- msprobe/nan_analyze/__init__.py +0 -14
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
- msprobe/pytorch/debugger/precision_debugger.py +0 -181
- msprobe/pytorch/free_benchmark/__init__.py +0 -23
- msprobe/pytorch/free_benchmark/common/constant.py +0 -85
- msprobe/pytorch/free_benchmark/common/counter.py +0 -87
- msprobe/pytorch/free_benchmark/common/enums.py +0 -80
- msprobe/pytorch/free_benchmark/common/params.py +0 -152
- msprobe/pytorch/free_benchmark/common/utils.py +0 -143
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
- msprobe/pytorch/free_benchmark/main.py +0 -123
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
- msprobe/pytorch/grad_probe/__init__.py +0 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
- msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
- msprobe/pytorch/hook_module/__init__.py +0 -16
- msprobe/pytorch/hook_module/wrap_aten.py +0 -111
- msprobe/pytorch/online_dispatch/__init__.py +0 -19
- msprobe/pytorch/online_dispatch/compare.py +0 -224
- msprobe/pytorch/online_dispatch/dispatch.py +0 -332
- msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
- msprobe/pytorch/online_dispatch/single_compare.py +0 -412
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
- msprobe/pytorch/online_dispatch/utils.py +0 -158
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +0 -31
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +0 -253
- msprobe/pytorch/parse_tool/lib/config.py +0 -50
- msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
- msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
- msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
- msprobe/pytorch/parse_tool/lib/utils.py +0 -299
- msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
- msprobe/pytorch/pt_config.py +0 -299
- /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
- /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
- /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
- /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
- /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
- /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
- /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
- /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
- /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
- /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
- /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
- /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
- /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
- /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
- /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
- /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
msprobe/core/monitor/db_utils.py
CHANGED
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
15
17
|
from collections import OrderedDict
|
|
16
18
|
from collections.abc import Iterable
|
|
17
19
|
from typing import Dict, List, Optional, Set, Tuple
|
|
18
20
|
|
|
19
|
-
from msprobe.core.common.const import MonitorConst
|
|
20
|
-
from msprobe.core.common.db_manager import DBManager
|
|
21
|
+
from msprobe.core.common.const import MonitorConst, Const, Data2DBConst
|
|
22
|
+
from msprobe.core.common.db_manager import DBManager, TrendSql
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def update_ordered_dict(main_dict: OrderedDict, new_list: List) -> OrderedDict:
|
|
@@ -28,151 +30,57 @@ def update_ordered_dict(main_dict: OrderedDict, new_list: List) -> OrderedDict:
|
|
|
28
30
|
return main_dict
|
|
29
31
|
|
|
30
32
|
|
|
31
|
-
def
|
|
32
|
-
|
|
33
|
-
if not isinstance(stats, Iterable):
|
|
33
|
+
def get_ordered_list(data: Iterable, order: Iterable) -> List[str]:
|
|
34
|
+
if not isinstance(data, Iterable) or not isinstance(order, Iterable):
|
|
34
35
|
return []
|
|
35
|
-
return [
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class MonitorSql:
|
|
39
|
-
"""数据库表参数类"""
|
|
36
|
+
return [value for value in order if value in data]
|
|
40
37
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)"""
|
|
38
|
+
def get_tags_from_target(target_name):
|
|
39
|
+
"""
|
|
40
|
+
full target name to tags
|
|
41
|
+
|
|
42
|
+
:param target_name: e.g. layers.22.self_attention.linear_qkv.layer_norm_weight
|
|
43
|
+
:return tags: set of tags
|
|
44
|
+
"""
|
|
45
|
+
tags = set()
|
|
46
|
+
parts = target_name.split(Const.SEP)[::-1]
|
|
47
|
+
max_idx = len(parts) - 1
|
|
48
|
+
for i, part in enumerate(parts):
|
|
49
|
+
if not part.isdigit():
|
|
50
|
+
tags.add((part, Data2DBConst.TAG_MODULE))
|
|
51
|
+
continue
|
|
52
|
+
if i == max_idx or parts[i+1].isdigit(): # 确保数字前面有非数字部分, 组成layer标签
|
|
53
|
+
continue
|
|
54
|
+
layer_tag = (f"{parts[i+1]}.{parts[i]}", Data2DBConst.TAG_LAYER)
|
|
55
|
+
tags.add(layer_tag)
|
|
56
|
+
return tags
|
|
61
57
|
|
|
62
|
-
@staticmethod
|
|
63
|
-
def get_metric_mapping_sql():
|
|
64
|
-
return """
|
|
65
|
-
SELECT m.metric_id, m.metric_name, GROUP_CONCAT(ms.stat_name) as stats
|
|
66
|
-
FROM monitoring_metrics m
|
|
67
|
-
LEFT JOIN metric_stats ms ON m.metric_id = ms.metric_id
|
|
68
|
-
GROUP BY m.metric_id
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
@staticmethod
|
|
72
|
-
def create_metric_stats_table():
|
|
73
|
-
"""指标统计表"""
|
|
74
|
-
return """
|
|
75
|
-
CREATE TABLE IF NOT EXISTS metric_stats (
|
|
76
|
-
metric_id INTEGER NOT NULL,
|
|
77
|
-
stat_name TEXT NOT NULL,
|
|
78
|
-
PRIMARY KEY (metric_id, stat_name),
|
|
79
|
-
FOREIGN KEY (metric_id) REFERENCES monitoring_metrics(metric_id)
|
|
80
|
-
) WITHOUT ROWID"""
|
|
81
|
-
|
|
82
|
-
@staticmethod
|
|
83
|
-
def create_global_stat_table():
|
|
84
|
-
return """
|
|
85
|
-
CREATE TABLE IF NOT EXISTS global_stats (
|
|
86
|
-
stat_name TEXT PRIMARY KEY,
|
|
87
|
-
stat_value INTEGER NOT NULL
|
|
88
|
-
) WITHOUT ROWID"""
|
|
89
|
-
|
|
90
|
-
@classmethod
|
|
91
|
-
def get_table_definition(cls, table_name=""):
|
|
92
|
-
"""
|
|
93
|
-
获取表定义SQL
|
|
94
|
-
:param table_name: 表名
|
|
95
|
-
:return: 建表SQL语句
|
|
96
|
-
:raises ValueError: 当表名不存在时
|
|
97
|
-
"""
|
|
98
|
-
table_creators = {
|
|
99
|
-
"monitoring_targets": cls.create_monitoring_targets_table,
|
|
100
|
-
"monitoring_metrics": cls.create_monitoring_metrics_table,
|
|
101
|
-
"metric_stats": cls.create_metric_stats_table,
|
|
102
|
-
"global_stats": cls.create_global_stat_table,
|
|
103
|
-
}
|
|
104
|
-
if not table_name:
|
|
105
|
-
return [table_creators.get(table, lambda x: "")() for table in table_creators]
|
|
106
|
-
if table_name not in table_creators:
|
|
107
|
-
raise ValueError(f"Unsupported table name: {table_name}")
|
|
108
|
-
return table_creators[table_name]()
|
|
109
|
-
|
|
110
|
-
@classmethod
|
|
111
|
-
def get_metric_table_definition(cls, table_name, stats, patition=None):
|
|
112
|
-
stat_columns = [f"{stat} REAL DEFAULT NULL" for stat in stats]
|
|
113
|
-
if patition and len(patition) == 2:
|
|
114
|
-
partition_start_step, partition_end_step = patition
|
|
115
|
-
step_column = f"""step INTEGER NOT NULL CHECK(step BETWEEN {partition_start_step}
|
|
116
|
-
AND {partition_end_step}),"""
|
|
117
|
-
else:
|
|
118
|
-
step_column = "step INTEGER NOT NULL"
|
|
119
|
-
create_sql = f"""
|
|
120
|
-
CREATE TABLE {table_name} (
|
|
121
|
-
rank INTEGER NOT NULL,
|
|
122
|
-
{step_column}
|
|
123
|
-
target_id INTEGER NOT NULL,
|
|
124
|
-
{', '.join(stat_columns)},
|
|
125
|
-
PRIMARY KEY (rank, step, target_id),
|
|
126
|
-
FOREIGN KEY (target_id) REFERENCES monitoring_targets(target_id)
|
|
127
|
-
) WITHOUT ROWID
|
|
128
|
-
"""
|
|
129
|
-
return create_sql
|
|
130
|
-
|
|
131
58
|
|
|
132
59
|
class MonitorDB:
|
|
133
60
|
"""Main class for monitoring database operations"""
|
|
134
61
|
|
|
135
|
-
def __init__(self, db_path: str
|
|
62
|
+
def __init__(self, db_path: str):
|
|
136
63
|
self.db_path = db_path
|
|
137
64
|
self.db_manager = DBManager(db_path)
|
|
138
|
-
self.step_partition_size = step_partition_size
|
|
139
|
-
|
|
140
|
-
def get_metric_table_name(self, metric_id: int, step: int) -> str:
|
|
141
|
-
"""Generate metric table name"""
|
|
142
|
-
step_start = (
|
|
143
|
-
step // self.step_partition_size) * self.step_partition_size
|
|
144
|
-
step_end = step_start + self.step_partition_size - 1
|
|
145
|
-
return f"metric_{metric_id}_step_{step_start}_{step_end}", step_start, step_end
|
|
146
65
|
|
|
147
66
|
def init_schema(self) -> None:
|
|
148
67
|
"""Initialize database schema"""
|
|
149
|
-
self.db_manager.execute_multi_sql(
|
|
150
|
-
|
|
151
|
-
# Insert initial global stats
|
|
152
|
-
global_stats = [
|
|
153
|
-
('max_rank', 0),
|
|
154
|
-
('min_step', 0),
|
|
155
|
-
('max_step', 0),
|
|
156
|
-
('step_partition_size', self.step_partition_size)
|
|
157
|
-
]
|
|
158
|
-
self.db_manager.insert_data("global_stats", global_stats)
|
|
68
|
+
self.db_manager.execute_multi_sql(TrendSql.get_table_definition())
|
|
159
69
|
|
|
160
70
|
def insert_dimensions(
|
|
161
71
|
self,
|
|
162
|
-
targets: OrderedDict,
|
|
163
|
-
metrics: Set[str],
|
|
164
|
-
metric_stats: Dict[str, Set[str]],
|
|
165
|
-
min_step: Optional[int] = None,
|
|
166
|
-
max_step: int = None,
|
|
72
|
+
targets: Dict[str,OrderedDict],
|
|
167
73
|
) -> None:
|
|
168
74
|
"""Insert dimension data into database"""
|
|
169
75
|
# Insert targets
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
76
|
+
metrics = get_ordered_list(set(targets.keys()), MonitorConst.METRICS_TRENDVIS_SUPPORTED)
|
|
77
|
+
for metric_name in metrics:
|
|
78
|
+
self.db_manager.insert_data(
|
|
79
|
+
"monitoring_targets",
|
|
80
|
+
[(name, vpp_stage, micro_step)
|
|
81
|
+
for (name, vpp_stage, micro_step) in targets[metric_name]],
|
|
82
|
+
key_list=["target_name", "vpp_stage", "micro_step"]
|
|
83
|
+
)
|
|
176
84
|
|
|
177
85
|
# Insert metrics
|
|
178
86
|
self.db_manager.insert_data(
|
|
@@ -181,79 +89,53 @@ class MonitorDB:
|
|
|
181
89
|
key_list=["metric_name"]
|
|
182
90
|
)
|
|
183
91
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
metric_id = self._get_metric_id(metric)
|
|
187
|
-
ordered_stats = get_ordered_stats(stats)
|
|
188
|
-
|
|
189
|
-
self.db_manager.insert_data(
|
|
190
|
-
"metric_stats",
|
|
191
|
-
[(metric_id, stat) for stat in ordered_stats],
|
|
192
|
-
key_list=["metric_id", "stat_name"]
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
# Create metric tables for each partition
|
|
196
|
-
if min_step is not None and max_step is not None:
|
|
197
|
-
first_partition = min_step // self.step_partition_size
|
|
198
|
-
last_partition = max_step // self.step_partition_size
|
|
199
|
-
|
|
200
|
-
for partition in range(first_partition, last_partition + 1):
|
|
201
|
-
step_start = partition * self.step_partition_size
|
|
202
|
-
self.create_metric_table(
|
|
203
|
-
metric_id, step_start, ordered_stats)
|
|
204
|
-
|
|
205
|
-
def insert_rows(self, table_name, rows):
|
|
206
|
-
if not self.db_manager.table_exists(table_name):
|
|
207
|
-
raise RuntimeError(f"{table_name} not existed in {self.db_path}")
|
|
92
|
+
def insert_rows(self, rows):
|
|
93
|
+
table_name = "trend_data"
|
|
208
94
|
inserted = self.db_manager.insert_data(table_name, rows)
|
|
209
95
|
inserted = 0 if inserted is None else inserted
|
|
210
96
|
return inserted
|
|
211
97
|
|
|
212
|
-
def
|
|
213
|
-
"""
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
)
|
|
218
|
-
if self.db_manager.table_exists(table_name):
|
|
219
|
-
return table_name
|
|
220
|
-
|
|
221
|
-
create_sql = MonitorSql.get_metric_table_definition(
|
|
222
|
-
table_name, stats, patition=(
|
|
223
|
-
partition_start_step, partition_end_step)
|
|
98
|
+
def init_global_stats_data(self, config: Dict):
|
|
99
|
+
"""初始化全局统计表数据"""
|
|
100
|
+
# 准备插入数据
|
|
101
|
+
self.db_manager.execute_sql(
|
|
102
|
+
TrendSql.create_global_stats_table(config)
|
|
224
103
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
("max_step", max_step)
|
|
234
|
-
]
|
|
235
|
-
for stat_name, value in updates:
|
|
236
|
-
if not value:
|
|
237
|
-
continue
|
|
238
|
-
self.db_manager.update_data(
|
|
239
|
-
table_name="global_stats",
|
|
240
|
-
updates={"stat_value": value},
|
|
241
|
-
where={"stat_name": stat_name}
|
|
242
|
-
)
|
|
104
|
+
|
|
105
|
+
column_values = []
|
|
106
|
+
for _, column_value in config.items():
|
|
107
|
+
if isinstance(column_value, (list, set)):
|
|
108
|
+
column_values.append(",".join(column_value))
|
|
109
|
+
elif isinstance(column_value, (int, float)):
|
|
110
|
+
column_values.append(column_value)
|
|
111
|
+
self.db_manager.insert_data("global_stats", [column_values,])
|
|
243
112
|
|
|
244
113
|
def get_metric_mapping(self) -> Dict[str, Tuple[int, List[str]]]:
|
|
245
|
-
"""
|
|
246
|
-
|
|
247
|
-
|
|
114
|
+
"""获取metric名称到ID的映射及对应的统计信息"""
|
|
115
|
+
metric_results = self.db_manager.execute_sql(
|
|
116
|
+
TrendSql.get_metric_mapping_sql()
|
|
117
|
+
)
|
|
118
|
+
global_stats_result = self.db_manager.execute_sql(
|
|
119
|
+
TrendSql.get_global_stats_sql()
|
|
248
120
|
)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
121
|
+
if not global_stats_result:
|
|
122
|
+
return {}
|
|
123
|
+
|
|
124
|
+
global_stats_config = global_stats_result[0]
|
|
125
|
+
# 构建映射字典
|
|
126
|
+
metric_mapping = {}
|
|
127
|
+
for row in metric_results:
|
|
128
|
+
metric_name = row["metric_name"]
|
|
129
|
+
metric_id = row["metric_id"]
|
|
130
|
+
|
|
131
|
+
# 从global_stats中获取该metric对应的统计信息
|
|
132
|
+
stats_value = global_stats_config.get(metric_name)
|
|
133
|
+
ordered_stats = []
|
|
134
|
+
if stats_value:
|
|
135
|
+
stats_list = stats_value.split(",") if isinstance(stats_value, str) else []
|
|
136
|
+
ordered_stats = get_ordered_list(stats_list, MonitorConst.OP_MONVIS_SUPPORTED)
|
|
137
|
+
metric_mapping[metric_name] = (metric_id, ordered_stats)
|
|
138
|
+
return metric_mapping
|
|
257
139
|
|
|
258
140
|
def get_target_mapping(self) -> Dict[Tuple[str, int, int], int]:
|
|
259
141
|
"""Get target mapping dictionary"""
|
|
@@ -268,6 +150,9 @@ class MonitorDB:
|
|
|
268
150
|
for row in results
|
|
269
151
|
}
|
|
270
152
|
|
|
153
|
+
def create_trend_data(self, stats):
|
|
154
|
+
self.db_manager.execute_sql(TrendSql.create_trend_table(stats))
|
|
155
|
+
|
|
271
156
|
def _get_metric_id(self, metric_name: str) -> Optional[int]:
|
|
272
157
|
"""Get metric ID by name"""
|
|
273
158
|
result = self.db_manager.select_data(
|
|
@@ -276,3 +161,53 @@ class MonitorDB:
|
|
|
276
161
|
where={"metric_name": metric_name}
|
|
277
162
|
)
|
|
278
163
|
return result[0]["metric_id"] if result else None
|
|
164
|
+
|
|
165
|
+
def extract_tags_from_processed_targets(self, targets:dict , metric_id_dict: dict, target_dict: dict):
|
|
166
|
+
"""从已处理的targets中提取标签并建立映射关系"""
|
|
167
|
+
|
|
168
|
+
for metric_name in targets:
|
|
169
|
+
tag_info_to_id = {}
|
|
170
|
+
mapping_data = []
|
|
171
|
+
new_tags_data = set()
|
|
172
|
+
target_to_tags = {}
|
|
173
|
+
metric_id = metric_id_dict.get(metric_name, [None,])[0]
|
|
174
|
+
for target in targets[metric_name]:
|
|
175
|
+
target_id = target_dict[target]
|
|
176
|
+
name, _, _ = target
|
|
177
|
+
tags = get_tags_from_target(name)
|
|
178
|
+
new_tags_data = new_tags_data.union(tags)
|
|
179
|
+
target_to_tags[target_id] = tags
|
|
180
|
+
|
|
181
|
+
# 插入新标签
|
|
182
|
+
if not new_tags_data:
|
|
183
|
+
continue
|
|
184
|
+
self.db_manager.insert_data(
|
|
185
|
+
"monitoring_tags",
|
|
186
|
+
[(tag_name, category, metric_id)
|
|
187
|
+
for tag_name, category in new_tags_data],
|
|
188
|
+
key_list=["tag_name", "category", "metric_id"]
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# 重新获取tag_id用于映射
|
|
192
|
+
tag_records = self.db_manager.select_data(
|
|
193
|
+
"monitoring_tags",
|
|
194
|
+
columns=["tag_id", "tag_name", "category", "metric_id"]
|
|
195
|
+
)
|
|
196
|
+
for row in tag_records:
|
|
197
|
+
tag_info_to_id[(row["tag_name"], row["category"],
|
|
198
|
+
row["metric_id"])] = row["tag_id"]
|
|
199
|
+
|
|
200
|
+
for target_id, tags_list in target_to_tags.items():
|
|
201
|
+
for tag_name, category in tags_list:
|
|
202
|
+
tag_id = tag_info_to_id.get(
|
|
203
|
+
(tag_name, category, metric_id))
|
|
204
|
+
if tag_id:
|
|
205
|
+
mapping_data.append((target_id, tag_id))
|
|
206
|
+
|
|
207
|
+
# 插入映射关系
|
|
208
|
+
if mapping_data:
|
|
209
|
+
self.db_manager.insert_data(
|
|
210
|
+
"tag_target_mapping",
|
|
211
|
+
mapping_data,
|
|
212
|
+
key_list=["target_id", "tag_id"]
|
|
213
|
+
)
|
msprobe/core/monitor/utils.py
CHANGED
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
3
4
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# You may obtain a copy of
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
7
8
|
#
|
|
8
|
-
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
9
10
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# See the
|
|
14
|
-
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
15
17
|
from collections import namedtuple
|
|
16
18
|
from datetime import timezone, timedelta
|
|
17
19
|
from functools import wraps
|
|
@@ -24,7 +26,6 @@ from msprobe.core.common.log import logger
|
|
|
24
26
|
from msprobe.core.common.utils import is_int
|
|
25
27
|
from msprobe.core.common.file_utils import check_file_or_directory_path, recursive_chmod
|
|
26
28
|
|
|
27
|
-
|
|
28
29
|
beijing_tz = timezone(timedelta(hours=8))
|
|
29
30
|
MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
|
|
30
31
|
|
|
@@ -96,7 +97,7 @@ def validate_targets(targets):
|
|
|
96
97
|
raise TypeError('key of targets should be module_name[str] in config.json')
|
|
97
98
|
if not isinstance(field, dict):
|
|
98
99
|
raise TypeError('values of targets should be cared filed e.g. {"input": "tensor"} in config.json')
|
|
99
|
-
|
|
100
|
+
|
|
100
101
|
|
|
101
102
|
def validate_l2_targets(targets):
|
|
102
103
|
if not isinstance(targets, dict):
|
|
@@ -114,15 +115,15 @@ def validate_l2_targets(targets):
|
|
|
114
115
|
def validate_recording_l2_features(recording_l2_features):
|
|
115
116
|
if not isinstance(recording_l2_features, bool):
|
|
116
117
|
raise TypeError("recording_l2_features should be a bool")
|
|
117
|
-
|
|
118
|
+
|
|
118
119
|
|
|
119
120
|
def validate_sa_order(sa_order):
|
|
120
121
|
if isinstance(sa_order, str):
|
|
121
122
|
sa_order = sa_order.replace(' ', '')
|
|
122
123
|
if sa_order not in MonitorConst.SA_ORDERS:
|
|
123
124
|
raise TypeError(f'sa_order must be in {MonitorConst.SA_ORDERS}, got {sa_order}')
|
|
124
|
-
|
|
125
|
-
|
|
125
|
+
|
|
126
|
+
|
|
126
127
|
def validate_print_struct(print_struct):
|
|
127
128
|
if not isinstance(print_struct, bool):
|
|
128
129
|
raise TypeError("print_struct should be a bool")
|
|
@@ -246,7 +247,7 @@ def validate_config(config):
|
|
|
246
247
|
|
|
247
248
|
recording_l2_features = config.get("recording_l2_features", False)
|
|
248
249
|
validate_recording_l2_features(recording_l2_features)
|
|
249
|
-
|
|
250
|
+
|
|
250
251
|
sa_order = config.get("sa_order", "s,b,h,d")
|
|
251
252
|
validate_sa_order(sa_order)
|
|
252
253
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
monitor_v2: next-generation monitoring core for unifying
|
|
3
|
+
PyTorch / MindSpore monitor logic behind a single, clean abstraction.
|
|
4
|
+
|
|
5
|
+
Goal:
|
|
6
|
+
- keep production behavior stable while iterating;
|
|
7
|
+
- migrate mature logic from existing monitors in small steps;
|
|
8
|
+
- enforce clearer layering and performance constraints.
|
|
9
|
+
|
|
10
|
+
Current structure:
|
|
11
|
+
base.py
|
|
12
|
+
- BaseMonitorV2: minimal shared monitor scaffold.
|
|
13
|
+
trainer.py
|
|
14
|
+
- TrainerMonitorV2: orchestrator for step gating and writer integration.
|
|
15
|
+
factory.py
|
|
16
|
+
- MonitorFactory: registry of framework-specific monitor classes.
|
|
17
|
+
writer.py
|
|
18
|
+
- CSVWriterV2: CSV output for per-step monitor data.
|
|
19
|
+
"""
|
|
20
|
+
from .trainer import TrainerMonitorV2
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# -------------------------------------------------------------------------
|
|
2
|
+
# This file is part of the MindStudio project.
|
|
3
|
+
# Copyright (c) 2025 Huawei Technologies Co.,Ltd.
|
|
4
|
+
#
|
|
5
|
+
# MindStudio is licensed under Mulan PSL v2.
|
|
6
|
+
# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
7
|
+
# You may obtain a copy of Mulan PSL v2 at:
|
|
8
|
+
#
|
|
9
|
+
# http://license.coscl.org.cn/MulanPSL2
|
|
10
|
+
#
|
|
11
|
+
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
12
|
+
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
13
|
+
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
14
|
+
# See the Mulan PSL v2 for more details.
|
|
15
|
+
# -------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
21
|
+
|
|
22
|
+
from msprobe.core.common.log import logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BaseMonitorV2(ABC):
|
|
26
|
+
"""
|
|
27
|
+
Minimal base class for v2 monitors.
|
|
28
|
+
|
|
29
|
+
Compared to core.monitor.BaseMonitor, this class is intentionally kept
|
|
30
|
+
small so that higher-level policies live in dedicated components
|
|
31
|
+
(framework, trainer, backends) instead of being mixed into the base.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
DEFAULT_OPS = ["min", "max", "mean", "norm", "nans"]
|
|
35
|
+
|
|
36
|
+
def __init__(self) -> None:
|
|
37
|
+
self.config: Dict[str, Any] = {}
|
|
38
|
+
self._context: Dict[str, Any] = {}
|
|
39
|
+
self._ops: List[str] = []
|
|
40
|
+
# Common per-step rows buffer; subclasses can populate this and
|
|
41
|
+
# rely on the default collect() implementation.
|
|
42
|
+
self._rows: List[Dict[str, Any]] = []
|
|
43
|
+
|
|
44
|
+
def set_config(self, config: Dict[str, Any]) -> None:
|
|
45
|
+
self.config = config
|
|
46
|
+
self._configure_ops(config.get("ops"))
|
|
47
|
+
|
|
48
|
+
def set_context(self, **context: Any) -> None:
|
|
49
|
+
self._context.update(context)
|
|
50
|
+
|
|
51
|
+
def collect(self) -> Optional[Dict[str, Any]]:
|
|
52
|
+
"""
|
|
53
|
+
Default collect implementation used by most monitors:
|
|
54
|
+
return per-step rows and reset the internal buffer.
|
|
55
|
+
"""
|
|
56
|
+
if not self._rows:
|
|
57
|
+
return None
|
|
58
|
+
data = {"rows": self._rows}
|
|
59
|
+
self._rows = []
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def start(self, *args: Any, **kwargs: Any) -> None:
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def stop(self) -> None:
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
def _configure_ops(self, ops: Optional[Iterable[str]] = None) -> None:
|
|
71
|
+
if ops is None or not isinstance(ops, Iterable):
|
|
72
|
+
self._ops = list(self.DEFAULT_OPS)
|
|
73
|
+
return
|
|
74
|
+
requested = [ops] if isinstance(ops, str) else list(ops)
|
|
75
|
+
invalid_ops = [op for op in requested if op not in self.DEFAULT_OPS]
|
|
76
|
+
if invalid_ops:
|
|
77
|
+
logger.warning(
|
|
78
|
+
f"[monitor_v2] Unsupported ops {invalid_ops}; supported: {self.DEFAULT_OPS}."
|
|
79
|
+
)
|
|
80
|
+
filtered = [op for op in requested if op in self.DEFAULT_OPS]
|
|
81
|
+
if not filtered:
|
|
82
|
+
logger.warning("[monitor_v2] No valid ops provided; fallback to default ops.")
|
|
83
|
+
self._ops = filtered if filtered else list(self.DEFAULT_OPS)
|