mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
- mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
- msprobe/CMakeLists.txt +5 -0
- msprobe/README.md +51 -20
- msprobe/config.json +2 -3
- msprobe/core/advisor/advisor.py +8 -3
- msprobe/core/common/const.py +264 -15
- msprobe/core/common/exceptions.py +27 -3
- msprobe/core/common/file_utils.py +176 -26
- msprobe/core/common/inplace_op_checker.py +15 -0
- msprobe/core/common/inplace_ops.yaml +3 -0
- msprobe/core/common/log.py +27 -9
- msprobe/core/common/utils.py +204 -77
- msprobe/core/common_config.py +49 -14
- msprobe/core/compare/acc_compare.py +274 -198
- msprobe/core/compare/check.py +32 -33
- msprobe/core/compare/compare_cli.py +32 -14
- msprobe/core/compare/highlight.py +283 -127
- msprobe/core/compare/layer_mapping/__init__.py +19 -0
- msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
- msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
- msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
- msprobe/core/compare/merge_result/merge_result.py +380 -0
- msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
- msprobe/core/compare/multiprocessing_compute.py +2 -2
- msprobe/core/compare/npy_compare.py +135 -144
- msprobe/core/compare/utils.py +419 -274
- msprobe/core/data_dump/data_collector.py +60 -28
- msprobe/core/data_dump/data_processor/base.py +84 -36
- msprobe/core/data_dump/data_processor/factory.py +5 -3
- msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
- msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
- msprobe/core/data_dump/json_writer.py +29 -1
- msprobe/core/data_dump/scope.py +119 -39
- msprobe/core/grad_probe/constant.py +27 -13
- msprobe/core/grad_probe/grad_compare.py +18 -1
- msprobe/core/grad_probe/utils.py +30 -2
- msprobe/core/overflow_check/abnormal_scene.py +189 -0
- msprobe/core/overflow_check/api_info.py +55 -0
- msprobe/core/overflow_check/checker.py +138 -0
- msprobe/core/overflow_check/filter.py +157 -0
- msprobe/core/overflow_check/ignore_rules.yaml +55 -0
- msprobe/core/overflow_check/level.py +22 -0
- msprobe/core/overflow_check/utils.py +28 -0
- msprobe/docs/01.installation.md +96 -7
- msprobe/docs/02.config_introduction.md +50 -23
- msprobe/docs/03.config_examples.md +2 -9
- msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
- msprobe/docs/05.data_dump_PyTorch.md +93 -61
- msprobe/docs/06.data_dump_MindSpore.md +200 -95
- msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
- msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
- msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
- msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
- msprobe/docs/12.overflow_check_PyTorch.md +2 -2
- msprobe/docs/13.overflow_check_MindSpore.md +6 -6
- msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
- msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
- msprobe/docs/17.grad_probe.md +5 -6
- msprobe/docs/19.monitor.md +561 -0
- msprobe/docs/20.monitor_performance_baseline.md +52 -0
- msprobe/docs/21.visualization_PyTorch.md +466 -0
- msprobe/docs/22.visualization_MindSpore.md +481 -0
- msprobe/docs/23.generate_operator_PyTorch.md +107 -0
- msprobe/docs/24.code_mapping_Mindspore.md +28 -0
- msprobe/docs/25.tool_function_introduction.md +29 -0
- msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
- msprobe/docs/27.dump_json_instruction.md +521 -0
- msprobe/docs/FAQ.md +29 -2
- msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
- msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
- msprobe/docs/img/visualization/tensorboard_1.png +0 -0
- msprobe/docs/img/visualization/tensorboard_2.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_browser_2.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/docs/visualization/GPTModel.png +0 -0
- msprobe/docs/visualization/ParallelMLP.png +0 -0
- msprobe/docs/visualization/layer_mapping_example.md +132 -0
- msprobe/docs/visualization/mapping.png +0 -0
- msprobe/docs/visualization/mapping1.png +0 -0
- msprobe/docs/visualization/module_name.png +0 -0
- msprobe/docs/visualization/module_name1.png +0 -0
- msprobe/docs/visualization/no_mapping.png +0 -0
- msprobe/docs/visualization/no_mapping1.png +0 -0
- msprobe/docs/visualization/no_mapping_analyze.png +0 -0
- msprobe/docs/visualization/top_layer.png +0 -0
- msprobe/mindspore/__init__.py +25 -0
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
- msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
- msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
- msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
- msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
- msprobe/mindspore/api_accuracy_checker/main.py +28 -3
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
- msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
- msprobe/mindspore/cell_processor.py +33 -12
- msprobe/mindspore/code_mapping/bind.py +264 -0
- msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
- msprobe/mindspore/code_mapping/graph.py +49 -0
- msprobe/mindspore/code_mapping/graph_parser.py +226 -0
- msprobe/mindspore/code_mapping/main.py +24 -0
- msprobe/mindspore/code_mapping/processor.py +34 -0
- msprobe/mindspore/common/const.py +35 -13
- msprobe/mindspore/common/log.py +5 -9
- msprobe/mindspore/common/utils.py +88 -4
- msprobe/mindspore/compare/distributed_compare.py +22 -24
- msprobe/mindspore/compare/ms_compare.py +333 -268
- msprobe/mindspore/compare/ms_graph_compare.py +95 -52
- msprobe/mindspore/debugger/debugger_config.py +7 -1
- msprobe/mindspore/debugger/precision_debugger.py +87 -12
- msprobe/mindspore/dump/dump_tool_factory.py +3 -1
- msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
- msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
- msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
- msprobe/mindspore/dump/jit_dump.py +17 -5
- msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
- msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
- msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
- msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
- msprobe/mindspore/free_benchmark/common/utils.py +19 -4
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
- msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
- msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
- msprobe/mindspore/grad_probe/global_context.py +28 -8
- msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
- msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
- msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
- msprobe/mindspore/grad_probe/hook.py +35 -12
- msprobe/mindspore/grad_probe/utils.py +18 -5
- msprobe/mindspore/mindtorch/__init__.py +18 -0
- msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
- msprobe/mindspore/ms_config.py +27 -16
- msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
- msprobe/mindspore/runtime.py +15 -0
- msprobe/mindspore/service.py +285 -113
- msprobe/mindspore/task_handler_factory.py +15 -0
- msprobe/msprobe.py +48 -10
- msprobe/pytorch/__init__.py +8 -6
- msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
- msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
- msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
- msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
- msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
- msprobe/pytorch/bench_functions/swiglu.py +10 -2
- msprobe/pytorch/common/parse_json.py +7 -6
- msprobe/pytorch/common/utils.py +101 -7
- msprobe/pytorch/compare/distributed_compare.py +17 -30
- msprobe/pytorch/compare/pt_compare.py +44 -22
- msprobe/pytorch/debugger/debugger_config.py +46 -27
- msprobe/pytorch/debugger/precision_debugger.py +42 -12
- msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
- msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
- msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
- msprobe/pytorch/free_benchmark/common/constant.py +15 -0
- msprobe/pytorch/free_benchmark/common/counter.py +15 -0
- msprobe/pytorch/free_benchmark/common/enums.py +15 -0
- msprobe/pytorch/free_benchmark/common/params.py +10 -2
- msprobe/pytorch/free_benchmark/common/utils.py +29 -4
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
- msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
- msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
- msprobe/pytorch/hook_module/__init__.py +1 -1
- msprobe/pytorch/hook_module/hook_module.py +14 -11
- msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
- msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
- msprobe/pytorch/hook_module/wrap_functional.py +0 -38
- msprobe/pytorch/monitor/__init__.py +0 -0
- msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
- msprobe/pytorch/monitor/anomaly_detect.py +425 -0
- msprobe/pytorch/monitor/csv2tb.py +166 -0
- msprobe/pytorch/monitor/distributed/__init__.py +0 -0
- msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
- msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
- msprobe/pytorch/monitor/features.py +108 -0
- msprobe/pytorch/monitor/module_hook.py +1076 -0
- msprobe/pytorch/monitor/module_metric.py +172 -0
- msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
- msprobe/pytorch/monitor/optimizer_collect.py +333 -0
- msprobe/pytorch/monitor/unittest/__init__.py +0 -0
- msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
- msprobe/pytorch/monitor/utils.py +321 -0
- msprobe/pytorch/monitor/visualizer.py +59 -0
- msprobe/pytorch/online_dispatch/__init__.py +2 -3
- msprobe/pytorch/online_dispatch/compare.py +29 -38
- msprobe/pytorch/online_dispatch/dispatch.py +58 -27
- msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
- msprobe/pytorch/online_dispatch/single_compare.py +53 -32
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
- msprobe/pytorch/online_dispatch/utils.py +49 -21
- msprobe/pytorch/parse_tool/lib/compare.py +21 -27
- msprobe/pytorch/parse_tool/lib/config.py +6 -8
- msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
- msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
- msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
- msprobe/pytorch/parse_tool/lib/utils.py +33 -53
- msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
- msprobe/pytorch/pt_config.py +31 -8
- msprobe/pytorch/service.py +188 -108
- msprobe/visualization/__init__.py +14 -0
- msprobe/visualization/builder/__init__.py +14 -0
- msprobe/visualization/builder/graph_builder.py +222 -0
- msprobe/visualization/builder/msprobe_adapter.py +227 -0
- msprobe/visualization/compare/__init__.py +14 -0
- msprobe/visualization/compare/graph_comparator.py +180 -0
- msprobe/visualization/compare/mode_adapter.py +197 -0
- msprobe/visualization/graph/__init__.py +14 -0
- msprobe/visualization/graph/base_node.py +119 -0
- msprobe/visualization/graph/distributed_analyzer.py +318 -0
- msprobe/visualization/graph/graph.py +209 -0
- msprobe/visualization/graph/node_colors.py +95 -0
- msprobe/visualization/graph/node_op.py +39 -0
- msprobe/visualization/graph_service.py +288 -0
- msprobe/visualization/utils.py +217 -0
- mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
- msprobe/docs/04.acl_config_examples.md +0 -78
- msprobe/mindspore/compare/layer_mapping.py +0 -146
- msprobe/mindspore/compare/modify_mapping.py +0 -107
- msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
- msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
- msprobe/pytorch/functional/module_dump.py +0 -84
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
- /msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
- /msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
from glob import glob
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
from msprobe.pytorch.common.log import logger
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_logfile(logfile):
|
|
27
|
+
grad_norm = []
|
|
28
|
+
step = []
|
|
29
|
+
with open(logfile) as f:
|
|
30
|
+
for line in f.readlines():
|
|
31
|
+
if 'consumed samples' in line:
|
|
32
|
+
grad_norm.append(float(re.findall('(?<=grad norm\: )[\d\.]*', line)[0]))
|
|
33
|
+
return grad_norm
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def parse_monitor_output(output_dir):
|
|
37
|
+
reduced = {}
|
|
38
|
+
unreduced = {}
|
|
39
|
+
for directory in glob(output_dir + '*'):
|
|
40
|
+
rank = int(re.findall('(?<=rank)[\d]*', directory)[0])
|
|
41
|
+
unreduced[rank] = []
|
|
42
|
+
reduced[rank] = []
|
|
43
|
+
for file in os.listdir(directory):
|
|
44
|
+
df = pd.read_csv(os.path.join(directory, file))
|
|
45
|
+
if '_unreduced_' in file:
|
|
46
|
+
unreduced[rank].append(df)
|
|
47
|
+
pass
|
|
48
|
+
elif '_reduced_' in file:
|
|
49
|
+
reduced[rank].append(df)
|
|
50
|
+
else:
|
|
51
|
+
logger.info(f'unexpected file {file} in {directory}')
|
|
52
|
+
return reduced, unreduced
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel):
|
|
56
|
+
steps = len(reduced[0])
|
|
57
|
+
world_size = len(reduced)
|
|
58
|
+
errors = []
|
|
59
|
+
for _, row in unreduced[0][0].iterrows():
|
|
60
|
+
param = row['param_name']
|
|
61
|
+
is_tp_duplicate = False
|
|
62
|
+
for step in range(2):
|
|
63
|
+
# sum reduced
|
|
64
|
+
reduced_mean = 0.
|
|
65
|
+
for rank in range(world_size):
|
|
66
|
+
if len(reduced[rank]) == 0:
|
|
67
|
+
continue
|
|
68
|
+
df = reduced[rank][step]
|
|
69
|
+
value = list(df[df['param_name'] == param]['mean'])
|
|
70
|
+
if not value:
|
|
71
|
+
if step == 0:
|
|
72
|
+
is_tp_duplicate = True
|
|
73
|
+
continue
|
|
74
|
+
reduced_mean += value[0]
|
|
75
|
+
|
|
76
|
+
# sum unreduced
|
|
77
|
+
unreduced_mean = 0.
|
|
78
|
+
for rank in range(world_size):
|
|
79
|
+
df = unreduced[rank][step]
|
|
80
|
+
value = list(df[df['param_name'] == param]['mean'])
|
|
81
|
+
if not value:
|
|
82
|
+
continue
|
|
83
|
+
unreduced_mean += list(df[df['param_name'] == param]['mean'])[0]
|
|
84
|
+
|
|
85
|
+
unreduced_mean /= dp_size
|
|
86
|
+
if is_tp_duplicate and (not sequence_parallel or 'embedding' in param):
|
|
87
|
+
unreduced_mean /= tp_size
|
|
88
|
+
try:
|
|
89
|
+
assert_equal(unreduced_mean, reduced_mean)
|
|
90
|
+
except AssertionError as e:
|
|
91
|
+
errors.append([param, step, e, is_tp_duplicate])
|
|
92
|
+
if errors:
|
|
93
|
+
logger.info(errors)
|
|
94
|
+
else:
|
|
95
|
+
logger.info(f'grad mean is in consist between unreduced grad and reduced grad monitord.')
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def assert_equal(a, b):
|
|
99
|
+
if b == 0 or a == 0:
|
|
100
|
+
return
|
|
101
|
+
if b == 0:
|
|
102
|
+
rel_diff = a
|
|
103
|
+
elif a == 0:
|
|
104
|
+
rel_diff = b
|
|
105
|
+
else:
|
|
106
|
+
rel_diff = abs(a / b - 1)
|
|
107
|
+
assert rel_diff < 0.01, f'{a}, {b}, {rel_diff}'
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def valid_total_norm(total_norm, reduced, duplicate_embedding):
|
|
111
|
+
steps = len(total_norm)
|
|
112
|
+
world_size = len(reduced)
|
|
113
|
+
errors = []
|
|
114
|
+
for step in range(steps):
|
|
115
|
+
calculated_norm = 0.
|
|
116
|
+
for rank in range(world_size):
|
|
117
|
+
if len(reduced[rank]) == 0:
|
|
118
|
+
if step == 0:
|
|
119
|
+
logger.info(f'rank {rank} is duplicated in dp group')
|
|
120
|
+
continue
|
|
121
|
+
for _, row in reduced[rank][step].iterrows():
|
|
122
|
+
if duplicate_embedding and 'word_embedding' in row['param_name']:
|
|
123
|
+
continue
|
|
124
|
+
calculated_norm += row['norm'] ** 2
|
|
125
|
+
try:
|
|
126
|
+
assert_equal(calculated_norm ** 0.5, total_norm[step])
|
|
127
|
+
except AssertionError as e:
|
|
128
|
+
errors.append([step, e])
|
|
129
|
+
if errors:
|
|
130
|
+
logger.info('total norm errors: ', errors)
|
|
131
|
+
else:
|
|
132
|
+
logger.info('grad norm in consist between training log and reduced gradients monitored')
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
parser = argparse.ArgumentParser()
|
|
137
|
+
parser.add_argument('--monitor_output', '-m', type=str, required=True,
|
|
138
|
+
help='path prefix to the output of monitor e.g. monitor_output/Aug12_07-16')
|
|
139
|
+
parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file')
|
|
140
|
+
parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size')
|
|
141
|
+
parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size')
|
|
142
|
+
parser.add_argument('--pp_size', '-p', type=int, required=True, help='pp parallel size')
|
|
143
|
+
parser.add_argument('--untie_embeddings_and_output_weights', '-u', action="store_true", default=False,
|
|
144
|
+
help='whether untie_embeddings_and_output_weights in pp parallel')
|
|
145
|
+
parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False,
|
|
146
|
+
help='whether sequence parallel is enabled. Add -s to store true')
|
|
147
|
+
|
|
148
|
+
args = parser.parse_args()
|
|
149
|
+
|
|
150
|
+
assert args.tp_size > 0, 'if tp not enabled, set tp_size = 1'
|
|
151
|
+
assert args.dp_size > 0, 'if tp not enabled, set dp_size = 1'
|
|
152
|
+
assert args.pp_size > 0, 'if tp not enabled, set pp_size = 1'
|
|
153
|
+
|
|
154
|
+
total_norm = parse_logfile(args.logfile)
|
|
155
|
+
reduced, unreduced = parse_monitor_output(args.monitor_output)
|
|
156
|
+
|
|
157
|
+
duplicate_embedding = not args.untie_embeddings_and_output_weights and args.pp_size > 1
|
|
158
|
+
|
|
159
|
+
valid_total_norm(total_norm, reduced, duplicate_embedding)
|
|
160
|
+
valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel)
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import inspect
|
|
16
|
+
from collections import namedtuple
|
|
17
|
+
from datetime import timezone, timedelta
|
|
18
|
+
from functools import wraps
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
|
|
23
|
+
import torch
|
|
24
|
+
|
|
25
|
+
from msprobe.core.common.const import MonitorConst, Const
|
|
26
|
+
from msprobe.pytorch.common.log import logger
|
|
27
|
+
from msprobe.core.common.utils import is_int
|
|
28
|
+
from msprobe.core.common.file_utils import check_file_or_directory_path
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
device = "cpu"
|
|
32
|
+
try:
|
|
33
|
+
import torch_npu
|
|
34
|
+
device = "npu"
|
|
35
|
+
except ImportError:
|
|
36
|
+
if torch.cuda.is_available():
|
|
37
|
+
device = "cuda"
|
|
38
|
+
|
|
39
|
+
NAN_TENSOR_ON_DEVICE = torch.tensor(torch.nan, device=device)
|
|
40
|
+
FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
|
|
41
|
+
FILE_NAME_MAX_LENGTH = 255
|
|
42
|
+
DIRECTORY_MAX_LENGTH = 4096
|
|
43
|
+
|
|
44
|
+
beijing_tz = timezone(timedelta(hours=8))
|
|
45
|
+
MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
|
|
46
|
+
MVGradResult = namedtuple('MVGradResult', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MsgConst:
|
|
50
|
+
"""
|
|
51
|
+
Class for log messages const
|
|
52
|
+
"""
|
|
53
|
+
SPECIAL_CHAR = ["\n", "\r", "\u007F", "\b", "\f", "\t", "\u000B", "%08", "%0a", "%0b", "%0c", "%0d", "%7f"]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_output_base_dir():
|
|
57
|
+
return os.getenv(MonitorConst.MONITOR_OUTPUT_DIR, MonitorConst.DEFAULT_MONITOR_OUTPUT_DIR)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def filter_special_chars(func):
|
|
61
|
+
@wraps(func)
|
|
62
|
+
def func_level(msg):
|
|
63
|
+
for char in MsgConst.SPECIAL_CHAR:
|
|
64
|
+
msg = msg.replace(char, '_')
|
|
65
|
+
return func(msg)
|
|
66
|
+
|
|
67
|
+
return func_level
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_param_struct(param):
|
|
71
|
+
res = {}
|
|
72
|
+
if isinstance(param, (tuple, list)):
|
|
73
|
+
res['config'] = f'{type(param).__name__}[{len(param)}]'
|
|
74
|
+
for i, x in enumerate(param):
|
|
75
|
+
res[i] = f'size={tuple(x.shape)}, dtype={x.dtype}' if torch.is_tensor(x) else f'{type(x)}'
|
|
76
|
+
elif torch.is_tensor(param):
|
|
77
|
+
res['config'] = 'tensor'
|
|
78
|
+
res['tensor'] = f'size={tuple(param.shape)}, dtype={param.dtype}'
|
|
79
|
+
else:
|
|
80
|
+
res['config'] = f'{type(param)}'
|
|
81
|
+
logger.warning(f'Not support type({type(param)}) now, please check the type of param {param}')
|
|
82
|
+
return res
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_recomputation():
|
|
86
|
+
"""Check if the current operation is in the re-computation phase.
|
|
87
|
+
|
|
88
|
+
This function inspects the current call stack to indicate whether the current operation is in the
|
|
89
|
+
re-computation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
|
|
90
|
+
megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
|
|
91
|
+
mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
|
|
92
|
+
file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
|
|
93
|
+
'torch/_tensor.py' file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
bool: True if in the re-computation phase, False otherwise.
|
|
97
|
+
"""
|
|
98
|
+
backward_function_indices = []
|
|
99
|
+
call_stack = inspect.stack()
|
|
100
|
+
|
|
101
|
+
# Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
|
|
102
|
+
for frame_info in call_stack:
|
|
103
|
+
if frame_info.function == Const.BACKWARD and frame_info.filename.endswith('torch/_tensor.py'):
|
|
104
|
+
del call_stack
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
# Identify indices in the call stack where the specific function is being executed
|
|
108
|
+
for idx, frame_info in enumerate(call_stack):
|
|
109
|
+
if frame_info.function == Const.BACKWARD or frame_info.function == 'checkpoint_function_backward':
|
|
110
|
+
backward_function_indices.append(idx)
|
|
111
|
+
|
|
112
|
+
# Check if the execution is within 'torch/autograd/function.py' file
|
|
113
|
+
for idx in backward_function_indices:
|
|
114
|
+
# The Megatron and MindSpeed L0&L1 scenes
|
|
115
|
+
if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
|
|
116
|
+
del call_stack
|
|
117
|
+
return True
|
|
118
|
+
# The latest MindSpeed L2 and ModelLink scenes
|
|
119
|
+
if idx + 2 < len(call_stack) and call_stack[idx + 2].filename.endswith('torch/autograd/function.py'):
|
|
120
|
+
del call_stack
|
|
121
|
+
return True
|
|
122
|
+
|
|
123
|
+
del call_stack
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def validate_ops(ops):
|
|
128
|
+
if not isinstance(ops, list):
|
|
129
|
+
raise TypeError("ops should be a list")
|
|
130
|
+
valid_ops = []
|
|
131
|
+
for op in ops:
|
|
132
|
+
if op not in MonitorConst.OP_LIST:
|
|
133
|
+
logger.warning(f"op {op} is not supported. Optional ops: {MonitorConst.OP_LIST}")
|
|
134
|
+
continue
|
|
135
|
+
valid_ops.append(op)
|
|
136
|
+
if not valid_ops:
|
|
137
|
+
default_op = MonitorConst.OP_LIST[0]
|
|
138
|
+
valid_ops.append(default_op)
|
|
139
|
+
logger.info_on_rank_0(f"There is no valid ops, default op {default_op} is used")
|
|
140
|
+
return valid_ops
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def validate_ranks(ranks):
|
|
144
|
+
if not isinstance(ranks, list):
|
|
145
|
+
raise TypeError("module_ranks should be a list")
|
|
146
|
+
for rank in ranks:
|
|
147
|
+
if not isinstance(rank, int) or isinstance(rank, bool):
|
|
148
|
+
raise TypeError(f"element in module_ranks should be a int, get {type(rank)}")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def validate_targets(targets):
|
|
152
|
+
if not isinstance(targets, dict):
|
|
153
|
+
raise TypeError('targets in config.json should be a dict')
|
|
154
|
+
for module_name, field in targets.items():
|
|
155
|
+
if not isinstance(module_name, str):
|
|
156
|
+
raise TypeError('key of targets should be module_name[str] in config.json')
|
|
157
|
+
if not isinstance(field, dict):
|
|
158
|
+
raise TypeError('values of targets should be cared filed e.g. {"input": "tensor"} in config.json')
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def validate_print_struct(print_struct):
|
|
162
|
+
if not isinstance(print_struct, bool):
|
|
163
|
+
raise TypeError("print_struct should be a bool")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def validate_ur_distribution(ur_distribution):
|
|
167
|
+
if not isinstance(ur_distribution, bool):
|
|
168
|
+
raise TypeError('ur_distribution should be a bool')
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def validate_xy_distribution(xy_distribution):
|
|
172
|
+
if not isinstance(xy_distribution, bool):
|
|
173
|
+
raise TypeError('xy_distribution should be a bool')
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def validate_wg_distribution(wg_distribution):
|
|
177
|
+
if not isinstance(wg_distribution, bool):
|
|
178
|
+
raise TypeError('wg_distribution should be a bool')
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def validate_mg_distribution(mg_distribution):
|
|
182
|
+
if not isinstance(mg_distribution, bool):
|
|
183
|
+
raise TypeError('mg_distribution should be a bool')
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def validate_param_distribution(param_distribution):
|
|
187
|
+
if not isinstance(param_distribution, bool):
|
|
188
|
+
raise TypeError('param_distribution should be a bool')
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def validate_cc_distribution(cc_distribution):
|
|
192
|
+
if not isinstance(cc_distribution, dict):
|
|
193
|
+
raise TypeError('cc_distribution should be a dictionary')
|
|
194
|
+
for key, value in cc_distribution.items():
|
|
195
|
+
if key == 'enable':
|
|
196
|
+
if not isinstance(value, bool):
|
|
197
|
+
raise TypeError('cc_distribution enable should be a bool')
|
|
198
|
+
elif key == 'cc_codeline':
|
|
199
|
+
if not isinstance(value, list):
|
|
200
|
+
raise TypeError('cc_distribution cc_codeline should be a list')
|
|
201
|
+
elif key == 'cc_pre_hook':
|
|
202
|
+
if not isinstance(value, bool):
|
|
203
|
+
raise TypeError('cc_distribution cc_pre_hook should be a bool')
|
|
204
|
+
elif key == 'cc_log_only':
|
|
205
|
+
if not isinstance(value, bool):
|
|
206
|
+
raise TypeError('cc_distribution cc_log_only should be a bool')
|
|
207
|
+
else:
|
|
208
|
+
raise TypeError(f'{key} of cc_distribution is not supported.')
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def validate_squash_name(squash_name):
|
|
212
|
+
if not isinstance(squash_name, bool):
|
|
213
|
+
raise TypeError('squash_name should be a bool')
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def validate_alert(alert):
|
|
217
|
+
if not isinstance(alert, dict):
|
|
218
|
+
raise TypeError('alert should be a dictionary')
|
|
219
|
+
rules = alert.get('rules')
|
|
220
|
+
if rules and isinstance(rules, list):
|
|
221
|
+
for rule in rules:
|
|
222
|
+
rule_name = rule.get("rule_name")
|
|
223
|
+
if rule_name and rule_name not in MonitorConst.RULE_NAME:
|
|
224
|
+
raise TypeError(f"{rule_name} is not supported")
|
|
225
|
+
args = rule.get("args")
|
|
226
|
+
if args and isinstance(args, dict):
|
|
227
|
+
threshold = args.get("threshold")
|
|
228
|
+
if not isinstance(threshold, float) or threshold < 0:
|
|
229
|
+
raise TypeError('threshold must be float and not less than 0')
|
|
230
|
+
dump = alert.get('dump')
|
|
231
|
+
if dump and not isinstance(dump, bool):
|
|
232
|
+
raise TypeError('dump must be bool.')
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def validate_step_count_per_record(step_count_per_record):
|
|
236
|
+
if not is_int(step_count_per_record):
|
|
237
|
+
raise TypeError('step_count_per_record must be int.')
|
|
238
|
+
if step_count_per_record < 1:
|
|
239
|
+
raise ValueError("step_count_per_record must greater than 0")
|
|
240
|
+
if step_count_per_record > 1e6:
|
|
241
|
+
raise ValueError("step_count_per_record must smaller than 1e6")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def validate_config(config):
|
|
245
|
+
config['ops'] = validate_ops(config.get('ops', []))
|
|
246
|
+
|
|
247
|
+
eps = config.get('eps', 1e-8)
|
|
248
|
+
if not isinstance(eps, float):
|
|
249
|
+
raise TypeError("eps should be a float")
|
|
250
|
+
|
|
251
|
+
ranks = config.get("module_ranks", [])
|
|
252
|
+
validate_ranks(ranks)
|
|
253
|
+
|
|
254
|
+
targets = config.get("targets", {})
|
|
255
|
+
validate_targets(targets)
|
|
256
|
+
|
|
257
|
+
print_struct = config.get('print_struct', False)
|
|
258
|
+
validate_print_struct(print_struct)
|
|
259
|
+
|
|
260
|
+
ur_distribution = config.get('ur_distribution', False)
|
|
261
|
+
validate_ur_distribution(ur_distribution)
|
|
262
|
+
|
|
263
|
+
xy_distribution = config.get('xy_distribution', False)
|
|
264
|
+
validate_xy_distribution(xy_distribution)
|
|
265
|
+
|
|
266
|
+
wg_distribution = config.get('wg_distribution', False)
|
|
267
|
+
validate_wg_distribution(wg_distribution)
|
|
268
|
+
|
|
269
|
+
mg_distribution = config.get('mg_distribution', False)
|
|
270
|
+
validate_mg_distribution(mg_distribution)
|
|
271
|
+
|
|
272
|
+
param_distribution = config.get('param_distribution', False)
|
|
273
|
+
validate_param_distribution(param_distribution)
|
|
274
|
+
|
|
275
|
+
cc_distribution = config.get('cc_distribution', {})
|
|
276
|
+
validate_cc_distribution(cc_distribution)
|
|
277
|
+
|
|
278
|
+
alert = config.get('alert', {})
|
|
279
|
+
validate_alert(alert)
|
|
280
|
+
|
|
281
|
+
step_count_per_record = config.get('step_count_per_record', 1)
|
|
282
|
+
validate_step_count_per_record(step_count_per_record)
|
|
283
|
+
|
|
284
|
+
squash_name = config.get('squash_name', True)
|
|
285
|
+
validate_squash_name(squash_name)
|
|
286
|
+
|
|
287
|
+
if not targets:
|
|
288
|
+
if xy_distribution:
|
|
289
|
+
config["all_xy"] = True
|
|
290
|
+
config["targets"] = {"": {}}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def time_str2time_digit(time_str):
|
|
294
|
+
time_format = '%b%d_%H-%M-%S'
|
|
295
|
+
try:
|
|
296
|
+
time_digit = datetime.strptime(time_str, time_format)
|
|
297
|
+
except Exception as e:
|
|
298
|
+
raise RuntimeError(f"illegal timestamp: {time_str}, timestamp should be prefix \
|
|
299
|
+
of existing output dirpath, like 'Dec03_21-34-40'.") from e
|
|
300
|
+
return time_digit
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def get_target_output_dir(monitor_path, time_start, time_end):
|
|
304
|
+
check_file_or_directory_path(monitor_path, isdir=True)
|
|
305
|
+
time_start = time_str2time_digit(time_start) if time_start is not None else time_start
|
|
306
|
+
time_end = time_str2time_digit(time_end) if time_end is not None else time_end
|
|
307
|
+
if time_start and time_end and time_start > time_end:
|
|
308
|
+
raise ValueError(f"time_start({time_start}) greater than time_end({time_end})")
|
|
309
|
+
result = {}
|
|
310
|
+
for dirname in os.listdir(monitor_path):
|
|
311
|
+
match = re.match(MonitorConst.OUTPUT_DIR_PATTERN, dirname)
|
|
312
|
+
if not match:
|
|
313
|
+
continue
|
|
314
|
+
time_tag = match.group(1)
|
|
315
|
+
rank = match.group(2)
|
|
316
|
+
target_time = time_str2time_digit(time_tag)
|
|
317
|
+
start_ok = time_start is None or target_time >= time_start
|
|
318
|
+
end_ok = time_end is None or target_time <= time_end
|
|
319
|
+
if start_ok and end_ok:
|
|
320
|
+
result[rank] = os.path.join(monitor_path, dirname)
|
|
321
|
+
return result
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import torch
|
|
17
|
+
import numpy as np
|
|
18
|
+
import matplotlib.pyplot as plt
|
|
19
|
+
from msprobe.pytorch.monitor.features import cal_histc
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HeatmapVisualizer:
|
|
23
|
+
def __init__(self) -> None:
|
|
24
|
+
self.histogram_bins_num = 30
|
|
25
|
+
self.min_val = -1
|
|
26
|
+
self.max_val = 1
|
|
27
|
+
self.histogram_edges = None
|
|
28
|
+
self.histogram_sum_data_np = None # matrix shape is [bins_num * total_step]
|
|
29
|
+
self.cur_step_histogram_data = None
|
|
30
|
+
self.histogram_edges = torch.linspace(self.min_val, self.max_val, self.histogram_bins_num)
|
|
31
|
+
|
|
32
|
+
def pre_cal(self, tensor):
|
|
33
|
+
self.cur_step_histogram_data = cal_histc(tensor_cal=tensor, bins_total=self.histogram_bins_num,
|
|
34
|
+
min_val=self.min_val, max_val=self.max_val)
|
|
35
|
+
|
|
36
|
+
def visualize(self, tag_name: str, step, summary_writer):
|
|
37
|
+
if self.histogram_sum_data_np is None or self.histogram_sum_data_np.size == 0:
|
|
38
|
+
self.histogram_sum_data_np = np.expand_dims(self.cur_step_histogram_data.cpu(), 0).T
|
|
39
|
+
else:
|
|
40
|
+
# add new data along a different axis because we transposed early
|
|
41
|
+
# matrix shape is [bins_num * total_step]
|
|
42
|
+
self.histogram_sum_data_np = np.concatenate((self.histogram_sum_data_np, np.expand_dims(
|
|
43
|
+
self.cur_step_histogram_data.cpu(), 1)), axis=1)
|
|
44
|
+
|
|
45
|
+
fig, ax = plt.subplots()
|
|
46
|
+
cax = ax.matshow(self.histogram_sum_data_np, cmap='hot', aspect='auto')
|
|
47
|
+
fig.colorbar(cax)
|
|
48
|
+
|
|
49
|
+
lbs = [f'{self.histogram_edges[i]:.2f}' for i in range(self.histogram_bins_num)]
|
|
50
|
+
plt.yticks(ticks=range(self.histogram_bins_num), labels=lbs)
|
|
51
|
+
ax.set_xlabel('Step')
|
|
52
|
+
ax.set_ylabel('Value Range')
|
|
53
|
+
plt.title(f'Total Step: {step}')
|
|
54
|
+
|
|
55
|
+
# Convert matplotlib figure to an image format suitable for TensorBoard
|
|
56
|
+
fig.canvas.draw()
|
|
57
|
+
image = torch.from_numpy(np.array(fig.canvas.renderer.buffer_rgba()))
|
|
58
|
+
plt.close(fig)
|
|
59
|
+
summary_writer.add_image(tag_name, image.permute(2, 0, 1), global_step=step, dataformats='CHW')
|
|
@@ -12,9 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
__all__ = ["PtdbgDispatch"]
|
|
16
|
+
|
|
15
17
|
from signal import signal, SIGPIPE, SIG_DFL
|
|
16
18
|
from .dispatch import PtdbgDispatch
|
|
17
19
|
signal(SIGPIPE, SIG_DFL)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
__all__ = ["PtdbgDispatch"]
|
|
@@ -1,16 +1,30 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
4
16
|
import csv
|
|
5
17
|
import json
|
|
18
|
+
import os
|
|
19
|
+
import sys
|
|
6
20
|
from collections import namedtuple
|
|
7
|
-
|
|
8
|
-
from rich.console import Console
|
|
21
|
+
|
|
9
22
|
from msprobe.core.common.const import CompareConst, FileCheckConst
|
|
10
|
-
from msprobe.core.common.file_utils import
|
|
23
|
+
from msprobe.core.common.file_utils import read_csv, get_json_contents, write_csv
|
|
24
|
+
from msprobe.core.common.utils import check_op_str_pattern_valid
|
|
11
25
|
from msprobe.pytorch.online_dispatch.single_compare import single_benchmark_compare_wrap
|
|
12
|
-
from
|
|
13
|
-
from
|
|
26
|
+
from rich.console import Console
|
|
27
|
+
from rich.table import Table
|
|
14
28
|
|
|
15
29
|
ELEMENT_NUM_THRESHOLD = 100
|
|
16
30
|
ZERO_NUM_THRESHOLD = 0.1
|
|
@@ -19,30 +33,6 @@ FLOAT_PRECISION = 14
|
|
|
19
33
|
ResultInfo = namedtuple('ResultInfo', ['api_name', 'is_fwd_success', 'is_bwd_success',
|
|
20
34
|
'fwd_compare_alg_results', 'bwd_compare_alg_results'])
|
|
21
35
|
|
|
22
|
-
def get_file_content_bytes(file):
|
|
23
|
-
with FileOpen(file, 'rb') as file_handle:
|
|
24
|
-
return file_handle.read()
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def get_json_contents(file_path):
|
|
28
|
-
ops = get_file_content_bytes(file_path)
|
|
29
|
-
try:
|
|
30
|
-
json_obj = json.loads(ops)
|
|
31
|
-
except ValueError as error:
|
|
32
|
-
logger.error('Failed to load "%s". %s' % (file_path, str(error)))
|
|
33
|
-
raise CompareException(CompareException.INVALID_FILE_ERROR) from error
|
|
34
|
-
if not isinstance(json_obj, dict):
|
|
35
|
-
logger.error('Json file %s, content is not a dictionary!' % file_path)
|
|
36
|
-
raise CompareException(CompareException.INVALID_FILE_ERROR)
|
|
37
|
-
return json_obj
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def write_csv(data, filepath):
|
|
41
|
-
with FileOpen(filepath, 'a', encoding='utf-8-sig') as f:
|
|
42
|
-
writer = csv.writer(f)
|
|
43
|
-
writer.writerows(data)
|
|
44
|
-
change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
|
|
45
|
-
|
|
46
36
|
|
|
47
37
|
class Saver:
|
|
48
38
|
# consts for result csv
|
|
@@ -62,14 +52,15 @@ class Saver:
|
|
|
62
52
|
}
|
|
63
53
|
|
|
64
54
|
def write_csv_title(self):
|
|
65
|
-
summary_test_rows = [
|
|
55
|
+
summary_test_rows = [
|
|
56
|
+
[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS, "Message"]]
|
|
66
57
|
write_csv(summary_test_rows, self.save_path)
|
|
67
58
|
|
|
68
59
|
detail_test_rows = [[
|
|
69
60
|
"Npu Name", "Bench Dtype", "NPU Dtype", "Shape",
|
|
70
61
|
"error_balance", "max_abs_diff", "max_abs_idx",
|
|
71
62
|
"max_rel_diff", "max_rel_idx", "eb_thd",
|
|
72
|
-
"error_thd", "Status","Message"
|
|
63
|
+
"error_thd", "Status", "Message"
|
|
73
64
|
]]
|
|
74
65
|
write_csv(detail_test_rows, self.detail_save_path)
|
|
75
66
|
|
|
@@ -106,7 +97,7 @@ class Saver:
|
|
|
106
97
|
console.print(table_detail)
|
|
107
98
|
|
|
108
99
|
def get_statistics_from_result_csv(self):
|
|
109
|
-
checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.
|
|
100
|
+
checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.N_A, CompareConst.SKIP]
|
|
110
101
|
data = read_csv(self.save_path)
|
|
111
102
|
result_csv_name = os.path.basename(self.save_path)
|
|
112
103
|
for _, row in data.iterrows():
|
|
@@ -121,7 +112,7 @@ class Saver:
|
|
|
121
112
|
if column1 == CompareConst.SKIP:
|
|
122
113
|
continue
|
|
123
114
|
self.test_result_cnt["total_num"] += 1
|
|
124
|
-
if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE,
|
|
115
|
+
if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE, CompareConst.N_A]:
|
|
125
116
|
self.test_result_cnt['success_num'] += 1
|
|
126
117
|
elif column1 == CompareConst.FALSE and column2 == CompareConst.FALSE:
|
|
127
118
|
self.test_result_cnt['forward_and_backward_fail_num'] += 1
|
|
@@ -228,8 +219,8 @@ class Comparator:
|
|
|
228
219
|
is_bwd_success, bwd_compare_alg_results = True, None
|
|
229
220
|
if is_bwd_success and bwd_compare_alg_results is None:
|
|
230
221
|
self.saver.record_results(ResultInfo(api_name, is_fwd_success, CompareConst.NAN, fwd_compare_alg_results,
|
|
231
|
-
|
|
222
|
+
bwd_compare_alg_results))
|
|
232
223
|
else:
|
|
233
224
|
self.saver.record_results(ResultInfo(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results,
|
|
234
|
-
|
|
225
|
+
bwd_compare_alg_results))
|
|
235
226
|
return is_fwd_success, is_bwd_success
|