PyPI - mindstudio-probe - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +51 -20
msprobe/config.json +2 -3
msprobe/core/advisor/advisor.py +8 -3
msprobe/core/common/const.py +264 -15
msprobe/core/common/exceptions.py +27 -3
msprobe/core/common/file_utils.py +176 -26
msprobe/core/common/inplace_op_checker.py +15 -0
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/log.py +27 -9
msprobe/core/common/utils.py +204 -77
msprobe/core/common_config.py +49 -14
msprobe/core/compare/acc_compare.py +274 -198
msprobe/core/compare/check.py +32 -33
msprobe/core/compare/compare_cli.py +32 -14
msprobe/core/compare/highlight.py +283 -127
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +135 -144
msprobe/core/compare/utils.py +419 -274
msprobe/core/data_dump/data_collector.py +60 -28
msprobe/core/data_dump/data_processor/base.py +84 -36
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +119 -39
msprobe/core/grad_probe/constant.py +27 -13
msprobe/core/grad_probe/grad_compare.py +18 -1
msprobe/core/grad_probe/utils.py +30 -2
msprobe/core/overflow_check/abnormal_scene.py +189 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +96 -7
msprobe/docs/02.config_introduction.md +50 -23
msprobe/docs/03.config_examples.md +2 -9
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +93 -61
msprobe/docs/06.data_dump_MindSpore.md +200 -95
msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
msprobe/docs/17.grad_probe.md +5 -6
msprobe/docs/19.monitor.md +561 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +466 -0
msprobe/docs/22.visualization_MindSpore.md +481 -0
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/25.tool_function_introduction.md +29 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +29 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +25 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
msprobe/mindspore/api_accuracy_checker/main.py +28 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +33 -12
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +88 -4
msprobe/mindspore/compare/distributed_compare.py +22 -24
msprobe/mindspore/compare/ms_compare.py +333 -268
msprobe/mindspore/compare/ms_graph_compare.py +95 -52
msprobe/mindspore/debugger/debugger_config.py +7 -1
msprobe/mindspore/debugger/precision_debugger.py +87 -12
msprobe/mindspore/dump/dump_tool_factory.py +3 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +17 -5
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
msprobe/mindspore/free_benchmark/common/utils.py +19 -4
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
msprobe/mindspore/grad_probe/global_context.py +28 -8
msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +35 -12
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +27 -16
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +285 -113
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +48 -10
msprobe/pytorch/__init__.py +8 -6
msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
msprobe/pytorch/bench_functions/swiglu.py +10 -2
msprobe/pytorch/common/parse_json.py +7 -6
msprobe/pytorch/common/utils.py +101 -7
msprobe/pytorch/compare/distributed_compare.py +17 -30
msprobe/pytorch/compare/pt_compare.py +44 -22
msprobe/pytorch/debugger/debugger_config.py +46 -27
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +15 -0
msprobe/pytorch/free_benchmark/common/params.py +10 -2
msprobe/pytorch/free_benchmark/common/utils.py +29 -4
msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -38
msprobe/pytorch/monitor/__init__.py +0 -0
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +425 -0
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/__init__.py +0 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +1076 -0
msprobe/pytorch/monitor/module_metric.py +172 -0
msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
msprobe/pytorch/monitor/optimizer_collect.py +333 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
msprobe/pytorch/monitor/utils.py +321 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +29 -38
msprobe/pytorch/online_dispatch/dispatch.py +58 -27
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +53 -32
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
msprobe/pytorch/online_dispatch/utils.py +49 -21
msprobe/pytorch/parse_tool/lib/compare.py +21 -27
msprobe/pytorch/parse_tool/lib/config.py +6 -8
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
msprobe/pytorch/parse_tool/lib/utils.py +33 -53
msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
msprobe/pytorch/pt_config.py +31 -8
msprobe/pytorch/service.py +188 -108
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +222 -0
msprobe/visualization/builder/msprobe_adapter.py +227 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +180 -0
msprobe/visualization/compare/mode_adapter.py +197 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +119 -0
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +209 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +288 -0
msprobe/visualization/utils.py +217 -0
mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
msprobe/docs/04.acl_config_examples.md +0 -78
msprobe/mindspore/compare/layer_mapping.py +0 -146
msprobe/mindspore/compare/modify_mapping.py +0 -107
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
/msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0

msprobe/pytorch/monitor/unittest/test_monitor.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+from glob import glob
+import pandas as pd
+from msprobe.pytorch.common.log import logger
+def parse_logfile(logfile):
+    grad_norm = []
+    step = []
+    with open(logfile) as f:
+        for line in f.readlines():
+            if 'consumed samples' in line:
+                grad_norm.append(float(re.findall('(?<=grad norm\: )[\d\.]*', line)[0]))
+    return grad_norm
+def parse_monitor_output(output_dir):
+    reduced = {}
+    unreduced = {}
+    for directory in glob(output_dir + '*'):
+        rank = int(re.findall('(?<=rank)[\d]*', directory)[0])
+        unreduced[rank] = []
+        reduced[rank] = []
+        for file in os.listdir(directory):
+            df = pd.read_csv(os.path.join(directory, file))
+            if '_unreduced_' in file:
+                unreduced[rank].append(df)
+                pass
+            elif '_reduced_' in file:
+                reduced[rank].append(df)
+            else:
+                logger.info(f'unexpected file {file} in {directory}')
+    return reduced, unreduced
+def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel):
+    steps = len(reduced[0])
+    world_size = len(reduced)
+    errors = []
+    for _, row in unreduced[0][0].iterrows():
+        param = row['param_name']
+        is_tp_duplicate = False
+        for step in range(2):
+            # sum reduced
+            reduced_mean = 0.
+            for rank in range(world_size):
+                if len(reduced[rank]) == 0:
+                    continue
+                df = reduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if not value:
+                    if step == 0:
+                        is_tp_duplicate = True
+                    continue
+                reduced_mean += value[0]
+            # sum unreduced
+            unreduced_mean = 0.
+            for rank in range(world_size):
+                df = unreduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if not value:
+                    continue
+                unreduced_mean += list(df[df['param_name'] == param]['mean'])[0]
+            unreduced_mean /= dp_size
+            if is_tp_duplicate and (not sequence_parallel or 'embedding' in param):
+                unreduced_mean /= tp_size
+            try:
+                assert_equal(unreduced_mean, reduced_mean)
+            except AssertionError as e:
+                errors.append([param, step, e, is_tp_duplicate])
+    if errors:
+        logger.info(errors)
+    else:
+        logger.info(f'grad mean is in consist between unreduced grad and reduced grad monitord.')
+def assert_equal(a, b):
+    if b == 0 or a == 0:
+        return
+    if b == 0:
+        rel_diff = a
+    elif a == 0:
+        rel_diff = b
+    else:
+        rel_diff = abs(a / b - 1)
+    assert rel_diff < 0.01, f'{a}, {b}, {rel_diff}'
+def valid_total_norm(total_norm, reduced, duplicate_embedding):
+    steps = len(total_norm)
+    world_size = len(reduced)
+    errors = []
+    for step in range(steps):
+        calculated_norm = 0.
+        for rank in range(world_size):
+            if len(reduced[rank]) == 0:
+                if step == 0:
+                    logger.info(f'rank {rank} is duplicated in dp group')
+                continue
+            for _, row in reduced[rank][step].iterrows():
+                if duplicate_embedding and 'word_embedding' in row['param_name']:
+                    continue
+                calculated_norm += row['norm'] ** 2
+        try:
+            assert_equal(calculated_norm ** 0.5, total_norm[step])
+        except AssertionError as e:
+            errors.append([step, e])
+    if errors:
+        logger.info('total norm errors: ', errors)
+    else:
+        logger.info('grad norm in consist between training log and reduced gradients monitored')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--monitor_output', '-m', type=str, required=True,
+                        help='path prefix to the output of monitor e.g. monitor_output/Aug12_07-16')
+    parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file')
+    parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size')
+    parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size')
+    parser.add_argument('--pp_size', '-p', type=int, required=True, help='pp parallel size')
+    parser.add_argument('--untie_embeddings_and_output_weights', '-u', action="store_true", default=False,
+                        help='whether untie_embeddings_and_output_weights in pp parallel')
+    parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False,
+                        help='whether sequence parallel is enabled. Add -s to store true')
+    args = parser.parse_args()
+    assert args.tp_size > 0, 'if tp not enabled, set tp_size = 1'
+    assert args.dp_size > 0, 'if tp not enabled, set dp_size = 1'
+    assert args.pp_size > 0, 'if tp not enabled, set pp_size = 1'
+    total_norm = parse_logfile(args.logfile)
+    reduced, unreduced = parse_monitor_output(args.monitor_output)
+    duplicate_embedding = not args.untie_embeddings_and_output_weights and args.pp_size > 1
+    valid_total_norm(total_norm, reduced, duplicate_embedding)
+    valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel)

msprobe/pytorch/monitor/utils.py ADDED Viewed

@@ -0,0 +1,321 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from collections import namedtuple
+from datetime import timezone, timedelta
+from functools import wraps
+from datetime import datetime
+import os
+import re
+import torch
+from msprobe.core.common.const import MonitorConst, Const
+from msprobe.pytorch.common.log import logger
+from msprobe.core.common.utils import is_int
+from msprobe.core.common.file_utils import check_file_or_directory_path
+device = "cpu"
+try:
+    import torch_npu
+    device = "npu"
+except ImportError:
+    if torch.cuda.is_available():
+        device = "cuda"
+NAN_TENSOR_ON_DEVICE = torch.tensor(torch.nan, device=device)
+FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
+FILE_NAME_MAX_LENGTH = 255
+DIRECTORY_MAX_LENGTH = 4096
+beijing_tz = timezone(timedelta(hours=8))
+MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
+MVGradResult = namedtuple('MVGradResult', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
+class MsgConst:
+    """
+    Class for log messages const
+    """
+    SPECIAL_CHAR = ["\n", "\r", "\u007F", "\b", "\f", "\t", "\u000B", "%08", "%0a", "%0b", "%0c", "%0d", "%7f"]
+def get_output_base_dir():
+    return os.getenv(MonitorConst.MONITOR_OUTPUT_DIR, MonitorConst.DEFAULT_MONITOR_OUTPUT_DIR)
+def filter_special_chars(func):
+    @wraps(func)
+    def func_level(msg):
+        for char in MsgConst.SPECIAL_CHAR:
+            msg = msg.replace(char, '_')
+        return func(msg)
+    return func_level
+def get_param_struct(param):
+    res = {}
+    if isinstance(param, (tuple, list)):
+        res['config'] = f'{type(param).__name__}[{len(param)}]'
+        for i, x in enumerate(param):
+            res[i] = f'size={tuple(x.shape)}, dtype={x.dtype}' if torch.is_tensor(x) else f'{type(x)}'
+    elif torch.is_tensor(param):
+        res['config'] = 'tensor'
+        res['tensor'] = f'size={tuple(param.shape)}, dtype={param.dtype}'
+    else:
+        res['config'] = f'{type(param)}'
+        logger.warning(f'Not support type({type(param)}) now, please check the type of param {param}')
+    return res
+def is_recomputation():
+    """Check if the current operation is in the re-computation phase.
+    This function inspects the current call stack to indicate whether the current operation is in the
+    re-computation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
+    megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
+    mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
+    file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
+    'torch/_tensor.py' file.
+    Returns:
+        bool: True if in the re-computation phase, False otherwise.
+    """
+    backward_function_indices = []
+    call_stack = inspect.stack()
+    # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
+    for frame_info in call_stack:
+        if frame_info.function == Const.BACKWARD and frame_info.filename.endswith('torch/_tensor.py'):
+            del call_stack
+            return True
+    # Identify indices in the call stack where the specific function is being executed
+    for idx, frame_info in enumerate(call_stack):
+        if frame_info.function == Const.BACKWARD or frame_info.function == 'checkpoint_function_backward':
+            backward_function_indices.append(idx)
+    # Check if the execution is within 'torch/autograd/function.py' file
+    for idx in backward_function_indices:
+        # The Megatron and MindSpeed L0&L1 scenes
+        if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
+            del call_stack
+            return True
+        # The latest MindSpeed L2 and ModelLink scenes
+        if idx + 2 < len(call_stack) and call_stack[idx + 2].filename.endswith('torch/autograd/function.py'):
+            del call_stack
+            return True
+    del call_stack
+    return False
+def validate_ops(ops):
+    if not isinstance(ops, list):
+        raise TypeError("ops should be a list")
+    valid_ops = []
+    for op in ops:
+        if op not in MonitorConst.OP_LIST:
+            logger.warning(f"op {op} is not supported. Optional ops: {MonitorConst.OP_LIST}")
+            continue
+        valid_ops.append(op)
+    if not valid_ops:
+        default_op = MonitorConst.OP_LIST[0]
+        valid_ops.append(default_op)
+        logger.info_on_rank_0(f"There is no valid ops, default op {default_op} is used")
+    return valid_ops
+def validate_ranks(ranks):
+    if not isinstance(ranks, list):
+        raise TypeError("module_ranks should be a list")
+    for rank in ranks:
+        if not isinstance(rank, int) or isinstance(rank, bool):
+            raise TypeError(f"element in module_ranks should be a int, get {type(rank)}")
+def validate_targets(targets):
+    if not isinstance(targets, dict):
+        raise TypeError('targets in config.json should be a dict')
+    for module_name, field in targets.items():
+        if not isinstance(module_name, str):
+            raise TypeError('key of targets should be module_name[str] in config.json')
+        if not isinstance(field, dict):
+            raise TypeError('values of targets should be cared filed e.g. {"input": "tensor"} in config.json')
+def validate_print_struct(print_struct):
+    if not isinstance(print_struct, bool):
+        raise TypeError("print_struct should be a bool")
+def validate_ur_distribution(ur_distribution):
+    if not isinstance(ur_distribution, bool):
+        raise TypeError('ur_distribution should be a bool')
+def validate_xy_distribution(xy_distribution):
+    if not isinstance(xy_distribution, bool):
+        raise TypeError('xy_distribution should be a bool')
+def validate_wg_distribution(wg_distribution):
+    if not isinstance(wg_distribution, bool):
+        raise TypeError('wg_distribution should be a bool')
+def validate_mg_distribution(mg_distribution):
+    if not isinstance(mg_distribution, bool):
+        raise TypeError('mg_distribution should be a bool')
+def validate_param_distribution(param_distribution):
+    if not isinstance(param_distribution, bool):
+        raise TypeError('param_distribution should be a bool')
+def validate_cc_distribution(cc_distribution):
+    if not isinstance(cc_distribution, dict):
+        raise TypeError('cc_distribution should be a dictionary')
+    for key, value in cc_distribution.items():
+        if key == 'enable':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution enable should be a bool')
+        elif key == 'cc_codeline':
+            if not isinstance(value, list):
+                raise TypeError('cc_distribution cc_codeline should be a list')
+        elif key == 'cc_pre_hook':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution cc_pre_hook should be a bool')
+        elif key == 'cc_log_only':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution cc_log_only should be a bool')
+        else:
+            raise TypeError(f'{key} of cc_distribution is not supported.')
+def validate_squash_name(squash_name):
+    if not isinstance(squash_name, bool):
+        raise TypeError('squash_name should be a bool')
+def validate_alert(alert):
+    if not isinstance(alert, dict):
+        raise TypeError('alert should be a dictionary')
+    rules = alert.get('rules')
+    if rules and isinstance(rules, list):
+        for rule in rules:
+            rule_name = rule.get("rule_name")
+            if rule_name and rule_name not in MonitorConst.RULE_NAME:
+                raise TypeError(f"{rule_name} is not supported")
+            args = rule.get("args")
+            if args and isinstance(args, dict):
+                threshold = args.get("threshold")
+                if not isinstance(threshold, float) or threshold < 0:
+                    raise TypeError('threshold must be float and not less than 0')
+    dump = alert.get('dump')
+    if dump and not isinstance(dump, bool):
+        raise TypeError('dump must be bool.')
+def validate_step_count_per_record(step_count_per_record):
+    if not is_int(step_count_per_record):
+        raise TypeError('step_count_per_record must be int.')
+    if step_count_per_record < 1:
+        raise ValueError("step_count_per_record must greater than 0")
+    if step_count_per_record > 1e6:
+        raise ValueError("step_count_per_record must smaller than 1e6")
+def validate_config(config):
+    config['ops'] = validate_ops(config.get('ops', []))
+    eps = config.get('eps', 1e-8)
+    if not isinstance(eps, float):
+        raise TypeError("eps should be a float")
+    ranks = config.get("module_ranks", [])
+    validate_ranks(ranks)
+    targets = config.get("targets", {})
+    validate_targets(targets)
+    print_struct = config.get('print_struct', False)
+    validate_print_struct(print_struct)
+    ur_distribution = config.get('ur_distribution', False)
+    validate_ur_distribution(ur_distribution)
+    xy_distribution = config.get('xy_distribution', False)
+    validate_xy_distribution(xy_distribution)
+    wg_distribution = config.get('wg_distribution', False)
+    validate_wg_distribution(wg_distribution)
+    mg_distribution = config.get('mg_distribution', False)
+    validate_mg_distribution(mg_distribution)
+    param_distribution = config.get('param_distribution', False)
+    validate_param_distribution(param_distribution)
+    cc_distribution = config.get('cc_distribution', {})
+    validate_cc_distribution(cc_distribution)
+    alert = config.get('alert', {})
+    validate_alert(alert)
+    step_count_per_record = config.get('step_count_per_record', 1)
+    validate_step_count_per_record(step_count_per_record)
+    squash_name = config.get('squash_name', True)
+    validate_squash_name(squash_name)
+    if not targets:
+        if xy_distribution:
+            config["all_xy"] = True
+        config["targets"] = {"": {}}
+def time_str2time_digit(time_str):
+    time_format = '%b%d_%H-%M-%S'
+    try:
+        time_digit = datetime.strptime(time_str, time_format)
+    except Exception as e:
+        raise RuntimeError(f"illegal timestamp: {time_str}, timestamp should be prefix \
+                           of existing output dirpath, like 'Dec03_21-34-40'.") from e
+    return time_digit
+def get_target_output_dir(monitor_path, time_start, time_end):
+    check_file_or_directory_path(monitor_path, isdir=True)
+    time_start = time_str2time_digit(time_start) if time_start is not None else time_start
+    time_end = time_str2time_digit(time_end) if time_end is not None else time_end
+    if time_start and time_end and time_start > time_end:
+        raise ValueError(f"time_start({time_start}) greater than time_end({time_end})")
+    result = {}
+    for dirname in os.listdir(monitor_path):
+        match = re.match(MonitorConst.OUTPUT_DIR_PATTERN, dirname)
+        if not match:
+            continue
+        time_tag = match.group(1)
+        rank = match.group(2)
+        target_time = time_str2time_digit(time_tag)
+        start_ok = time_start is None or target_time >= time_start
+        end_ok = time_end is None or target_time <= time_end
+        if start_ok and end_ok:
+            result[rank] = os.path.join(monitor_path, dirname)
+    return result

msprobe/pytorch/monitor/visualizer.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from msprobe.pytorch.monitor.features import cal_histc
+class HeatmapVisualizer:
+    def __init__(self) -> None:
+        self.histogram_bins_num = 30
+        self.min_val = -1
+        self.max_val = 1
+        self.histogram_edges = None
+        self.histogram_sum_data_np = None  # matrix shape is [bins_num * total_step]
+        self.cur_step_histogram_data = None
+        self.histogram_edges = torch.linspace(self.min_val, self.max_val, self.histogram_bins_num)
+    def pre_cal(self, tensor):
+        self.cur_step_histogram_data = cal_histc(tensor_cal=tensor, bins_total=self.histogram_bins_num,
+                                                 min_val=self.min_val, max_val=self.max_val)
+    def visualize(self, tag_name: str, step, summary_writer):
+        if self.histogram_sum_data_np is None or self.histogram_sum_data_np.size == 0:
+            self.histogram_sum_data_np = np.expand_dims(self.cur_step_histogram_data.cpu(), 0).T
+        else:
+            # add new data along a different axis because we transposed early
+            # matrix shape is [bins_num * total_step]
+            self.histogram_sum_data_np = np.concatenate((self.histogram_sum_data_np, np.expand_dims(
+                self.cur_step_histogram_data.cpu(), 1)), axis=1)
+        fig, ax = plt.subplots()
+        cax = ax.matshow(self.histogram_sum_data_np, cmap='hot', aspect='auto')
+        fig.colorbar(cax)
+        lbs = [f'{self.histogram_edges[i]:.2f}' for i in range(self.histogram_bins_num)]
+        plt.yticks(ticks=range(self.histogram_bins_num), labels=lbs)
+        ax.set_xlabel('Step')
+        ax.set_ylabel('Value Range')
+        plt.title(f'Total Step: {step}')
+        # Convert matplotlib figure to an image format suitable for TensorBoard
+        fig.canvas.draw()
+        image = torch.from_numpy(np.array(fig.canvas.renderer.buffer_rgba()))
+        plt.close(fig)
+        summary_writer.add_image(tag_name, image.permute(2, 0, 1), global_step=step, dataformats='CHW')

msprobe/pytorch/online_dispatch/__init__.py CHANGED Viewed

@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+__all__ = ["PtdbgDispatch"]
 from signal import signal, SIGPIPE, SIG_DFL
 from .dispatch import PtdbgDispatch
 signal(SIGPIPE, SIG_DFL)
-__all__ = ["PtdbgDispatch"]

msprobe/pytorch/online_dispatch/compare.py CHANGED Viewed

@@ -1,16 +1,30 @@
-# 进行比对及结果展示
-import os
-import sys
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import csv
 import json
+import os
+import sys
 from collections import namedtuple
-from rich.table import Table
-from rich.console import Console
 from msprobe.core.common.const import CompareConst, FileCheckConst
-from msprobe.core.common.file_utils import FileOpen, change_mode, read_csv
+from msprobe.core.common.file_utils import read_csv, get_json_contents, write_csv
+from msprobe.core.common.utils import check_op_str_pattern_valid
 from msprobe.pytorch.online_dispatch.single_compare import single_benchmark_compare_wrap
-from msprobe.pytorch.common.log import logger
-from msprobe.core.common.utils import CompareException, check_op_str_pattern_valid
+from rich.console import Console
+from rich.table import Table
 ELEMENT_NUM_THRESHOLD = 100
 ZERO_NUM_THRESHOLD = 0.1
@@ -19,30 +33,6 @@ FLOAT_PRECISION = 14
 ResultInfo = namedtuple('ResultInfo', ['api_name', 'is_fwd_success', 'is_bwd_success',
                                        'fwd_compare_alg_results', 'bwd_compare_alg_results'])
-def get_file_content_bytes(file):
-    with FileOpen(file, 'rb') as file_handle:
-        return file_handle.read()
-def get_json_contents(file_path):
-    ops = get_file_content_bytes(file_path)
-    try:
-        json_obj = json.loads(ops)
-    except ValueError as error:
-        logger.error('Failed to load "%s". %s' % (file_path, str(error)))
-        raise CompareException(CompareException.INVALID_FILE_ERROR) from error
-    if not isinstance(json_obj, dict):
-        logger.error('Json file %s, content is not a dictionary!' % file_path)
-        raise CompareException(CompareException.INVALID_FILE_ERROR)
-    return json_obj
-def write_csv(data, filepath):
-    with FileOpen(filepath, 'a', encoding='utf-8-sig') as f:
-        writer = csv.writer(f)
-        writer.writerows(data)
-    change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
 class Saver:
     # consts for result csv
@@ -62,14 +52,15 @@ class Saver:
         }
     def write_csv_title(self):
-        summary_test_rows = [[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS, "Message"]]
+        summary_test_rows = [
+            [self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS, "Message"]]
         write_csv(summary_test_rows, self.save_path)
         detail_test_rows = [[
             "Npu Name", "Bench Dtype", "NPU Dtype", "Shape",
             "error_balance", "max_abs_diff", "max_abs_idx",
             "max_rel_diff", "max_rel_idx", "eb_thd",
-            "error_thd", "Status","Message"
+            "error_thd", "Status", "Message"
         ]]
         write_csv(detail_test_rows, self.detail_save_path)
@@ -106,7 +97,7 @@ class Saver:
         console.print(table_detail)
     def get_statistics_from_result_csv(self):
-        checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.NA, CompareConst.SKIP]
+        checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.N_A, CompareConst.SKIP]
         data = read_csv(self.save_path)
         result_csv_name = os.path.basename(self.save_path)
         for _, row in data.iterrows():
@@ -121,7 +112,7 @@ class Saver:
             if column1 == CompareConst.SKIP:
                 continue
             self.test_result_cnt["total_num"] += 1
-            if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE, 'N/A']:
+            if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE, CompareConst.N_A]:
                 self.test_result_cnt['success_num'] += 1
             elif column1 == CompareConst.FALSE and column2 == CompareConst.FALSE:
                 self.test_result_cnt['forward_and_backward_fail_num'] += 1
@@ -228,8 +219,8 @@ class Comparator:
             is_bwd_success, bwd_compare_alg_results = True, None
         if is_bwd_success and bwd_compare_alg_results is None:
             self.saver.record_results(ResultInfo(api_name, is_fwd_success, CompareConst.NAN, fwd_compare_alg_results,
-                                      bwd_compare_alg_results))
+                                                 bwd_compare_alg_results))
         else:
             self.saver.record_results(ResultInfo(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results,
-                                      bwd_compare_alg_results))
+                                                 bwd_compare_alg_results))
         return is_fwd_success, is_bwd_success

mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl