PyPI - mindstudio-probe - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
msprobe/README.md +39 -3
msprobe/config.json +1 -3
msprobe/core/advisor/advisor.py +8 -3
msprobe/core/common/const.py +113 -13
msprobe/core/common/exceptions.py +25 -3
msprobe/core/common/file_utils.py +150 -26
msprobe/core/common/inplace_op_checker.py +15 -0
msprobe/core/common/log.py +27 -9
msprobe/core/common/utils.py +182 -69
msprobe/core/common_config.py +44 -15
msprobe/core/compare/acc_compare.py +207 -142
msprobe/core/compare/check.py +2 -5
msprobe/core/compare/compare_cli.py +21 -4
msprobe/core/compare/highlight.py +124 -55
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
msprobe/core/compare/npy_compare.py +52 -23
msprobe/core/compare/utils.py +272 -247
msprobe/core/data_dump/data_collector.py +13 -11
msprobe/core/data_dump/data_processor/base.py +46 -16
msprobe/core/data_dump/data_processor/mindspore_processor.py +4 -4
msprobe/core/data_dump/data_processor/pytorch_processor.py +156 -59
msprobe/core/data_dump/scope.py +113 -34
msprobe/core/grad_probe/constant.py +27 -13
msprobe/core/grad_probe/grad_compare.py +18 -1
msprobe/core/grad_probe/utils.py +30 -2
msprobe/core/overflow_check/abnormal_scene.py +185 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +10 -0
msprobe/docs/02.config_introduction.md +49 -22
msprobe/docs/03.config_examples.md +2 -9
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +3 -1
msprobe/docs/06.data_dump_MindSpore.md +157 -90
msprobe/docs/07.accuracy_checker_PyTorch.md +12 -12
msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +19 -13
msprobe/docs/11.accuracy_compare_MindSpore.md +104 -13
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
msprobe/docs/17.grad_probe.md +5 -6
msprobe/docs/19.monitor.md +468 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +386 -0
msprobe/docs/22.visualization_MindSpore.md +384 -0
msprobe/docs/23.tool_function_introduction.md +28 -0
msprobe/docs/FAQ.md +3 -0
msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/mindspore/__init__.py +15 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +113 -145
msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
msprobe/mindspore/api_accuracy_checker/main.py +27 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +33 -12
msprobe/mindspore/common/const.py +33 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +43 -4
msprobe/mindspore/compare/distributed_compare.py +22 -22
msprobe/mindspore/compare/ms_compare.py +271 -248
msprobe/mindspore/compare/ms_graph_compare.py +81 -47
msprobe/mindspore/debugger/debugger_config.py +4 -1
msprobe/mindspore/debugger/precision_debugger.py +7 -1
msprobe/mindspore/dump/dump_tool_factory.py +3 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +12 -2
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +13 -16
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +25 -0
msprobe/mindspore/dump/jit_dump.py +17 -5
msprobe/mindspore/dump/kernel_graph_dump.py +2 -4
msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +145 -39
msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
msprobe/mindspore/free_benchmark/common/utils.py +19 -4
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
msprobe/mindspore/grad_probe/global_context.py +28 -8
msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +24 -10
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/ms_config.py +22 -15
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +2 -4
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +36 -30
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +24 -7
msprobe/pytorch/__init__.py +3 -2
msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +6 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +19 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +13 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +77 -53
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +15 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +100 -6
msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
msprobe/pytorch/bench_functions/swiglu.py +10 -2
msprobe/pytorch/common/parse_json.py +6 -6
msprobe/pytorch/common/utils.py +56 -5
msprobe/pytorch/compare/distributed_compare.py +8 -9
msprobe/pytorch/compare/pt_compare.py +8 -6
msprobe/pytorch/debugger/debugger_config.py +19 -15
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +15 -0
msprobe/pytorch/free_benchmark/common/params.py +8 -1
msprobe/pytorch/free_benchmark/common/utils.py +26 -4
msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -3
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +10 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
msprobe/pytorch/hook_module/wrap_functional.py +14 -12
msprobe/pytorch/module_processer.py +2 -5
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +340 -0
msprobe/pytorch/monitor/distributed/__init__.py +0 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +870 -0
msprobe/pytorch/monitor/module_metric.py +193 -0
msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
msprobe/pytorch/monitor/optimizer_collect.py +295 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
msprobe/pytorch/monitor/utils.py +250 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +29 -38
msprobe/pytorch/online_dispatch/dispatch.py +50 -25
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +53 -32
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
msprobe/pytorch/online_dispatch/utils.py +49 -21
msprobe/pytorch/parse_tool/lib/compare.py +12 -18
msprobe/pytorch/parse_tool/lib/config.py +1 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +1 -2
msprobe/pytorch/parse_tool/lib/utils.py +16 -35
msprobe/pytorch/parse_tool/lib/visualization.py +2 -0
msprobe/pytorch/pt_config.py +31 -8
msprobe/pytorch/service.py +15 -5
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +165 -0
msprobe/visualization/builder/msprobe_adapter.py +205 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +130 -0
msprobe/visualization/compare/mode_adapter.py +211 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +124 -0
msprobe/visualization/graph/graph.py +200 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +214 -0
msprobe/visualization/utils.py +232 -0
mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
msprobe/docs/04.acl_config_examples.md +0 -78
msprobe/mindspore/compare/layer_mapping.py +0 -146
msprobe/mindspore/compare/modify_mapping.py +0 -107
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/unittest/test_monitor.py ADDED Viewed

@@ -0,0 +1,145 @@
+import os
+import re
+import argparse
+from glob import glob
+import pandas as pd
+from msprobe.core.common.log import logger
+def parse_logfile(logfile):
+    grad_norm = []
+    step = []
+    with open(logfile) as f:
+        for line in f.readlines():
+            if 'consumed samples' in line:
+                grad_norm.append(float(re.findall('(?<=grad norm\: )[\d\.]*', line)[0]))
+    return grad_norm
+def parse_monitor_output(output_dir):
+    reduced = {}
+    unreduced = {}
+    for dir in glob(output_dir + '*'):
+        rank = int(re.findall('(?<=rank)[\d]*', dir)[0])
+        unreduced[rank] = []
+        reduced[rank] = []
+        for file in os.listdir(dir):
+            df = pd.read_csv(os.path.join(dir, file))
+            if '_unreduced_' in file:
+                unreduced[rank].append(df)
+                pass
+            elif '_reduced_' in file:
+                reduced[rank].append(df)
+            else:
+                logger.info(f'unexpected file {file} in {dir}')
+    return reduced, unreduced
+def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel):
+    steps = len(reduced[0])
+    world_size = len(reduced)
+    errors = []
+    for index, row in unreduced[0][0].iterrows():
+        param = row['param_name']
+        is_tp_duplicate = False
+        for step in range(2):
+            # sum reduced
+            reduced_mean = 0.
+            for rank in range(world_size):
+                if len(reduced[rank]) == 0:
+                    continue
+                df = reduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if not value:
+                    if step == 0:
+                        is_tp_duplicate = True
+                    continue
+                reduced_mean += value[0]
+            # sum unreduced
+            unreduced_mean = 0.
+            for rank in range(world_size):
+                df = unreduced[rank][step]
+                value = list(df[df['param_name'] == param]['mean'])
+                if not value:
+                    continue
+                unreduced_mean += list(df[df['param_name'] == param]['mean'])[0]
+            unreduced_mean /= dp_size
+            if is_tp_duplicate and (not sequence_parallel or 'embedding' in param):
+                unreduced_mean /= tp_size
+            try:
+                assert_equal(unreduced_mean, reduced_mean)
+            except AssertionError as e:
+                errors.append([param, step, e, is_tp_duplicate])
+    if errors:
+        logger.info(errors)
+    else:
+        logger.info(f'grad mean is in consist between unreduced grad and reduced grad monitord.')
+def assert_equal(a, b):
+    if b == 0 or a == 0:
+        return
+    if b == 0:
+        rel_diff = a
+    elif a == 0:
+        rel_diff = b
+    else:
+        rel_diff = abs(a / b - 1)
+    assert rel_diff < 0.01, f'{a}, {b}, {rel_diff}'
+def valid_total_norm(total_norm, reduced, duplicate_embedding):
+    steps = len(total_norm)
+    world_size = len(reduced)
+    errors = []
+    for step in range(steps):
+        calculated_norm = 0.
+        for rank in range(world_size):
+            if len(reduced[rank]) == 0:
+                if step == 0:
+                    logger.info(f'rank {rank} is duplicated in dp group')
+                continue
+            for index, row in reduced[rank][step].iterrows():
+                if duplicate_embedding and 'word_embedding' in row['param_name']:
+                    continue
+                calculated_norm += row['norm'] ** 2
+        try:
+            assert_equal(calculated_norm ** 0.5, total_norm[step])
+        except AssertionError as e:
+            errors.append([step, e])
+    if errors:
+        logger.info('total norm errors: ', errors)
+    else:
+        logger.info('grad norm in consist between training log and reduced gradients monitored')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--monitor_output', '-m', type=str, required=True,
+                        help='path prefix to the output of monitor e.g. monitor_output/Aug12_07-16')
+    parser.add_argument('--logfile', '-l', type=str, required=True, help='path to the training log file')
+    parser.add_argument('--tp_size', '-t', type=int, required=True, help='tp parallel size')
+    parser.add_argument('--dp_size', '-d', type=int, required=True, help='dp parallel size')
+    parser.add_argument('--pp_size', '-p', type=int, required=True, help='pp parallel size')
+    parser.add_argument('--untie_embeddings_and_output_weights', '-u', action="store_true", default=False,
+                        help='whether untie_embeddings_and_output_weights in pp parallel')
+    parser.add_argument('--sequence_parallel', '-s', action="store_true", default=False,
+                        help='whether sequence parallel is enabled. Add -s to store true')
+    args = parser.parse_args()
+    assert args.tp_size > 0, 'if tp not enabled, set tp_size = 1'
+    assert args.dp_size > 0, 'if tp not enabled, set dp_size = 1'
+    assert args.pp_size > 0, 'if tp not enabled, set pp_size = 1'
+    total_norm = parse_logfile(args.logfile)
+    reduced, unreduced = parse_monitor_output(args.monitor_output)
+    duplicate_embedding = not args.untie_embeddings_and_output_weights and args.pp_size > 1
+    valid_total_norm(total_norm, reduced, duplicate_embedding)
+    valid_reduce(reduced, unreduced, args.tp_size, args.dp_size, args.sequence_parallel)

msprobe/pytorch/monitor/utils.py ADDED Viewed

@@ -0,0 +1,250 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from collections import namedtuple
+from datetime import timezone, timedelta
+from functools import wraps
+import torch
+from msprobe.core.common.const import MonitorConst, Const
+from msprobe.core.common.log import logger
+from msprobe.core.common.utils import is_int
+FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
+FILE_NAME_MAX_LENGTH = 255
+DIRECTORY_MAX_LENGTH = 4096
+beijing_tz = timezone(timedelta(hours=8))
+MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
+MVGradResult = namedtuple('MVGradResult', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
+class MsgConst:
+    """
+    Class for log messages const
+    """
+    SPECIAL_CHAR = ["\n", "\r", "\u007F", "\b", "\f", "\t", "\u000B", "%08", "%0a", "%0b", "%0c", "%0d", "%7f"]
+def filter_special_chars(func):
+    @wraps(func)
+    def func_level(msg):
+        for char in MsgConst.SPECIAL_CHAR:
+            msg = msg.replace(char, '_')
+        return func(msg)
+    return func_level
+def get_param_struct(param):
+    res = {}
+    if isinstance(param, (tuple, list)):
+        res['config'] = f'{type(param).__name__}[{len(param)}]'
+        for i, x in enumerate(param):
+            res[i] = f'size={tuple(x.shape)}, dtype={x.dtype}' if torch.is_tensor(x) else f'{type(x)}'
+    elif torch.is_tensor(param):
+        res['config'] = 'tensor'
+        res['tensor'] = f'size={tuple(param.shape)}, dtype={param.dtype}'
+    else:
+        res['config'] = f'{type(param)}'
+        logger.warning(f'Not support type({type(param)}) now, please check the type of param {param}')
+    return res
+def is_recomputation():
+    """Check if the current operation is in the re-computation phase.
+    This function inspects the current call stack to indicate whether the current operation is in the
+    re-computation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
+    megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
+    mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
+    file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
+    'torch/_tensor.py' file.
+    Returns:
+        bool: True if in the re-computation phase, False otherwise.
+    """
+    backward_function_indices = []
+    call_stack = inspect.stack()
+    # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
+    for frame_info in call_stack:
+        if frame_info.function == Const.BACKWARD and frame_info.filename.endswith('torch/_tensor.py'):
+            del call_stack
+            return True
+    # Identify indices in the call stack where the specific function is being executed
+    for idx, frame_info in enumerate(call_stack):
+        if frame_info.function == Const.BACKWARD or frame_info.function == 'checkpoint_function_backward':
+            backward_function_indices.append(idx)
+    # Check if the execution is within 'torch/autograd/function.py' file
+    for idx in backward_function_indices:
+        # The Megatron and MindSpeed L0&L1 scenes
+        if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
+            del call_stack
+            return True
+        # The latest MindSpeed L2 and ModelLink scenes
+        if idx + 2 < len(call_stack) and call_stack[idx + 2].filename.endswith('torch/autograd/function.py'):
+            del call_stack
+            return True
+    del call_stack
+    return False
+def validate_ops(ops):
+    if not isinstance(ops, list):
+        raise TypeError("ops should be a list")
+    if not ops:
+        raise TypeError(f"specify ops to calculate metrics. Optional ops: {MonitorConst.OP_LIST}")
+    valid_ops = []
+    for op in ops:
+        if op not in MonitorConst.OP_LIST:
+            logger.warning(f"op {op} is not supported. Optional ops: {MonitorConst.OP_LIST}")
+        else:
+            valid_ops.append(op)
+    return valid_ops
+def validate_ranks(ranks):
+    if not isinstance(ranks, list):
+        raise TypeError("module_ranks should be a list")
+    for rank in ranks:
+        if not isinstance(rank, int) or isinstance(rank, bool):
+            raise TypeError(f"element in module_ranks should be a int, get {type(rank)}")
+def validate_targets(targets):
+    if not isinstance(targets, dict):
+        raise TypeError('targets in config.json should be a dict')
+    for module_name, field in targets.items():
+        if not isinstance(module_name, str):
+            raise TypeError('key of targets should be module_name[str] in config.json')
+        if not isinstance(field, dict):
+            raise TypeError('values of targets should be cared filed e.g. {"input": "tensor"} in config.json')
+def validate_print_struct(print_struct):
+    if not isinstance(print_struct, bool):
+        raise TypeError("print_struct should be a bool")
+def validate_ur_distribution(ur_distribution):
+    if not isinstance(ur_distribution, bool):
+        raise TypeError('ur_distribution should be a bool')
+def validate_xy_distribution(xy_distribution):
+    if not isinstance(xy_distribution, bool):
+        raise TypeError('xy_distribution should be a bool')
+def validate_wg_distribution(wg_distribution):
+    if not isinstance(wg_distribution, bool):
+        raise TypeError('wg_distribution should be a bool')
+def validate_mg_distribution(mg_distribution):
+    if not isinstance(mg_distribution, bool):
+        raise TypeError('mg_distribution should be a bool')
+def validate_cc_distribution(cc_distribution):
+    if not isinstance(cc_distribution, dict):
+        raise TypeError('cc_distribution should be a dictionary')
+    for key, value in cc_distribution.items():
+        if key == 'enable':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution enable should be a bool')
+        elif key == 'cc_codeline':
+            if not isinstance(value, list):
+                raise TypeError('cc_distribution cc_codeline should be a list')
+        elif key == 'cc_pre_hook':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution cc_pre_hook should be a bool')
+        elif key == 'cc_log_only':
+            if not isinstance(value, bool):
+                raise TypeError('cc_distribution cc_log_only should be a bool')
+        else:
+            raise TypeError(f'{key} of cc_distribution is not supported.')
+def validate_alert(alert):
+    if not isinstance(alert, dict):
+        raise TypeError('alert should be a dictionary')
+    rules = alert.get('rules')
+    if rules and isinstance(rules, list):
+        for rule in rules:
+            rule_name = rule.get("rule_name")
+            if rule_name and rule_name not in MonitorConst.RULE_NAME:
+                raise TypeError(f"{rule_name} is not supported")
+            args = rule.get("args")
+            if args and isinstance(args, dict):
+                threshold = args.get("threshold")
+                if not isinstance(threshold, float) or threshold < 0:
+                    raise TypeError('threshold must be float and not less than 0')
+    dump = alert.get('dump')
+    if dump and not isinstance(dump, bool):
+        raise TypeError('dump must be bool.')
+def validate_step_count_per_record(step_count_per_record):
+    if not is_int(step_count_per_record):
+        raise TypeError('step_count_per_record must be int.')
+    if step_count_per_record < 1:
+        raise ValueError("step_count_per_record must greater than 0")
+    if step_count_per_record > 1e6:
+        raise ValueError("step_count_per_record must smaller than 1e6")
+def validate_config(config):
+    config['ops'] = validate_ops(config.get('ops', []))
+    eps = config.get('eps', 1e-8)
+    if not isinstance(eps, float):
+        raise TypeError("eps should be a float")
+    ranks = config.get("module_ranks", [])
+    validate_ranks(ranks)
+    targets = config.get("targets", {})
+    validate_targets(targets)
+    print_struct = config.get('print_struct', False)
+    validate_print_struct(print_struct)
+    ur_distribution = config.get('ur_distribution', False)
+    validate_ur_distribution(ur_distribution)
+    xy_distribution = config.get('xy_distribution', False)
+    validate_xy_distribution(xy_distribution)
+    wg_distribution = config.get('wg_distribution', False)
+    validate_wg_distribution(wg_distribution)
+    mg_distribution = config.get('mg_distribution', False)
+    validate_mg_distribution(mg_distribution)
+    cc_distribution = config.get('cc_distribution', {})
+    validate_cc_distribution(cc_distribution)
+    alert = config.get('alert', {})
+    validate_alert(alert)
+    step_count_per_record = config.get('step_count_per_record', 1)
+    validate_step_count_per_record(step_count_per_record)

msprobe/pytorch/monitor/visualizer.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from msprobe.pytorch.monitor.features import cal_histc
+class HeatmapVisualizer:
+    def __init__(self) -> None:
+        self.histogram_bins_num = 30
+        self.min_val = -1
+        self.max_val = 1
+        self.histogram_edges = None
+        self.histogram_sum_data_np = None  # matrix shape is [bins_num * total_step]
+        self.cur_step_histogram_data = None
+        self.histogram_edges = torch.linspace(self.min_val, self.max_val, self.histogram_bins_num)
+    def pre_cal(self, tensor):
+        self.cur_step_histogram_data = cal_histc(tensor_cal=tensor, bins_total=self.histogram_bins_num,
+                                                 min_val=self.min_val, max_val=self.max_val)
+    def visualize(self, tag_name: str, step, summary_writer):
+        if self.histogram_sum_data_np is None or self.histogram_sum_data_np.size == 0:
+            self.histogram_sum_data_np = np.expand_dims(self.cur_step_histogram_data.cpu(), 0).T
+        else:
+            # add new data along a different axis because we transposed early
+            # matrix shape is [bins_num * total_step]
+            self.histogram_sum_data_np = np.concatenate((self.histogram_sum_data_np, np.expand_dims(
+                self.cur_step_histogram_data.cpu(), 1)), axis=1)
+        fig, ax = plt.subplots()
+        cax = ax.matshow(self.histogram_sum_data_np, cmap='hot', aspect='auto')
+        fig.colorbar(cax)
+        lbs = [f'{self.histogram_edges[i]:.2f}' for i in range(self.histogram_bins_num)]
+        plt.yticks(ticks=range(self.histogram_bins_num), labels=lbs)
+        ax.set_xlabel('Step')
+        ax.set_ylabel('Value Range')
+        plt.title(f'Total Step: {step}')
+        # Convert matplotlib figure to an image format suitable for TensorBoard
+        fig.canvas.draw()
+        image = torch.from_numpy(np.array(fig.canvas.renderer.buffer_rgba()))
+        plt.close(fig)
+        summary_writer.add_image(tag_name, image.permute(2, 0, 1), global_step=step, dataformats='CHW')

msprobe/pytorch/online_dispatch/__init__.py CHANGED Viewed

@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+__all__ = ["PtdbgDispatch"]
 from signal import signal, SIGPIPE, SIG_DFL
 from .dispatch import PtdbgDispatch
 signal(SIGPIPE, SIG_DFL)
-__all__ = ["PtdbgDispatch"]

msprobe/pytorch/online_dispatch/compare.py CHANGED Viewed

@@ -1,16 +1,30 @@
-# 进行比对及结果展示
-import os
-import sys
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import csv
 import json
+import os
+import sys
 from collections import namedtuple
-from rich.table import Table
-from rich.console import Console
 from msprobe.core.common.const import CompareConst, FileCheckConst
-from msprobe.core.common.file_utils import FileOpen, change_mode, read_csv
+from msprobe.core.common.file_utils import read_csv, get_json_contents, write_csv
+from msprobe.core.common.utils import check_op_str_pattern_valid
 from msprobe.pytorch.online_dispatch.single_compare import single_benchmark_compare_wrap
-from msprobe.pytorch.common.log import logger
-from msprobe.core.common.utils import CompareException, check_op_str_pattern_valid
+from rich.console import Console
+from rich.table import Table
 ELEMENT_NUM_THRESHOLD = 100
 ZERO_NUM_THRESHOLD = 0.1
@@ -19,30 +33,6 @@ FLOAT_PRECISION = 14
 ResultInfo = namedtuple('ResultInfo', ['api_name', 'is_fwd_success', 'is_bwd_success',
                                        'fwd_compare_alg_results', 'bwd_compare_alg_results'])
-def get_file_content_bytes(file):
-    with FileOpen(file, 'rb') as file_handle:
-        return file_handle.read()
-def get_json_contents(file_path):
-    ops = get_file_content_bytes(file_path)
-    try:
-        json_obj = json.loads(ops)
-    except ValueError as error:
-        logger.error('Failed to load "%s". %s' % (file_path, str(error)))
-        raise CompareException(CompareException.INVALID_FILE_ERROR) from error
-    if not isinstance(json_obj, dict):
-        logger.error('Json file %s, content is not a dictionary!' % file_path)
-        raise CompareException(CompareException.INVALID_FILE_ERROR)
-    return json_obj
-def write_csv(data, filepath):
-    with FileOpen(filepath, 'a', encoding='utf-8-sig') as f:
-        writer = csv.writer(f)
-        writer.writerows(data)
-    change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
 class Saver:
     # consts for result csv
@@ -62,14 +52,15 @@ class Saver:
         }
     def write_csv_title(self):
-        summary_test_rows = [[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS, "Message"]]
+        summary_test_rows = [
+            [self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS, "Message"]]
         write_csv(summary_test_rows, self.save_path)
         detail_test_rows = [[
             "Npu Name", "Bench Dtype", "NPU Dtype", "Shape",
             "error_balance", "max_abs_diff", "max_abs_idx",
             "max_rel_diff", "max_rel_idx", "eb_thd",
-            "error_thd", "Status","Message"
+            "error_thd", "Status", "Message"
         ]]
         write_csv(detail_test_rows, self.detail_save_path)
@@ -106,7 +97,7 @@ class Saver:
         console.print(table_detail)
     def get_statistics_from_result_csv(self):
-        checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.NA, CompareConst.SKIP]
+        checklist = [CompareConst.TRUE, CompareConst.FALSE, CompareConst.N_A, CompareConst.SKIP]
         data = read_csv(self.save_path)
         result_csv_name = os.path.basename(self.save_path)
         for _, row in data.iterrows():
@@ -121,7 +112,7 @@ class Saver:
             if column1 == CompareConst.SKIP:
                 continue
             self.test_result_cnt["total_num"] += 1
-            if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE, 'N/A']:
+            if column1 == CompareConst.TRUE and column2 in [CompareConst.TRUE, CompareConst.N_A]:
                 self.test_result_cnt['success_num'] += 1
             elif column1 == CompareConst.FALSE and column2 == CompareConst.FALSE:
                 self.test_result_cnt['forward_and_backward_fail_num'] += 1
@@ -228,8 +219,8 @@ class Comparator:
             is_bwd_success, bwd_compare_alg_results = True, None
         if is_bwd_success and bwd_compare_alg_results is None:
             self.saver.record_results(ResultInfo(api_name, is_fwd_success, CompareConst.NAN, fwd_compare_alg_results,
-                                      bwd_compare_alg_results))
+                                                 bwd_compare_alg_results))
         else:
             self.saver.record_results(ResultInfo(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results,
-                                      bwd_compare_alg_results))
+                                                 bwd_compare_alg_results))
         return is_fwd_success, is_bwd_success

mindstudio-probe 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

mindstudio-probe 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl