PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +3 -2
mindstudio_probe-1.2.2.dist-info/RECORD +415 -0
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +16 -21
msprobe/config.json +1 -0
msprobe/core/common/const.py +185 -11
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +4 -0
msprobe/core/common/utils.py +42 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +19 -8
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +381 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +199 -69
msprobe/core/data_dump/data_collector.py +100 -25
msprobe/core/data_dump/data_processor/base.py +130 -28
msprobe/core/data_dump/data_processor/factory.py +8 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +170 -23
msprobe/core/data_dump/data_processor/pytorch_processor.py +175 -64
msprobe/core/data_dump/json_writer.py +54 -8
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +121 -17
msprobe/docs/02.config_introduction.md +18 -16
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +107 -58
msprobe/docs/06.data_dump_MindSpore.md +95 -34
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +310 -220
msprobe/docs/21.visualization_PyTorch.md +125 -35
msprobe/docs/22.visualization_MindSpore.md +149 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +525 -0
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +11 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +80 -28
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +52 -8
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +68 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +28 -2
msprobe/mindspore/debugger/precision_debugger.py +100 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +85 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +13 -3
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +347 -107
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -7
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +55 -31
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +116 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +42 -17
msprobe/pytorch/debugger/precision_debugger.py +56 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/dump/module_dump/module_processer.py +204 -0
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +36 -1
msprobe/pytorch/hook_module/wrap_distributed.py +10 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +98 -28
msprobe/pytorch/monitor/csv2tb.py +164 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +543 -318
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +76 -56
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +84 -48
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +264 -115
msprobe/visualization/builder/graph_builder.py +93 -10
msprobe/visualization/builder/msprobe_adapter.py +30 -6
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +15 -19
msprobe/visualization/graph/distributed_analyzer.py +395 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +100 -27
msprobe/visualization/utils.py +24 -31
mindstudio_probe-1.1.1.dist-info/RECORD +0 -341
msprobe/pytorch/functional/module_dump.py +0 -84
msprobe/pytorch/module_processer.py +0 -150
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -12,16 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import itertools
-import math
 import re
-import statistics
 import torch
-from msprobe.core.common.const import MonitorConst
-from msprobe.pytorch.monitor.features import square_sum, get_max, get_min, get_zeros, get_nans, get_norm, get_mean
-from msprobe.core.common.log import logger
+from msprobe.pytorch.monitor.features import get_max, get_min, get_zeros, get_nans, get_norm, get_mean
+from msprobe.pytorch.monitor.utils import get_nan_tensor
 def get_summary_writer_tag_name(module_or_param_name: str, tag: str, rank):
@@ -31,7 +27,9 @@ def get_summary_writer_tag_name(module_or_param_name: str, tag: str, rank):
         return f"{module_or_param_name}/rank{rank}/{tag}"
-def squash_param_name(param_name):
+def squash_param_name(param_name, enable=True):
+    if not enable:
+        return param_name
     name = ''
     for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
         match = re.findall(pattern, param_name)
@@ -63,7 +61,7 @@ class TensorMetrics:
         self.metrics = {}  # tensor_tag --> []
         self.cur_idx = {}
-    def stat_insert(self, tensor, stat_ops, module_name, tensor_name, rank, eps=1e-8):
+    def stat_insert(self, tensor, stat_ops, module_name, tensor_name, rank):
         """get stats and insert into metrics dictionary"""
         prefix = get_summary_writer_tag_name(module_name, tensor_name, rank)
         for stat_op in stat_ops:
@@ -120,14 +118,14 @@ class NormMetric(Metric):
     @staticmethod
     def get_metric_value(tensor, eps):
         return get_norm(tensor)
 @register_config_metric("zeros")
 class ZerosMetric(Metric):
     @staticmethod
     def get_metric_value(tensor, eps):
         return get_zeros(tensor, eps)
 @register_config_metric("nans")
 class NaNsMetric(Metric):
@@ -146,48 +144,29 @@ class IdentMetric(Metric):
 def get_metrics(ops, tag2tensor, eps, out_dict=None):
+    """
+    :param ops: ["op1", "op2"]
+    :param tag2tensor: {
+    '0:fc.input:0/actv': torch.randn([3, 4]),
+    '0:fc.output:0/actv': torch.randn([3, 3])
+    }
+    :param eps: float 1e-8
+    :param out_dict:{
+    '0:fc.input:0/actv': {"op1": op1(torch.randn([3, 4])), "op2": op2(torch.randn([3, 4]))}
+    '0:fc.output:0/actv': {"op1": op1(torch.randn([3, 3])), "op2": op2(torch.randn([3, 3]))}
+    }
+    :return: out_dict
+    """
     if out_dict is None:
         out_dict = {}
     for tag, tensor in tag2tensor.items():
         if tag not in out_dict:
             out_dict[tag] = {}
-        for metric_name in ops:
+        if not torch.is_tensor(tensor):
+            # Non-tensor in/output filled with nan.
+            out_dict[tag].update({metric_name: get_nan_tensor() for metric_name in ops})
+            continue
+        for metric_name in ops:
             fun_metric = config_metric_registry.get(metric_name)
             out_dict[tag][metric_name] = fun_metric.get_metric(tensor, eps)
     return out_dict
-def write_metrics_base(ops, summary_writer, metric_value, step, prefix=''):
-    if not metric_value:
-        return
-    tensors = []
-    tags = list(itertools.product(metric_value.keys(), ops))
-    for op2tensor in metric_value.values():
-        tensors.extend(op2tensor.values())
-    with torch.no_grad():
-        metric_list = torch.stack(tensors).cpu()
-    for tag, metric in zip(tags, metric_list):
-        summary_writer.add_scalar(tag, metric, step)
-def write_metrics_csv(ops, summary_writer, metric_value, step, prefix=''):
-    write_metrics_base(ops, summary_writer, metric_value, step, prefix='')
-    if not summary_writer.header:
-        # 前向的norm用input.ops_和output.ops_，反向的用input_grad.ops_和output_grad.ops_
-        if prefix in {"actv", "actv_grad"}:
-            if prefix == "actv":
-                input_and_output = [MonitorConst.ACTV_IN, MonitorConst.ACTV_OUT]
-            else:
-                input_and_output = [MonitorConst.ACTVGRAD_IN, MonitorConst.ACTVGRAD_OUT]
-            ops_ = [MonitorConst.DOT.join(i[::-1]) for i in itertools.product(ops, input_and_output)]
-            summary_writer.header = ["module_name", "step", *ops_]
-        else:
-            summary_writer.header = ["param_name", "step", *ops]
-        for key in metric_value.keys():
-            if MonitorConst.VPP_SEP in key:
-                summary_writer.header.insert(0, 'vpp_stage')
-            break
-    summary_writer.write_csv(prefix, step)
-    summary_writer.header = []

msprobe/pytorch/monitor/module_spec_verifier.py CHANGED Viewed

@@ -17,7 +17,7 @@ import re
 import abc
 import torch
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 # 用于存储所有validator实现类的注册表
 config_validator_registry = {}
@@ -79,6 +79,8 @@ class TupleValidator(ConfigValidator):
 def validate_config_spec(config_spec: str, actual_data, module_name: str, data_type: str):
     focused_col = None
+    if not config_spec or not isinstance(config_spec, str):
+        return focused_col
     for _, validator_cls in config_validator_registry.items():
         config_validator = validator_cls()
         pattern_match = config_validator.check_pattern_match(config_spec)

msprobe/pytorch/monitor/optimizer_collect.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -13,27 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from abc import ABC, abstractmethod
 from collections import defaultdict
 import torch
 import torch.distributed as dist
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.monitor.utils import MVResult, MVGradResult
 class OptimizerMon(object):
-    wrapped_optimizer = None
     def __init__(self) -> None:
         self.fp16_to_fp32_param = {}
         self.is_stage3 = False
-    @classmethod
-    def set_wrapped_optimizer(cls, wrapped_optimizer):
-        cls.wrapped_optimizer = wrapped_optimizer
     def fetch_mv(self, monitor, torch_opt, params2name):
         pass
@@ -83,11 +76,10 @@ class OptimizerMon(object):
         ratio_dict = defaultdict()
         param2name = defaultdict()
         fp32_partitioned_groups_flat_grad = defaultdict()
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
         partition_id = dist.get_rank()
         def get_flatten_grad(self, optimizer, group_idx):
-            if  fp32_partitioned_groups_flat[group_idx].grad is None:
+            if fp32_partitioned_groups_flat[group_idx].grad is None:
                 if partition_id == dist.get_world_size() - 1 and not self.is_stage3:
                     fp32_partitioned_groups_flat_grad = optimizer.flatten_dense_tensors_aligned(
                         optimizer.averaged_gradients[group_idx],
@@ -102,7 +94,7 @@ class OptimizerMon(object):
                 return fp32_partitioned_groups_flat[group_idx].grad
         for group_idx in range(len(fp32_partitioned_groups_flat)):
-            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, mix_prec_opt, group_idx)
+            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, torch_opt, group_idx)
         for name in params2name.values():
             start_idx, end_idx, group_idx, group_with_rank = name2indices[name]
@@ -111,9 +103,9 @@ class OptimizerMon(object):
             fp32_param = fp32_partitioned_groups_flat[group_idx][start_idx: end_idx]
             fp32_param.grad = fp32_partitioned_groups_flat_grad[group_idx][start_idx: end_idx]
             param2name[fp32_param] = name
-            if not mix_prec_opt.state:
+            if not torch_opt.state:
                 continue
-            state_param = list(mix_prec_opt.state.values())[group_idx]
+            state_param = list(torch_opt.state.values())[group_idx]
             exp_avg = state_param.get("exp_avg", None)
             exp_avg_sq = state_param.get("exp_avg_sq", None)
             if exp_avg is None or exp_avg_sq is None:
@@ -151,29 +143,33 @@ class MixPrecisionOptimizerMon(OptimizerMon):
     混合精度训练通过适当降低某些计算的精度来加速训练过程并减少内存消耗。
     """
+    def map_fp16_tp_fp32_param(self, torch_opt):
+        for fp16_group, fp32_group in zip(torch_opt.float16_groups, torch_opt.fp32_from_float16_groups):
+            for fp16_param, fp32_param in zip(fp16_group, fp32_group):
+                self.fp16_to_fp32_param[fp16_param] = fp32_param
     def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            self.map_fp16_tp_fp32_param(torch_opt)
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups):
-                for fp16_param, fp32_param in zip(fp16_group, fp32_group):
-                    self.fp16_to_fp32_param[fp16_param] = fp32_param
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronDistributedOptimizerMon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
-        if not (hasattr(mix_prec_opt, "model_float16_groups") and
-                hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")):
+    def map_fp16_tp_fp32_param(self, torch_opt):
+        if not (hasattr(torch_opt, "model_float16_groups") and
+                hasattr(torch_opt, "shard_fp32_from_float16_groups")):
             raise Exception(
                 "megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, "
                 "if not, please check megatron-lm version")
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups,
-                                                    mix_prec_opt.shard_fp32_from_float16_groups):
-                for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
-                    self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
+        for fp16_group, shard_fp32_group in zip(torch_opt.model_float16_groups,
+                                                torch_opt.shard_fp32_from_float16_groups):
+            for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
+                self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            self.map_fp16_tp_fp32_param(torch_opt)
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
@@ -183,15 +179,40 @@ class MegatronFP32OptimizerMon(OptimizerMon):
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class MegatronChainedDistributedOptimizerMon(MegatronDistributedOptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            for opt in torch_opt.chained_optimizers:
+                self.map_fp16_tp_fp32_param(opt)
+        if not isinstance(torch_opt, torch.optim.Optimizer):
+            torch_opt.state = {}
+            for opt in torch_opt.chained_optimizers:
+                torch_opt.state.update(opt.optimizer.state)
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class MegatronChainedMixPrecisionOptimizerMon(MixPrecisionOptimizerMon):
+    def fetch_mv(self, monitor, torch_opt, params2name):
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            for opt in torch_opt.chained_optimizers:
+                self.map_fp16_tp_fp32_param(opt)
+        if not isinstance(torch_opt, torch.optim.Optimizer):
+            torch_opt.state = {}
+            for opt in torch_opt.chained_optimizers:
+                torch_opt.state.update(opt.optimizer.state)
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class DeepSpeedZeroOptimizerStage0Mon(OptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name):
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
-    def get_param_index(self, params2name, name2index):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp16_groups = mix_prec_opt.fp16_partitioned_groups
+    def get_param_index(self, params2name, name2index, torch_opt):
+        fp16_groups = torch_opt.fp16_partitioned_groups
         name2indices = defaultdict()
         index_length = defaultdict()
         index = 0
@@ -210,13 +231,11 @@ class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
         self.is_stage3 = True
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp32_partitioned_groups_flat = mix_prec_opt.fp32_partitioned_groups_flat
+        fp32_partitioned_groups_flat = torch_opt.fp32_partitioned_groups_flat
         return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
 class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
     @staticmethod
     def get_group_index(fp32_length, world_size, index):
         for i in range(len(fp32_length) - 1):
@@ -229,12 +248,11 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
                 return sub_interval_start, min(sub_index, world_size - 1)
         return fp32_length[-1], 0
-    def get_param_index(self, params2name, name2index):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        padding = mix_prec_opt.groups_padding
+    def get_param_index(self, params2name, name2index, torch_opt):
+        padding = torch_opt.groups_padding
         world_size = dist.get_world_size()
         fp32_length = [0]
-        for fp32_group_index, single_partition_of_fp32_group in enumerate(mix_prec_opt.single_partition_of_fp32_groups):
+        for fp32_group_index, single_partition_of_fp32_group in enumerate(torch_opt.single_partition_of_fp32_groups):
             fp32_length.append(len(single_partition_of_fp32_group) * world_size + fp32_length[fp32_group_index])
         bf16_groups = []
@@ -242,7 +260,7 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
         index_length = defaultdict()
         index = 0
         idx = 0
-        for group_idx, bf16_group in enumerate(mix_prec_opt.bit16_groups):
+        for group_idx, bf16_group in enumerate(torch_opt.bit16_groups):
             bf16_groups.extend(bf16_group)
             for param in bf16_group:
                 param_length = len(param.flatten())
@@ -250,7 +268,7 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
                 index_length[idx] = (index, index + param_length, group_idx, group_index, group_with_rank)
                 index += param_length
                 idx += 1
-        group_length = len(bf16_groups) / len(mix_prec_opt.bit16_groups)
+        group_length = len(bf16_groups) / len(torch_opt.bit16_groups)
         for _, name in params2name.items():
             name_index = name2index[name]
             start_idx, end_idx, group_idx, group_index, group_with_rank = index_length[name_index]
@@ -264,32 +282,34 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
         return name2indices
     def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp32_partitioned_groups_flat = mix_prec_opt.single_partition_of_fp32_groups
+        fp32_partitioned_groups_flat = torch_opt.single_partition_of_fp32_groups
         return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
 class DummyOptimizerMon(OptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name):
-        return MVResult(exp_avg=None, exp_avg_sq=None, update=None, ratio=None)
+        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class OptimizerMonFactory:
     _optimizer_mon_map = {
-        "Megatron_Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
-        "Megatron_DistributedOptimizer": MegatronDistributedOptimizerMon,
-        "Megatron_FP32Optimizer": MegatronFP32OptimizerMon,
-        "DeepSpeedZeroOptimizer_Stage0": DeepSpeedZeroOptimizerStage0Mon,
-        "DeepSpeedZeroOptimizer_Stage1_or_2": DeepSpeedZeroOptimizerStage1or2Mon,
+        "FP32Optimizer": MegatronFP32OptimizerMon,
+        "Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
+        "DistributedOptimizer": MegatronDistributedOptimizerMon,
+        "ChainedDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
+        "ChainedFloat16OptimizerWithFloat16Params": MegatronChainedMixPrecisionOptimizerMon,
+        "BF16_Optimizer": DeepSpeedZeroOptimizerStage0Mon,
+        "DeepSpeedZeroOptimizer": DeepSpeedZeroOptimizerStage1or2Mon,
         "DeepSpeedZeroOptimizer_Stage3": DeepSpeedZeroOptimizerStage3Mon,
-        "unknown": DummyOptimizerMon
+        "Adam": DummyOptimizerMon
     }
     @staticmethod
-    def create_optimizer_mon(opt_ty: str):
-        if not opt_ty:
-            return DummyOptimizerMon()
-        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(opt_ty)
-        if not optimizer_mon_class:
-            raise Exception("opt_ty should be one of: " + ", ".join(OptimizerMonFactory._optimizer_mon_map.keys()))
-        return optimizer_mon_class()
+    def create_optimizer_mon(optimizer):
+        # auto replace opt_ty
+        optimizer_class = optimizer.__class__.__name__
+        if optimizer_class == "ChainedOptimizer":
+            optimizer_class = "Chained" + optimizer.chained_optimizers[0].__class__.__name__
+        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, DummyOptimizerMon)
+        return optimizer_mon_class(), optimizer_class

msprobe/pytorch/monitor/unittest/test_monitor.py CHANGED Viewed

@@ -1,11 +1,26 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
 import os
 import re
-import argparse
 from glob import glob
 import pandas as pd
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 def parse_logfile(logfile):
@@ -21,19 +36,19 @@ def parse_logfile(logfile):
 def parse_monitor_output(output_dir):
     reduced = {}
     unreduced = {}
-    for dir in glob(output_dir + '*'):
-        rank = int(re.findall('(?<=rank)[\d]*', dir)[0])
+    for directory in glob(output_dir + '*'):
+        rank = int(re.findall('(?<=rank)[\d]*', directory)[0])
         unreduced[rank] = []
         reduced[rank] = []
-        for file in os.listdir(dir):
-            df = pd.read_csv(os.path.join(dir, file))
+        for file in os.listdir(directory):
+            df = pd.read_csv(os.path.join(directory, file))
             if '_unreduced_' in file:
                 unreduced[rank].append(df)
                 pass
             elif '_reduced_' in file:
                 reduced[rank].append(df)
             else:
-                logger.info(f'unexpected file {file} in {dir}')
+                logger.info(f'unexpected file {file} in {directory}')
     return reduced, unreduced
@@ -41,7 +56,7 @@ def valid_reduce(reduced, unreduced, tp_size, dp_size, sequence_parallel):
     steps = len(reduced[0])
     world_size = len(reduced)
     errors = []
-    for index, row in unreduced[0][0].iterrows():
+    for _, row in unreduced[0][0].iterrows():
         param = row['param_name']
         is_tp_duplicate = False
         for step in range(2):
@@ -103,7 +118,7 @@ def valid_total_norm(total_norm, reduced, duplicate_embedding):
                 if step == 0:
                     logger.info(f'rank {rank} is duplicated in dp group')
                 continue
-            for index, row in reduced[rank][step].iterrows():
+            for _, row in reduced[rank][step].iterrows():
                 if duplicate_embedding and 'word_embedding' in row['param_name']:
                     continue
                 calculated_norm += row['norm'] ** 2

msprobe/pytorch/monitor/utils.py CHANGED Viewed

@@ -16,13 +16,27 @@ import inspect
 from collections import namedtuple
 from datetime import timezone, timedelta
 from functools import wraps
+from datetime import datetime
+import os
+import re
 import torch
 from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 from msprobe.core.common.utils import is_int
+from msprobe.core.common.file_utils import check_file_or_directory_path
+device = "cpu"
+try:
+    import torch_npu
+    device = "npu"
+except ImportError:
+    if torch.cuda.is_available():
+        device = "cuda"
+NAN_TENSOR_ON_DEVICE = None
 FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
 FILE_NAME_MAX_LENGTH = 255
 DIRECTORY_MAX_LENGTH = 4096
@@ -39,6 +53,17 @@ class MsgConst:
     SPECIAL_CHAR = ["\n", "\r", "\u007F", "\b", "\f", "\t", "\u000B", "%08", "%0a", "%0b", "%0c", "%0d", "%7f"]
+def get_output_base_dir():
+    return os.getenv(MonitorConst.MONITOR_OUTPUT_DIR, MonitorConst.DEFAULT_MONITOR_OUTPUT_DIR)
+def get_nan_tensor():
+    global NAN_TENSOR_ON_DEVICE
+    if not NAN_TENSOR_ON_DEVICE:
+        NAN_TENSOR_ON_DEVICE = torch.tensor(torch.nan, device=device)
+    return NAN_TENSOR_ON_DEVICE
 def filter_special_chars(func):
     @wraps(func)
     def func_level(msg):
@@ -64,60 +89,19 @@ def get_param_struct(param):
     return res
-def is_recomputation():
-    """Check if the current operation is in the re-computation phase.
-    This function inspects the current call stack to indicate whether the current operation is in the
-    re-computation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
-    megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
-    mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
-    file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
-    'torch/_tensor.py' file.
-    Returns:
-        bool: True if in the re-computation phase, False otherwise.
-    """
-    backward_function_indices = []
-    call_stack = inspect.stack()
-    # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
-    for frame_info in call_stack:
-        if frame_info.function == Const.BACKWARD and frame_info.filename.endswith('torch/_tensor.py'):
-            del call_stack
-            return True
-    # Identify indices in the call stack where the specific function is being executed
-    for idx, frame_info in enumerate(call_stack):
-        if frame_info.function == Const.BACKWARD or frame_info.function == 'checkpoint_function_backward':
-            backward_function_indices.append(idx)
-    # Check if the execution is within 'torch/autograd/function.py' file
-    for idx in backward_function_indices:
-        # The Megatron and MindSpeed L0&L1 scenes
-        if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
-            del call_stack
-            return True
-        # The latest MindSpeed L2 and ModelLink scenes
-        if idx + 2 < len(call_stack) and call_stack[idx + 2].filename.endswith('torch/autograd/function.py'):
-            del call_stack
-            return True
-    del call_stack
-    return False
 def validate_ops(ops):
     if not isinstance(ops, list):
         raise TypeError("ops should be a list")
-    if not ops:
-        raise TypeError(f"specify ops to calculate metrics. Optional ops: {MonitorConst.OP_LIST}")
     valid_ops = []
     for op in ops:
         if op not in MonitorConst.OP_LIST:
             logger.warning(f"op {op} is not supported. Optional ops: {MonitorConst.OP_LIST}")
-        else:
-            valid_ops.append(op)
+            continue
+        valid_ops.append(op)
+    if not valid_ops:
+        default_op = MonitorConst.OP_LIST[0]
+        valid_ops.append(default_op)
+        logger.info_on_rank_0(f"There is no valid ops, default op {default_op} is used")
     return valid_ops
@@ -164,6 +148,11 @@ def validate_mg_distribution(mg_distribution):
         raise TypeError('mg_distribution should be a bool')
+def validate_param_distribution(param_distribution):
+    if not isinstance(param_distribution, bool):
+        raise TypeError('param_distribution should be a bool')
 def validate_cc_distribution(cc_distribution):
     if not isinstance(cc_distribution, dict):
         raise TypeError('cc_distribution should be a dictionary')
@@ -184,6 +173,11 @@ def validate_cc_distribution(cc_distribution):
             raise TypeError(f'{key} of cc_distribution is not supported.')
+def validate_squash_name(squash_name):
+    if not isinstance(squash_name, bool):
+        raise TypeError('squash_name should be a bool')
 def validate_alert(alert):
     if not isinstance(alert, dict):
         raise TypeError('alert should be a dictionary')
@@ -240,6 +234,9 @@ def validate_config(config):
     mg_distribution = config.get('mg_distribution', False)
     validate_mg_distribution(mg_distribution)
+    param_distribution = config.get('param_distribution', False)
+    validate_param_distribution(param_distribution)
     cc_distribution = config.get('cc_distribution', {})
     validate_cc_distribution(cc_distribution)
@@ -248,3 +245,42 @@ def validate_config(config):
     step_count_per_record = config.get('step_count_per_record', 1)
     validate_step_count_per_record(step_count_per_record)
+    squash_name = config.get('squash_name', True)
+    validate_squash_name(squash_name)
+    if not targets:
+        if xy_distribution:
+            config["all_xy"] = True
+        config["targets"] = {"": {}}
+def time_str2time_digit(time_str):
+    time_format = '%b%d_%H-%M-%S'
+    try:
+        time_digit = datetime.strptime(time_str, time_format)
+    except Exception as e:
+        raise RuntimeError(f"illegal timestamp: {time_str}, timestamp should be prefix \
+                           of existing output dirpath, like 'Dec03_21-34-40'.") from e
+    return time_digit
+def get_target_output_dir(monitor_path, time_start, time_end):
+    check_file_or_directory_path(monitor_path, isdir=True)
+    time_start = time_str2time_digit(time_start) if time_start is not None else time_start
+    time_end = time_str2time_digit(time_end) if time_end is not None else time_end
+    if time_start and time_end and time_start > time_end:
+        raise ValueError(f"time_start({time_start}) greater than time_end({time_end})")
+    result = {}
+    for dirname in os.listdir(monitor_path):
+        match = re.match(MonitorConst.OUTPUT_DIR_PATTERN, dirname)
+        if not match:
+            continue
+        time_tag = match.group(1)
+        rank = match.group(2)
+        target_time = time_str2time_digit(time_tag)
+        start_ok = time_start is None or target_time >= time_start
+        end_ok = time_end is None or target_time <= time_end
+        if start_ok and end_ok:
+            result[rank] = os.path.join(monitor_path, dirname)
+    return result

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl