mindstudio-probe 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
  2. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +168 -150
  3. msprobe/README.md +27 -22
  4. msprobe/core/common/const.py +129 -60
  5. msprobe/core/common/decorator.py +50 -0
  6. msprobe/core/common/exceptions.py +3 -1
  7. msprobe/core/common/file_utils.py +25 -2
  8. msprobe/core/common/inplace_ops.yaml +1 -0
  9. msprobe/core/common/utils.py +43 -33
  10. msprobe/core/compare/acc_compare.py +43 -74
  11. msprobe/core/compare/check.py +2 -6
  12. msprobe/core/compare/highlight.py +2 -0
  13. msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
  14. msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
  15. msprobe/core/compare/merge_result/merge_result.py +16 -9
  16. msprobe/core/compare/merge_result/utils.py +81 -0
  17. msprobe/core/compare/multiprocessing_compute.py +19 -12
  18. msprobe/core/compare/npy_compare.py +30 -12
  19. msprobe/core/compare/utils.py +30 -10
  20. msprobe/core/data_dump/api_registry.py +176 -0
  21. msprobe/core/data_dump/data_collector.py +58 -13
  22. msprobe/core/data_dump/data_processor/base.py +94 -10
  23. msprobe/core/data_dump/data_processor/factory.py +3 -0
  24. msprobe/core/data_dump/data_processor/mindspore_processor.py +33 -33
  25. msprobe/core/data_dump/data_processor/pytorch_processor.py +99 -18
  26. msprobe/core/data_dump/json_writer.py +61 -40
  27. msprobe/core/grad_probe/constant.py +1 -0
  28. msprobe/core/grad_probe/grad_compare.py +1 -1
  29. msprobe/core/overflow_check/abnormal_scene.py +2 -0
  30. msprobe/docs/01.installation.md +27 -1
  31. msprobe/docs/02.config_introduction.md +27 -23
  32. msprobe/docs/03.config_examples.md +24 -0
  33. msprobe/docs/05.data_dump_PyTorch.md +103 -16
  34. msprobe/docs/06.data_dump_MindSpore.md +76 -32
  35. msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
  36. msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
  37. msprobe/docs/09.accuracy_checker_MindSpore.md +5 -3
  38. msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
  39. msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
  40. msprobe/docs/12.overflow_check_PyTorch.md +3 -1
  41. msprobe/docs/13.overflow_check_MindSpore.md +4 -2
  42. msprobe/docs/14.data_parse_PyTorch.md +1 -7
  43. msprobe/docs/18.online_dispatch.md +1 -1
  44. msprobe/docs/19.monitor.md +332 -273
  45. msprobe/docs/21.visualization_PyTorch.md +42 -13
  46. msprobe/docs/22.visualization_MindSpore.md +43 -13
  47. msprobe/docs/23.generate_operator_PyTorch.md +9 -9
  48. msprobe/docs/27.dump_json_instruction.md +301 -27
  49. msprobe/docs/28.debugger_save_instruction.md +94 -0
  50. msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
  51. msprobe/docs/29.data_dump_MSAdapter.md +229 -0
  52. msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
  53. msprobe/docs/FAQ.md +3 -11
  54. msprobe/docs/img/compare_result.png +0 -0
  55. msprobe/docs/img/merge_result.png +0 -0
  56. msprobe/docs/img/monitor/step_count_per_record.png +0 -0
  57. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  58. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  59. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  60. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  61. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  62. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  63. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  64. msprobe/mindspore/__init__.py +4 -2
  65. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +32 -7
  66. msprobe/mindspore/api_accuracy_checker/api_runner.py +70 -22
  67. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
  68. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
  69. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
  70. msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
  71. msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
  72. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
  73. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +130 -0
  74. msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
  75. msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
  76. msprobe/mindspore/common/const.py +61 -0
  77. msprobe/mindspore/common/utils.py +48 -18
  78. msprobe/mindspore/compare/ms_compare.py +27 -19
  79. msprobe/mindspore/compare/ms_graph_compare.py +6 -5
  80. msprobe/mindspore/debugger/debugger_config.py +31 -6
  81. msprobe/mindspore/debugger/precision_debugger.py +45 -14
  82. msprobe/mindspore/dump/dump_tool_factory.py +5 -3
  83. msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
  84. msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
  85. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
  86. msprobe/mindspore/dump/jit_dump.py +21 -15
  87. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
  88. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
  89. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
  90. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
  91. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
  92. msprobe/mindspore/grad_probe/global_context.py +2 -0
  93. msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
  94. msprobe/mindspore/grad_probe/hook.py +2 -4
  95. msprobe/mindspore/monitor/anomaly_detect.py +404 -0
  96. msprobe/mindspore/monitor/distributed/__init__.py +0 -0
  97. msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
  98. msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
  99. msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
  100. msprobe/mindspore/monitor/features.py +63 -0
  101. msprobe/mindspore/monitor/module_hook.py +873 -0
  102. msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
  103. msprobe/mindspore/monitor/utils.py +309 -0
  104. msprobe/mindspore/ms_config.py +8 -2
  105. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
  106. msprobe/mindspore/service.py +114 -34
  107. msprobe/pytorch/__init__.py +0 -1
  108. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
  109. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +12 -7
  110. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
  111. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
  112. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
  113. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
  114. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
  115. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
  116. msprobe/pytorch/bench_functions/apply_adam.py +215 -0
  117. msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
  118. msprobe/pytorch/{parse.py → bench_functions/mish.py} +6 -4
  119. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +50 -0
  120. msprobe/pytorch/bench_functions/sort_v2.py +21 -0
  121. msprobe/pytorch/common/utils.py +97 -4
  122. msprobe/pytorch/debugger/debugger_config.py +19 -9
  123. msprobe/pytorch/debugger/precision_debugger.py +24 -1
  124. msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
  125. msprobe/pytorch/dump/module_dump/module_processer.py +21 -35
  126. msprobe/pytorch/free_benchmark/common/utils.py +1 -1
  127. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
  128. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
  129. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
  130. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
  131. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
  132. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
  133. msprobe/pytorch/function_factory.py +8 -2
  134. msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
  135. msprobe/pytorch/hook_module/api_register.py +131 -0
  136. msprobe/pytorch/hook_module/hook_module.py +19 -14
  137. msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
  138. msprobe/pytorch/hook_module/support_wrap_ops.yaml +173 -75
  139. msprobe/pytorch/monitor/anomaly_detect.py +14 -29
  140. msprobe/pytorch/monitor/csv2tb.py +18 -14
  141. msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
  142. msprobe/pytorch/monitor/module_hook.py +238 -193
  143. msprobe/pytorch/monitor/module_metric.py +9 -6
  144. msprobe/pytorch/monitor/optimizer_collect.py +100 -67
  145. msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
  146. msprobe/pytorch/monitor/utils.py +76 -44
  147. msprobe/pytorch/online_dispatch/compare.py +0 -2
  148. msprobe/pytorch/online_dispatch/dispatch.py +9 -0
  149. msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
  150. msprobe/pytorch/online_dispatch/utils.py +3 -0
  151. msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
  152. msprobe/pytorch/parse_tool/lib/utils.py +2 -1
  153. msprobe/pytorch/pt_config.py +30 -29
  154. msprobe/pytorch/service.py +114 -32
  155. msprobe/visualization/builder/graph_builder.py +75 -10
  156. msprobe/visualization/builder/msprobe_adapter.py +7 -6
  157. msprobe/visualization/compare/graph_comparator.py +42 -38
  158. msprobe/visualization/compare/mode_adapter.py +0 -19
  159. msprobe/visualization/graph/base_node.py +11 -3
  160. msprobe/visualization/graph/distributed_analyzer.py +71 -3
  161. msprobe/visualization/graph/graph.py +0 -11
  162. msprobe/visualization/graph/node_op.py +4 -3
  163. msprobe/visualization/graph_service.py +4 -5
  164. msprobe/visualization/utils.py +12 -35
  165. msprobe/mindspore/dump/hook_cell/api_registry.py +0 -205
  166. msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
  167. msprobe/pytorch/hook_module/api_registry.py +0 -166
  168. msprobe/pytorch/hook_module/wrap_distributed.py +0 -75
  169. msprobe/pytorch/hook_module/wrap_functional.py +0 -66
  170. msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
  171. msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
  172. msprobe/pytorch/hook_module/wrap_torch.py +0 -84
  173. msprobe/pytorch/hook_module/wrap_vf.py +0 -60
  174. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
  175. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
  176. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
  177. {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0
@@ -23,10 +23,10 @@ import numpy as np
23
23
  from msprobe.core.common.const import Const
24
24
  from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo,
25
25
  ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs)
26
- from msprobe.core.common.file_utils import path_len_exceeds_limit
26
+ from msprobe.core.common.file_utils import path_len_exceeds_limit, save_npy
27
27
  from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
28
28
  from msprobe.mindspore.common.log import logger
29
- from msprobe.mindspore.dump.hook_cell.api_registry import api_register
29
+ from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
30
30
 
31
31
  has_adump = True
32
32
  try:
@@ -44,6 +44,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
44
44
  "dtype": self.analyze_dtype_in_kwargs
45
45
  }
46
46
  self._async_dump_cache = {}
47
+ self.api_register = get_api_register()
47
48
 
48
49
  @staticmethod
49
50
  def get_md5_for_tensor(x):
@@ -74,61 +75,51 @@ class MindsporeDataProcessor(BaseDataProcessor):
74
75
  else:
75
76
  if not ops.is_floating_point(data) or data.dtype == ms.float64:
76
77
  data = data.to(ms.float32)
77
- api_register.norm_inner_op_set_ori_func()
78
- get_max_value = api_register.mint_ops_ori_attr.get("max", mint.max)
79
- get_min_value = api_register.mint_ops_ori_attr.get("min", mint.min)
80
- get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
81
- if hasattr(mint, "norm"):
82
- get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
83
- else:
84
- get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
85
- tensor_stat.max = get_max_value(data).item()
86
- tensor_stat.min = get_min_value(data).item()
87
- tensor_stat.mean = get_mean_value(data).item()
78
+ get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
79
+ tensor_stat.max = mint.max(data).item()
80
+ tensor_stat.min = mint.min(data).item()
81
+ tensor_stat.mean = mint.mean(data).item()
88
82
  tensor_stat.norm = get_norm_value(data).item()
89
- api_register.norm_inner_op_set_hook_func()
90
83
  return tensor_stat
91
84
 
92
85
  @staticmethod
93
86
  def get_stat_info_async(data):
94
87
  tensor_stat = TensorStatInfo()
95
- stack_method = api_register.functional_ori_attr.get("stack", ms.ops.stack)
96
88
  if data.dtype == ms.complex64 or data.dtype == ms.complex128:
97
89
  logger.warning("Async dump do not support complex data!")
98
90
  return tensor_stat
99
91
  elif data.dtype == ms.bool_:
100
- tensor_stat.stack_tensor_stat = (["Max", "Min"], stack_method([data.any(), data.all()]))
92
+ tensor_stat.stack_tensor_stat = (["Max", "Min"], ops.stack([data.any(), data.all()]))
101
93
  elif not data.shape:
102
- tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], stack_method([data, data, data, data]))
94
+ tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack([data, data, data, data]))
103
95
  else:
104
96
  if not ops.is_floating_point(data) or data.dtype == ms.float64:
105
97
  data = data.to(ms.float32)
106
- api_register.norm_inner_op_set_ori_func()
107
- get_max_value = api_register.mint_ops_ori_attr.get("max", mint.max)
108
- get_min_value = api_register.mint_ops_ori_attr.get("min", mint.min)
109
- get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
110
- if hasattr(mint, "norm"):
111
- get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
112
- else:
113
- get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
114
- tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], stack_method(
115
- [get_max_value(data), get_min_value(data), get_mean_value(data), get_norm_value(data)]))
116
- api_register.norm_inner_op_set_hook_func()
98
+ get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
99
+ tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack(
100
+ [mint.max(data), mint.min(data), mint.mean(data), get_norm_value(data)]))
117
101
  return tensor_stat
118
102
 
103
+ @staticmethod
104
+ def is_hookable_element(element):
105
+ return hasattr(element, "register_hook") and callable(element.register_hook)
106
+
119
107
  @classmethod
120
108
  def get_special_types(cls):
121
109
  return super().get_special_types() + cls.mindspore_special_type
122
110
 
123
111
  def get_stat_info(self, data):
112
+ self.api_register.restore_inner_used_api()
124
113
  tensor_stat = TensorStatInfo()
125
114
  if data.numel() == 0:
126
- return tensor_stat
115
+ stat_info = tensor_stat
127
116
  else:
128
117
  if self.config.async_dump:
129
- return MindsporeDataProcessor.get_stat_info_async(data)
118
+ stat_info = MindsporeDataProcessor.get_stat_info_async(data)
130
119
  else:
131
- return MindsporeDataProcessor.get_stat_info_sync(data)
120
+ stat_info = MindsporeDataProcessor.get_stat_info_sync(data)
121
+ self.api_register.register_inner_used_api()
122
+ return stat_info
132
123
 
133
124
  def analyze_single_element(self, element, suffix_stack):
134
125
  if suffix_stack and suffix_stack[-1] in self.mindspore_object_key:
@@ -136,11 +127,13 @@ class MindsporeDataProcessor(BaseDataProcessor):
136
127
 
137
128
  converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
138
129
  if converted_numpy is not element:
139
- return self._analyze_numpy(converted_numpy, numpy_type)
130
+ return {"type": numpy_type, "value": converted_numpy}
140
131
  if isinstance(element, Number):
141
132
  return self.analyze_dtype_in_kwargs(element)
142
133
  if isinstance(element, ms.Tensor):
143
- return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
134
+ return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
135
+ if isinstance(element, np.ndarray):
136
+ return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
144
137
  if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
145
138
  return self._analyze_builtin(element)
146
139
  return {}
@@ -186,6 +179,13 @@ class TensorDataProcessor(MindsporeDataProcessor):
186
179
  save_tensor_as_npy(tensor, file_path)
187
180
  return single_arg
188
181
 
182
+ def _analyze_numpy(self, ndarray, suffix):
183
+ dump_data_name, file_path = self.get_save_file_path(suffix)
184
+ save_npy(ndarray, file_path)
185
+ ndarray_json = super()._analyze_numpy(ndarray, suffix)
186
+ ndarray_json.update({"data_name": dump_data_name})
187
+ return ndarray_json
188
+
189
189
 
190
190
  class OverflowCheckDataProcessor(MindsporeDataProcessor):
191
191
  __slots__ = ["cached_tensors_and_file_paths"]
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
1
+ # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
2
  # All rights reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,16 +21,18 @@ from typing import List
21
21
  import numpy as np
22
22
  import torch
23
23
  from torch import distributed as dist
24
+ from torch.distributed.distributed_c10d import _get_default_group
24
25
 
25
26
  from msprobe.core.common.const import Const
27
+ from msprobe.core.common.exceptions import MsprobeException
26
28
  from msprobe.core.common.file_utils import path_len_exceeds_limit
27
29
  from msprobe.core.common.log import logger
28
30
  from msprobe.core.common.utils import convert_tuple
31
+ from msprobe.core.common.decorator import recursion_depth_decorator
29
32
  from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
30
33
  ModuleForwardInputsOutputs, TensorStatInfo
31
- from msprobe.pytorch.common.utils import save_pt, load_pt
34
+ from msprobe.pytorch.common.utils import Const as PtConst, save_pt, is_hifloat8_tensor, is_float8_tensor
32
35
  from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
33
- from msprobe.core.common.utils import recursion_depth_decorator
34
36
 
35
37
  is_gpu = False
36
38
  try:
@@ -40,7 +42,16 @@ except ImportError:
40
42
 
41
43
 
42
44
  class PytorchDataProcessor(BaseDataProcessor):
43
- pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor, torch.memory_format, dist.ProcessGroup)
45
+ pytorch_special_type = (
46
+ torch.device,
47
+ torch.dtype,
48
+ torch.Size,
49
+ torch.Tensor,
50
+ torch.memory_format,
51
+ dist.ProcessGroup,
52
+ dist.P2POp,
53
+ dist.ReduceOp
54
+ )
44
55
  memory_format = {
45
56
  torch.contiguous_format: "contiguous_format",
46
57
  torch.channels_last: "channels_last",
@@ -68,14 +79,16 @@ class PytorchDataProcessor(BaseDataProcessor):
68
79
  def analyze_device_in_kwargs(element):
69
80
  single_arg = {}
70
81
  single_arg.update({'type': "torch.device"})
71
- if not isinstance(element, str):
82
+ if isinstance(element, (int, str)):
83
+ single_arg.update({"value": element})
84
+ elif isinstance(element, torch.device):
72
85
  if hasattr(element, "index"):
73
86
  device_value = element.type + ":" + str(element.index)
74
87
  else:
75
88
  device_value = element.type
76
89
  single_arg.update({"value": device_value})
77
90
  else:
78
- single_arg.update({"value": element})
91
+ logger.debug(f"Device type {type(element)} is not supported.")
79
92
  return single_arg
80
93
 
81
94
  @staticmethod
@@ -133,7 +146,7 @@ class PytorchDataProcessor(BaseDataProcessor):
133
146
  if data.is_meta:
134
147
  return tensor_stat
135
148
  data_clone = data.detach()
136
- if data_clone.numel() == 0:
149
+ if not data_clone.numel() or not data_clone.data_ptr():
137
150
  return tensor_stat
138
151
  else:
139
152
  if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump:
@@ -168,6 +181,11 @@ class PytorchDataProcessor(BaseDataProcessor):
168
181
  def is_distributed_op(module):
169
182
  return getattr(module, "op_is_distributed", False)
170
183
 
184
+ @staticmethod
185
+ def is_hookable_element(element):
186
+ return (hasattr(element, "register_hook") and callable(element.register_hook)) and \
187
+ (hasattr(element, "requires_grad") and element.requires_grad)
188
+
171
189
  @staticmethod
172
190
  def _analyze_torch_size(arg):
173
191
  return {"type": "torch.Size", "value": list(arg)}
@@ -176,7 +194,6 @@ class PytorchDataProcessor(BaseDataProcessor):
176
194
  def _analyze_memory_format(arg):
177
195
  # 获取内存格式
178
196
  format_type = PytorchDataProcessor.memory_format.get(arg)
179
-
180
197
  return {"type": "torch.memory_format", "format": format_type}
181
198
 
182
199
  @staticmethod
@@ -188,9 +205,30 @@ class PytorchDataProcessor(BaseDataProcessor):
188
205
  group_id = PytorchDataProcessor.process_group_hash(arg)
189
206
  group_info.update({"group_id": group_id})
190
207
  except Exception as e:
191
- logger.warning(f"Failed to get process group(id: {group_id}) ranks info with error info: {e}.")
208
+ logger.warning(f"Failed to get process group ranks info with error info: {e}.")
192
209
  return group_info
193
210
 
211
+ @staticmethod
212
+ def _analyze_reduce_op(arg):
213
+ op_type = None
214
+ try:
215
+ op_type = str(arg)
216
+ except Exception as e:
217
+ logger.warning(f"Failed to get value of torch.distributed.ReduceOp with error info: {e}.")
218
+ return {"type": "torch.distributed.ReduceOp", "value": op_type}
219
+
220
+ @staticmethod
221
+ def _cast_to_float_if_fp8(tensor):
222
+ dtype = str(tensor.dtype)
223
+ if is_float8_tensor(tensor):
224
+ dtype = PtConst.HIFLOAT8_TYPE if is_hifloat8_tensor(tensor) else dtype
225
+ logger.debug(
226
+ f"The {dtype} tensor analyzing/saving is unsupported in dump function."
227
+ f"Casting to float for processing."
228
+ )
229
+ tensor = tensor.float()
230
+ return tensor, dtype
231
+
194
232
  @classmethod
195
233
  def get_special_types(cls):
196
234
  return super().get_special_types() + cls.pytorch_special_type
@@ -204,11 +242,17 @@ class PytorchDataProcessor(BaseDataProcessor):
204
242
  return self._analyze_memory_format(element)
205
243
  if isinstance(element, dist.ProcessGroup):
206
244
  return self._analyze_process_group(element)
245
+ if isinstance(element, dist.P2POp):
246
+ return self._analyze_p2pop(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
247
+ if isinstance(element, dist.ReduceOp):
248
+ return self._analyze_reduce_op(element)
207
249
  converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
208
250
  if converted_numpy is not element:
209
- return self._analyze_numpy(converted_numpy, numpy_type)
251
+ return {"type": numpy_type, "value": converted_numpy}
210
252
  if isinstance(element, torch.Tensor):
211
- return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
253
+ return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
254
+ if isinstance(element, np.ndarray):
255
+ return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
212
256
  if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
213
257
  return self._analyze_builtin(element)
214
258
  return {}
@@ -218,11 +262,27 @@ class PytorchDataProcessor(BaseDataProcessor):
218
262
  module_input_output.update_output_with_args_and_kwargs()
219
263
  return super().analyze_forward_output(name, module, module_input_output)
220
264
 
265
+ def _analyze_p2pop(self, arg, suffix):
266
+ p2pop_info = {"class_type": "torch.distributed.P2POp"}
267
+ try:
268
+ tensor_info = self._analyze_tensor(arg.tensor, suffix)
269
+ p2pop_info.update({"tensor": tensor_info})
270
+ p2pop_info.update({"op": arg.op.__name__})
271
+ p2pop_info.update({"peer": arg.peer})
272
+ p2pop_info.update({"tag": arg.tag})
273
+ group_id = PytorchDataProcessor.process_group_hash(
274
+ arg.group) if arg.group else PytorchDataProcessor.process_group_hash(_get_default_group())
275
+ p2pop_info.update({"group_id": group_id})
276
+ except Exception as e:
277
+ logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
278
+ return p2pop_info
279
+
221
280
  def _analyze_tensor(self, tensor, suffix):
281
+ tensor, dtype = self._cast_to_float_if_fp8(tensor)
222
282
  tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
223
283
  tensor_json = {}
224
284
  tensor_json.update({'type': 'torch.Tensor'})
225
- tensor_json.update({'dtype': str(tensor.dtype)})
285
+ tensor_json.update({'dtype': dtype})
226
286
  tensor_json.update({"shape": tensor.shape})
227
287
  if tensor_stat.stack_tensor_stat is None:
228
288
  tensor_json.update({"Max": tensor_stat.max})
@@ -261,6 +321,7 @@ class TensorDataProcessor(PytorchDataProcessor):
261
321
  dump_data_name, file_path = self.get_save_file_path(suffix)
262
322
  single_arg = super()._analyze_tensor(tensor, suffix)
263
323
  single_arg.update({"data_name": dump_data_name})
324
+ tensor, _ = self._cast_to_float_if_fp8(tensor)
264
325
  if self.config.async_dump:
265
326
  self._async_dump_cache[file_path] = tensor.clone().detach()
266
327
  else:
@@ -268,6 +329,13 @@ class TensorDataProcessor(PytorchDataProcessor):
268
329
  save_pt(saved_tensor, file_path)
269
330
  return single_arg
270
331
 
332
+ def _analyze_numpy(self, ndarray, suffix):
333
+ dump_data_name, file_path = self.get_save_file_path(suffix)
334
+ save_pt(torch.tensor(ndarray), file_path)
335
+ ndarray_json = super()._analyze_numpy(ndarray, suffix)
336
+ ndarray_json.update({"data_name": dump_data_name})
337
+ return ndarray_json
338
+
271
339
 
272
340
  class OverflowCheckDataProcessor(PytorchDataProcessor):
273
341
  __slots__ = ["cached_tensors_and_file_paths"]
@@ -319,7 +387,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
319
387
  api_info_struct = super().analyze_backward(name, module, module_input_output)
320
388
  self.handle_overflow()
321
389
  return api_info_struct if self.has_overflow else None
322
-
390
+
323
391
  def analyze_params(self, name, param_name, grad):
324
392
  self.has_overflow = False
325
393
  self._is_support_inf_nan()
@@ -332,7 +400,8 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
332
400
  self._analyze_maybe_overflow_flag()
333
401
  if self.has_overflow:
334
402
  for file_path, tensor in self.cached_tensors_and_file_paths.items():
335
- save_pt(tensor, file_path)
403
+ tensor, _ = self._cast_to_float_if_fp8(tensor)
404
+ save_pt(tensor.clone().contiguous().detach(), file_path)
336
405
  self.real_overflow_nums += 1
337
406
  if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
338
407
  logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
@@ -457,11 +526,13 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
457
526
  return
458
527
 
459
528
  if self.config.is_backward_kernel_dump:
460
- self.forward_args = self.clone_and_detach_tensor(module_input_output.args)
461
- self.forward_kwargs = self.clone_and_detach_tensor(module_input_output.kwargs)
462
529
  try:
530
+ self.forward_args = self.clone_and_detach_tensor(module_input_output.args)
531
+ self.forward_kwargs = self.clone_and_detach_tensor(module_input_output.kwargs)
463
532
  output = module.forward(*self.forward_args, **self.forward_kwargs)
464
- except Exception:
533
+ except Exception as e:
534
+ if isinstance(e, MsprobeException):
535
+ logger.warning(str(e))
465
536
  self._print_unsupported_log(name)
466
537
  self.enable_kernel_dump = False
467
538
  return
@@ -503,9 +574,17 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
503
574
  self.stop_kernel_dump()
504
575
  logger.info(f"The kernel data of {name} is dumped successfully.")
505
576
 
506
- @recursion_depth_decorator("KernelDump: KernelDumpDataProcessor.clone_and_detach_tensor")
577
+ @recursion_depth_decorator(
578
+ "KernelDump: KernelDumpDataProcessor.clone_and_detach_tensor",
579
+ max_depth=Const.DUMP_MAX_DEPTH
580
+ )
507
581
  def clone_and_detach_tensor(self, input_params):
508
582
  if isinstance(input_params, torch.Tensor):
583
+ if is_float8_tensor(input_params):
584
+ raise MsprobeException(
585
+ MsprobeException.UNSUPPORTED_TYPE_ERROR,
586
+ f"L2 backward dump does not support float8 type."
587
+ )
509
588
  if input_params.requires_grad:
510
589
  return input_params.clone().detach().requires_grad_()
511
590
  return input_params.clone()
@@ -520,6 +599,8 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
520
599
 
521
600
  def analyze_single_element(self, element, suffix_stack):
522
601
  if isinstance(element, torch.Tensor):
602
+ if is_float8_tensor(element):
603
+ return {}
523
604
  if not self.is_found_output_tensor:
524
605
  if element.requires_grad:
525
606
  self.forward_output_tensor = element
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
1
+ # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
2
  # All rights reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,12 +15,15 @@
15
15
 
16
16
  import csv
17
17
  import os
18
- import numpy as np
18
+ import copy
19
+ import threading
19
20
 
20
21
  from msprobe.core.common.const import Const, FileCheckConst
21
22
  from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json
22
23
  from msprobe.core.common.log import logger
23
- from msprobe.core.common.exceptions import MsprobeException
24
+ from msprobe.core.common.decorator import recursion_depth_decorator
25
+
26
+ lock = threading.Lock()
24
27
 
25
28
 
26
29
  class DataWriter:
@@ -31,10 +34,12 @@ class DataWriter:
31
34
  self.construct_file_path = None
32
35
  self.free_benchmark_file_path = None
33
36
  self.dump_tensor_data_dir = None
37
+ self.debug_file_path = None
34
38
  self.flush_size = 1000
35
39
  self.cache_data = {}
36
40
  self.cache_stack = {}
37
41
  self.cache_construct = {}
42
+ self.cache_debug = {}
38
43
 
39
44
  @staticmethod
40
45
  def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -57,6 +62,13 @@ class DataWriter:
57
62
  self.cache_construct = {}
58
63
 
59
64
  def initialize_json_file(self, **kwargs):
65
+ if self.debug_file_path and not self.cache_debug:
66
+ # debug level case only create debug.json
67
+ debug_dict = copy.deepcopy(kwargs)
68
+ debug_dict.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
69
+ self.cache_debug = debug_dict
70
+ save_json(self.debug_file_path, self.cache_debug, indent=1)
71
+ return
60
72
  if not self.cache_data:
61
73
  kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
62
74
  self.cache_data = kwargs
@@ -66,13 +78,13 @@ class DataWriter:
66
78
  if not self.cache_construct:
67
79
  save_json(self.construct_file_path, self.cache_construct, indent=1)
68
80
 
69
- def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
70
- free_benchmark_file_path):
71
- self.dump_file_path = dump_file_path
72
- self.stack_file_path = stack_file_path
73
- self.construct_file_path = construct_file_path
74
- self.dump_tensor_data_dir = dump_data_dir
75
- self.free_benchmark_file_path = free_benchmark_file_path
81
+ def update_dump_paths(self, dump_path_aggregation):
82
+ self.dump_file_path = dump_path_aggregation.dump_file_path
83
+ self.stack_file_path = dump_path_aggregation.stack_file_path
84
+ self.construct_file_path = dump_path_aggregation.construct_file_path
85
+ self.dump_tensor_data_dir = dump_path_aggregation.dump_tensor_data_dir
86
+ self.free_benchmark_file_path = dump_path_aggregation.free_benchmark_file_path
87
+ self.debug_file_path = dump_path_aggregation.debug_file_path
76
88
 
77
89
  def flush_data_periodically(self):
78
90
  dump_data = self.cache_data.get(Const.DATA)
@@ -80,25 +92,32 @@ class DataWriter:
80
92
  self.write_json()
81
93
 
82
94
  def update_data(self, new_data):
83
- if not isinstance(new_data, dict) or len(new_data.keys()) != 1:
84
- logger.warning(f"The data info({new_data}) should be a dict with only one outer key.")
85
- return
86
- dump_data = self.cache_data.get(Const.DATA)
87
- if not isinstance(dump_data, dict):
88
- logger.warning(f"The dump data({dump_data}) should be a dict.")
89
- return
90
-
91
- key = next(iter(new_data.keys()))
92
- if key in dump_data:
93
- dump_data.get(key).update(new_data.get(key))
94
- else:
95
- dump_data.update(new_data)
95
+ with lock:
96
+ if not isinstance(new_data, dict) or len(new_data.keys()) != 1:
97
+ logger.warning(f"The data info({new_data}) should be a dict with only one outer key.")
98
+ return
99
+ dump_data = self.cache_data.get(Const.DATA)
100
+ if not isinstance(dump_data, dict):
101
+ logger.warning(f"The dump data({dump_data}) should be a dict.")
102
+ return
103
+
104
+ key = next(iter(new_data.keys()))
105
+ if key in dump_data:
106
+ dump_data.get(key).update(new_data.get(key))
107
+ else:
108
+ dump_data.update(new_data)
96
109
 
97
110
  def update_stack(self, new_data):
98
- self.cache_stack.update(new_data)
111
+ with lock:
112
+ self.cache_stack.update(new_data)
99
113
 
100
114
  def update_construct(self, new_data):
101
- self.cache_construct.update(new_data)
115
+ with lock:
116
+ self.cache_construct.update(new_data)
117
+
118
+ def update_debug(self, new_data):
119
+ with lock:
120
+ self.cache_debug['data'].update(new_data)
102
121
 
103
122
  def write_data_json(self, file_path):
104
123
  logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
@@ -110,21 +129,25 @@ class DataWriter:
110
129
  def write_construct_info_json(self, file_path):
111
130
  save_json(file_path, self.cache_construct, indent=1)
112
131
 
132
+ def write_debug_info_json(self, file_path):
133
+ save_json(file_path, self.cache_debug, indent=1)
134
+
113
135
  def write_json(self):
114
- if self.cache_data:
115
- self.write_data_json(self.dump_file_path)
116
- if self.cache_stack:
117
- self.write_stack_info_json(self.stack_file_path)
118
- if self.cache_construct:
119
- self.write_construct_info_json(self.construct_file_path)
136
+ with lock:
137
+ if self.cache_data:
138
+ self.write_data_json(self.dump_file_path)
139
+ if self.cache_stack:
140
+ self.write_stack_info_json(self.stack_file_path)
141
+ if self.cache_construct:
142
+ self.write_construct_info_json(self.construct_file_path)
143
+ if self.cache_debug:
144
+ self.write_debug_info_json(self.debug_file_path)
120
145
 
121
146
  def fill_stack_tensor_data(self):
122
147
  self.process_stat_data_recursive(self.cache_data)
123
148
 
124
- def process_stat_data_recursive(self, data, depth=0):
125
- if depth > Const.MAX_DEPTH:
126
- logger.error(f"The maximum depth of recursive process stat data, {Const.MAX_DEPTH} is reached.")
127
- raise MsprobeException(MsprobeException.RECURSION_LIMIT_ERROR)
149
+ @recursion_depth_decorator("AsyncDump: DataWriter.process_stat_data_recursive", max_depth=Const.DUMP_MAX_DEPTH)
150
+ def process_stat_data_recursive(self, data):
128
151
  if isinstance(data, dict):
129
152
  if "tensor_stat" in data.keys():
130
153
  tensor_stat = data["tensor_stat"]
@@ -132,14 +155,12 @@ class DataWriter:
132
155
  logger.warning("Some bad data in async dump")
133
156
  else:
134
157
  tensor_stat_index, tensor_stat_data = tensor_stat[0], tensor_stat[1]
135
- if hasattr(tensor_stat_data, "device") and tensor_stat_data.device != Const.CPU_LOWERCASE:
136
- tensor_stat_data = tensor_stat_data.cpu()
137
158
  for index, stat in zip(tensor_stat_index, tensor_stat_data):
138
- data.update({index, stat.item()})
159
+ data.update({index: stat.item()})
139
160
  del data["tensor_stat"]
140
161
  else:
141
162
  for key in data.keys():
142
- self.process_stat_data_recursive(data[key], depth + 1)
163
+ self.process_stat_data_recursive(data[key])
143
164
  elif isinstance(data, (list, tuple)):
144
165
  for i in data:
145
- self.process_stat_data_recursive(i, depth + 1)
166
+ self.process_stat_data_recursive(i)
@@ -31,6 +31,7 @@ class GradConst:
31
31
  STEP = "step"
32
32
  BOUNDS = "bounds"
33
33
  OUTPUT_PATH = "output_path"
34
+ TIME_STAMP = "time_stamp"
34
35
 
35
36
  # level const
36
37
  LEVEL = "level"
@@ -112,7 +112,7 @@ class GradComparator:
112
112
  result.append([key] + value)
113
113
  result_csv_path = os.path.join(output_dir, "similarities.csv")
114
114
  if os.path.exists(result_csv_path):
115
- logger.warning(f"{result_csv_path} will be recoverd")
115
+ logger.warning(f"{result_csv_path} will be deleted")
116
116
  remove_path(result_csv_path)
117
117
  write_csv(result, result_csv_path)
118
118
 
@@ -20,6 +20,7 @@ import numpy as np
20
20
  from msprobe.core.overflow_check.api_info import APIInfo
21
21
  from msprobe.core.overflow_check.level import OverflowLevel
22
22
  from msprobe.core.overflow_check.utils import has_nan_inf
23
+ from msprobe.core.common.decorator import recursion_depth_decorator
23
24
 
24
25
 
25
26
  class AnomalyScene:
@@ -35,6 +36,7 @@ class AnomalyScene:
35
36
  raise NotImplementedError
36
37
 
37
38
  @staticmethod
39
+ @recursion_depth_decorator("AbnormalScene: AnomalyScene._has_anomaly")
38
40
  def _has_anomaly(data: Union[Dict, Any]) -> bool:
39
41
  """检查张量是否包含异常值"""
40
42
  if isinstance(data, dict):
@@ -16,6 +16,8 @@ pip install mindstudio-probe
16
16
 
17
17
  |版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
18
18
  |:--:|:--:|:--:|:--:|:--:|:--:|
19
+ |1.2.2|2025.3.03|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|961411bb460d327ea51d6ca4d0c8e8c5565f07c0852d7b8592b781ca35b87212|
20
+ |1.2.1|2025.2.07|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.1-py3-none-any.whl)|b64b342118558e0339b39237f88a49b93fd24551b0cb202c872fbfef4260c86b|
19
21
  |1.2.0|2025.1.13|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.0-py3-none-any.whl)|1e3aeea1706112f6ee52fd1165037936bb209138f0b9ec42ea21e2c1c8942cdc|
20
22
  |1.1.1|2024.12.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.1-py3-none-any.whl)|577b597555dc155b76ba1a62d575c3546004644e140a456c3ba0824d46283735|
21
23
  |1.1.0|2024.10.14|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.0-py3-none-any.whl)|83a5a9b7c65a357639f8c9636d88c693b4cf0eb590d4f8f5cb56395ba69b1f6d|
@@ -50,10 +52,34 @@ pip install ./mindstudio_probe*.whl
50
52
 
51
53
  |参数|说明|是否必选|
52
54
  |--|--|:--:|
53
- |--include-mod|指定可选模块,可取值`adump`,表示在编whl包时加入adump模块。默认未配置该参数,表示编基础包。<br>&#8226; adump模块用于MindSpore静态图场景L2级别的dump。<br>&#8226; 仅MindSpore 2.5.0及以上版本支持adump模块。<br>&#8226; 若使用源码安装,编译环境需支持GCC 7或以上版本,和CMAKE 3.14或以上版本。<br>&#8226; 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
55
+ |--include-mod|指定可选模块,可取值`adump`,表示在编whl包时加入adump模块。默认未配置该参数,表示编基础包。<br>&#8226; adump模块用于MindSpore静态图场景L2级别的dump。<br>&#8226; 仅MindSpore 2.5.0及以上版本支持adump模块。<br>&#8226; 若使用源码安装,编译环境需支持GCC 7.5或以上版本,和CMAKE 3.14或以上版本。<br>&#8226; 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
54
56
 
55
57
  # 特性变更说明
56
58
 
59
+ ## 1.2.0
60
+
61
+ 【数据采集】
62
+
63
+ - 模块级dump支持采集权重及权重梯度
64
+ - 修复原地覆盖类API前向输入数据采集不正确的问题
65
+ - seed_all接口支持控制dropout失效功能
66
+
67
+ 【精度预检】
68
+
69
+ - MindSpore场景新增支持Tensor类的mint API的预检
70
+
71
+ 【训练状态监控】
72
+
73
+ - 支持FSDP和ZeRO-0
74
+ - 异常排序支持前向激活值和反向梯度
75
+
76
+ 【分级可视化构图比对】
77
+
78
+ - 支持graph结构分页展示,支持graph批量构建和比对
79
+ - 支持溢出检测模式
80
+
81
+ ## 1.1.1
82
+
57
83
  ## 1.1.1
58
84
 
59
85
  【数据采集】