mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
  2. mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
  3. {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
  4. {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
  5. msprobe/README.md +84 -18
  6. msprobe/__init__.py +16 -1
  7. msprobe/config.json +1 -5
  8. msprobe/core/advisor/advisor.py +16 -11
  9. msprobe/core/advisor/advisor_const.py +6 -7
  10. msprobe/core/advisor/advisor_result.py +12 -12
  11. msprobe/core/common/const.py +164 -3
  12. msprobe/core/common/exceptions.py +26 -4
  13. msprobe/core/common/file_utils.py +196 -27
  14. msprobe/core/common/inplace_op_checker.py +53 -0
  15. msprobe/core/common/inplace_ops.yaml +251 -0
  16. msprobe/core/common/log.py +46 -18
  17. msprobe/core/common/utils.py +308 -209
  18. msprobe/core/common_config.py +60 -38
  19. msprobe/core/compare/acc_compare.py +332 -94
  20. msprobe/core/compare/check.py +104 -22
  21. msprobe/core/compare/compare_cli.py +42 -5
  22. msprobe/core/compare/highlight.py +162 -57
  23. msprobe/core/compare/layer_mapping/__init__.py +19 -0
  24. msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
  25. msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
  26. msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
  27. msprobe/core/compare/multiprocessing_compute.py +33 -8
  28. msprobe/core/compare/npy_compare.py +73 -29
  29. msprobe/core/compare/utils.py +306 -247
  30. msprobe/core/data_dump/data_collector.py +44 -43
  31. msprobe/core/data_dump/data_processor/base.py +88 -35
  32. msprobe/core/data_dump/data_processor/factory.py +20 -3
  33. msprobe/core/data_dump/data_processor/mindspore_processor.py +14 -8
  34. msprobe/core/data_dump/data_processor/pytorch_processor.py +180 -66
  35. msprobe/core/data_dump/json_writer.py +63 -42
  36. msprobe/core/data_dump/scope.py +143 -48
  37. msprobe/core/grad_probe/constant.py +31 -13
  38. msprobe/core/grad_probe/grad_compare.py +20 -4
  39. msprobe/core/grad_probe/utils.py +44 -3
  40. msprobe/core/overflow_check/abnormal_scene.py +185 -0
  41. msprobe/core/overflow_check/api_info.py +55 -0
  42. msprobe/core/overflow_check/checker.py +138 -0
  43. msprobe/core/overflow_check/filter.py +157 -0
  44. msprobe/core/overflow_check/ignore_rules.yaml +55 -0
  45. msprobe/core/overflow_check/level.py +22 -0
  46. msprobe/core/overflow_check/utils.py +28 -0
  47. msprobe/docs/01.installation.md +29 -9
  48. msprobe/docs/02.config_introduction.md +83 -84
  49. msprobe/docs/03.config_examples.md +3 -20
  50. msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
  51. msprobe/docs/05.data_dump_PyTorch.md +143 -13
  52. msprobe/docs/06.data_dump_MindSpore.md +197 -88
  53. msprobe/docs/07.accuracy_checker_PyTorch.md +69 -46
  54. msprobe/docs/08.accuracy_checker_online_PyTorch.md +52 -17
  55. msprobe/docs/09.accuracy_checker_MindSpore.md +51 -15
  56. msprobe/docs/10.accuracy_compare_PyTorch.md +187 -99
  57. msprobe/docs/11.accuracy_compare_MindSpore.md +253 -31
  58. msprobe/docs/12.overflow_check_PyTorch.md +1 -1
  59. msprobe/docs/13.overflow_check_MindSpore.md +6 -6
  60. msprobe/docs/15.free_benchmarking_PyTorch.md +60 -55
  61. msprobe/docs/16.free_benchmarking_MindSpore.md +159 -0
  62. msprobe/docs/17.grad_probe.md +19 -22
  63. msprobe/docs/18.online_dispatch.md +89 -0
  64. msprobe/docs/19.monitor.md +468 -0
  65. msprobe/docs/20.monitor_performance_baseline.md +52 -0
  66. msprobe/docs/21.visualization_PyTorch.md +386 -0
  67. msprobe/docs/22.visualization_MindSpore.md +384 -0
  68. msprobe/docs/23.tool_function_introduction.md +28 -0
  69. msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +25 -10
  70. msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
  71. msprobe/docs/img/compare_result.png +0 -0
  72. msprobe/docs/img/monitor/cpu_info.png +0 -0
  73. msprobe/docs/img/ms_dump.png +0 -0
  74. msprobe/docs/img/ms_layer.png +0 -0
  75. msprobe/docs/img/pt_dump.png +0 -0
  76. msprobe/mindspore/__init__.py +16 -0
  77. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +130 -138
  78. msprobe/mindspore/api_accuracy_checker/api_info.py +27 -5
  79. msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
  80. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
  81. msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
  82. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
  83. msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
  84. msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
  85. msprobe/mindspore/api_accuracy_checker/main.py +27 -3
  86. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
  87. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
  88. msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
  89. msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
  90. msprobe/mindspore/cell_processor.py +58 -13
  91. msprobe/mindspore/common/const.py +35 -13
  92. msprobe/mindspore/common/log.py +5 -9
  93. msprobe/mindspore/common/utils.py +60 -5
  94. msprobe/mindspore/compare/distributed_compare.py +15 -28
  95. msprobe/mindspore/compare/ms_compare.py +319 -158
  96. msprobe/mindspore/compare/ms_graph_compare.py +99 -49
  97. msprobe/mindspore/debugger/debugger_config.py +20 -14
  98. msprobe/mindspore/debugger/precision_debugger.py +43 -13
  99. msprobe/mindspore/dump/dump_tool_factory.py +18 -1
  100. msprobe/mindspore/dump/hook_cell/api_registry.py +23 -3
  101. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +203 -0
  102. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +107 -10
  103. msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
  104. msprobe/mindspore/dump/jit_dump.py +56 -20
  105. msprobe/mindspore/dump/kernel_graph_dump.py +19 -5
  106. msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -6
  107. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
  108. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
  109. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +162 -41
  110. msprobe/mindspore/free_benchmark/common/config.py +15 -0
  111. msprobe/mindspore/free_benchmark/common/handler_params.py +15 -1
  112. msprobe/mindspore/free_benchmark/common/utils.py +37 -8
  113. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
  114. msprobe/mindspore/free_benchmark/handler/base_handler.py +20 -5
  115. msprobe/mindspore/free_benchmark/handler/check_handler.py +21 -7
  116. msprobe/mindspore/free_benchmark/handler/fix_handler.py +18 -3
  117. msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -6
  118. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +23 -8
  119. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +29 -5
  120. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +25 -10
  121. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +45 -19
  122. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +29 -8
  123. msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
  124. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +22 -7
  125. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +17 -2
  126. msprobe/mindspore/grad_probe/global_context.py +44 -14
  127. msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
  128. msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
  129. msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
  130. msprobe/mindspore/grad_probe/hook.py +24 -10
  131. msprobe/mindspore/grad_probe/utils.py +18 -5
  132. msprobe/mindspore/ms_config.py +22 -15
  133. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -6
  134. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
  135. msprobe/mindspore/runtime.py +15 -0
  136. msprobe/mindspore/service.py +75 -150
  137. msprobe/mindspore/task_handler_factory.py +15 -0
  138. msprobe/msprobe.py +24 -7
  139. msprobe/pytorch/__init__.py +23 -3
  140. msprobe/pytorch/api_accuracy_checker/common/config.py +81 -2
  141. msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
  142. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
  143. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +50 -25
  144. msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
  145. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
  146. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
  147. msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
  148. msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
  149. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
  150. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
  151. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +73 -33
  152. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +44 -18
  153. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +32 -11
  154. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +122 -172
  155. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +158 -4
  156. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +30 -24
  157. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +68 -31
  158. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
  159. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
  160. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +26 -9
  161. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
  162. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
  163. msprobe/pytorch/bench_functions/__init__.py +18 -3
  164. msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
  165. msprobe/pytorch/bench_functions/confusion_transpose.py +20 -1
  166. msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
  167. msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
  168. msprobe/pytorch/bench_functions/linear.py +15 -0
  169. msprobe/pytorch/bench_functions/matmul_backward.py +33 -6
  170. msprobe/pytorch/bench_functions/npu_fusion_attention.py +280 -157
  171. msprobe/pytorch/bench_functions/rms_norm.py +15 -0
  172. msprobe/pytorch/bench_functions/rotary_mul.py +32 -9
  173. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
  174. msprobe/pytorch/bench_functions/swiglu.py +29 -6
  175. msprobe/pytorch/common/__init__.py +15 -0
  176. msprobe/pytorch/common/log.py +18 -6
  177. msprobe/pytorch/common/parse_json.py +31 -16
  178. msprobe/pytorch/common/utils.py +96 -40
  179. msprobe/pytorch/compare/distributed_compare.py +13 -14
  180. msprobe/pytorch/compare/match.py +15 -0
  181. msprobe/pytorch/compare/pt_compare.py +44 -10
  182. msprobe/pytorch/debugger/debugger_config.py +69 -52
  183. msprobe/pytorch/debugger/precision_debugger.py +72 -24
  184. msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
  185. msprobe/pytorch/free_benchmark/__init__.py +20 -5
  186. msprobe/pytorch/free_benchmark/common/constant.py +15 -0
  187. msprobe/pytorch/free_benchmark/common/counter.py +15 -0
  188. msprobe/pytorch/free_benchmark/common/enums.py +43 -0
  189. msprobe/pytorch/free_benchmark/common/params.py +23 -1
  190. msprobe/pytorch/free_benchmark/common/utils.py +43 -5
  191. msprobe/pytorch/free_benchmark/compare/grad_saver.py +47 -9
  192. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +17 -0
  193. msprobe/pytorch/free_benchmark/main.py +19 -4
  194. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
  195. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
  196. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +18 -1
  197. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +21 -4
  198. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +28 -2
  199. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +19 -0
  200. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
  201. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
  202. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
  203. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +65 -16
  204. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
  205. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +21 -5
  206. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
  207. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
  208. msprobe/pytorch/function_factory.py +17 -2
  209. msprobe/pytorch/functional/module_dump.py +84 -0
  210. msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
  211. msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
  212. msprobe/pytorch/hook_module/__init__.py +16 -1
  213. msprobe/pytorch/hook_module/api_registry.py +13 -8
  214. msprobe/pytorch/hook_module/hook_module.py +17 -19
  215. msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
  216. msprobe/pytorch/hook_module/utils.py +4 -6
  217. msprobe/pytorch/hook_module/wrap_aten.py +12 -11
  218. msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
  219. msprobe/pytorch/hook_module/wrap_functional.py +21 -20
  220. msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
  221. msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
  222. msprobe/pytorch/hook_module/wrap_torch.py +4 -6
  223. msprobe/pytorch/hook_module/wrap_vf.py +4 -6
  224. msprobe/pytorch/module_processer.py +18 -6
  225. msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
  226. msprobe/pytorch/monitor/anomaly_detect.py +340 -0
  227. msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
  228. msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
  229. msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
  230. msprobe/pytorch/monitor/features.py +108 -0
  231. msprobe/pytorch/monitor/module_hook.py +870 -0
  232. msprobe/pytorch/monitor/module_metric.py +193 -0
  233. msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
  234. msprobe/pytorch/monitor/optimizer_collect.py +295 -0
  235. msprobe/pytorch/monitor/unittest/__init__.py +0 -0
  236. msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
  237. msprobe/pytorch/monitor/utils.py +250 -0
  238. msprobe/pytorch/monitor/visualizer.py +59 -0
  239. msprobe/pytorch/online_dispatch/__init__.py +2 -3
  240. msprobe/pytorch/online_dispatch/compare.py +38 -48
  241. msprobe/pytorch/online_dispatch/dispatch.py +50 -25
  242. msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
  243. msprobe/pytorch/online_dispatch/single_compare.py +60 -39
  244. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +9 -1
  245. msprobe/pytorch/online_dispatch/utils.py +48 -23
  246. msprobe/pytorch/parse.py +15 -0
  247. msprobe/pytorch/parse_tool/cli.py +5 -6
  248. msprobe/pytorch/parse_tool/lib/compare.py +19 -26
  249. msprobe/pytorch/parse_tool/lib/config.py +1 -1
  250. msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -2
  251. msprobe/pytorch/parse_tool/lib/utils.py +40 -55
  252. msprobe/pytorch/parse_tool/lib/visualization.py +3 -1
  253. msprobe/pytorch/pt_config.py +192 -40
  254. msprobe/pytorch/service.py +110 -35
  255. msprobe/visualization/__init__.py +14 -0
  256. msprobe/visualization/builder/__init__.py +14 -0
  257. msprobe/visualization/builder/graph_builder.py +165 -0
  258. msprobe/visualization/builder/msprobe_adapter.py +205 -0
  259. msprobe/visualization/compare/__init__.py +14 -0
  260. msprobe/visualization/compare/graph_comparator.py +130 -0
  261. msprobe/visualization/compare/mode_adapter.py +211 -0
  262. msprobe/visualization/graph/__init__.py +14 -0
  263. msprobe/visualization/graph/base_node.py +124 -0
  264. msprobe/visualization/graph/graph.py +200 -0
  265. msprobe/visualization/graph/node_colors.py +95 -0
  266. msprobe/visualization/graph/node_op.py +39 -0
  267. msprobe/visualization/graph_service.py +214 -0
  268. msprobe/visualization/utils.py +232 -0
  269. mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
  270. msprobe/docs/04.acl_config_examples.md +0 -76
  271. msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -43
  272. msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -107
  273. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
  274. msprobe/pytorch/functional/dump_module.py +0 -39
  275. {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
  276. {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
  277. /msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0
  278. /msprobe/pytorch/{functional/data_processor.py → monitor/distributed/__init__.py} +0 -0
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
  from functools import wraps
@@ -23,6 +21,7 @@ from msprobe.pytorch.hook_module.hook_module import HOOKModule
23
21
  from msprobe.pytorch.common.utils import torch_device_guard
24
22
  from msprobe.core.common.const import Const
25
23
  from msprobe.core.common.file_utils import load_yaml
24
+ from msprobe.core.common.inplace_op_checker import InplaceOpChecker
26
25
 
27
26
 
28
27
  cur_path = os.path.dirname(os.path.realpath(__file__))
@@ -50,7 +49,7 @@ class DistributedOPTemplate(HOOKModule):
50
49
  self.op_name_ = op_name
51
50
  self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
52
51
  super().__init__(build_hook)
53
- if not self.stop_hook and self.op_name_ in Const.INPLACE_LIST:
52
+ if not self.stop_hook and InplaceOpChecker.check(self.op_name_, InplaceOpChecker.OP_DISTRIBUTED):
54
53
  self.op_is_inplace = True
55
54
 
56
55
  @torch_device_guard
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
  import torch
@@ -32,31 +30,34 @@ def remove_dropout():
32
30
  from torch import _VF
33
31
  from torch.overrides import has_torch_function_unary, handle_torch_function
34
32
 
35
- def function_dropout(input: torch.Tensor, p: float = 0.5, training: bool = True,
33
+ def function_dropout(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
36
34
  inplace: bool = False) -> torch.Tensor:
37
- if has_torch_function_unary(input):
38
- return handle_torch_function(function_dropout, (input,), input, p=0., training=training, inplace=inplace)
35
+ if has_torch_function_unary(input_tensor):
36
+ return handle_torch_function(
37
+ function_dropout, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
39
38
  if p < 0.0 or p > 1.0:
40
39
  raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
41
- return _VF.dropout_(input, 0., training) if inplace else _VF.dropout(input, 0., training)
42
-
40
+ return _VF.dropout_(input_tensor, 0., training) if inplace else _VF.dropout(input_tensor, 0., training)
43
41
 
44
- def function_dropout2d(input: torch.Tensor, p: float = 0.5, training: bool = True,
42
+ def function_dropout2d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
45
43
  inplace: bool = False) -> torch.Tensor:
46
- if has_torch_function_unary(input):
47
- return handle_torch_function(function_dropout2d, (input,), input, p=0., training=training, inplace=inplace)
44
+ if has_torch_function_unary(input_tensor):
45
+ return handle_torch_function(
46
+ function_dropout2d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
48
47
  if p < 0.0 or p > 1.0:
49
48
  raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
50
- return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
51
-
49
+ return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
50
+ 0., training)
52
51
 
53
- def function_dropout3d(input: torch.Tensor, p: float = 0.5, training: bool = True,
52
+ def function_dropout3d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
54
53
  inplace: bool = False) -> torch.Tensor:
55
- if has_torch_function_unary(input):
56
- return handle_torch_function(function_dropout3d, (input,), input, p=0., training=training, inplace=inplace)
54
+ if has_torch_function_unary(input_tensor):
55
+ return handle_torch_function(
56
+ function_dropout3d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
57
57
  if p < 0.0 or p > 1.0:
58
58
  raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
59
- return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
59
+ return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
60
+ 0., training)
60
61
 
61
62
  F.dropout = function_dropout
62
63
  F.dropout2d = function_dropout2d
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
  import torch
@@ -21,24 +19,19 @@ import torch
21
19
  from msprobe.pytorch.hook_module.hook_module import HOOKModule
22
20
  from msprobe.pytorch.common.utils import torch_device_guard, torch_without_guard_version
23
21
  from msprobe.core.common.const import Const
22
+ from msprobe.core.common.log import logger
24
23
  from msprobe.core.common.file_utils import load_yaml
25
24
  from msprobe.pytorch.function_factory import npu_custom_functions
26
25
 
27
- cur_path = os.path.dirname(os.path.realpath(__file__))
28
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
29
-
30
-
31
26
  try:
32
27
  import torch_npu
33
28
  except ImportError:
34
- is_gpu = True
35
- else:
36
- is_gpu = False
29
+ logger.info("Failing to import torch_npu.")
37
30
 
38
31
 
39
- cuda_func_mapping = {
40
- "npu_fusion_attention" : "gpu_fusion_attention"
41
- }
32
+ cur_path = os.path.dirname(os.path.realpath(__file__))
33
+ yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
34
+ cuda_func_mapping = {"npu_fusion_attention" : "gpu_fusion_attention"}
42
35
 
43
36
 
44
37
  def get_npu_ops():
@@ -83,7 +76,6 @@ class NpuOPTemplate(HOOKModule):
83
76
  def wrap_npu_op(op_name, hook):
84
77
  def npu_op_template(*args, **kwargs):
85
78
  return NpuOPTemplate(op_name, hook)(*args, **kwargs)
86
-
87
79
  return npu_op_template
88
80
 
89
81
 
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
 
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
  import torch
@@ -1,8 +1,7 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
5
  # you may not use this file except in compliance with the License.
7
6
  # You may obtain a copy of the License at
8
7
  #
@@ -13,7 +12,6 @@
13
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
- """
17
15
 
18
16
  import os
19
17
  import torch
@@ -1,10 +1,25 @@
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
1
16
  from functools import wraps
2
17
 
3
18
  import torch
19
+ from msprobe.core.common.const import Const
20
+ from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope
4
21
  from torch.utils.hooks import BackwardHook
5
22
 
6
- from msprobe.core.common.const import Const
7
- from msprobe.core.data_dump.scope import ModuleRangeScope
8
23
  torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
9
24
 
10
25
 
@@ -15,10 +30,7 @@ class ModuleProcesser:
15
30
  module_node = {}
16
31
 
17
32
  def __init__(self, scope):
18
- if isinstance(scope, ModuleRangeScope):
19
- self.scope = scope
20
- else:
21
- self.scope = None
33
+ self.scope = scope if isinstance(scope, (ModuleRangeScope, MixRangeScope)) else None
22
34
  BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook)
23
35
  BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook)
24
36
  BackwardHook.setup_output_hook = ModuleProcesser.filter_tensor_and_tuple(BackwardHook.setup_output_hook)
@@ -0,0 +1,201 @@
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ import sys
18
+ import argparse
19
+ import ast
20
+ import heapq
21
+
22
+ from msprobe.core.common.log import logger
23
+ from msprobe.core.common.const import MonitorConst
24
+ from msprobe.core.common.file_utils import check_path_before_create, save_json, create_directory, remove_path, \
25
+ check_file_or_directory_path, load_json
26
+ from msprobe.pytorch.monitor.anomaly_detect import GradAnomalyData
27
+
28
+
29
+ class AnomalyDataWriter:
30
+ """
31
+ 异常数据写入类,负责将异常数据写入到JSON文件中。
32
+ """
33
+
34
+ def __init__(self, dump_path, rank) -> None:
35
+ self.dump_path = dump_path
36
+ self.dump_rank_dir = os.path.join(self.dump_path, f"rank{rank}")
37
+ self.json_path = os.path.join(self.dump_rank_dir, MonitorConst.ANOMALY_JSON)
38
+
39
+ @staticmethod
40
+ def get_anomaly_dict(anomalies):
41
+ """将GradAnomalyData列表转换为json"""
42
+ anomalies_json = {}
43
+ for anomaly in anomalies:
44
+ anomalies_json.update({anomaly.get_key(): anomaly.to_dict()})
45
+ return anomalies_json
46
+
47
+ def init_detected_json(self):
48
+ """初始化落盘文件"""
49
+ check_path_before_create(self.dump_path)
50
+ if not os.path.exists(self.dump_path):
51
+ create_directory(self.dump_path)
52
+
53
+ if not os.path.exists(self.dump_rank_dir):
54
+ create_directory(self.dump_rank_dir)
55
+
56
+ if os.path.exists(self.json_path):
57
+ check_file_or_directory_path(self.json_path, isdir=False)
58
+ logger.warning(f"The existing file will be deleted: {self.json_path}.")
59
+ remove_path(self.json_path)
60
+ save_json(self.json_path, {}, indent=1)
61
+
62
+ def write_detected_json(self, anomalies):
63
+ """
64
+ 落盘异常数据
65
+ Args:
66
+ anomalies: GradAnomalyData对象列表
67
+ """
68
+ anomalies_json = self.get_anomaly_dict(anomalies)
69
+ logger.info(f"{MonitorConst.ANOMALY_JSON} is at {self.dump_rank_dir}.")
70
+
71
+ data_to_write = load_json(self.json_path) if os.path.exists(self.json_path) else {}
72
+ data_to_write.update(anomalies_json)
73
+ save_json(self.json_path, data_to_write, indent=1)
74
+
75
+
76
+ class AnomalyDataLoader:
77
+ def __init__(self, data_path) -> None:
78
+ self.data_path = data_path
79
+
80
+ @staticmethod
81
+ def create_instances_from_dict(anomalies_dict: dict):
82
+ instances = []
83
+ for values in anomalies_dict.values():
84
+ try:
85
+ instances.append(GradAnomalyData(**values))
86
+ except KeyError as e:
87
+ logger.warning(f"Missing key in anomaly data: {e}.")
88
+ except Exception as e:
89
+ logger.warning(f"Value error when creating a GradAnomalyData instance: {e}.")
90
+ return instances
91
+
92
+ def get_anomalies_from_jsons(self):
93
+ """遍历文件夹,从rankK/anomaly.json中读取异常数据
94
+ return: anomalies: GradAnomalyData对象列表
95
+ """
96
+ anomalies = []
97
+ check_file_or_directory_path(self.data_path, isdir=True)
98
+ for rank_dir in os.listdir(self.data_path):
99
+ rank_path = os.path.join(self.data_path, rank_dir)
100
+ if not os.path.isdir(rank_path):
101
+ continue
102
+ json_path = os.path.join(rank_path, MonitorConst.ANOMALY_JSON)
103
+ if not os.path.exists(json_path):
104
+ continue
105
+ data_anomalies = load_json(json_path)
106
+ instances = self.create_instances_from_dict(data_anomalies)
107
+ anomalies.extend(instances)
108
+ return anomalies
109
+
110
+
111
+ class AnomalyAnalyse:
112
+ def __init__(self) -> None:
113
+ self.sorted_anomalies = []
114
+
115
+ def get_range_top_k(self, topk, step_list, anomalies):
116
+ """
117
+ 获取前topk个step_list范围内的异常。
118
+ """
119
+ if not step_list:
120
+ filtered_anomalies = anomalies
121
+ else:
122
+ filtered_anomalies = [
123
+ anomaly
124
+ for anomaly in anomalies
125
+ if anomaly.step in step_list
126
+ ]
127
+ if topk >= len(filtered_anomalies):
128
+ self.sorted_anomalies = sorted(filtered_anomalies)
129
+ else:
130
+ self.sorted_anomalies = list(heapq.nsmallest(topk, filtered_anomalies))
131
+ return self.sorted_anomalies
132
+
133
+ def rewrite_sorted_anomalies(self, output_path):
134
+ """
135
+ 将排序后的异常数据重新落盘
136
+ """
137
+ check_file_or_directory_path(output_path, isdir=True)
138
+
139
+ sorted_data = AnomalyDataWriter.get_anomaly_dict(self.sorted_anomalies)
140
+ logger.info(f"{MonitorConst.ANALYSE_JSON} is at {output_path}.")
141
+ json_path = os.path.join(output_path, MonitorConst.ANALYSE_JSON)
142
+ if os.path.exists(json_path):
143
+ logger.warning(f"The existing file will be deleted: {json_path}.")
144
+ remove_path(json_path)
145
+ save_json(json_path, sorted_data, indent=1)
146
+
147
+
148
+ def _get_parse_args():
149
+ parser = argparse.ArgumentParser()
150
+ parser.add_argument("-d", "--data_path", dest="data_path_dir", default="./", type=str,
151
+ help="<Required> The anomaly detect result dictionary: generate from monitor tool.",
152
+ required=True,
153
+ )
154
+ parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
155
+ help="<optional> The analyse task result out path.",
156
+ required=False,
157
+ )
158
+ parser.add_argument("-k", "--topk", dest="top_k_number", default=8, type=int,
159
+ help="<optional> Top K number of earliest anomalies.",
160
+ required=False,
161
+ )
162
+ parser.add_argument("-s", "--step", dest="step_list", default="[]", type=str,
163
+ help="<optional> Analyse which steps.",
164
+ required=False,
165
+ )
166
+ return parser.parse_args(sys.argv[1:])
167
+
168
+
169
+ def _get_step_and_stop(args):
170
+ try:
171
+ step_list = ast.literal_eval(args.step_list)
172
+ if not isinstance(step_list, list):
173
+ raise ValueError(f"{args.step_list} is not a list.")
174
+ except (ValueError, SyntaxError, RecursionError) as e:
175
+ raise Exception(f"The step list must be a resolvable list type.") from e
176
+ if args.top_k_number <= 0:
177
+ raise Exception("The top k number must be greater than 0.")
178
+ return step_list, args.top_k_number
179
+
180
+
181
+ def _anomaly_analyse():
182
+ args = _get_parse_args()
183
+ step_list, top_k_number = _get_step_and_stop(args)
184
+ loader = AnomalyDataLoader(args.data_path_dir)
185
+ anomalies = loader.get_anomalies_from_jsons()
186
+ analyser = AnomalyAnalyse()
187
+ top_anomalies = analyser.get_range_top_k(
188
+ top_k_number, step_list, anomalies
189
+ )
190
+ analyser.rewrite_sorted_anomalies(
191
+ args.out_path if args.out_path else args.data_path_dir
192
+ )
193
+
194
+ logger.info(f"Top {top_k_number} anomalies are listed as follows:")
195
+ for index, anomaly in enumerate(top_anomalies):
196
+ logger.info(f"{index}: {anomaly.message}")
197
+
198
+
199
+ if __name__ == "__main__":
200
+ _anomaly_analyse()
201
+ logger.info("Analyse task completed.")