mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
  2. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
  3. msprobe/README.md +57 -21
  4. msprobe/core/__init__.py +17 -0
  5. msprobe/core/common/const.py +224 -82
  6. msprobe/core/common/decorator.py +50 -0
  7. msprobe/core/common/exceptions.py +5 -3
  8. msprobe/core/common/file_utils.py +274 -40
  9. msprobe/core/common/framework_adapter.py +169 -0
  10. msprobe/core/common/global_lock.py +86 -0
  11. msprobe/core/common/runtime.py +25 -0
  12. msprobe/core/common/utils.py +148 -72
  13. msprobe/core/common_config.py +7 -0
  14. msprobe/core/compare/acc_compare.py +640 -462
  15. msprobe/core/compare/check.py +36 -107
  16. msprobe/core/compare/compare_cli.py +4 -0
  17. msprobe/core/compare/config.py +72 -0
  18. msprobe/core/compare/highlight.py +217 -215
  19. msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
  20. msprobe/core/compare/merge_result/merge_result.py +12 -6
  21. msprobe/core/compare/multiprocessing_compute.py +227 -107
  22. msprobe/core/compare/npy_compare.py +32 -16
  23. msprobe/core/compare/utils.py +218 -244
  24. msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
  25. msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
  26. msprobe/core/config_check/checkers/base_checker.py +60 -0
  27. msprobe/core/config_check/checkers/dataset_checker.py +138 -0
  28. msprobe/core/config_check/checkers/env_args_checker.py +96 -0
  29. msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
  30. msprobe/core/config_check/checkers/pip_checker.py +90 -0
  31. msprobe/core/config_check/checkers/random_checker.py +367 -0
  32. msprobe/core/config_check/checkers/weights_checker.py +147 -0
  33. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
  34. msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
  35. msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
  36. msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
  37. msprobe/core/config_check/config_check_cli.py +51 -0
  38. msprobe/core/config_check/config_checker.py +100 -0
  39. msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
  40. msprobe/core/config_check/resource/env.yaml +57 -0
  41. msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
  42. msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
  43. msprobe/core/config_check/utils/utils.py +107 -0
  44. msprobe/core/data_dump/api_registry.py +239 -0
  45. msprobe/core/data_dump/data_collector.py +36 -9
  46. msprobe/core/data_dump/data_processor/base.py +74 -53
  47. msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
  48. msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
  49. msprobe/core/data_dump/json_writer.py +146 -57
  50. msprobe/core/debugger/precision_debugger.py +143 -0
  51. msprobe/core/grad_probe/constant.py +2 -1
  52. msprobe/core/grad_probe/grad_compare.py +2 -2
  53. msprobe/core/grad_probe/utils.py +1 -1
  54. msprobe/core/hook_manager.py +242 -0
  55. msprobe/core/monitor/anomaly_processor.py +384 -0
  56. msprobe/core/overflow_check/abnormal_scene.py +2 -0
  57. msprobe/core/service.py +356 -0
  58. msprobe/core/single_save/__init__.py +0 -0
  59. msprobe/core/single_save/single_comparator.py +243 -0
  60. msprobe/core/single_save/single_saver.py +157 -0
  61. msprobe/docs/01.installation.md +6 -5
  62. msprobe/docs/02.config_introduction.md +89 -30
  63. msprobe/docs/03.config_examples.md +1 -0
  64. msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
  65. msprobe/docs/05.data_dump_PyTorch.md +184 -50
  66. msprobe/docs/06.data_dump_MindSpore.md +193 -28
  67. msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
  68. msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
  69. msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
  70. msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
  71. msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
  72. msprobe/docs/12.overflow_check_PyTorch.md +5 -3
  73. msprobe/docs/13.overflow_check_MindSpore.md +6 -4
  74. msprobe/docs/14.data_parse_PyTorch.md +4 -10
  75. msprobe/docs/17.grad_probe.md +2 -1
  76. msprobe/docs/18.online_dispatch.md +3 -3
  77. msprobe/docs/19.monitor.md +211 -103
  78. msprobe/docs/21.visualization_PyTorch.md +100 -28
  79. msprobe/docs/22.visualization_MindSpore.md +103 -31
  80. msprobe/docs/23.generate_operator_PyTorch.md +9 -9
  81. msprobe/docs/25.tool_function_introduction.md +23 -22
  82. msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
  83. msprobe/docs/27.dump_json_instruction.md +278 -8
  84. msprobe/docs/28.debugger_save_instruction.md +111 -20
  85. msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
  86. msprobe/docs/29.data_dump_MSAdapter.md +229 -0
  87. msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
  88. msprobe/docs/31.config_check.md +95 -0
  89. msprobe/docs/32.ckpt_compare.md +69 -0
  90. msprobe/docs/33.generate_operator_MindSpore.md +190 -0
  91. msprobe/docs/34.RL_collect.md +92 -0
  92. msprobe/docs/35.nan_analyze.md +72 -0
  93. msprobe/docs/FAQ.md +3 -11
  94. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
  95. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
  96. msprobe/docs/img/compare_result.png +0 -0
  97. msprobe/docs/img/merge_result.png +0 -0
  98. msprobe/docs/img/save_compare_result_sample.png +0 -0
  99. msprobe/docs/img/visualization/proxy.png +0 -0
  100. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  101. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  102. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  103. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  104. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  105. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  106. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  107. msprobe/mindspore/__init__.py +3 -3
  108. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
  109. msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
  110. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
  111. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
  112. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
  113. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
  114. msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
  115. msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
  116. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
  117. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
  118. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
  119. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
  120. msprobe/mindspore/cell_processor.py +204 -33
  121. msprobe/mindspore/code_mapping/graph_parser.py +4 -21
  122. msprobe/mindspore/common/const.py +73 -2
  123. msprobe/mindspore/common/utils.py +157 -29
  124. msprobe/mindspore/compare/common_dir_compare.py +382 -0
  125. msprobe/mindspore/compare/distributed_compare.py +2 -26
  126. msprobe/mindspore/compare/ms_compare.py +18 -398
  127. msprobe/mindspore/compare/ms_graph_compare.py +20 -10
  128. msprobe/mindspore/compare/utils.py +37 -0
  129. msprobe/mindspore/debugger/debugger_config.py +59 -7
  130. msprobe/mindspore/debugger/precision_debugger.py +83 -90
  131. msprobe/mindspore/dump/cell_dump_process.py +902 -0
  132. msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
  133. msprobe/mindspore/dump/dump_tool_factory.py +18 -8
  134. msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
  135. msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
  136. msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
  137. msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
  138. msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
  139. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
  140. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
  141. msprobe/mindspore/dump/jit_dump.py +35 -27
  142. msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
  143. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
  144. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
  145. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
  146. msprobe/mindspore/free_benchmark/common/utils.py +1 -1
  147. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
  148. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
  149. msprobe/mindspore/grad_probe/global_context.py +9 -2
  150. msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
  151. msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
  152. msprobe/mindspore/grad_probe/hook.py +2 -4
  153. msprobe/mindspore/mindspore_service.py +111 -0
  154. msprobe/mindspore/monitor/common_func.py +52 -0
  155. msprobe/mindspore/monitor/data_writers.py +237 -0
  156. msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
  157. msprobe/mindspore/monitor/features.py +13 -1
  158. msprobe/mindspore/monitor/module_hook.py +568 -444
  159. msprobe/mindspore/monitor/optimizer_collect.py +331 -0
  160. msprobe/mindspore/monitor/utils.py +71 -9
  161. msprobe/mindspore/ms_config.py +16 -15
  162. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
  163. msprobe/mindspore/task_handler_factory.py +5 -2
  164. msprobe/msprobe.py +19 -0
  165. msprobe/nan_analyze/__init__.py +14 -0
  166. msprobe/nan_analyze/analyzer.py +255 -0
  167. msprobe/nan_analyze/graph.py +189 -0
  168. msprobe/nan_analyze/utils.py +211 -0
  169. msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
  170. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
  171. msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
  172. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
  173. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
  174. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
  175. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
  176. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
  177. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
  178. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
  179. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
  180. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
  181. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
  182. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
  183. msprobe/pytorch/attl_manager.py +65 -0
  184. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
  185. msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
  186. msprobe/pytorch/common/utils.py +53 -19
  187. msprobe/pytorch/compare/distributed_compare.py +4 -36
  188. msprobe/pytorch/compare/pt_compare.py +13 -84
  189. msprobe/pytorch/compare/utils.py +47 -0
  190. msprobe/pytorch/debugger/debugger_config.py +34 -17
  191. msprobe/pytorch/debugger/precision_debugger.py +50 -96
  192. msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
  193. msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
  194. msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
  195. msprobe/pytorch/free_benchmark/common/utils.py +1 -1
  196. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
  197. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
  198. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
  199. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
  200. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
  201. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
  202. msprobe/pytorch/function_factory.py +1 -1
  203. msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
  204. msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
  205. msprobe/pytorch/hook_module/api_register.py +155 -0
  206. msprobe/pytorch/hook_module/hook_module.py +18 -22
  207. msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
  208. msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
  209. msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
  210. msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
  211. msprobe/pytorch/hook_module/utils.py +28 -2
  212. msprobe/pytorch/monitor/csv2tb.py +14 -4
  213. msprobe/pytorch/monitor/data_writers.py +259 -0
  214. msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
  215. msprobe/pytorch/monitor/module_hook.py +336 -241
  216. msprobe/pytorch/monitor/module_metric.py +17 -0
  217. msprobe/pytorch/monitor/optimizer_collect.py +244 -224
  218. msprobe/pytorch/monitor/utils.py +84 -4
  219. msprobe/pytorch/online_dispatch/compare.py +0 -2
  220. msprobe/pytorch/online_dispatch/dispatch.py +13 -2
  221. msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
  222. msprobe/pytorch/online_dispatch/utils.py +3 -0
  223. msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
  224. msprobe/pytorch/parse_tool/lib/utils.py +5 -4
  225. msprobe/pytorch/pt_config.py +16 -11
  226. msprobe/pytorch/pytorch_service.py +70 -0
  227. msprobe/visualization/builder/graph_builder.py +69 -10
  228. msprobe/visualization/builder/msprobe_adapter.py +24 -12
  229. msprobe/visualization/compare/graph_comparator.py +63 -51
  230. msprobe/visualization/compare/mode_adapter.py +22 -20
  231. msprobe/visualization/graph/base_node.py +11 -4
  232. msprobe/visualization/graph/distributed_analyzer.py +1 -10
  233. msprobe/visualization/graph/graph.py +2 -13
  234. msprobe/visualization/graph/node_op.py +1 -2
  235. msprobe/visualization/graph_service.py +251 -104
  236. msprobe/visualization/utils.py +26 -44
  237. msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
  238. msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
  239. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
  240. msprobe/mindspore/monitor/anomaly_detect.py +0 -404
  241. msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
  242. msprobe/mindspore/service.py +0 -543
  243. msprobe/pytorch/hook_module/api_registry.py +0 -166
  244. msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
  245. msprobe/pytorch/hook_module/wrap_functional.py +0 -66
  246. msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
  247. msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
  248. msprobe/pytorch/hook_module/wrap_torch.py +0 -84
  249. msprobe/pytorch/hook_module/wrap_vf.py +0 -60
  250. msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
  251. msprobe/pytorch/monitor/anomaly_detect.py +0 -410
  252. msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
  253. msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
  254. msprobe/pytorch/service.py +0 -470
  255. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
  256. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
  257. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
  258. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
  259. /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
  260. /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
  261. /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
@@ -1,166 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import torch
17
- import torch.distributed as dist
18
-
19
- from msprobe.pytorch.hook_module import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten
20
- from msprobe.pytorch.hook_module.wrap_aten import get_aten_ops
21
- from msprobe.pytorch.hook_module.wrap_distributed import get_distributed_ops
22
- from msprobe.pytorch.hook_module.wrap_functional import get_functional_ops
23
- from msprobe.pytorch.hook_module.wrap_tensor import get_tensor_ops
24
- from msprobe.pytorch.hook_module.wrap_torch import get_torch_ops
25
- from msprobe.pytorch.hook_module.wrap_vf import get_vf_ops
26
- from msprobe.pytorch.common.utils import torch_without_guard_version, npu_distributed_api, is_gpu
27
- from msprobe.core.common.const import Const
28
-
29
- torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
30
-
31
- if not is_gpu:
32
- import torch_npu
33
- from . import wrap_npu_custom
34
- from .wrap_npu_custom import get_npu_ops
35
-
36
-
37
- class ApiRegistry:
38
- def __init__(self):
39
- self.tensor_ori_attr = {}
40
- self.torch_ori_attr = {}
41
- self.functional_ori_attr = {}
42
- self.distributed_ori_attr = {}
43
- self.npu_distributed_ori_attr = {}
44
- self.vf_ori_attr = {}
45
- self.aten_ori_attr = {}
46
- self.torch_npu_ori_attr = {}
47
-
48
- self.tensor_hook_attr = {}
49
- self.torch_hook_attr = {}
50
- self.functional_hook_attr = {}
51
- self.distributed_hook_attr = {}
52
- self.npu_distributed_hook_attr = {}
53
- self.vf_hook_attr = {}
54
- self.aten_hook_attr = {}
55
- self.torch_npu_hook_attr = {}
56
-
57
- @staticmethod
58
- def store_ori_attr(ori_api_group, api_list, api_ori_attr):
59
- for api in api_list:
60
- if '.' in api:
61
- sub_module_name, sub_op = api.rsplit('.', 1)
62
- sub_module = getattr(ori_api_group, sub_module_name)
63
- api_ori_attr[api] = getattr(sub_module, sub_op)
64
- else:
65
- api_ori_attr[api] = getattr(ori_api_group, api)
66
-
67
- @staticmethod
68
- def set_api_attr(api_group, attr_dict):
69
- for api, api_attr in attr_dict.items():
70
- if '.' in api:
71
- sub_module_name, sub_op = api.rsplit('.', 1)
72
- sub_module = getattr(api_group, sub_module_name, None)
73
- if sub_module is not None:
74
- setattr(sub_module, sub_op, api_attr)
75
- else:
76
- setattr(api_group, api, api_attr)
77
-
78
- def api_modularity(self):
79
- self.set_api_attr(torch.Tensor, self.tensor_hook_attr)
80
- self.set_api_attr(torch, self.torch_hook_attr)
81
- self.set_api_attr(torch.nn.functional, self.functional_hook_attr)
82
- self.set_api_attr(dist, self.distributed_hook_attr)
83
- self.set_api_attr(dist.distributed_c10d, self.distributed_hook_attr)
84
- if not is_gpu and not torch_without_guard_version:
85
- self.set_api_attr(torch_npu.distributed, self.npu_distributed_hook_attr)
86
- self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_hook_attr)
87
- if torch_version_above_2:
88
- self.set_api_attr(torch.ops.aten, self.aten_hook_attr)
89
- self.set_api_attr(torch._VF, self.vf_hook_attr)
90
- if not is_gpu:
91
- self.set_api_attr(torch_npu, self.torch_npu_hook_attr)
92
-
93
- def api_originality(self):
94
- self.set_api_attr(torch.Tensor, self.tensor_ori_attr)
95
- self.set_api_attr(torch, self.torch_ori_attr)
96
- self.set_api_attr(torch.nn.functional, self.functional_ori_attr)
97
- self.set_api_attr(dist, self.distributed_ori_attr)
98
- self.set_api_attr(dist.distributed_c10d, self.distributed_ori_attr)
99
- if not is_gpu and not torch_without_guard_version:
100
- self.set_api_attr(torch_npu.distributed, self.npu_distributed_ori_attr)
101
- self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_ori_attr)
102
- if torch_version_above_2:
103
- self.set_api_attr(torch.ops.aten, self.aten_ori_attr)
104
- self.set_api_attr(torch._VF, self.vf_ori_attr)
105
- if not is_gpu:
106
- self.set_api_attr(torch_npu, self.torch_npu_ori_attr)
107
-
108
- def initialize_hook(self, hook, online_run_ut=False):
109
- """
110
- initialize_hook
111
- Args:
112
- hook (_type_): initialize_hook
113
- online_run_ut (bool): default False, whether online run_ut or not.
114
- If online_run_ut is True, the hook will not wrap the aten ops.
115
- """
116
- self.store_ori_attr(torch.Tensor, get_tensor_ops(), self.tensor_ori_attr)
117
- wrap_tensor.wrap_tensor_ops_and_bind(hook)
118
- for attr_name in dir(wrap_tensor.HOOKTensor):
119
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
120
- self.tensor_hook_attr[attr_name[5:]] = getattr(wrap_tensor.HOOKTensor, attr_name)
121
-
122
- self.store_ori_attr(torch, get_torch_ops(), self.torch_ori_attr)
123
- wrap_torch.wrap_torch_ops_and_bind(hook)
124
- for attr_name in dir(wrap_torch.HOOKTorchOP):
125
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
126
- self.torch_hook_attr[attr_name[5:]] = getattr(wrap_torch.HOOKTorchOP, attr_name)
127
-
128
- self.store_ori_attr(torch.nn.functional, get_functional_ops(), self.functional_ori_attr)
129
- wrap_functional.wrap_functional_ops_and_bind(hook)
130
- for attr_name in dir(wrap_functional.HOOKFunctionalOP):
131
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
132
- self.functional_hook_attr[attr_name[5:]] = getattr(wrap_functional.HOOKFunctionalOP, attr_name)
133
-
134
- self.store_ori_attr(dist, get_distributed_ops(), self.distributed_ori_attr)
135
- wrap_distributed.wrap_distributed_ops_and_bind(hook)
136
- if not is_gpu and not torch_without_guard_version:
137
- self.store_ori_attr(torch_npu.distributed, npu_distributed_api, self.npu_distributed_ori_attr)
138
- for attr_name in dir(wrap_distributed.HOOKDistributedOP):
139
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
140
- self.distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, attr_name)
141
- if not is_gpu and not torch_without_guard_version and attr_name[5:] in npu_distributed_api:
142
- self.npu_distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP,
143
- attr_name)
144
-
145
- if torch_version_above_2 and not online_run_ut:
146
- self.store_ori_attr(torch.ops.aten, get_aten_ops(), self.aten_ori_attr)
147
- wrap_aten.wrap_aten_ops_and_bind(hook)
148
- for attr_name in dir(wrap_aten.HOOKAtenOP):
149
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
150
- self.aten_hook_attr[attr_name[5:]] = getattr(wrap_aten.HOOKAtenOP, attr_name)
151
-
152
- self.store_ori_attr(torch._VF, get_vf_ops(), self.vf_ori_attr)
153
- wrap_vf.wrap_vf_ops_and_bind(hook)
154
- for attr_name in dir(wrap_vf.HOOKVfOP):
155
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
156
- self.vf_hook_attr[attr_name[5:]] = getattr(wrap_vf.HOOKVfOP, attr_name)
157
-
158
- if not is_gpu:
159
- self.store_ori_attr(torch_npu, get_npu_ops(), self.torch_npu_ori_attr)
160
- wrap_npu_custom.wrap_npu_ops_and_bind(hook)
161
- for attr_name in dir(wrap_npu_custom.HOOKNpuOP):
162
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
163
- self.torch_npu_hook_attr[attr_name[5:]] = getattr(wrap_npu_custom.HOOKNpuOP, attr_name)
164
-
165
-
166
- api_register = ApiRegistry()
@@ -1,79 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- from functools import wraps
18
- import torch.distributed as dist
19
-
20
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
21
- from msprobe.pytorch.common.utils import torch_device_guard
22
- from msprobe.core.common.const import Const
23
- from msprobe.core.common.file_utils import load_yaml
24
-
25
-
26
- cur_path = os.path.dirname(os.path.realpath(__file__))
27
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
28
-
29
-
30
- distributed_func = {}
31
- for f in dir(dist):
32
- distributed_func[f] = getattr(dist, f)
33
-
34
-
35
- def get_distributed_ops():
36
- _all_distributed_ops = dir(dist)
37
- yaml_data = load_yaml(yaml_path)
38
- wrap_distributed_ops = yaml_data.get('distributed')
39
- return set(wrap_distributed_ops) & set(_all_distributed_ops)
40
-
41
-
42
- class HOOKDistributedOP(object):
43
- pass
44
-
45
-
46
- class DistributedOPTemplate(HOOKModule):
47
- def __init__(self, op_name, build_hook):
48
- self.op_name_ = op_name
49
- self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
50
- super().__init__(build_hook)
51
- if not self.stop_hook:
52
- self.op_is_distributed = True
53
-
54
- @torch_device_guard
55
- def forward(self, *args, **kwargs):
56
- handle = distributed_func.get(self.op_name_)(*args, **kwargs)
57
- if kwargs.get("async_op") or self.op_name_ in ["isend", "irecv"]:
58
- if handle and hasattr(handle, 'wait'):
59
- handle.wait()
60
- if self.op_name_ == "batch_isend_irecv":
61
- if isinstance(handle, list):
62
- for req in handle:
63
- req.wait()
64
- return handle
65
-
66
-
67
- def wrap_distributed_op(op_name, hook):
68
- @wraps(DistributedOPTemplate)
69
- def distributed_op_template(*args, **kwargs):
70
- return DistributedOPTemplate(op_name, hook)(*args, **kwargs)
71
-
72
- distributed_op_template.__name__ = op_name
73
- return distributed_op_template
74
-
75
-
76
- def wrap_distributed_ops_and_bind(hook):
77
- _distributed_ops = get_distributed_ops()
78
- for op_name in _distributed_ops:
79
- setattr(HOOKDistributedOP, "wrap_" + str(op_name), wrap_distributed_op(op_name, hook))
@@ -1,66 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import torch
18
-
19
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
20
- from msprobe.pytorch.common.utils import torch_device_guard
21
- from msprobe.core.common.const import Const
22
- from msprobe.pytorch.common.log import logger
23
- from msprobe.core.common.file_utils import load_yaml
24
-
25
-
26
- cur_path = os.path.dirname(os.path.realpath(__file__))
27
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
28
-
29
-
30
- def get_functional_ops():
31
- yaml_data = load_yaml(yaml_path)
32
- wrap_functional_ops = yaml_data.get('functional')
33
- _all_functional_ops = dir(torch.nn.functional)
34
- return set(wrap_functional_ops) & set(_all_functional_ops)
35
-
36
-
37
- TorchFunctions = {func: getattr(torch.nn.functional, func) for func in get_functional_ops()}
38
-
39
-
40
- class HOOKFunctionalOP(object):
41
- pass
42
-
43
-
44
- class FunctionalOPTemplate(HOOKModule):
45
- def __init__(self, op_name, hook, need_hook=True):
46
- self.op_name_ = op_name
47
- self.prefix_op_name_ = "Functional" + Const.SEP + str(op_name) + Const.SEP
48
- if need_hook:
49
- super().__init__(hook)
50
-
51
- @torch_device_guard
52
- def forward(self, *args, **kwargs):
53
- return TorchFunctions[str(self.op_name_)](*args, **kwargs)
54
-
55
-
56
- def wrap_functional_op(op_name, hook):
57
- def functional_op_template(*args, **kwargs):
58
- return FunctionalOPTemplate(op_name, hook)(*args, **kwargs)
59
-
60
- return functional_op_template
61
-
62
-
63
- def wrap_functional_ops_and_bind(hook):
64
- _functional_ops = get_functional_ops()
65
- for op_name in _functional_ops:
66
- setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook))
@@ -1,85 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import torch
18
-
19
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
20
- from msprobe.pytorch.common.utils import torch_device_guard, torch_without_guard_version
21
- from msprobe.core.common.const import Const
22
- from msprobe.core.common.log import logger
23
- from msprobe.core.common.file_utils import load_yaml
24
- from msprobe.pytorch.function_factory import npu_custom_functions
25
-
26
- try:
27
- import torch_npu
28
- except ImportError:
29
- logger.info("Failing to import torch_npu.")
30
-
31
-
32
- cur_path = os.path.dirname(os.path.realpath(__file__))
33
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
34
- cuda_func_mapping = {"npu_fusion_attention" : "gpu_fusion_attention"}
35
-
36
-
37
- def get_npu_ops():
38
- if torch_without_guard_version:
39
- _npu_ops = dir(torch.ops.npu)
40
- else:
41
- _npu_ops = dir(torch_npu._C._VariableFunctionsClass)
42
- yaml_data = load_yaml(yaml_path)
43
- wrap_npu_ops = yaml_data.get('torch_npu')
44
- return set(wrap_npu_ops) & set(_npu_ops)
45
-
46
-
47
- class HOOKNpuOP(object):
48
- pass
49
-
50
-
51
- class NpuOPTemplate(HOOKModule):
52
-
53
- def __init__(self, op_name, hook, need_hook=True, device=Const.CPU_LOWERCASE):
54
- self.op_name_ = op_name
55
- self.prefix_op_name_ = "NPU" + Const.SEP + str(op_name) + Const.SEP
56
- self.need_hook = need_hook
57
- self.device = device
58
- if need_hook:
59
- super().__init__(hook)
60
-
61
- @torch_device_guard
62
- def forward(self, *args, **kwargs):
63
- if not self.need_hook:
64
- if self.op_name_ not in npu_custom_functions:
65
- raise Exception(f'There is not bench function {self.op_name_}')
66
- if self.device == Const.CUDA_LOWERCASE:
67
- self.op_name_ = cuda_func_mapping.get(self.op_name_, self.op_name_)
68
- if self.device in [Const.CUDA_LOWERCASE, Const.CPU_LOWERCASE]:
69
- return npu_custom_functions[self.op_name_](*args, **kwargs)
70
- if torch_without_guard_version:
71
- return getattr(torch.ops.npu, str(self.op_name_))(*args, **kwargs)
72
- else:
73
- return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
74
-
75
-
76
- def wrap_npu_op(op_name, hook):
77
- def npu_op_template(*args, **kwargs):
78
- return NpuOPTemplate(op_name, hook)(*args, **kwargs)
79
- return npu_op_template
80
-
81
-
82
- def wrap_npu_ops_and_bind(hook):
83
- _npu_ops = get_npu_ops()
84
- for op_name in _npu_ops:
85
- setattr(HOOKNpuOP, "wrap_" + str(op_name), wrap_npu_op(op_name, hook))
@@ -1,69 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- import torch
19
-
20
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
21
- from msprobe.pytorch.common.utils import torch_device_guard, parameter_adapter
22
- from msprobe.core.common.const import Const
23
- from msprobe.core.common.file_utils import load_yaml
24
-
25
-
26
- cur_path = os.path.dirname(os.path.realpath(__file__))
27
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
28
-
29
-
30
- def get_tensor_ops():
31
- _tensor_ops = dir(torch.Tensor)
32
- yaml_data = load_yaml(yaml_path)
33
- wrap_tensor_ops = yaml_data.get('tensor')
34
- return set(wrap_tensor_ops) & set(_tensor_ops)
35
-
36
-
37
- TensorOps = {op: getattr(torch.Tensor, op) for op in get_tensor_ops()}
38
-
39
-
40
- class HOOKTensor(object):
41
- pass
42
-
43
-
44
- class TensorOPTemplate(HOOKModule):
45
-
46
- def __init__(self, op_name, hook, need_hook=True):
47
- self.op_name_ = op_name
48
- self.prefix_op_name_ = "Tensor" + Const.SEP + str(op_name) + Const.SEP
49
- if need_hook:
50
- super().__init__(hook)
51
-
52
- @torch_device_guard
53
- @parameter_adapter
54
- def forward(self, *args, **kwargs):
55
- return TensorOps[str(self.op_name_)](*args, **kwargs)
56
-
57
-
58
- def wrap_tensor_op(op_name, hook):
59
-
60
- def tensor_op_template(*args, **kwargs):
61
- return TensorOPTemplate(op_name, hook)(*args, **kwargs)
62
-
63
- return tensor_op_template
64
-
65
-
66
- def wrap_tensor_ops_and_bind(hook):
67
- _tensor_ops = get_tensor_ops()
68
- for op_name in _tensor_ops:
69
- setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook))
@@ -1,84 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import torch
18
-
19
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
20
- from msprobe.pytorch.common.utils import torch_device_guard
21
- from msprobe.core.common.const import Const
22
- from msprobe.core.common.file_utils import load_yaml
23
-
24
-
25
- cur_path = os.path.dirname(os.path.realpath(__file__))
26
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
27
-
28
-
29
- def get_torch_ops():
30
- _torch_ops = []
31
- yaml_data = load_yaml(yaml_path)
32
- wrap_torch_ops = yaml_data.get('torch')
33
- for operation in wrap_torch_ops:
34
- if '.' in operation:
35
- operation_sub_module_name, operation_sub_op = operation.rsplit('.', 1)
36
- operation_sub_module = getattr(torch, operation_sub_module_name)
37
- if operation_sub_op in dir(operation_sub_module):
38
- _torch_ops.append(operation)
39
- else:
40
- if hasattr(torch, operation):
41
- _torch_ops.append(operation)
42
- return set(_torch_ops)
43
-
44
-
45
- TorchOps = {}
46
- for op in get_torch_ops():
47
- if '.' in op:
48
- sub_module_name, sub_op = op.rsplit('.', 1)
49
- sub_module = getattr(torch, sub_module_name)
50
- TorchOps[op] = getattr(sub_module, sub_op)
51
- else:
52
- TorchOps[op] = getattr(torch, op)
53
-
54
-
55
-
56
- class HOOKTorchOP(object):
57
- pass
58
-
59
-
60
- class TorchOPTemplate(HOOKModule):
61
-
62
- def __init__(self, op_name, hook, need_hook=True):
63
- self.op_name_ = op_name
64
- self.prefix_op_name_ = "Torch" + Const.SEP + str(op_name) + Const.SEP
65
- if need_hook:
66
- super().__init__(hook)
67
-
68
- @torch_device_guard
69
- def forward(self, *args, **kwargs):
70
- return TorchOps[str(self.op_name_)](*args, **kwargs)
71
-
72
-
73
- def wrap_torch_op(op_name, hook):
74
-
75
- def torch_op_template(*args, **kwargs):
76
- return TorchOPTemplate(op_name, hook)(*args, **kwargs)
77
-
78
- return torch_op_template
79
-
80
-
81
- def wrap_torch_ops_and_bind(hook):
82
- _torch_ops = get_torch_ops()
83
- for op_name in _torch_ops:
84
- setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook))
@@ -1,60 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import torch
18
-
19
- from msprobe.core.common.const import Const
20
- from msprobe.core.common.file_utils import load_yaml
21
- from msprobe.pytorch.hook_module.hook_module import HOOKModule
22
- from msprobe.pytorch.common.utils import torch_device_guard
23
-
24
-
25
- cur_path = os.path.dirname(os.path.realpath(__file__))
26
- yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
27
-
28
-
29
- def get_vf_ops():
30
- yaml_data = load_yaml(yaml_path)
31
- wrap_vf_ops = yaml_data.get('_VF')
32
- return wrap_vf_ops
33
-
34
-
35
- class HOOKVfOP(object):
36
- pass
37
-
38
-
39
- class VfOPTemplate(HOOKModule):
40
- def __init__(self, op_name, hook):
41
- self.op_name_ = op_name
42
- self.prefix_op_name_ = "VF" + Const.SEP + str(op_name) + Const.SEP
43
- super().__init__(hook)
44
-
45
- @torch_device_guard
46
- def forward(self, *args, **kwargs):
47
- return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
48
-
49
-
50
- def wrap_vf_op(op_name, hook):
51
- def vf_op_template(*args, **kwargs):
52
- return VfOPTemplate(op_name, hook)(*args, **kwargs)
53
-
54
- return vf_op_template
55
-
56
-
57
- def wrap_vf_ops_and_bind(hook):
58
- _vf_ops = get_vf_ops()
59
- for op_name in _vf_ops:
60
- setattr(HOOKVfOP, "wrap_" + op_name, wrap_vf_op(op_name, hook))