mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
- msprobe/README.md +7 -5
- msprobe/core/common/const.py +6 -0
- msprobe/core/common/db_manager.py +35 -4
- msprobe/core/common/file_utils.py +105 -27
- msprobe/core/common/framework_adapter.py +7 -6
- msprobe/core/common/megatron_utils.py +59 -0
- msprobe/core/common/utils.py +14 -3
- msprobe/core/compare/find_first/analyzer.py +8 -7
- msprobe/core/compare/find_first/graph.py +11 -3
- msprobe/core/compare/find_first/utils.py +2 -1
- msprobe/core/compare/highlight.py +13 -6
- msprobe/core/compare/multiprocessing_compute.py +17 -10
- msprobe/core/compare/utils.py +14 -5
- msprobe/core/data_dump/data_collector.py +18 -21
- msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
- msprobe/core/data_dump/json_writer.py +18 -8
- msprobe/core/data_dump/scope.py +4 -6
- msprobe/core/hook_manager.py +37 -3
- msprobe/core/service.py +18 -5
- msprobe/core/single_save/single_comparator.py +16 -3
- msprobe/docs/01.installation.md +7 -5
- msprobe/docs/02.config_introduction.md +14 -1
- msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
- msprobe/docs/06.data_dump_MindSpore.md +1 -1
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
- msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
- msprobe/docs/14.data_parse_PyTorch.md +1 -1
- msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
- msprobe/docs/19.monitor.md +2 -0
- msprobe/docs/21.visualization_PyTorch.md +15 -80
- msprobe/docs/22.visualization_MindSpore.md +20 -104
- msprobe/docs/23.generate_operator_PyTorch.md +1 -1
- msprobe/docs/25.tool_function_introduction.md +1 -0
- msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
- msprobe/mindspore/cell_processor.py +33 -5
- msprobe/mindspore/compare/common_dir_compare.py +22 -26
- msprobe/mindspore/compare/utils.py +1 -2
- msprobe/mindspore/debugger/precision_debugger.py +1 -1
- msprobe/mindspore/dump/cell_dump_process.py +73 -62
- msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
- msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
- msprobe/msprobe.py +6 -4
- msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
- msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
- msprobe/pytorch/attl_manager.py +65 -0
- msprobe/pytorch/common/utils.py +22 -2
- msprobe/pytorch/compare/utils.py +3 -3
- msprobe/pytorch/debugger/debugger_config.py +10 -0
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
- msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
- msprobe/pytorch/hook_module/api_register.py +6 -1
- msprobe/pytorch/monitor/module_hook.py +28 -9
- msprobe/pytorch/online_dispatch/dispatch.py +42 -24
- msprobe/pytorch/pt_config.py +57 -2
- msprobe/pytorch/pytorch_service.py +11 -2
- msprobe/visualization/builder/graph_builder.py +170 -64
- msprobe/visualization/builder/graph_merger.py +0 -1
- msprobe/visualization/builder/msprobe_adapter.py +1 -1
- msprobe/visualization/db_utils.py +25 -2
- msprobe/visualization/graph/base_node.py +0 -24
- msprobe/visualization/graph/graph.py +5 -14
- msprobe/visualization/graph_service.py +29 -53
- msprobe/visualization/utils.py +11 -1
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0
|
@@ -12,29 +12,31 @@
|
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
|
+
|
|
15
16
|
import atexit
|
|
16
17
|
import csv
|
|
17
18
|
import fcntl
|
|
18
19
|
import io
|
|
20
|
+
import json
|
|
21
|
+
import multiprocessing
|
|
19
22
|
import os
|
|
20
23
|
import pickle
|
|
21
|
-
from multiprocessing import shared_memory
|
|
22
|
-
import stat
|
|
23
|
-
import json
|
|
24
24
|
import re
|
|
25
25
|
import shutil
|
|
26
|
+
import stat
|
|
26
27
|
import sys
|
|
27
28
|
import zipfile
|
|
28
|
-
import
|
|
29
|
-
|
|
29
|
+
from multiprocessing import shared_memory
|
|
30
|
+
|
|
30
31
|
import numpy as np
|
|
31
32
|
import pandas as pd
|
|
33
|
+
import yaml
|
|
32
34
|
|
|
35
|
+
from msprobe.core.common.const import FileCheckConst, CompareConst, Const
|
|
33
36
|
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
34
|
-
from msprobe.core.common.log import logger
|
|
35
37
|
from msprobe.core.common.exceptions import FileCheckException
|
|
36
|
-
from msprobe.core.common.const import FileCheckConst, CompareConst
|
|
37
38
|
from msprobe.core.common.global_lock import global_lock, is_main_process
|
|
39
|
+
from msprobe.core.common.log import logger
|
|
38
40
|
|
|
39
41
|
proc_lock = multiprocessing.Lock()
|
|
40
42
|
|
|
@@ -46,16 +48,15 @@ class FileChecker:
|
|
|
46
48
|
Attributes:
|
|
47
49
|
file_path: The file or dictionary path to be verified.
|
|
48
50
|
path_type: file or dictionary
|
|
49
|
-
ability(str):
|
|
51
|
+
ability(str): one of [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
|
|
50
52
|
file_type(str): The correct file type for file
|
|
51
53
|
"""
|
|
52
54
|
|
|
53
|
-
def __init__(self, file_path, path_type, ability=None, file_type=None
|
|
55
|
+
def __init__(self, file_path, path_type, ability=None, file_type=None):
|
|
54
56
|
self.file_path = file_path
|
|
55
57
|
self.path_type = self._check_path_type(path_type)
|
|
56
|
-
self.ability = ability
|
|
58
|
+
self.ability = self._check_ability_type(ability)
|
|
57
59
|
self.file_type = file_type
|
|
58
|
-
self.is_script = is_script
|
|
59
60
|
|
|
60
61
|
@staticmethod
|
|
61
62
|
def _check_path_type(path_type):
|
|
@@ -64,9 +65,17 @@ class FileChecker:
|
|
|
64
65
|
raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
|
|
65
66
|
return path_type
|
|
66
67
|
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _check_ability_type(ability):
|
|
70
|
+
ability_list = [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
|
|
71
|
+
if ability and ability not in ability_list:
|
|
72
|
+
logger.error(f'The ability must be one of {ability_list}.')
|
|
73
|
+
raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
|
|
74
|
+
return ability
|
|
75
|
+
|
|
67
76
|
def common_check(self):
|
|
68
77
|
"""
|
|
69
|
-
|
|
78
|
+
功能:基本文件权限校验,包括文件存在性、软连接、文件长度、文件类型、文件读写权限、文件属组、文件路径特殊字符、文件后缀等
|
|
70
79
|
注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现
|
|
71
80
|
"""
|
|
72
81
|
check_path_exists(self.file_path)
|
|
@@ -75,13 +84,13 @@ class FileChecker:
|
|
|
75
84
|
check_path_length(self.file_path)
|
|
76
85
|
check_path_type(self.file_path, self.path_type)
|
|
77
86
|
self.check_path_ability()
|
|
78
|
-
|
|
79
|
-
check_path_owner_consistent(self.file_path)
|
|
87
|
+
check_path_owner_consistent(self.file_path)
|
|
80
88
|
check_path_pattern_valid(self.file_path)
|
|
81
89
|
check_common_file_size(self.file_path)
|
|
82
90
|
check_file_suffix(self.file_path, self.file_type)
|
|
91
|
+
check_path_no_others_write(self.file_path)
|
|
83
92
|
if self.path_type == FileCheckConst.FILE:
|
|
84
|
-
|
|
93
|
+
check_dirpath_permission(self.file_path)
|
|
85
94
|
return self.file_path
|
|
86
95
|
|
|
87
96
|
def check_path_ability(self):
|
|
@@ -137,7 +146,8 @@ class FileOpen:
|
|
|
137
146
|
check_path_pattern_valid(self.file_path)
|
|
138
147
|
if os.path.exists(self.file_path):
|
|
139
148
|
check_common_file_size(self.file_path)
|
|
140
|
-
|
|
149
|
+
check_path_no_others_write(self.file_path)
|
|
150
|
+
check_dirpath_permission(self.file_path)
|
|
141
151
|
|
|
142
152
|
def check_ability_and_owner(self):
|
|
143
153
|
if self.mode in self.SUPPORT_READ_MODE:
|
|
@@ -172,7 +182,7 @@ def check_path_exists(path):
|
|
|
172
182
|
if not os.path.exists(path):
|
|
173
183
|
logger.error('The file path %s does not exist.' % path)
|
|
174
184
|
raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
|
|
175
|
-
|
|
185
|
+
|
|
176
186
|
|
|
177
187
|
def check_path_not_exists(path):
|
|
178
188
|
if os.path.exists(path):
|
|
@@ -256,12 +266,15 @@ def check_path_type(file_path, file_type):
|
|
|
256
266
|
raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
|
|
257
267
|
|
|
258
268
|
|
|
259
|
-
def
|
|
260
|
-
|
|
261
|
-
is_writable = (
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
269
|
+
def check_group_writable(file_path):
|
|
270
|
+
path_stat = os.stat(file_path)
|
|
271
|
+
is_writable = bool(path_stat.st_mode & stat.S_IWGRP)
|
|
272
|
+
return is_writable
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def check_others_writable(file_path):
|
|
276
|
+
path_stat = os.stat(file_path)
|
|
277
|
+
is_writable = bool(path_stat.st_mode & stat.S_IWOTH)
|
|
265
278
|
return is_writable
|
|
266
279
|
|
|
267
280
|
|
|
@@ -309,7 +322,7 @@ def check_path_before_create(path):
|
|
|
309
322
|
'The file path {} contains special characters.'.format(path))
|
|
310
323
|
|
|
311
324
|
|
|
312
|
-
def
|
|
325
|
+
def check_dirpath_permission(path):
|
|
313
326
|
path = os.path.realpath(path)
|
|
314
327
|
dirpath = os.path.dirname(path)
|
|
315
328
|
if dedup_log('check_dirpath_before_read', dirpath):
|
|
@@ -319,15 +332,16 @@ def check_dirpath_before_read(path):
|
|
|
319
332
|
check_path_owner_consistent(dirpath)
|
|
320
333
|
except FileCheckException:
|
|
321
334
|
logger.warning(f"The directory {dirpath} is not yours.")
|
|
322
|
-
|
|
323
335
|
|
|
324
|
-
|
|
336
|
+
|
|
337
|
+
def check_file_or_directory_path(path, isdir=False, is_strict=False):
|
|
325
338
|
"""
|
|
326
339
|
Function Description:
|
|
327
340
|
check whether the path is valid
|
|
328
341
|
Parameter:
|
|
329
342
|
path: the path to check
|
|
330
343
|
isdir: the path is dir or file
|
|
344
|
+
is_strict: whether to perform stricter validation (e.g., verify group cannot write to path)
|
|
331
345
|
Exception Description:
|
|
332
346
|
when invalid data throw exception
|
|
333
347
|
"""
|
|
@@ -337,6 +351,33 @@ def check_file_or_directory_path(path, isdir=False):
|
|
|
337
351
|
path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE)
|
|
338
352
|
path_checker.common_check()
|
|
339
353
|
|
|
354
|
+
if is_strict:
|
|
355
|
+
if check_group_writable(path):
|
|
356
|
+
raise FileCheckException(
|
|
357
|
+
FileCheckException.FILE_PERMISSION_ERROR,
|
|
358
|
+
f"The directory/file must not allow write access to group. Directory/File path: {path}"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def check_path_no_others_write(file_path):
|
|
363
|
+
if dedup_log('check_path_no_others_write', file_path):
|
|
364
|
+
if check_group_writable(file_path):
|
|
365
|
+
logger.warning(f"The directory/file path is writable by group: {file_path}.")
|
|
366
|
+
|
|
367
|
+
if check_others_writable(file_path):
|
|
368
|
+
raise FileCheckException(
|
|
369
|
+
FileCheckException.FILE_PERMISSION_ERROR,
|
|
370
|
+
f"The directory/file must not allow write access to others. Directory/File path: {file_path}"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def check_path_no_group_others_write(file_path):
|
|
375
|
+
if check_group_writable(file_path) or check_others_writable(file_path):
|
|
376
|
+
raise FileCheckException(
|
|
377
|
+
FileCheckException.FILE_PERMISSION_ERROR,
|
|
378
|
+
f"The directory/file must not allow write access to group or others. Directory/File path: {file_path}"
|
|
379
|
+
)
|
|
380
|
+
|
|
340
381
|
|
|
341
382
|
def change_mode(path, mode):
|
|
342
383
|
if not os.path.exists(path) or os.path.islink(path):
|
|
@@ -388,6 +429,14 @@ def check_file_type(path):
|
|
|
388
429
|
raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
|
|
389
430
|
|
|
390
431
|
|
|
432
|
+
def root_privilege_warning():
|
|
433
|
+
if os.getuid() == 0:
|
|
434
|
+
logger.warning(
|
|
435
|
+
"msprobe is being run as root. "
|
|
436
|
+
"To avoid security risks, it is recommended to switch to a regular user to run it."
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
|
|
391
440
|
def load_yaml(yaml_path):
|
|
392
441
|
path_checker = FileChecker(yaml_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.YAML_SUFFIX)
|
|
393
442
|
checked_path = path_checker.common_check()
|
|
@@ -422,6 +471,26 @@ def load_json(json_path):
|
|
|
422
471
|
return data
|
|
423
472
|
|
|
424
473
|
|
|
474
|
+
def load_construct_json(json_path):
|
|
475
|
+
construct_dict_o = load_json(json_path)
|
|
476
|
+
if Const.MEGATRON_MICRO_STEP_NUMBER in construct_dict_o:
|
|
477
|
+
construct_dict = {}
|
|
478
|
+
micro_step_dict = {Const.MEGATRON_MICRO_STEP_NUMBER: construct_dict_o.get(Const.MEGATRON_MICRO_STEP_NUMBER)}
|
|
479
|
+
del construct_dict_o[Const.MEGATRON_MICRO_STEP_NUMBER]
|
|
480
|
+
for key, value in construct_dict_o.items():
|
|
481
|
+
if isinstance(value, list):
|
|
482
|
+
if len(value) != 2:
|
|
483
|
+
logger.error(f'Parse construct json file "{os.path.basename(json_path)}" failed.')
|
|
484
|
+
raise RuntimeError()
|
|
485
|
+
construct_dict[key] = value[0]
|
|
486
|
+
micro_step_dict[key] = value[1]
|
|
487
|
+
else:
|
|
488
|
+
construct_dict[key] = value
|
|
489
|
+
micro_step_dict[key] = 0
|
|
490
|
+
return construct_dict, micro_step_dict
|
|
491
|
+
return construct_dict_o, {}
|
|
492
|
+
|
|
493
|
+
|
|
425
494
|
def save_json(json_path, data, indent=None, mode="w"):
|
|
426
495
|
check_path_before_create(json_path)
|
|
427
496
|
json_path = os.path.realpath(json_path)
|
|
@@ -520,6 +589,9 @@ def move_directory(src_path, dst_path):
|
|
|
520
589
|
check_file_or_directory_path(src_path, isdir=True)
|
|
521
590
|
check_path_before_create(dst_path)
|
|
522
591
|
try:
|
|
592
|
+
if os.path.exists(dst_path):
|
|
593
|
+
logger.warning(f"The destination directory {dst_path} already exists, it will be removed.")
|
|
594
|
+
shutil.rmtree(dst_path)
|
|
523
595
|
shutil.move(src_path, dst_path)
|
|
524
596
|
except Exception as e:
|
|
525
597
|
logger.error(f"move directory {src_path} to {dst_path} failed")
|
|
@@ -945,7 +1017,13 @@ class SharedDict:
|
|
|
945
1017
|
def _safe_load(self):
|
|
946
1018
|
with io.BytesIO(self._shm.buf[:]) as buff:
|
|
947
1019
|
try:
|
|
948
|
-
|
|
1020
|
+
data = SafeUnpickler(buff).load()
|
|
1021
|
+
if not isinstance(data, dict):
|
|
1022
|
+
logger.debug(f"Data from shared memory is '{type(data)}' type, expected 'dict'.")
|
|
1023
|
+
self._dict = {}
|
|
1024
|
+
self._changed = True
|
|
1025
|
+
else:
|
|
1026
|
+
self._dict = data
|
|
949
1027
|
except Exception as e:
|
|
950
1028
|
logger.debug(f'shared dict is unreadable, reason: {e}, create new dict.')
|
|
951
1029
|
self._dict = {}
|
|
@@ -12,10 +12,11 @@
|
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.import functools
|
|
15
|
+
|
|
15
16
|
import functools
|
|
17
|
+
|
|
16
18
|
from msprobe.core.common.const import Const
|
|
17
|
-
from msprobe.core.common.file_utils import check_file_or_directory_path
|
|
18
|
-
from msprobe.core.common.file_utils import save_npy
|
|
19
|
+
from msprobe.core.common.file_utils import check_file_or_directory_path, save_npy
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class FrameworkDescriptor:
|
|
@@ -103,7 +104,7 @@ class FmkAdp:
|
|
|
103
104
|
@classmethod
|
|
104
105
|
def tensor_norm(cls, tensor):
|
|
105
106
|
return cls.process_tensor(tensor, lambda x: x.norm())
|
|
106
|
-
|
|
107
|
+
|
|
107
108
|
@classmethod
|
|
108
109
|
def save_tensor(cls, tensor, filepath):
|
|
109
110
|
if cls.fmk == Const.PT_FRAMEWORK:
|
|
@@ -151,7 +152,7 @@ class FmkAdp:
|
|
|
151
152
|
|
|
152
153
|
@classmethod
|
|
153
154
|
def load_checkpoint(cls, path, to_cpu=True, weights_only=True):
|
|
154
|
-
check_file_or_directory_path(path)
|
|
155
|
+
check_file_or_directory_path(path, is_strict=not weights_only)
|
|
155
156
|
if cls.fmk == Const.PT_FRAMEWORK:
|
|
156
157
|
try:
|
|
157
158
|
if to_cpu:
|
|
@@ -161,9 +162,9 @@ class FmkAdp:
|
|
|
161
162
|
except Exception as e:
|
|
162
163
|
raise RuntimeError(f"load pt file {path} failed: {e}") from e
|
|
163
164
|
return mindspore.load_checkpoint(path)
|
|
164
|
-
|
|
165
|
+
|
|
165
166
|
@classmethod
|
|
166
167
|
def asnumpy(cls, tensor):
|
|
167
168
|
if cls.fmk == Const.PT_FRAMEWORK:
|
|
168
169
|
return tensor.float().numpy()
|
|
169
|
-
return tensor.float().asnumpy()
|
|
170
|
+
return tensor.float().asnumpy()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from functools import wraps
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MegatronStepInfo:
|
|
20
|
+
is_megatron = False
|
|
21
|
+
is_forward = False
|
|
22
|
+
is_backward = False
|
|
23
|
+
forward_micro_step = -1
|
|
24
|
+
backward_micro_step = -1
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def reset(cls):
|
|
28
|
+
"""重置所有类属性到初始状态"""
|
|
29
|
+
cls.is_megatron = False
|
|
30
|
+
cls.is_forward = False
|
|
31
|
+
cls.is_backward = False
|
|
32
|
+
cls.forward_micro_step = -1
|
|
33
|
+
cls.backward_micro_step = -1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def wrap_megatron_step(func, is_forward=True):
|
|
37
|
+
@wraps(func)
|
|
38
|
+
def wrapped_func(*args, **kwargs):
|
|
39
|
+
if not MegatronStepInfo.is_megatron:
|
|
40
|
+
MegatronStepInfo.is_megatron = True
|
|
41
|
+
if is_forward:
|
|
42
|
+
MegatronStepInfo.is_forward = True
|
|
43
|
+
MegatronStepInfo.is_backward = False
|
|
44
|
+
MegatronStepInfo.forward_micro_step += 1
|
|
45
|
+
else:
|
|
46
|
+
MegatronStepInfo.is_forward = False
|
|
47
|
+
MegatronStepInfo.is_backward = True
|
|
48
|
+
MegatronStepInfo.backward_micro_step += 1
|
|
49
|
+
return func(*args, **kwargs)
|
|
50
|
+
|
|
51
|
+
return wrapped_func
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_micro_step():
|
|
55
|
+
return MegatronStepInfo.forward_micro_step if MegatronStepInfo.is_forward else MegatronStepInfo.backward_micro_step
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_megatron():
|
|
59
|
+
return MegatronStepInfo.is_megatron
|
msprobe/core/common/utils.py
CHANGED
|
@@ -28,7 +28,7 @@ import numpy as np
|
|
|
28
28
|
from msprobe.core.common.const import Const, CompareConst
|
|
29
29
|
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
30
30
|
from msprobe.core.common.exceptions import MsprobeException
|
|
31
|
-
from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json)
|
|
31
|
+
from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json, load_construct_json)
|
|
32
32
|
from msprobe.core.common.log import logger
|
|
33
33
|
|
|
34
34
|
device = collections.namedtuple('device', ['type', 'index'])
|
|
@@ -83,7 +83,8 @@ class MsprobeBaseException(Exception):
|
|
|
83
83
|
INVALID_API_NAME_ERROR = 36
|
|
84
84
|
CROSS_FRAME_ERROR = 37
|
|
85
85
|
MISSING_THRESHOLD_ERROR = 38
|
|
86
|
-
WRONG_THRESHOLD_ERROR =
|
|
86
|
+
WRONG_THRESHOLD_ERROR = 39
|
|
87
|
+
MULTIPROCESS_ERROR = 40
|
|
87
88
|
|
|
88
89
|
def __init__(self, code, error_info: str = ""):
|
|
89
90
|
super(MsprobeBaseException, self).__init__()
|
|
@@ -348,8 +349,18 @@ def get_stack_construct_by_dump_json_path(dump_json_path):
|
|
|
348
349
|
stack_json = os.path.join(directory, "stack.json")
|
|
349
350
|
construct_json = os.path.join(directory, "construct.json")
|
|
350
351
|
|
|
352
|
+
stack_json_exist = os.path.exists(stack_json)
|
|
353
|
+
construct_json_exist = os.path.exists(construct_json)
|
|
354
|
+
|
|
355
|
+
if not stack_json_exist and not construct_json_exist:
|
|
356
|
+
logger.info("stack.json and construct.json not found")
|
|
357
|
+
return {}, {}
|
|
358
|
+
if not stack_json_exist or not construct_json_exist:
|
|
359
|
+
logger.error("stack.json or construct.json not found, please check.")
|
|
360
|
+
raise CompareException(CompareException.INVALID_PATH_ERROR)
|
|
361
|
+
|
|
351
362
|
stack = load_json(stack_json)
|
|
352
|
-
construct =
|
|
363
|
+
construct, _ = load_construct_json(construct_json)
|
|
353
364
|
return stack, construct
|
|
354
365
|
|
|
355
366
|
|
|
@@ -47,7 +47,6 @@ class DiffAnalyzer:
|
|
|
47
47
|
analyze_func()
|
|
48
48
|
if self._diff_nodes:
|
|
49
49
|
self._gen_analyze_info()
|
|
50
|
-
self._post_process()
|
|
51
50
|
return
|
|
52
51
|
logger.info('Cannot find any diff node, no need to generate analyze file.')
|
|
53
52
|
|
|
@@ -56,12 +55,6 @@ class DiffAnalyzer:
|
|
|
56
55
|
self._resolve_input_path(self._output_path)
|
|
57
56
|
logger.info("Pre Process completed.")
|
|
58
57
|
|
|
59
|
-
def _post_process(self):
|
|
60
|
-
for rank_path in self._paths.values():
|
|
61
|
-
dump_path = rank_path.dump_path
|
|
62
|
-
logger.debug(f"Remove {dump_path} success")
|
|
63
|
-
logger.info("Post Process completed.")
|
|
64
|
-
|
|
65
58
|
"""
|
|
66
59
|
这里需要生成stack,但是直接用dict中自带就行,在op_items.NPU_Stack_Info中
|
|
67
60
|
"""
|
|
@@ -105,6 +98,8 @@ class DiffAnalyzer:
|
|
|
105
98
|
logger.warning(f'Rank {path.rank} has no dump data!')
|
|
106
99
|
continue
|
|
107
100
|
for op_name, op_data in dump_data.items():
|
|
101
|
+
if is_ignore_op(op_name):
|
|
102
|
+
continue
|
|
108
103
|
if is_communication_op(op_name):
|
|
109
104
|
self._first_comm_nodes[path.rank] = op_name
|
|
110
105
|
break
|
|
@@ -131,10 +126,16 @@ class DiffAnalyzer:
|
|
|
131
126
|
for rank, nodes in list(self._rank_comm_nodes_dict.items())[:-1]:
|
|
132
127
|
searched_ranks.add(rank)
|
|
133
128
|
seen_nodes = set()
|
|
129
|
+
last_node = None
|
|
134
130
|
for cur_node in nodes.values():
|
|
131
|
+
is_overflow = last_node and hasattr(last_node, 'layer') and hasattr(cur_node, 'layer') and \
|
|
132
|
+
last_node.layer >= cur_node.layer
|
|
133
|
+
if is_overflow:
|
|
134
|
+
cur_node.layer = last_node.layer + 1
|
|
135
135
|
conn_info = cur_node.find_connected_nodes()
|
|
136
136
|
if not conn_info.get('ranks'):
|
|
137
137
|
conn_info['ranks'] = self._rank_comm_nodes_dict.keys()
|
|
138
|
+
last_node = cur_node
|
|
138
139
|
if not self._find_connection(conn_info, cur_node, searched_ranks, seen_nodes):
|
|
139
140
|
logger.debug(f'Cannot find connected communication node for "{cur_node.node_id}".')
|
|
140
141
|
|
|
@@ -52,19 +52,25 @@ class DataNode:
|
|
|
52
52
|
metrics = {}
|
|
53
53
|
for cmp_data in self.op_data:
|
|
54
54
|
name = cmp_data.get(CompareConst.NPU_NAME)
|
|
55
|
+
# 构建度量指标字典
|
|
56
|
+
metrics = {}
|
|
57
|
+
|
|
55
58
|
if CompareConst.NPU_MAX in cmp_data:
|
|
56
59
|
metrics = {CompareConst.NPU_MAX: cmp_data.get(CompareConst.NPU_MAX),
|
|
57
60
|
CompareConst.NPU_MIN: cmp_data.get(CompareConst.NPU_MIN),
|
|
58
61
|
CompareConst.NPU_MEAN: cmp_data.get(CompareConst.NPU_MEAN),
|
|
59
62
|
CompareConst.NPU_NORM: cmp_data.get(CompareConst.NPU_NORM)}
|
|
60
63
|
elif CompareConst.NPU_MD5 in cmp_data:
|
|
61
|
-
metrics
|
|
64
|
+
metrics[CompareConst.NPU_MD5] = cmp_data.get(CompareConst.NPU_MD5)
|
|
65
|
+
|
|
66
|
+
if CompareConst.NPU_P2POP_PEER in cmp_data:
|
|
67
|
+
metrics[CompareConst.NPU_P2POP_PEER] = cmp_data.get(CompareConst.NPU_P2POP_PEER)
|
|
62
68
|
|
|
63
69
|
if cmp_data.get(CompareConst.STACK) != CompareConst.N_A and not self.stack:
|
|
64
70
|
self.stack = cmp_data.get(CompareConst.STACK)
|
|
65
|
-
if
|
|
71
|
+
if cmp_data.get('state') == "input":
|
|
66
72
|
self.inputs[name] = metrics
|
|
67
|
-
elif
|
|
73
|
+
elif cmp_data.get('state') == "output":
|
|
68
74
|
self.outputs[name] = metrics
|
|
69
75
|
|
|
70
76
|
def gen_node_info(self, path: RankPath):
|
|
@@ -161,6 +167,8 @@ class CommunicationNode:
|
|
|
161
167
|
if val and val.startswith('[') and val.endswith(']'):
|
|
162
168
|
val = [int(part) for part in val.strip('[]').split(',')]
|
|
163
169
|
ranks.update(val)
|
|
170
|
+
elif v.get(CompareConst.NPU_P2POP_PEER) != "None":
|
|
171
|
+
ranks.add(v.get(CompareConst.NPU_P2POP_PEER))
|
|
164
172
|
|
|
165
173
|
return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
|
|
166
174
|
'type': DiffAnalyseConst.OPPOSITE_DIR.get(self.type, DiffAnalyseConst.LINK)}
|
|
@@ -26,7 +26,7 @@ from tqdm import tqdm
|
|
|
26
26
|
from msprobe.core.common.const import CompareConst, Const
|
|
27
27
|
from msprobe.core.common.file_utils import save_workbook
|
|
28
28
|
from msprobe.core.common.log import logger
|
|
29
|
-
from msprobe.core.common.utils import get_header_index
|
|
29
|
+
from msprobe.core.common.utils import get_header_index, CompareException
|
|
30
30
|
from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches
|
|
31
31
|
from msprobe.core.compare.config import ModeConfig
|
|
32
32
|
|
|
@@ -359,18 +359,25 @@ class HighLight:
|
|
|
359
359
|
|
|
360
360
|
def err_call(args):
|
|
361
361
|
logger.error("Multiprocessing malicious value check failed! Reason: {}".format(args))
|
|
362
|
-
try:
|
|
363
|
-
pool.close()
|
|
364
|
-
except OSError:
|
|
365
|
-
logger.error("Pool terminate failed")
|
|
366
362
|
|
|
367
363
|
result_df_columns = result_df.columns.tolist()
|
|
368
364
|
for column in result_df_columns:
|
|
369
365
|
self.value_check(column)
|
|
366
|
+
async_results = []
|
|
370
367
|
for df_chunk in chunks:
|
|
371
|
-
pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
|
|
368
|
+
result = pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
|
|
369
|
+
async_results.append(result)
|
|
372
370
|
|
|
373
371
|
pool.close()
|
|
372
|
+
|
|
373
|
+
for ar in async_results:
|
|
374
|
+
try:
|
|
375
|
+
ar.get(timeout=3600)
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.error(f"Task failed with exception: {e}")
|
|
378
|
+
pool.terminate()
|
|
379
|
+
raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
|
|
380
|
+
|
|
374
381
|
pool.join()
|
|
375
382
|
|
|
376
383
|
def df_malicious_value_check(self, result_df):
|
|
@@ -52,16 +52,20 @@ def _ms_graph_handle_multi_process(func, result_df, mode):
|
|
|
52
52
|
|
|
53
53
|
def err_call(args):
|
|
54
54
|
logger.error('multiprocess compare failed! Reason: {}'.format(args))
|
|
55
|
-
try:
|
|
56
|
-
pool.close()
|
|
57
|
-
except OSError as e:
|
|
58
|
-
logger.error(f'pool terminate failed: {str(e)}')
|
|
59
55
|
|
|
60
56
|
for df_chunk in df_chunks:
|
|
61
57
|
result = pool.apply_async(func, args=(df_chunk, mode), error_callback=err_call)
|
|
62
58
|
results.append(result)
|
|
63
|
-
|
|
59
|
+
|
|
64
60
|
pool.close()
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
final_results = [r.get(timeout=3600) for r in results]
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Task failed with exception: {e}")
|
|
66
|
+
pool.terminate()
|
|
67
|
+
raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
|
|
68
|
+
|
|
65
69
|
pool.join()
|
|
66
70
|
return pd.concat(final_results, ignore_index=True)
|
|
67
71
|
|
|
@@ -277,10 +281,6 @@ class CompareRealData:
|
|
|
277
281
|
|
|
278
282
|
def err_call(args):
|
|
279
283
|
logger.error('multiprocess compare failed! Reason: {}'.format(args))
|
|
280
|
-
try:
|
|
281
|
-
pool.close()
|
|
282
|
-
except OSError:
|
|
283
|
-
logger.error("pool terminate failed")
|
|
284
284
|
|
|
285
285
|
progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
|
|
286
286
|
|
|
@@ -298,7 +298,14 @@ class CompareRealData:
|
|
|
298
298
|
)
|
|
299
299
|
results.append(result)
|
|
300
300
|
|
|
301
|
-
final_results = [r.get() for r in results]
|
|
302
301
|
pool.close()
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
final_results = [r.get(timeout=3600) for r in results]
|
|
305
|
+
except Exception as e:
|
|
306
|
+
logger.error(f"Task failed with exception: {e}")
|
|
307
|
+
pool.terminate()
|
|
308
|
+
raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
|
|
309
|
+
|
|
303
310
|
pool.join()
|
|
304
311
|
return pd.concat(final_results, ignore_index=True)
|
msprobe/core/compare/utils.py
CHANGED
|
@@ -695,10 +695,6 @@ def get_sorted_ranks(npu_dump_dir, bench_dump_dir):
|
|
|
695
695
|
def multi_statistics_compare(func, func_args):
|
|
696
696
|
def err_call(args):
|
|
697
697
|
logger.error(f'Multiprocess statistics compare failed! Reason: {args}')
|
|
698
|
-
try:
|
|
699
|
-
pool.close()
|
|
700
|
-
except OSError:
|
|
701
|
-
logger.error("Pool terminate failed")
|
|
702
698
|
|
|
703
699
|
compare_func, input_param_nr_list, output_path, kwargs = func_args
|
|
704
700
|
|
|
@@ -715,9 +711,22 @@ def multi_statistics_compare(func, func_args):
|
|
|
715
711
|
chunks[i].append(input_param_nr_list[param_num - remainder + i])
|
|
716
712
|
|
|
717
713
|
pool = multiprocessing.Pool(process_num)
|
|
714
|
+
|
|
715
|
+
async_results = []
|
|
718
716
|
for chunk in chunks:
|
|
719
|
-
pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
|
|
717
|
+
result = pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
|
|
718
|
+
async_results.append(result)
|
|
719
|
+
|
|
720
720
|
pool.close()
|
|
721
|
+
|
|
722
|
+
for ar in async_results:
|
|
723
|
+
try:
|
|
724
|
+
ar.get(timeout=3600)
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.error(f"Task failed with exception: {e}")
|
|
727
|
+
pool.terminate()
|
|
728
|
+
raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
|
|
729
|
+
|
|
721
730
|
pool.join()
|
|
722
731
|
|
|
723
732
|
|