mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
  2. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
  3. msprobe/README.md +7 -5
  4. msprobe/core/common/const.py +6 -0
  5. msprobe/core/common/db_manager.py +35 -4
  6. msprobe/core/common/file_utils.py +105 -27
  7. msprobe/core/common/framework_adapter.py +7 -6
  8. msprobe/core/common/megatron_utils.py +59 -0
  9. msprobe/core/common/utils.py +14 -3
  10. msprobe/core/compare/find_first/analyzer.py +8 -7
  11. msprobe/core/compare/find_first/graph.py +11 -3
  12. msprobe/core/compare/find_first/utils.py +2 -1
  13. msprobe/core/compare/highlight.py +13 -6
  14. msprobe/core/compare/multiprocessing_compute.py +17 -10
  15. msprobe/core/compare/utils.py +14 -5
  16. msprobe/core/data_dump/data_collector.py +18 -21
  17. msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
  18. msprobe/core/data_dump/json_writer.py +18 -8
  19. msprobe/core/data_dump/scope.py +4 -6
  20. msprobe/core/hook_manager.py +37 -3
  21. msprobe/core/service.py +18 -5
  22. msprobe/core/single_save/single_comparator.py +16 -3
  23. msprobe/docs/01.installation.md +7 -5
  24. msprobe/docs/02.config_introduction.md +14 -1
  25. msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
  26. msprobe/docs/06.data_dump_MindSpore.md +1 -1
  27. msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
  28. msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
  29. msprobe/docs/14.data_parse_PyTorch.md +1 -1
  30. msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
  31. msprobe/docs/19.monitor.md +2 -0
  32. msprobe/docs/21.visualization_PyTorch.md +15 -80
  33. msprobe/docs/22.visualization_MindSpore.md +20 -104
  34. msprobe/docs/23.generate_operator_PyTorch.md +1 -1
  35. msprobe/docs/25.tool_function_introduction.md +1 -0
  36. msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
  37. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  38. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  39. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  40. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  41. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  42. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  43. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  44. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
  45. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
  46. msprobe/mindspore/cell_processor.py +33 -5
  47. msprobe/mindspore/compare/common_dir_compare.py +22 -26
  48. msprobe/mindspore/compare/utils.py +1 -2
  49. msprobe/mindspore/debugger/precision_debugger.py +1 -1
  50. msprobe/mindspore/dump/cell_dump_process.py +73 -62
  51. msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
  52. msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
  53. msprobe/msprobe.py +6 -4
  54. msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
  55. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
  56. msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
  57. msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
  58. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
  59. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
  60. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
  61. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
  62. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
  63. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
  64. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
  65. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
  66. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
  67. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
  68. msprobe/pytorch/attl_manager.py +65 -0
  69. msprobe/pytorch/common/utils.py +22 -2
  70. msprobe/pytorch/compare/utils.py +3 -3
  71. msprobe/pytorch/debugger/debugger_config.py +10 -0
  72. msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
  73. msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
  74. msprobe/pytorch/hook_module/api_register.py +6 -1
  75. msprobe/pytorch/monitor/module_hook.py +28 -9
  76. msprobe/pytorch/online_dispatch/dispatch.py +42 -24
  77. msprobe/pytorch/pt_config.py +57 -2
  78. msprobe/pytorch/pytorch_service.py +11 -2
  79. msprobe/visualization/builder/graph_builder.py +170 -64
  80. msprobe/visualization/builder/graph_merger.py +0 -1
  81. msprobe/visualization/builder/msprobe_adapter.py +1 -1
  82. msprobe/visualization/db_utils.py +25 -2
  83. msprobe/visualization/graph/base_node.py +0 -24
  84. msprobe/visualization/graph/graph.py +5 -14
  85. msprobe/visualization/graph_service.py +29 -53
  86. msprobe/visualization/utils.py +11 -1
  87. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
  88. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
  89. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
  90. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0
@@ -12,29 +12,31 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+
15
16
  import atexit
16
17
  import csv
17
18
  import fcntl
18
19
  import io
20
+ import json
21
+ import multiprocessing
19
22
  import os
20
23
  import pickle
21
- from multiprocessing import shared_memory
22
- import stat
23
- import json
24
24
  import re
25
25
  import shutil
26
+ import stat
26
27
  import sys
27
28
  import zipfile
28
- import multiprocessing
29
- import yaml
29
+ from multiprocessing import shared_memory
30
+
30
31
  import numpy as np
31
32
  import pandas as pd
33
+ import yaml
32
34
 
35
+ from msprobe.core.common.const import FileCheckConst, CompareConst, Const
33
36
  from msprobe.core.common.decorator import recursion_depth_decorator
34
- from msprobe.core.common.log import logger
35
37
  from msprobe.core.common.exceptions import FileCheckException
36
- from msprobe.core.common.const import FileCheckConst, CompareConst
37
38
  from msprobe.core.common.global_lock import global_lock, is_main_process
39
+ from msprobe.core.common.log import logger
38
40
 
39
41
  proc_lock = multiprocessing.Lock()
40
42
 
@@ -46,16 +48,15 @@ class FileChecker:
46
48
  Attributes:
47
49
  file_path: The file or dictionary path to be verified.
48
50
  path_type: file or dictionary
49
- ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
51
+ ability(str): one of [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
50
52
  file_type(str): The correct file type for file
51
53
  """
52
54
 
53
- def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True):
55
+ def __init__(self, file_path, path_type, ability=None, file_type=None):
54
56
  self.file_path = file_path
55
57
  self.path_type = self._check_path_type(path_type)
56
- self.ability = ability
58
+ self.ability = self._check_ability_type(ability)
57
59
  self.file_type = file_type
58
- self.is_script = is_script
59
60
 
60
61
  @staticmethod
61
62
  def _check_path_type(path_type):
@@ -64,9 +65,17 @@ class FileChecker:
64
65
  raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
65
66
  return path_type
66
67
 
68
+ @staticmethod
69
+ def _check_ability_type(ability):
70
+ ability_list = [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
71
+ if ability and ability not in ability_list:
72
+ logger.error(f'The ability must be one of {ability_list}.')
73
+ raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
74
+ return ability
75
+
67
76
  def common_check(self):
68
77
  """
69
- 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
78
+ 功能:基本文件权限校验,包括文件存在性、软连接、文件长度、文件类型、文件读写权限、文件属组、文件路径特殊字符、文件后缀等
70
79
  注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现
71
80
  """
72
81
  check_path_exists(self.file_path)
@@ -75,13 +84,13 @@ class FileChecker:
75
84
  check_path_length(self.file_path)
76
85
  check_path_type(self.file_path, self.path_type)
77
86
  self.check_path_ability()
78
- if self.is_script:
79
- check_path_owner_consistent(self.file_path)
87
+ check_path_owner_consistent(self.file_path)
80
88
  check_path_pattern_valid(self.file_path)
81
89
  check_common_file_size(self.file_path)
82
90
  check_file_suffix(self.file_path, self.file_type)
91
+ check_path_no_others_write(self.file_path)
83
92
  if self.path_type == FileCheckConst.FILE:
84
- check_dirpath_before_read(self.file_path)
93
+ check_dirpath_permission(self.file_path)
85
94
  return self.file_path
86
95
 
87
96
  def check_path_ability(self):
@@ -137,7 +146,8 @@ class FileOpen:
137
146
  check_path_pattern_valid(self.file_path)
138
147
  if os.path.exists(self.file_path):
139
148
  check_common_file_size(self.file_path)
140
- check_dirpath_before_read(self.file_path)
149
+ check_path_no_others_write(self.file_path)
150
+ check_dirpath_permission(self.file_path)
141
151
 
142
152
  def check_ability_and_owner(self):
143
153
  if self.mode in self.SUPPORT_READ_MODE:
@@ -172,7 +182,7 @@ def check_path_exists(path):
172
182
  if not os.path.exists(path):
173
183
  logger.error('The file path %s does not exist.' % path)
174
184
  raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
175
-
185
+
176
186
 
177
187
  def check_path_not_exists(path):
178
188
  if os.path.exists(path):
@@ -256,12 +266,15 @@ def check_path_type(file_path, file_type):
256
266
  raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
257
267
 
258
268
 
259
- def check_others_writable(directory):
260
- dir_stat = os.stat(directory)
261
- is_writable = (
262
- bool(dir_stat.st_mode & stat.S_IWGRP) or # 组可写
263
- bool(dir_stat.st_mode & stat.S_IWOTH) # 其他用户可写
264
- )
269
+ def check_group_writable(file_path):
270
+ path_stat = os.stat(file_path)
271
+ is_writable = bool(path_stat.st_mode & stat.S_IWGRP)
272
+ return is_writable
273
+
274
+
275
+ def check_others_writable(file_path):
276
+ path_stat = os.stat(file_path)
277
+ is_writable = bool(path_stat.st_mode & stat.S_IWOTH)
265
278
  return is_writable
266
279
 
267
280
 
@@ -309,7 +322,7 @@ def check_path_before_create(path):
309
322
  'The file path {} contains special characters.'.format(path))
310
323
 
311
324
 
312
- def check_dirpath_before_read(path):
325
+ def check_dirpath_permission(path):
313
326
  path = os.path.realpath(path)
314
327
  dirpath = os.path.dirname(path)
315
328
  if dedup_log('check_dirpath_before_read', dirpath):
@@ -319,15 +332,16 @@ def check_dirpath_before_read(path):
319
332
  check_path_owner_consistent(dirpath)
320
333
  except FileCheckException:
321
334
  logger.warning(f"The directory {dirpath} is not yours.")
322
-
323
335
 
324
- def check_file_or_directory_path(path, isdir=False):
336
+
337
+ def check_file_or_directory_path(path, isdir=False, is_strict=False):
325
338
  """
326
339
  Function Description:
327
340
  check whether the path is valid
328
341
  Parameter:
329
342
  path: the path to check
330
343
  isdir: the path is dir or file
344
+ is_strict: whether to perform stricter validation (e.g., verify group cannot write to path)
331
345
  Exception Description:
332
346
  when invalid data throw exception
333
347
  """
@@ -337,6 +351,33 @@ def check_file_or_directory_path(path, isdir=False):
337
351
  path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE)
338
352
  path_checker.common_check()
339
353
 
354
+ if is_strict:
355
+ if check_group_writable(path):
356
+ raise FileCheckException(
357
+ FileCheckException.FILE_PERMISSION_ERROR,
358
+ f"The directory/file must not allow write access to group. Directory/File path: {path}"
359
+ )
360
+
361
+
362
+ def check_path_no_others_write(file_path):
363
+ if dedup_log('check_path_no_others_write', file_path):
364
+ if check_group_writable(file_path):
365
+ logger.warning(f"The directory/file path is writable by group: {file_path}.")
366
+
367
+ if check_others_writable(file_path):
368
+ raise FileCheckException(
369
+ FileCheckException.FILE_PERMISSION_ERROR,
370
+ f"The directory/file must not allow write access to others. Directory/File path: {file_path}"
371
+ )
372
+
373
+
374
+ def check_path_no_group_others_write(file_path):
375
+ if check_group_writable(file_path) or check_others_writable(file_path):
376
+ raise FileCheckException(
377
+ FileCheckException.FILE_PERMISSION_ERROR,
378
+ f"The directory/file must not allow write access to group or others. Directory/File path: {file_path}"
379
+ )
380
+
340
381
 
341
382
  def change_mode(path, mode):
342
383
  if not os.path.exists(path) or os.path.islink(path):
@@ -388,6 +429,14 @@ def check_file_type(path):
388
429
  raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
389
430
 
390
431
 
432
+ def root_privilege_warning():
433
+ if os.getuid() == 0:
434
+ logger.warning(
435
+ "msprobe is being run as root. "
436
+ "To avoid security risks, it is recommended to switch to a regular user to run it."
437
+ )
438
+
439
+
391
440
  def load_yaml(yaml_path):
392
441
  path_checker = FileChecker(yaml_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.YAML_SUFFIX)
393
442
  checked_path = path_checker.common_check()
@@ -422,6 +471,26 @@ def load_json(json_path):
422
471
  return data
423
472
 
424
473
 
474
+ def load_construct_json(json_path):
475
+ construct_dict_o = load_json(json_path)
476
+ if Const.MEGATRON_MICRO_STEP_NUMBER in construct_dict_o:
477
+ construct_dict = {}
478
+ micro_step_dict = {Const.MEGATRON_MICRO_STEP_NUMBER: construct_dict_o.get(Const.MEGATRON_MICRO_STEP_NUMBER)}
479
+ del construct_dict_o[Const.MEGATRON_MICRO_STEP_NUMBER]
480
+ for key, value in construct_dict_o.items():
481
+ if isinstance(value, list):
482
+ if len(value) != 2:
483
+ logger.error(f'Parse construct json file "{os.path.basename(json_path)}" failed.')
484
+ raise RuntimeError()
485
+ construct_dict[key] = value[0]
486
+ micro_step_dict[key] = value[1]
487
+ else:
488
+ construct_dict[key] = value
489
+ micro_step_dict[key] = 0
490
+ return construct_dict, micro_step_dict
491
+ return construct_dict_o, {}
492
+
493
+
425
494
  def save_json(json_path, data, indent=None, mode="w"):
426
495
  check_path_before_create(json_path)
427
496
  json_path = os.path.realpath(json_path)
@@ -520,6 +589,9 @@ def move_directory(src_path, dst_path):
520
589
  check_file_or_directory_path(src_path, isdir=True)
521
590
  check_path_before_create(dst_path)
522
591
  try:
592
+ if os.path.exists(dst_path):
593
+ logger.warning(f"The destination directory {dst_path} already exists, it will be removed.")
594
+ shutil.rmtree(dst_path)
523
595
  shutil.move(src_path, dst_path)
524
596
  except Exception as e:
525
597
  logger.error(f"move directory {src_path} to {dst_path} failed")
@@ -945,7 +1017,13 @@ class SharedDict:
945
1017
  def _safe_load(self):
946
1018
  with io.BytesIO(self._shm.buf[:]) as buff:
947
1019
  try:
948
- self._dict = SafeUnpickler(buff).load()
1020
+ data = SafeUnpickler(buff).load()
1021
+ if not isinstance(data, dict):
1022
+ logger.debug(f"Data from shared memory is '{type(data)}' type, expected 'dict'.")
1023
+ self._dict = {}
1024
+ self._changed = True
1025
+ else:
1026
+ self._dict = data
949
1027
  except Exception as e:
950
1028
  logger.debug(f'shared dict is unreadable, reason: {e}, create new dict.')
951
1029
  self._dict = {}
@@ -12,10 +12,11 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.import functools
15
+
15
16
  import functools
17
+
16
18
  from msprobe.core.common.const import Const
17
- from msprobe.core.common.file_utils import check_file_or_directory_path
18
- from msprobe.core.common.file_utils import save_npy
19
+ from msprobe.core.common.file_utils import check_file_or_directory_path, save_npy
19
20
 
20
21
 
21
22
  class FrameworkDescriptor:
@@ -103,7 +104,7 @@ class FmkAdp:
103
104
  @classmethod
104
105
  def tensor_norm(cls, tensor):
105
106
  return cls.process_tensor(tensor, lambda x: x.norm())
106
-
107
+
107
108
  @classmethod
108
109
  def save_tensor(cls, tensor, filepath):
109
110
  if cls.fmk == Const.PT_FRAMEWORK:
@@ -151,7 +152,7 @@ class FmkAdp:
151
152
 
152
153
  @classmethod
153
154
  def load_checkpoint(cls, path, to_cpu=True, weights_only=True):
154
- check_file_or_directory_path(path)
155
+ check_file_or_directory_path(path, is_strict=not weights_only)
155
156
  if cls.fmk == Const.PT_FRAMEWORK:
156
157
  try:
157
158
  if to_cpu:
@@ -161,9 +162,9 @@ class FmkAdp:
161
162
  except Exception as e:
162
163
  raise RuntimeError(f"load pt file {path} failed: {e}") from e
163
164
  return mindspore.load_checkpoint(path)
164
-
165
+
165
166
  @classmethod
166
167
  def asnumpy(cls, tensor):
167
168
  if cls.fmk == Const.PT_FRAMEWORK:
168
169
  return tensor.float().numpy()
169
- return tensor.float().asnumpy()
170
+ return tensor.float().asnumpy()
@@ -0,0 +1,59 @@
1
+ # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from functools import wraps
17
+
18
+
19
+ class MegatronStepInfo:
20
+ is_megatron = False
21
+ is_forward = False
22
+ is_backward = False
23
+ forward_micro_step = -1
24
+ backward_micro_step = -1
25
+
26
+ @classmethod
27
+ def reset(cls):
28
+ """重置所有类属性到初始状态"""
29
+ cls.is_megatron = False
30
+ cls.is_forward = False
31
+ cls.is_backward = False
32
+ cls.forward_micro_step = -1
33
+ cls.backward_micro_step = -1
34
+
35
+
36
+ def wrap_megatron_step(func, is_forward=True):
37
+ @wraps(func)
38
+ def wrapped_func(*args, **kwargs):
39
+ if not MegatronStepInfo.is_megatron:
40
+ MegatronStepInfo.is_megatron = True
41
+ if is_forward:
42
+ MegatronStepInfo.is_forward = True
43
+ MegatronStepInfo.is_backward = False
44
+ MegatronStepInfo.forward_micro_step += 1
45
+ else:
46
+ MegatronStepInfo.is_forward = False
47
+ MegatronStepInfo.is_backward = True
48
+ MegatronStepInfo.backward_micro_step += 1
49
+ return func(*args, **kwargs)
50
+
51
+ return wrapped_func
52
+
53
+
54
+ def get_micro_step():
55
+ return MegatronStepInfo.forward_micro_step if MegatronStepInfo.is_forward else MegatronStepInfo.backward_micro_step
56
+
57
+
58
+ def is_megatron():
59
+ return MegatronStepInfo.is_megatron
@@ -28,7 +28,7 @@ import numpy as np
28
28
  from msprobe.core.common.const import Const, CompareConst
29
29
  from msprobe.core.common.decorator import recursion_depth_decorator
30
30
  from msprobe.core.common.exceptions import MsprobeException
31
- from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json)
31
+ from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json, load_construct_json)
32
32
  from msprobe.core.common.log import logger
33
33
 
34
34
  device = collections.namedtuple('device', ['type', 'index'])
@@ -83,7 +83,8 @@ class MsprobeBaseException(Exception):
83
83
  INVALID_API_NAME_ERROR = 36
84
84
  CROSS_FRAME_ERROR = 37
85
85
  MISSING_THRESHOLD_ERROR = 38
86
- WRONG_THRESHOLD_ERROR = 38
86
+ WRONG_THRESHOLD_ERROR = 39
87
+ MULTIPROCESS_ERROR = 40
87
88
 
88
89
  def __init__(self, code, error_info: str = ""):
89
90
  super(MsprobeBaseException, self).__init__()
@@ -348,8 +349,18 @@ def get_stack_construct_by_dump_json_path(dump_json_path):
348
349
  stack_json = os.path.join(directory, "stack.json")
349
350
  construct_json = os.path.join(directory, "construct.json")
350
351
 
352
+ stack_json_exist = os.path.exists(stack_json)
353
+ construct_json_exist = os.path.exists(construct_json)
354
+
355
+ if not stack_json_exist and not construct_json_exist:
356
+ logger.info("stack.json and construct.json not found")
357
+ return {}, {}
358
+ if not stack_json_exist or not construct_json_exist:
359
+ logger.error("stack.json or construct.json not found, please check.")
360
+ raise CompareException(CompareException.INVALID_PATH_ERROR)
361
+
351
362
  stack = load_json(stack_json)
352
- construct = load_json(construct_json)
363
+ construct, _ = load_construct_json(construct_json)
353
364
  return stack, construct
354
365
 
355
366
 
@@ -47,7 +47,6 @@ class DiffAnalyzer:
47
47
  analyze_func()
48
48
  if self._diff_nodes:
49
49
  self._gen_analyze_info()
50
- self._post_process()
51
50
  return
52
51
  logger.info('Cannot find any diff node, no need to generate analyze file.')
53
52
 
@@ -56,12 +55,6 @@ class DiffAnalyzer:
56
55
  self._resolve_input_path(self._output_path)
57
56
  logger.info("Pre Process completed.")
58
57
 
59
- def _post_process(self):
60
- for rank_path in self._paths.values():
61
- dump_path = rank_path.dump_path
62
- logger.debug(f"Remove {dump_path} success")
63
- logger.info("Post Process completed.")
64
-
65
58
  """
66
59
  这里需要生成stack,但是直接用dict中自带就行,在op_items.NPU_Stack_Info中
67
60
  """
@@ -105,6 +98,8 @@ class DiffAnalyzer:
105
98
  logger.warning(f'Rank {path.rank} has no dump data!')
106
99
  continue
107
100
  for op_name, op_data in dump_data.items():
101
+ if is_ignore_op(op_name):
102
+ continue
108
103
  if is_communication_op(op_name):
109
104
  self._first_comm_nodes[path.rank] = op_name
110
105
  break
@@ -131,10 +126,16 @@ class DiffAnalyzer:
131
126
  for rank, nodes in list(self._rank_comm_nodes_dict.items())[:-1]:
132
127
  searched_ranks.add(rank)
133
128
  seen_nodes = set()
129
+ last_node = None
134
130
  for cur_node in nodes.values():
131
+ is_overflow = last_node and hasattr(last_node, 'layer') and hasattr(cur_node, 'layer') and \
132
+ last_node.layer >= cur_node.layer
133
+ if is_overflow:
134
+ cur_node.layer = last_node.layer + 1
135
135
  conn_info = cur_node.find_connected_nodes()
136
136
  if not conn_info.get('ranks'):
137
137
  conn_info['ranks'] = self._rank_comm_nodes_dict.keys()
138
+ last_node = cur_node
138
139
  if not self._find_connection(conn_info, cur_node, searched_ranks, seen_nodes):
139
140
  logger.debug(f'Cannot find connected communication node for "{cur_node.node_id}".')
140
141
 
@@ -52,19 +52,25 @@ class DataNode:
52
52
  metrics = {}
53
53
  for cmp_data in self.op_data:
54
54
  name = cmp_data.get(CompareConst.NPU_NAME)
55
+ # 构建度量指标字典
56
+ metrics = {}
57
+
55
58
  if CompareConst.NPU_MAX in cmp_data:
56
59
  metrics = {CompareConst.NPU_MAX: cmp_data.get(CompareConst.NPU_MAX),
57
60
  CompareConst.NPU_MIN: cmp_data.get(CompareConst.NPU_MIN),
58
61
  CompareConst.NPU_MEAN: cmp_data.get(CompareConst.NPU_MEAN),
59
62
  CompareConst.NPU_NORM: cmp_data.get(CompareConst.NPU_NORM)}
60
63
  elif CompareConst.NPU_MD5 in cmp_data:
61
- metrics = {CompareConst.NPU_MD5: cmp_data.get(CompareConst.NPU_MD5)}
64
+ metrics[CompareConst.NPU_MD5] = cmp_data.get(CompareConst.NPU_MD5)
65
+
66
+ if CompareConst.NPU_P2POP_PEER in cmp_data:
67
+ metrics[CompareConst.NPU_P2POP_PEER] = cmp_data.get(CompareConst.NPU_P2POP_PEER)
62
68
 
63
69
  if cmp_data.get(CompareConst.STACK) != CompareConst.N_A and not self.stack:
64
70
  self.stack = cmp_data.get(CompareConst.STACK)
65
- if Const.INPUT in name:
71
+ if cmp_data.get('state') == "input":
66
72
  self.inputs[name] = metrics
67
- elif Const.OUTPUT in name:
73
+ elif cmp_data.get('state') == "output":
68
74
  self.outputs[name] = metrics
69
75
 
70
76
  def gen_node_info(self, path: RankPath):
@@ -161,6 +167,8 @@ class CommunicationNode:
161
167
  if val and val.startswith('[') and val.endswith(']'):
162
168
  val = [int(part) for part in val.strip('[]').split(',')]
163
169
  ranks.update(val)
170
+ elif v.get(CompareConst.NPU_P2POP_PEER) != "None":
171
+ ranks.add(v.get(CompareConst.NPU_P2POP_PEER))
164
172
 
165
173
  return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
166
174
  'type': DiffAnalyseConst.OPPOSITE_DIR.get(self.type, DiffAnalyseConst.LINK)}
@@ -120,7 +120,8 @@ def is_communication_op(op_name):
120
120
  def is_ignore_op(op_name):
121
121
  ignore_keywords = [
122
122
  'Torch.empty',
123
- 'Torch.fill'
123
+ 'Torch.fill',
124
+ 'Tensor.__setitem__'
124
125
  ]
125
126
  return any(keyword in op_name for keyword in ignore_keywords)
126
127
 
@@ -26,7 +26,7 @@ from tqdm import tqdm
26
26
  from msprobe.core.common.const import CompareConst, Const
27
27
  from msprobe.core.common.file_utils import save_workbook
28
28
  from msprobe.core.common.log import logger
29
- from msprobe.core.common.utils import get_header_index
29
+ from msprobe.core.common.utils import get_header_index, CompareException
30
30
  from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches
31
31
  from msprobe.core.compare.config import ModeConfig
32
32
 
@@ -359,18 +359,25 @@ class HighLight:
359
359
 
360
360
  def err_call(args):
361
361
  logger.error("Multiprocessing malicious value check failed! Reason: {}".format(args))
362
- try:
363
- pool.close()
364
- except OSError:
365
- logger.error("Pool terminate failed")
366
362
 
367
363
  result_df_columns = result_df.columns.tolist()
368
364
  for column in result_df_columns:
369
365
  self.value_check(column)
366
+ async_results = []
370
367
  for df_chunk in chunks:
371
- pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
368
+ result = pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
369
+ async_results.append(result)
372
370
 
373
371
  pool.close()
372
+
373
+ for ar in async_results:
374
+ try:
375
+ ar.get(timeout=3600)
376
+ except Exception as e:
377
+ logger.error(f"Task failed with exception: {e}")
378
+ pool.terminate()
379
+ raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
380
+
374
381
  pool.join()
375
382
 
376
383
  def df_malicious_value_check(self, result_df):
@@ -52,16 +52,20 @@ def _ms_graph_handle_multi_process(func, result_df, mode):
52
52
 
53
53
  def err_call(args):
54
54
  logger.error('multiprocess compare failed! Reason: {}'.format(args))
55
- try:
56
- pool.close()
57
- except OSError as e:
58
- logger.error(f'pool terminate failed: {str(e)}')
59
55
 
60
56
  for df_chunk in df_chunks:
61
57
  result = pool.apply_async(func, args=(df_chunk, mode), error_callback=err_call)
62
58
  results.append(result)
63
- final_results = [r.get() for r in results]
59
+
64
60
  pool.close()
61
+
62
+ try:
63
+ final_results = [r.get(timeout=3600) for r in results]
64
+ except Exception as e:
65
+ logger.error(f"Task failed with exception: {e}")
66
+ pool.terminate()
67
+ raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
68
+
65
69
  pool.join()
66
70
  return pd.concat(final_results, ignore_index=True)
67
71
 
@@ -277,10 +281,6 @@ class CompareRealData:
277
281
 
278
282
  def err_call(args):
279
283
  logger.error('multiprocess compare failed! Reason: {}'.format(args))
280
- try:
281
- pool.close()
282
- except OSError:
283
- logger.error("pool terminate failed")
284
284
 
285
285
  progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
286
286
 
@@ -298,7 +298,14 @@ class CompareRealData:
298
298
  )
299
299
  results.append(result)
300
300
 
301
- final_results = [r.get() for r in results]
302
301
  pool.close()
302
+
303
+ try:
304
+ final_results = [r.get(timeout=3600) for r in results]
305
+ except Exception as e:
306
+ logger.error(f"Task failed with exception: {e}")
307
+ pool.terminate()
308
+ raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
309
+
303
310
  pool.join()
304
311
  return pd.concat(final_results, ignore_index=True)
@@ -695,10 +695,6 @@ def get_sorted_ranks(npu_dump_dir, bench_dump_dir):
695
695
  def multi_statistics_compare(func, func_args):
696
696
  def err_call(args):
697
697
  logger.error(f'Multiprocess statistics compare failed! Reason: {args}')
698
- try:
699
- pool.close()
700
- except OSError:
701
- logger.error("Pool terminate failed")
702
698
 
703
699
  compare_func, input_param_nr_list, output_path, kwargs = func_args
704
700
 
@@ -715,9 +711,22 @@ def multi_statistics_compare(func, func_args):
715
711
  chunks[i].append(input_param_nr_list[param_num - remainder + i])
716
712
 
717
713
  pool = multiprocessing.Pool(process_num)
714
+
715
+ async_results = []
718
716
  for chunk in chunks:
719
- pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
717
+ result = pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
718
+ async_results.append(result)
719
+
720
720
  pool.close()
721
+
722
+ for ar in async_results:
723
+ try:
724
+ ar.get(timeout=3600)
725
+ except Exception as e:
726
+ logger.error(f"Task failed with exception: {e}")
727
+ pool.terminate()
728
+ raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
729
+
721
730
  pool.join()
722
731
 
723
732