PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -51,8 +51,6 @@ from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
 from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
 from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import generate_cpu_params, generate_device_params, \
     ExecParams
@@ -90,27 +88,22 @@ seed_all()
 def run_ut(config):
     logger.info("start UT test")
-    if config.online_config.is_online:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}".replace(".csv", "_rank*.csv"))
-        logger.info(f"UT task details will be saved in {config.details_csv_path}".replace(".csv", "_rank*.csv"))
-    else:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}")
-        logger.info(f"UT task details will be saved in {config.details_csv_path}")
+    logger.info(f"UT task result will be saved in {config.result_csv_path}")
+    logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
         logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
-    if config.online_config.is_online:
-        run_api_online(config, compare)
-    else:
-        csv_df = read_csv(config.result_csv_path)
-        try:
-            api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
-        except IndexError:
-            logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
-            api_name_set = set()
-        run_api_offline(config, compare, api_name_set)
+    csv_df = read_csv(config.result_csv_path)
+    try:
+        api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
+    except IndexError:
+        logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
+        api_name_set = set()
+    run_api_offline(config, compare, api_name_set)
     for result_csv_path, details_csv_path in zip(compare.save_path_list, compare.detail_save_path_list):
         change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
         change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -164,60 +157,6 @@ def run_api_offline(config, compare, api_name_set):
             gc.collect()
-def run_api_online(config, compare):
-    attl = init_attl(config.online_config)
-    dispatcher = ConsumerDispatcher(compare=compare)
-    dispatcher.start(handle_func=run_torch_api_online, config=config)
-    def tcp_communication_flow():
-        while True:
-            api_data = attl.recv()
-            if api_data == 'STOP_':
-                continue
-            if api_data == 'KILL_':
-                time.sleep(1)
-                logger.info("==========接收到STOP信号==========")
-                dispatcher.stop()
-                attl.stop_serve()
-                time.sleep(1)
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    def shared_storage_communication_flow():
-        flag_num = -1
-        while True:
-            api_data = attl.download()
-            if api_data == "start":
-                if flag_num == -1:
-                    flag_num += 1
-                flag_num += 1
-            if api_data == "end":
-                flag_num -= 1
-            if flag_num == 0:
-                dispatcher.stop()
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    if config.online_config.nfs_path:
-        shared_storage_communication_flow()
-    else:
-        tcp_communication_flow()
 def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     """
     run api(api_name) if api_name not in black_list and in white_list.
@@ -315,21 +254,6 @@ def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict
     return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list, backward_message)
-def run_torch_api_online(api_full_name, api_data, backward_content):
-    in_fwd_data_list = []
-    api_type, api_name = extract_basic_api_segments(api_full_name)
-    args, kwargs, out = api_data.args, api_data.kwargs, api_data.result
-    in_fwd_data_list.append(args)
-    in_fwd_data_list.append(kwargs)
-    if kwargs.get("device"):
-        del kwargs["device"]
-    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
-    device_out = exec_api(device_exec_params)
-    device_out = move2device_exec(device_out, "cpu")
-    return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
 def check_need_grad(api_info_dict):
     need_grad = True
     if api_info_dict.get(Const.INPUT_KWARGS) and "out" in api_info_dict.get(Const.INPUT_KWARGS):
@@ -389,16 +313,6 @@ def initialize_save_error_data(error_data_path):
     return error_data_path
-def init_attl(config):
-    """config: OnlineConfig"""
-    attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True,
-                                  connect_ip=config.host,
-                                  connect_port=config.port,
-                                  nfs_path=config.nfs_path,
-                                  tls_path=config.tls_path))
-    return attl
 def _run_ut_parser(parser):
     parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str,
                         help="<Optional> The api param tool result file: generate from api param tool, "
@@ -481,38 +395,6 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
-def checked_online_config(online_config):
-    if not online_config.is_online:
-        return
-    if not isinstance(online_config.is_online, bool):
-        raise ValueError("is_online must be bool type")
-    # rank_list
-    if not isinstance(online_config.rank_list, list):
-        raise ValueError("rank_list must be a list")
-    if online_config.rank_list and not all(isinstance(rank, int) for rank in online_config.rank_list):
-        raise ValueError("All elements in rank_list must be integers")
-    # nfs_path
-    if online_config.nfs_path:
-        check_file_or_directory_path(online_config.nfs_path, isdir=True)
-        return
-    # tls_path
-    if online_config.tls_path:
-        check_file_or_directory_path(online_config.tls_path, isdir=True)
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "ca.crt"))
-        crl_path = os.path.join(online_config.tls_path, "crl.pem")
-        if os.path.exists(crl_path):
-            check_file_or_directory_path(crl_path)
-    # host and port
-    if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
-        raise Exception(f"host: {online_config.host} is invalid.")
-    if not isinstance(online_config.port, int) or not (0 < online_config.port <= 65535):
-        raise Exception(f"port: {online_config.port} is invalid, port range 0-65535.")
 def run_ut_command(args):
@@ -525,7 +407,7 @@ def run_ut_command(args):
     else:
         checker_config = CheckerConfig()
-    if not checker_config.is_online and not args.api_info_file:
+    if not args.api_info_file:
         logger.error("Please provide api_info_file for offline run ut.")
         raise Exception("Please provide api_info_file for offline run ut.")
@@ -588,8 +470,6 @@ def run_ut_command(args):
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
-    online_config = checker_config.get_online_config()
-    checked_online_config(online_config)
     config_params = {
         'forward_content': forward_content,
         'backward_content': backward_content,

msprobe/pytorch/common/utils.py CHANGED Viewed

@@ -150,7 +150,7 @@ def remove_dropout():
         F.dropout3d = function_dropout3d
-def seed_all(seed=1234, mode=False, rm_dropout=True):
+def seed_all(seed=1234, mode=False, rm_dropout=False):
     check_seed_all(seed, mode, rm_dropout)
     try:
         random.seed(seed)
@@ -388,26 +388,6 @@ def load_pkl(pt_path):
     return pt
-def save_api_data(api_data):
-    """Save data to io stream"""
-    try:
-        io_buff = io.BytesIO()
-        torch.save(api_data, io_buff)
-    except Exception as e:
-        raise RuntimeError("save api_data to io_buff failed") from e
-    return io_buff
-def load_api_data(api_data_bytes):
-    """Load data from bytes stream"""
-    try:
-        buffer = io.BytesIO(api_data_bytes)
-        buffer = torch.load(buffer, map_location="cpu", weights_only=False)
-    except Exception as e:
-        raise RuntimeError("load api_data from bytes failed") from e
-    return buffer
 def is_recomputation():
     """Check if the current operation is in the re-computation phase.

msprobe/pytorch/compare/pt_compare.py CHANGED Viewed

@@ -31,8 +31,16 @@ def compare(input_param, output_path, **kwargs):
         raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR)
     config = setup_comparison(input_param, output_path, **kwargs)
-    mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match,
-                             config.dump_mode, config.compared_file_type)
+    config_dict = {
+        'stack_mode': config.stack_mode,
+        'auto_analyze': config.auto_analyze,
+        'fuzzy_match': config.fuzzy_match,
+        'highlight': config.highlight,
+        'dump_mode': config.dump_mode,
+        'first_diff_analyze': config.first_diff_analyze,
+        'compared_file_type': config.compared_file_type
+    }
+    mode_config = ModeConfig(**config_dict)
     mapping_config = MappingConfig(data_mapping=config.data_mapping)
     pt_comparator = Comparator(read_real_data, mode_config, mapping_config)
     pt_comparator.compare_core(input_param, output_path, suffix=config.suffix)

msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} RENAMED Viewed

@@ -13,21 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
-from msprobe.pytorch.hook_module.api_register import get_api_register
+from msprobe.pytorch.compare.distributed_compare import compare_distributed
-def wrap_jit_script_func():
-    def patched_script(*args, **kwargs):
-        all_api_registered = api_register.all_api_registered
-        if all_api_registered:
-            api_register.restore_all_api()
-        result = original_script(*args, **kwargs)
-        if all_api_registered:
-            api_register.register_all_api()
-        return result
-    original_script = torch.jit.script
-    api_register = get_api_register()
-    torch.jit.script = patched_script
+def pt_diff_analyze(npu_dump_dir, bench_dump_dir, output_path, first_diff_analyze):
+    compare_distributed(npu_dump_dir, bench_dump_dir, output_path, first_diff_analyze=first_diff_analyze)

msprobe/pytorch/compare/utils.py CHANGED Viewed

@@ -35,7 +35,8 @@ def read_pt_data(dir_path, file_name):
         data_value = load_pt(data_path, to_cpu=True).detach()
     except RuntimeError as e:
         # 这里捕获 load_pt 中抛出的异常
-        logger.error(f"Failed to load the .pt file at {data_path}.")
+        data_path_file_name = os.path.basename(data_path)
+        logger.error(f"Failed to load the .pt file at {data_path_file_name}.")
         raise CompareException(CompareException.INVALID_FILE_ERROR) from e
     except AttributeError as e:
         # 这里捕获 detach 方法抛出的异常

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -34,6 +34,7 @@ class DebuggerConfig:
         self.overflow_nums = task_config.overflow_nums if task_config.overflow_nums else 1
         self.framework = Const.PT_FRAMEWORK
         self.async_dump = common_config.async_dump if common_config.async_dump else False
+        self.precision = common_config.precision if common_config.precision else Const.DUMP_PRECISION_LOW
         if self.task == Const.FREE_BENCHMARK:
             self.fuzz_device = task_config.fuzz_device
@@ -47,16 +48,6 @@ class DebuggerConfig:
                 "max_sample": task_config.max_sample
             }
-        self.online_run_ut = False
-        if self.task == Const.TENSOR:
-            # dump api tensor and collaborate with online run_ut
-            self.online_run_ut = task_config.online_run_ut if task_config.online_run_ut else False
-            self.nfs_path = task_config.nfs_path if task_config.nfs_path else ""
-            self.tls_path = task_config.tls_path if task_config.tls_path else ""
-            self.host = task_config.host if task_config.host else ""
-            self.port = task_config.port if task_config.port else -1
-            self.online_run_ut_recompute = task_config.online_run_ut_recompute \
-                if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
         self._check_statistics_config(task_config)
@@ -65,7 +56,7 @@ class DebuggerConfig:
             self.is_backward_kernel_dump = False
             self._check_and_adjust_config_with_l2()
-    def check_kwargs(self):
+    def check(self):
         if self.task and self.task not in Const.TASK_LIST:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The task <{self.task}> is not in the {Const.TASK_LIST}.")
@@ -78,22 +69,26 @@ class DebuggerConfig:
         if not isinstance(self.async_dump, bool):
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR:
-            if self.level == Const.LEVEL_DEBUG:
-                self.list = [] # async_dump + debug level case ignore list
-            if not self.list and self.level != Const.LEVEL_DEBUG:
-                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                                    f"The parameters async_dump is true in tensor task, the parameters list cannot be "
-                                    f"empty.")
         if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
                 f"If not, the default level is {Const.LEVEL_MIX}."
             )
             self.level = Const.LEVEL_MIX
-    def check(self):
-        self.check_kwargs()
+        if self.async_dump:
+            if self.task == Const.TENSOR:
+                if self.level == Const.LEVEL_DEBUG:
+                    self.list = []  # async_dump + debug level case ignore list
+                if not self.list and self.level != Const.LEVEL_DEBUG:
+                    raise MsprobeException(
+                        MsprobeException.INVALID_PARAM_ERROR,
+                        f"The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+                    )
+            if self.summary_mode == Const.MD5:
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR,
+                    f"The parameters async_dump is true, the parameters summary_mode cannot be md5."
+                )
         return True
     def check_model(self, instance, start_model, token_range=None):
@@ -102,7 +97,7 @@ class DebuggerConfig:
         if token_range and not instance.model:
             error_info = "The 'model' parameter must be provided when token_range is not None"
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, error_info)
         if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None:
             return
@@ -123,7 +118,7 @@ class DebuggerConfig:
                     break
             if error_model is not None:
                 error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] "
-                            f"type, currently there is an unsupported {type(error_model)} type.")
+                              f"type, currently there is an unsupported {type(error_model)} type.")
                 raise MsprobeException(
                     MsprobeException.INVALID_PARAM_ERROR, error_info)
         else:

msprobe/pytorch/dump/module_dump/hook_wrapper.py CHANGED Viewed

@@ -24,8 +24,11 @@ from msprobe.pytorch.common.log import logger
 def wrap_setup_backward_hook(func):
-    def requires_clone(tensor):
-        return isinstance(tensor, torch.Tensor) and tensor.requires_grad and torch.is_grad_enabled()
+    def requires_clone(tensor, need_check_leaf=False):
+        need_clone = isinstance(tensor, torch.Tensor) and tensor.requires_grad and torch.is_grad_enabled()
+        if need_check_leaf:
+            need_clone &= tensor.grad_fn is not None
+        return need_clone
     @recursion_depth_decorator("Dump: wrap_setup_backward_hook.parse_tensor", max_depth=Const.DUMP_MAX_DEPTH)
     def parse_tensor(item, tensor_list):
@@ -39,20 +42,20 @@ def wrap_setup_backward_hook(func):
                 parse_tensor(value, tensor_list)
     @recursion_depth_decorator("Dump: wrap_setup_backward_hook.rebuild_args", max_depth=Const.DUMP_MAX_DEPTH)
-    def rebuild_args(item, tensor_iter):
-        if requires_clone(item):
+    def rebuild_args(item, tensor_iter, need_check_leaf=False):
+        if requires_clone(item, need_check_leaf):
             result = next(tensor_iter)
             if hasattr(result, "_base") and result._base is not None:
                 if torch._C._autograd._get_creation_meta(result) != torch._C._autograd.CreationMeta(0):
                     torch._C._autograd._set_creation_meta(result, torch._C._autograd.CreationMeta(0))
-            return result
+            return result
         if isinstance(item, list):
             for index, value in enumerate(item):
-                item[index] = rebuild_args(value, tensor_iter)
+                item[index] = rebuild_args(value, tensor_iter, need_check_leaf=True)
             return item
         if isinstance(item, dict):
             for key, value in item.items():
-                item[key] = rebuild_args(value, tensor_iter)
+                item[key] = rebuild_args(value, tensor_iter, need_check_leaf=True)
             return item
         if isinstance(item, tuple):
             if hasattr(item, '_fields'):

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -21,25 +21,18 @@ import torch
 from torch.utils.hooks import BackwardHook, RemovableHandle
 from msprobe.core.common.const import Const
+from msprobe.core.common.runtime import Runtime
 from msprobe.core.common.utils import ModuleQueue, ThreadSafe
+from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
 from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import is_torch_nn_module, register_forward_pre_hook
 from msprobe.pytorch.dump.module_dump.hook_wrapper import wrap_setup_input_output_hook
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
-if torch_version_above_or_equal_2:
-    from torch.utils.checkpoint import checkpoint as origin_checkpoint, set_checkpoint_early_stop
-def checkpoint_without_early_stop(*args, **kwargs):
-    with set_checkpoint_early_stop(False):
-        return origin_checkpoint(*args, **kwargs)
-def replace_checkpoint():
-    if torch_version_above_or_equal_2:
-        torch.utils.checkpoint.checkpoint = checkpoint_without_early_stop
+torch_version_above_or_equal_21 = torch.__version__.split('+')[0] >= '2.1'
+if torch_version_above_or_equal_21:
+    from torch.utils.checkpoint import _StopRecomputationError
 def wrap_megatron_deallocate(func):
@@ -53,6 +46,27 @@ def wrap_megatron_deallocate(func):
     return wrapper_func
+def wrap_forward_with_hook_safety(module):
+    """
+    包装模块的forward方法，确保异常时也执行forward_hook。
+    """
+    original_forward = module.forward
+    def wrapped_forward(*args, **kwargs):
+        try:
+            output = original_forward(*args, **kwargs)
+            return output
+        except _StopRecomputationError as e:
+            exception_output = None
+            if len(module._forward_hooks.values()) > 0:
+                # msprobe的forward_hook会出现在第一个，仅执行msprobe的forward_hook
+                hook_fn = list(module._forward_hooks.values())[0]
+                hook_fn(module, args, kwargs, exception_output)
+            raise e
+    if torch_version_above_or_equal_21:
+        module.forward = wrapped_forward
 class ModuleProcesser:
     module_queue = ModuleQueue()
     module_count = {}
@@ -66,11 +80,12 @@ class ModuleProcesser:
     def __init__(self, scope):
         self.scope = scope if isinstance(scope, (ModuleRangeScope, MixRangeScope)) else None
         wrap_setup_input_output_hook()
-        replace_checkpoint()
         try:
             from megatron.core.pipeline_parallel import schedules
             origin_func_id = id(schedules.deallocate_output_tensor)
             schedules.deallocate_output_tensor = wrap_megatron_deallocate(schedules.deallocate_output_tensor)
+            schedules.forward_step = wrap_megatron_step(schedules.forward_step)
+            schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
             for module in list(sys.modules.values()):
                 if module.__name__ == 'schedules':
                     continue
@@ -155,6 +170,7 @@ class ModuleProcesser:
                         f"which may cause abnormal data dump. The backward data dump for this module will be skipped."
                     )
                     ModuleProcesser.module_with_backward_hook[prefix_name] = True
+                wrap_forward_with_hook_safety(module)
                 register_forward_pre_hook(module, forward_pre_hook)
     def build_module_hook(self, module_name, build_data_hook):
@@ -163,6 +179,9 @@ class ModuleProcesser:
             if kwargs is None:
                 kwargs = {}
+            if not Runtime.is_running:
+                return (args, kwargs) if torch_version_above_or_equal_2 else args
             if hasattr(module, 'msprobe_module_dump') and not self.enable_module_dump:
                 return (args, kwargs) if torch_version_above_or_equal_2 else args
@@ -243,14 +262,16 @@ class ModuleProcesser:
             ModuleProcesser.module_stack[tid] = []
         if self.module_stack[tid]:
-            ModuleProcesser.module_node[full_name] = self.module_stack[tid][-1]
+            ModuleProcesser.module_node[full_name] = self.module_stack[tid][-1] if not is_megatron() \
+                else [self.module_stack[tid][-1], get_micro_step()]
         else:
             parent_name = ModuleProcesser.module_queue.find_last(full_name)
-            ModuleProcesser.module_node[full_name] = parent_name
+            ModuleProcesser.module_node[full_name] = parent_name if not is_megatron() \
+                else [parent_name, get_micro_step()]
         ModuleProcesser.module_queue.add_name(full_name)
         ModuleProcesser.module_stack[tid].append(full_name)
-        ModuleProcesser.api_parent_node[tid] = full_name
+        ModuleProcesser.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
         if self.scope:
             self.scope.begin_module(full_name)
@@ -258,14 +279,15 @@ class ModuleProcesser:
         tid = threading.get_ident()
         if torch_version_above_or_equal_2 or is_forward:
             ModuleProcesser.module_queue.remove_name(full_name)
-            ModuleProcesser.api_parent_node[tid] = None
+            ModuleProcesser.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
             if self.module_stack.get(tid):
                 ModuleProcesser.module_stack[tid].pop()
             if self.module_stack.get(tid):
-                ModuleProcesser.api_parent_node[tid] = ModuleProcesser.module_stack[tid][-1]
+                ModuleProcesser.api_parent_node[tid] = ModuleProcesser.module_stack[tid][-1] if not is_megatron() \
+                    else [ModuleProcesser.module_stack[tid][-1], get_micro_step()]
             if self.scope:
                 self.scope.end_module(full_name)
         else:
             if self.scope:
                 self.scope.begin_module(full_name)
-            ModuleProcesser.api_parent_node[tid] = full_name
+            ModuleProcesser.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]

msprobe/pytorch/free_benchmark/main.py CHANGED Viewed

@@ -17,8 +17,8 @@ from abc import ABC
 import torch
 from msprobe.core.common.const import Const
+from msprobe.core.common.utils import replace_last_occurrence
 from msprobe.pytorch.free_benchmark import logger
-from msprobe.pytorch.free_benchmark.common.constant import CommonField
 from msprobe.pytorch.free_benchmark.common.enums import (
     DeviceType,
     FuzzLevel,
@@ -37,6 +37,7 @@ from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import (
 class FreeBenchmarkCheck(ABC):
+    grad_saver_dict = {}
     def __init__(self, config) -> None:
         super().__init__()
@@ -68,7 +69,9 @@ class FreeBenchmarkCheck(ABC):
         grad_saver.kwargs = kwargs
         grad_saver.register_compare_func_for_inputs(args, data_processor)
         grad_saver.cache_backward_input(args)
-        setattr(module, CommonField.GRADSAVER, grad_saver)
+        backward_name = replace_last_occurrence(name, Const.FORWARD, Const.BACKWARD)
+        FreeBenchmarkCheck.grad_saver_dict[backward_name] = grad_saver
     def forward(self, name, module, args, kwargs, output):
         if not self.config.fuzz_stage == Const.FORWARD:
@@ -92,16 +95,16 @@ class FreeBenchmarkCheck(ABC):
         return perturbed_output, handler.get_unequal_rows()
     def backward(self, name, module, grad_output):
         if not self.config.fuzz_stage == Const.BACKWARD:
             return
         try:
-            grad_saver = getattr(module, CommonField.GRADSAVER)
+            grad_saver = FreeBenchmarkCheck.grad_saver_dict[name]
         except AttributeError:
             logger.warning_on_rank_0(
                 f"[msprobe] Free benchmark:  get grad saver failed. api_name:{name}"
             )
             return
+        del FreeBenchmarkCheck.grad_saver_dict[name]
         _new_grad_output = grad_output
         try:

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl