PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
msprobe/README.md +7 -5
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +105 -27
msprobe/core/common/framework_adapter.py +7 -6
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +2 -1
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +37 -3
msprobe/core/service.py +18 -5
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/02.config_introduction.md +14 -1
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/compare/utils.py +1 -2
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/msprobe.py +6 -4
msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/common/utils.py +22 -2
msprobe/pytorch/compare/utils.py +3 -3
msprobe/pytorch/debugger/debugger_config.py +10 -0
msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
msprobe/pytorch/hook_module/api_register.py +6 -1
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/pt_config.py +57 -2
msprobe/pytorch/pytorch_service.py +11 -2
msprobe/visualization/builder/graph_builder.py +170 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
msprobe/visualization/utils.py +11 -1
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -51,6 +51,8 @@ from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
 from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
+from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
+from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
 from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import generate_cpu_params, generate_device_params, \
     ExecParams
@@ -88,22 +90,27 @@ seed_all()
 def run_ut(config):
     logger.info("start UT test")
-    logger.info(f"UT task result will be saved in {config.result_csv_path}")
-    logger.info(f"UT task details will be saved in {config.details_csv_path}")
+    if config.online_config.is_online:
+        logger.info(f"UT task result will be saved in {config.result_csv_path}".replace(".csv", "_rank*.csv"))
+        logger.info(f"UT task details will be saved in {config.details_csv_path}".replace(".csv", "_rank*.csv"))
+    else:
+        logger.info(f"UT task result will be saved in {config.result_csv_path}")
+        logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
         logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
-    csv_df = read_csv(config.result_csv_path)
-    try:
-        api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
-    except IndexError:
-        logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
-        api_name_set = set()
-    run_api_offline(config, compare, api_name_set)
+    if config.online_config.is_online:
+        run_api_online(config, compare)
+    else:
+        csv_df = read_csv(config.result_csv_path)
+        try:
+            api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
+        except IndexError:
+            logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
+            api_name_set = set()
+        run_api_offline(config, compare, api_name_set)
     for result_csv_path, details_csv_path in zip(compare.save_path_list, compare.detail_save_path_list):
         change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
         change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -157,6 +164,60 @@ def run_api_offline(config, compare, api_name_set):
             gc.collect()
+def run_api_online(config, compare):
+    attl = init_attl(config.online_config)
+    dispatcher = ConsumerDispatcher(compare=compare)
+    dispatcher.start(handle_func=run_torch_api_online, config=config)
+    def tcp_communication_flow():
+        while True:
+            api_data = attl.recv()
+            if api_data == 'STOP_':
+                continue
+            if api_data == 'KILL_':
+                time.sleep(1)
+                logger.info("==========接收到STOP信号==========")
+                dispatcher.stop()
+                attl.stop_serve()
+                time.sleep(1)
+                break
+            if not isinstance(api_data, ApiData):
+                continue
+            api_full_name = api_data.name
+            _, api_name = extract_basic_api_segments(api_full_name)
+            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
+                continue
+            if api_data.rank in config.online_config.rank_list:
+                dispatcher.update_consume_queue(api_data)
+    def shared_storage_communication_flow():
+        flag_num = -1
+        while True:
+            api_data = attl.download()
+            if api_data == "start":
+                if flag_num == -1:
+                    flag_num += 1
+                flag_num += 1
+            if api_data == "end":
+                flag_num -= 1
+            if flag_num == 0:
+                dispatcher.stop()
+                break
+            if not isinstance(api_data, ApiData):
+                continue
+            api_full_name = api_data.name
+            _, api_name = extract_basic_api_segments(api_full_name)
+            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
+                continue
+            if api_data.rank in config.online_config.rank_list:
+                dispatcher.update_consume_queue(api_data)
+    if config.online_config.nfs_path:
+        shared_storage_communication_flow()
+    else:
+        tcp_communication_flow()
 def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     """
     run api(api_name) if api_name not in black_list and in white_list.
@@ -254,6 +315,21 @@ def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict
     return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list, backward_message)
+def run_torch_api_online(api_full_name, api_data, backward_content):
+    in_fwd_data_list = []
+    api_type, api_name = extract_basic_api_segments(api_full_name)
+    args, kwargs, out = api_data.args, api_data.kwargs, api_data.result
+    in_fwd_data_list.append(args)
+    in_fwd_data_list.append(kwargs)
+    if kwargs.get("device"):
+        del kwargs["device"]
+    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
+    device_out = exec_api(device_exec_params)
+    device_out = move2device_exec(device_out, "cpu")
+    return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
 def check_need_grad(api_info_dict):
     need_grad = True
     if api_info_dict.get(Const.INPUT_KWARGS) and "out" in api_info_dict.get(Const.INPUT_KWARGS):
@@ -313,6 +389,16 @@ def initialize_save_error_data(error_data_path):
     return error_data_path
+def init_attl(config):
+    """config: OnlineConfig"""
+    attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True,
+                                  connect_ip=config.host,
+                                  connect_port=config.port,
+                                  nfs_path=config.nfs_path,
+                                  tls_path=config.tls_path))
+    return attl
 def _run_ut_parser(parser):
     parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str,
                         help="<Optional> The api param tool result file: generate from api param tool, "
@@ -395,6 +481,38 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
+def checked_online_config(online_config):
+    if not online_config.is_online:
+        return
+    if not isinstance(online_config.is_online, bool):
+        raise ValueError("is_online must be bool type")
+    # rank_list
+    if not isinstance(online_config.rank_list, list):
+        raise ValueError("rank_list must be a list")
+    if online_config.rank_list and not all(isinstance(rank, int) for rank in online_config.rank_list):
+        raise ValueError("All elements in rank_list must be integers")
+    # nfs_path
+    if online_config.nfs_path:
+        check_file_or_directory_path(online_config.nfs_path, isdir=True)
+        return
+    # tls_path
+    if online_config.tls_path:
+        check_file_or_directory_path(online_config.tls_path, isdir=True)
+        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
+        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
+        check_file_or_directory_path(os.path.join(online_config.tls_path, "ca.crt"))
+        crl_path = os.path.join(online_config.tls_path, "crl.pem")
+        if os.path.exists(crl_path):
+            check_file_or_directory_path(crl_path)
+    # host and port
+    if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
+        raise Exception(f"host: {online_config.host} is invalid.")
+    if not isinstance(online_config.port, int) or not (0 < online_config.port <= 65535):
+        raise Exception(f"port: {online_config.port} is invalid, port range 0-65535.")
 def run_ut_command(args):
@@ -407,7 +525,7 @@ def run_ut_command(args):
     else:
         checker_config = CheckerConfig()
-    if not args.api_info_file:
+    if not checker_config.is_online and not args.api_info_file:
         logger.error("Please provide api_info_file for offline run ut.")
         raise Exception("Please provide api_info_file for offline run ut.")
@@ -470,6 +588,8 @@ def run_ut_command(args):
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
+    online_config = checker_config.get_online_config()
+    checked_online_config(online_config)
     config_params = {
         'forward_content': forward_content,
         'backward_content': backward_content,

msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py ADDED Viewed

File without changes

msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os.path
+import time
+from multiprocessing import Queue
+from typing import Optional, Union, Dict, Any
+from dataclasses import dataclass
+import torch
+from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
+from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.client import TCPClient
+from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.server import TCPServer
+from msprobe.core.common.file_utils import remove_path
+from msprobe.pytorch.common.utils import logger, save_api_data, load_api_data, save_pkl, load_pkl
+from msprobe.core.common.decorator import recursion_depth_decorator
+BufferType = Union[ApiData, Dict[str, Any], str]  # Union[Tensor, Tuple[Optional[Tensor]]]
+@dataclass
+class ATTLConfig:
+    is_benchmark_device: bool
+    connect_ip: str
+    connect_port: int
+    # storage_config
+    nfs_path: str = None
+    tls_path: str = None
+    check_sum: bool = True
+    queue_size: int = 50
+class ATTL:
+    def __init__(self, session_id: str, session_config: ATTLConfig, need_dump=True) -> None:
+        self.session_id = session_id
+        self.session_config = session_config
+        self.logger = logger
+        self.socket_manager = None
+        self.data_queue = Queue(maxsize=50)
+        self.dequeue_list = []
+        self.message_end = False
+        self.kill_progress = False
+        self.nfs_path = None
+        if self.session_config.nfs_path:
+            self.nfs_path = self.session_config.nfs_path
+        elif self.session_config.is_benchmark_device:
+            self.socket_manager = TCPServer(self.session_config.connect_port,
+                                            self.data_queue,
+                                            self.session_config.check_sum,
+                                            self.session_config.tls_path)
+            self.socket_manager.start()
+        elif need_dump:
+            self.socket_manager = TCPClient(self.session_config.connect_ip,
+                                            self.session_config.connect_port,
+                                            self.session_config.check_sum,
+                                            self.session_config.tls_path)
+            self.socket_manager.start()
+    def stop_serve(self):
+        if isinstance(self.socket_manager, TCPServer):
+            self.socket_manager.stop()
+    def send(self, buffer: BufferType) -> None:
+        """
+        npu major in 'send' (client)
+        """
+        # if tcp connection lost,
+        if self.socket_manager.signal_exit:
+            raise ConnectionError(f"Failed to connect to {self.session_config.connect_ip}.")
+        # know receiver receive and go next
+        if isinstance(buffer, ApiData):
+            buffer = move2target_device(buffer, torch.device('cpu'))
+        if 'device' in buffer.kwargs:
+            buffer.kwargs.pop('device')
+        rank = buffer.rank if hasattr(buffer, "rank") and buffer.rank is not None else 0
+        step = buffer.step if hasattr(buffer, "step") else 0
+        try:
+            io_buff = save_api_data(buffer)
+        except Exception as e:
+            self.logger.info(f"{buffer.name} can not be saved, skip: {e}")
+            return
+        data = io_buff.getvalue()
+        self.socket_manager.add_to_sending_queue(data, rank=rank, step=step)
+    def recv(self, timeout_ms=0) -> Optional[BufferType]:
+        buffer = ''
+        while not buffer:
+            if timeout_ms > 0:
+                time.sleep(timeout_ms / 1000.0)
+            if not buffer and not self.data_queue.empty():
+                buffer = self.data_queue.get()
+                break
+            if not buffer and timeout_ms > 0:  # timeout is the only case we give up and return None
+                break
+            if self.message_end and self.data_queue.empty():
+                buffer = b"KILL_CONFIRM"
+                self.kill_progress = True
+                break
+            time.sleep(0.1)  # waiting outside the lock before next attempt
+        if not buffer:
+            # this is a result of a timeout
+            self.logger.info(f"RECEIVE API DATA TIMED OUT")
+        else:
+            if buffer == b"STOP_":
+                return "STOP_"
+            if buffer == b"KILL_":
+                self.message_end = True
+                return "STOP_"
+            if buffer == b"KILL_CONFIRM":
+                self.kill_progress = True
+                return "KILL_"
+            try:
+                buffer = load_api_data(buffer)
+            except Exception as e:
+                self.logger.warning("there is something error. please check it. %s", e)
+            if isinstance(buffer, bytes):
+                return ''
+            if isinstance(buffer, str):
+                return buffer
+        return buffer
+    def upload(self, buffer: BufferType):
+        if isinstance(buffer, ApiData):
+            buffer = move2target_device(buffer, torch.device('cpu'))
+            file_path = os.path.join(self.session_config.nfs_path, buffer.name + ".pt")
+        else:
+            file_path = os.path.join(self.session_config.nfs_path, buffer + f"_{int(time.time())}")
+        try:
+            save_pkl(buffer, file_path)
+        except Exception as e:
+            self.logger.warning("there is something error in save_pt. please check it. %s", e)
+    def download(self):
+        buffer = None
+        cur_file = None
+        for file_type in ("start*", "*.pt", "end*"):
+            pattern = os.path.join(self.nfs_path, file_type)
+            files = glob.glob(pattern)
+            if len(files) > 0:
+                cur_file = files[0]
+                break
+        if cur_file is not None:
+            try:
+                buffer = load_pkl(cur_file)
+            except Exception as e:
+                self.logger.warning("there is something error. please check it. %s", e)
+            remove_path(cur_file)
+        return buffer
+@recursion_depth_decorator("move2device_exec")
+def move2device_exec(obj, device):
+    if isinstance(obj, (tuple, list)):
+        data_list = [move2device_exec(val, device) for val in obj]
+        return data_list if isinstance(obj, list) else tuple(data_list)
+    if isinstance(obj, dict):
+        return {key: move2device_exec(val, device) for key, val in obj.items()}
+    elif isinstance(obj, torch.Tensor):
+        obj = obj.detach()
+        if obj.device.type != device:
+            obj = obj.to(device)
+        return obj
+    elif "return_types" in str(type(obj)):
+        return move2device_exec(tuple(obj), device)
+    elif isinstance(obj, torch._C.device):
+        return torch.device(device)
+    else:
+        return obj
+def move2target_device(buffer: ApiData, target_device):
+    # handle args
+    new_args = move2device_exec(buffer.args, target_device)
+    # handle kwargs
+    new_kwargs = move2device_exec(buffer.kwargs, target_device)
+    # handle result
+    new_results = move2device_exec(buffer.result, target_device)
+    if target_device == torch.device('cpu') or target_device == "cpu":
+        return ApiData(buffer.name, tuple(new_args), new_kwargs, new_results, buffer.step, buffer.rank)
+    else:
+        return ApiData(buffer.name, tuple(new_args), new_kwargs, buffer.result, buffer.step, buffer.rank)

mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl