PyPI - mindstudio-probe - Versions diffs - 8.3.0__py3-none-any.whl → 8.3.1__py3-none-any.whl - Mend

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -51,8 +51,6 @@ from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
 from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
 from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import generate_cpu_params, generate_device_params, \
     ExecParams
@@ -90,27 +88,22 @@ seed_all()
 def run_ut(config):
     logger.info("start UT test")
-    if config.online_config.is_online:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}".replace(".csv", "_rank*.csv"))
-        logger.info(f"UT task details will be saved in {config.details_csv_path}".replace(".csv", "_rank*.csv"))
-    else:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}")
-        logger.info(f"UT task details will be saved in {config.details_csv_path}")
+    logger.info(f"UT task result will be saved in {config.result_csv_path}")
+    logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
         logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
-    if config.online_config.is_online:
-        run_api_online(config, compare)
-    else:
-        csv_df = read_csv(config.result_csv_path)
-        try:
-            api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
-        except IndexError:
-            logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
-            api_name_set = set()
-        run_api_offline(config, compare, api_name_set)
+    csv_df = read_csv(config.result_csv_path)
+    try:
+        api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
+    except IndexError:
+        logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
+        api_name_set = set()
+    run_api_offline(config, compare, api_name_set)
     for result_csv_path, details_csv_path in zip(compare.save_path_list, compare.detail_save_path_list):
         change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
         change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -164,60 +157,6 @@ def run_api_offline(config, compare, api_name_set):
             gc.collect()
-def run_api_online(config, compare):
-    attl = init_attl(config.online_config)
-    dispatcher = ConsumerDispatcher(compare=compare)
-    dispatcher.start(handle_func=run_torch_api_online, config=config)
-    def tcp_communication_flow():
-        while True:
-            api_data = attl.recv()
-            if api_data == 'STOP_':
-                continue
-            if api_data == 'KILL_':
-                time.sleep(1)
-                logger.info("==========接收到STOP信号==========")
-                dispatcher.stop()
-                attl.stop_serve()
-                time.sleep(1)
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    def shared_storage_communication_flow():
-        flag_num = -1
-        while True:
-            api_data = attl.download()
-            if api_data == "start":
-                if flag_num == -1:
-                    flag_num += 1
-                flag_num += 1
-            if api_data == "end":
-                flag_num -= 1
-            if flag_num == 0:
-                dispatcher.stop()
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    if config.online_config.nfs_path:
-        shared_storage_communication_flow()
-    else:
-        tcp_communication_flow()
 def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     """
     run api(api_name) if api_name not in black_list and in white_list.
@@ -315,21 +254,6 @@ def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict
     return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list, backward_message)
-def run_torch_api_online(api_full_name, api_data, backward_content):
-    in_fwd_data_list = []
-    api_type, api_name = extract_basic_api_segments(api_full_name)
-    args, kwargs, out = api_data.args, api_data.kwargs, api_data.result
-    in_fwd_data_list.append(args)
-    in_fwd_data_list.append(kwargs)
-    if kwargs.get("device"):
-        del kwargs["device"]
-    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
-    device_out = exec_api(device_exec_params)
-    device_out = move2device_exec(device_out, "cpu")
-    return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
 def check_need_grad(api_info_dict):
     need_grad = True
     if api_info_dict.get(Const.INPUT_KWARGS) and "out" in api_info_dict.get(Const.INPUT_KWARGS):
@@ -389,16 +313,6 @@ def initialize_save_error_data(error_data_path):
     return error_data_path
-def init_attl(config):
-    """config: OnlineConfig"""
-    attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True,
-                                  connect_ip=config.host,
-                                  connect_port=config.port,
-                                  nfs_path=config.nfs_path,
-                                  tls_path=config.tls_path))
-    return attl
 def _run_ut_parser(parser):
     parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str,
                         help="<Optional> The api param tool result file: generate from api param tool, "
@@ -481,38 +395,6 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
-def checked_online_config(online_config):
-    if not online_config.is_online:
-        return
-    if not isinstance(online_config.is_online, bool):
-        raise ValueError("is_online must be bool type")
-    # rank_list
-    if not isinstance(online_config.rank_list, list):
-        raise ValueError("rank_list must be a list")
-    if online_config.rank_list and not all(isinstance(rank, int) for rank in online_config.rank_list):
-        raise ValueError("All elements in rank_list must be integers")
-    # nfs_path
-    if online_config.nfs_path:
-        check_file_or_directory_path(online_config.nfs_path, isdir=True)
-        return
-    # tls_path
-    if online_config.tls_path:
-        check_file_or_directory_path(online_config.tls_path, isdir=True)
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "ca.crt"))
-        crl_path = os.path.join(online_config.tls_path, "crl.pem")
-        if os.path.exists(crl_path):
-            check_file_or_directory_path(crl_path)
-    # host and port
-    if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
-        raise Exception(f"host: {online_config.host} is invalid.")
-    if not isinstance(online_config.port, int) or not (0 < online_config.port <= 65535):
-        raise Exception(f"port: {online_config.port} is invalid, port range 0-65535.")
 def run_ut_command(args):
@@ -525,7 +407,7 @@ def run_ut_command(args):
     else:
         checker_config = CheckerConfig()
-    if not checker_config.is_online and not args.api_info_file:
+    if not args.api_info_file:
         logger.error("Please provide api_info_file for offline run ut.")
         raise Exception("Please provide api_info_file for offline run ut.")
@@ -588,8 +470,6 @@ def run_ut_command(args):
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
-    online_config = checker_config.get_online_config()
-    checked_online_config(online_config)
     config_params = {
         'forward_content': forward_content,
         'backward_content': backward_content,

msprobe/pytorch/common/utils.py CHANGED Viewed

@@ -337,56 +337,6 @@ def save_pt(tensor, filepath):
     change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
-class TypeCheckingUnpickler(pickle.Unpickler):
-    """
-    This class is a subclass of pickle.Unpickler, which is used to unpickle pickled objects.
-    It overrides the find_class method to add type checking functionality.
-    """
-    allowed_types = [
-        "str",
-        "ApiData",
-        "OrderedDict",
-        "_rebuild_tensor_v2",  # from torch.utils
-        "_load_from_bytes"  # from torch.storage
-    ]
-    def find_class(self, module, name):
-        """
-        Method to find the class of the object to be unpickled.
-        Throws pickle.UnpicklingError If the object type is not in the allowed types list.
-        """
-        if name in self.allowed_types:
-            return super().find_class(module, name)
-        raise pickle.UnpicklingError("Unsupported object type: {}.{}".format(module, name))
-def save_pkl(tensor, filepath):
-    """Save ApiData or str objection by pickle"""
-    check_path_before_create(filepath)
-    filepath = os.path.realpath(filepath)
-    try:
-        with FileOpen(filepath, 'wb') as f:
-            pickle.dump(tensor, f)
-    except Exception as e:
-        logger.error("Save pt file failed, please check according possible error causes: "
-                     "1. out of disk space or disk error, "
-                     "2. no permission to write files, etc.")
-        raise RuntimeError(f"save pt file {filepath} failed") from e
-    change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
-def load_pkl(pt_path):
-    """Load ApiData or str objection by pickle for accuracy_checker_online"""
-    check_file_or_directory_path(pt_path)
-    pt_path = os.path.realpath(pt_path)
-    try:
-        with FileOpen(pt_path, 'rb') as f:
-            pt = TypeCheckingUnpickler(f).load()
-    except Exception as e:
-        raise RuntimeError(f"load pt file {pt_path} failed: {e}") from e
-    return pt
 def is_recomputation():
     """Check if the current operation is in the re-computation phase.
@@ -471,23 +421,3 @@ def register_forward_hook(module, forward_hook):
         module.register_forward_hook(forward_hook, with_kwargs=True)
     else:
         module.register_forward_hook(forward_hook)
-def save_api_data(api_data):
-    """Save data to io stream"""
-    try:
-        io_buff = io.BytesIO()
-        torch.save(api_data, io_buff)
-    except Exception as e:
-        raise RuntimeError(f"save api_data to io_buff failed") from e
-    return io_buff
-def load_api_data(api_data_bytes):
-    """Load data from bytes stream"""
-    try:
-        buffer = io.BytesIO(api_data_bytes)
-        buffer = torch.load(buffer, map_location="cpu")
-    except Exception as e:
-        raise RuntimeError(f"load api_data from bytes failed") from e
-    return buffer

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -48,16 +48,6 @@ class DebuggerConfig:
                 "max_sample": task_config.max_sample
             }
-        self.online_run_ut = False
-        if self.task == Const.TENSOR:
-            # dump api tensor and collaborate with online run_ut
-            self.online_run_ut = task_config.online_run_ut if task_config.online_run_ut else False
-            self.nfs_path = task_config.nfs_path if task_config.nfs_path else ""
-            self.tls_path = task_config.tls_path if task_config.tls_path else ""
-            self.host = task_config.host if task_config.host else ""
-            self.port = task_config.port if task_config.port else -1
-            self.online_run_ut_recompute = task_config.online_run_ut_recompute \
-                if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
         self._check_statistics_config(task_config)

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -63,9 +63,11 @@ def wrap_forward_with_hook_safety(module):
         except _StopRecomputationError as e:
             exception_output = None
             if len(module._forward_hooks.values()) > 0:
-                # msprobe的forward_hook会出现在第一个，仅执行msprobe的forward_hook
-                hook_fn = list(module._forward_hooks.values())[0]
-                hook_fn(module, args, kwargs, exception_output)
+                # 仅执行msprobe的forward_hook, hook名称必然包含'ModuleProcesser.'
+                for hook_fn in module._forward_hooks.values():
+                    if 'ModuleProcesser' in str(hook_fn):
+                        hook_fn(module, args, kwargs, exception_output)
+                        break
             raise e
     if torch_version_above_or_equal_21:
@@ -152,7 +154,13 @@ class ModuleProcesser:
         modules_and_names_with_index = self.get_modules_and_names(models, recursive, module_names)
         for index, modules_and_names in modules_and_names_with_index.items():
             model = models if index == "-1" else models[int(index)]
+            model_list = []
             for name, module in modules_and_names:
+                model_list.append((name, module))
+            is_verl = "verl" in sys.modules
+            for idx, (name, module) in enumerate(model_list):
                 if recursive and module == model:
                     continue
                 if not is_torch_nn_module(module):
@@ -163,6 +171,13 @@ class ModuleProcesser:
                     continue
                 if module.__class__.__name__ == "FullyShardedDataParallel":
                     continue
+                # verl 场景下跳过第一层和最后一层
+                if is_verl and (idx == 1 or idx == len(model_list) - 1):
+                    logger.warning(f"The module {name} is the first or last layer in verl scenario, "
+                                   f"the data dump for this module will be skipped.")
+                    continue
                 setattr(module, 'msprobe_hook', True)
                 module_index = (index + Const.SEP) if index != "-1" else ""
                 prefix_name = f'{BaseScope.Module_Type_Module}{Const.SEP}{module_index}{name}{Const.SEP}' + \

msprobe/pytorch/hook_module/api_register.py CHANGED Viewed

@@ -135,13 +135,17 @@ def redirect_wait():
                 store_func = dist_data_collect_func.pop(args[0])
                 store_func()
                 return
+            remove_value = None
             for value in dist_batch_data_collect_func:
                 if args[0] in value[0]:
                     value[0].remove(args[0])
                     if len(value[0]) == 0:
                         store_func = value[1]
                         store_func()
-                    return
+                        remove_value = value
+                    break
+            if remove_value:
+                dist_batch_data_collect_func.remove(remove_value)
         return wrapped_wait

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -48,12 +48,10 @@ from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_write
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
     raise ValueError("monitor require torch>=2.0")
 FORMAT_MAPPING = {
     MonitorConst.TENSORBOARD: SummaryWriterWithAD,
     MonitorConst.CSV: CSVWriterWithAD,
@@ -150,15 +148,11 @@ class GradContext:
     def __init__(self) -> None:
         self.pre = {}
         self.post = {}
-        self.acc_metric = {}
-        self.acc = {}
         self.actv = {}
     def reset(self):
         self.pre.clear()
         self.post.clear()
-        self.acc_metric.clear()
-        self.acc.clear()
         self.actv.clear()
@@ -510,18 +504,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return {}, {}
-        if self.weight_hooked:
-            get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
         get_metrics(self.ops, post_grad_dict, self.eps, self.grad_context.post)
-        reduced_grad = self.grad_context.post
-        if self.weight_hooked:
-            unreduced_grad = self.grad_context.acc_metric
-        else:
-            unreduced_grad = self.grad_context.pre
-        return reduced_grad, unreduced_grad
+        return self.grad_context.post, self.grad_context.pre
     def generate_xy_metrics(self):
         actv = {}
@@ -529,7 +513,6 @@ class TrainerMon:
             actv.update(fwd_context.actv)
         actv_grad = self.grad_context.actv
         return actv, actv_grad
     def reload_xy(self, xy_distribution=False):
@@ -607,11 +590,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return
-        if self.weight_hooked:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced',
-                                              use_micro_step=self.monitor_mbs_grad)
-        else:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
+        self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced',
+                                          use_micro_step=self.monitor_mbs_grad)
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
     def hook_optimizer(self, optimizer):
@@ -732,9 +712,9 @@ class TrainerMon:
             # 静态在第0步就可以保存, 动态在第0步不可以, 因为动态设计的就是重置后下一步开启, 第0步的self.monitoring还是False
             if self.monitoring:
                 module_rank_valid = not self.module_rank_list or (
-                            dist.is_initialized() and dist.get_rank() in self.module_rank_list)
+                        dist.is_initialized() and dist.get_rank() in self.module_rank_list)
                 step_condition = (context.step >= self.start_step and (
-                            context.step - self.start_step) % self.step_interval == 0)
+                        context.step - self.start_step) % self.step_interval == 0)
                 if module_rank_valid and step_condition:
                     self.has_collect_times += 1
@@ -791,6 +771,7 @@ class TrainerMon:
                     hook(optimizer, args, kwargs)
                 step_final_hook(optimizer, args, kwargs)
                 return out
             return wrapper
         optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
@@ -1013,11 +994,11 @@ class TrainerMon:
                 vpp_stage + module_name,
             ]:
                 if pattern in l2_targets:
-                    return pattern
+                    return pattern
         elif hook_name in ["linear_hook"]:
             return vpp_stage + squash_param_name(module_name, self.squash_name)
         return ""
     def _hook_module(self, target_names, l2_target_names, module: torch.nn.Module, vpp_stage=''):
         if '_modules' not in module.__dict__:
             # nothing to hook
@@ -1151,7 +1132,7 @@ class TrainerMon:
                 context.micro_step = 0
                 context.step += 1
             return
         def stack_hook(module, args, kwargs, module_output, name):
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
@@ -1221,7 +1202,7 @@ class TrainerMon:
         if self.monitor_mbs_grad:
             self._hook_weights()
             return
         self.optimizer_mon.patch_grad_sync(self)
         if self.enable_megatron or self.enable_deepspeed:
@@ -1281,6 +1262,7 @@ class TrainerMon:
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = foreach_reduce(fsdp_params, unsharded_grads, *unused)
                 return out
             return wrapper
         logger.info("Patch fsdp2 foreach_reduce, collect pre_grad metrics.")
@@ -1294,10 +1276,9 @@ class TrainerMon:
         """
         遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。
         """
-        context = self.grad_context
         @torch.no_grad
-        def param_hook(*args, context_dict, param, name):
+        def param_hook(*args, param, name):
             key = name
             if self.monitor_mbs_grad:
                 key += f'{MonitorConst.NAME_SEP}{param.micro_step}'
@@ -1305,14 +1286,15 @@ class TrainerMon:
             key = get_summary_writer_tag_name(key, 'acc_grad', self.rank)
             self.register_param_call_id("param_hook", key)
             param.micro_step += 1
+            grad_dict = {}
             if self.monitor_mbs_grad or (param.micro_step == self.micro_batch_number):
                 if self.params_have_main_grad:
                     grad = param.main_grad
                 else:
                     grad = param.grad
-                context_dict[key] = grad.clone()
+                grad_dict[key] = grad.clone()
+            get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
             if param.micro_step == self.micro_batch_number:
                 param.micro_step = 0
@@ -1322,7 +1304,7 @@ class TrainerMon:
             param_tmp = param.expand_as(param)
             grad_acc = param_tmp.grad_fn.next_functions[0][0]
             handle = grad_acc.register_hook(
-                partial(param_hook, context_dict=context.acc, param=param, name=name))
+                partial(param_hook, param=param, name=name))
             self.grad_accs.append(grad_acc)
             self.handles['wgrads'].append(handle)

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -35,48 +35,15 @@ from msprobe.pytorch.hook_module.utils import get_ops
 class TensorConfig(BaseConfig):
     def __init__(self, json_config):
         super().__init__(json_config)
-        self.online_run_ut = json_config.get("online_run_ut", False)
-        self.nfs_path = json_config.get("nfs_path", "")
-        self.host = json_config.get("host", "")
-        self.port = json_config.get("port", -1)
-        self.tls_path = json_config.get("tls_path", "./")
-        self.online_run_ut_recompute = json_config.get("online_run_ut_recompute", False)
         self.check_config()
         self._check_summary_mode()
         self._check_file_format()
-        if self.online_run_ut:
-            self._check_online_run_ut()
     def _check_file_format(self):
         if self.file_format is not None and self.file_format not in ["npy", "bin"]:
             raise Exception("file_format is invalid")
-    def _check_online_run_ut(self):
-        if not isinstance(self.online_run_ut, bool):
-            raise Exception(f"online_run_ut: {self.online_run_ut} is invalid.")
-        if not isinstance(self.online_run_ut_recompute, bool):
-            raise Exception(f"online_run_ut_recompute: {self.online_run_ut_recompute} is invalid.")
-        if self.nfs_path:
-            check_file_or_directory_path(self.nfs_path, isdir=True)
-            return
-        if self.tls_path:
-            check_file_or_directory_path(self.tls_path, isdir=True)
-            check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
-            check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
-            check_file_or_directory_path(os.path.join(self.tls_path, "ca.crt"))
-            crl_path = os.path.join(self.tls_path, "crl.pem")
-            if os.path.exists(crl_path):
-                check_file_or_directory_path(crl_path)
-        if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
-            raise Exception(f"host: {self.host} is invalid.")
-        if not isinstance(self.port, int) or not (0 < self.port <= 65535):
-            raise Exception(f"port: {self.port} is invalid, port range 0-65535.")
 class StatisticsConfig(BaseConfig):
     def __init__(self, json_config):
@@ -257,12 +224,7 @@ class RunUTConfig(BaseConfig):
         self.white_list = json_config.get("white_list", Const.DEFAULT_LIST)
         self.black_list = json_config.get("black_list", Const.DEFAULT_LIST)
         self.error_data_path = json_config.get("error_data_path", Const.DEFAULT_PATH)
-        self.is_online = json_config.get("is_online", False)
-        self.nfs_path = json_config.get("nfs_path", "")
-        self.host = json_config.get("host", "")
-        self.port = json_config.get("port", -1)
-        self.rank_list = json_config.get("rank_list", Const.DEFAULT_LIST)
-        self.tls_path = json_config.get("tls_path", "./")
         self.check_run_ut_config()
     @classmethod
@@ -280,22 +242,11 @@ class RunUTConfig(BaseConfig):
         if not os.path.exists(error_data_path):
             raise Exception("error_data_path: %s does not exist" % error_data_path)
-    @classmethod
-    def check_nfs_path_config(cls, nfs_path):
-        if nfs_path:
-            FileChecker(nfs_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
-    @classmethod
-    def check_tls_path_config(cls, tls_path):
-        if tls_path:
-            FileChecker(tls_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)
         RunUTConfig.check_filter_list_config(Const.BLACK_LIST, self.black_list)
         RunUTConfig.check_error_data_path_config(self.error_data_path)
-        RunUTConfig.check_nfs_path_config(self.nfs_path)
-        RunUTConfig.check_tls_path_config(self.tls_path)
 class GradToolConfig(BaseConfig):

msprobe/pytorch/pytorch_service.py CHANGED Viewed

@@ -15,9 +15,8 @@
 from msprobe.core.common.utils import Const
 from msprobe.core.service import BaseService
-from msprobe.pytorch.attl_manager import ATTLManager
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import get_rank_if_initialized, torch_version_above_or_equal_2
+from msprobe.pytorch.common.utils import get_rank_if_initialized
 from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
 from msprobe.pytorch.hook_module.api_register import get_api_register, ApiTemplate, redirect_wait
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
@@ -25,9 +24,6 @@ from msprobe.pytorch.hook_module.pt_hook_manager import PytorchHookManager
 from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
 from msprobe.pytorch.hook_module.script_wrapper import wrap_script_func, preprocess_func
-if torch_version_above_or_equal_2:
-    from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.dump_dispatch import run_ut_dispatch
 class PytorchService(BaseService):
     @property
@@ -45,12 +41,10 @@ class PytorchService(BaseService):
         self.logger = logger
         self.api_register = get_api_register()
         self.module_processor = ModuleProcesser(self.data_collector.scope)
-        self.attl_manager = ATTLManager(self.config)
-        self.hook_manager = PytorchHookManager(self.data_collector, self.config, self.attl_manager)
+        self.hook_manager = PytorchHookManager(self.data_collector, self.config)
         self.api_template = ApiTemplate
     def _register_hook(self):
-        self.attl_manager.attl_init()
         if self._is_mix_level:
             register_optimizer_hook(self.data_collector)
@@ -65,9 +59,6 @@ class PytorchService(BaseService):
         self.module_processor.register_module_hook(self.model, self.build_hook)
         self.logger.info(f"The module {self.config.task} hook function is successfully mounted to the model.")
-    def _run_ut_dispatch(self, status):
-        if torch_version_above_or_equal_2:
-            run_ut_dispatch(self.attl_manager.attl, status, self.config.online_run_ut_recompute)
     def _reset_status(self):
         super()._reset_status()

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -298,8 +298,8 @@ class GraphBuilder:
         no_recompute_map = GraphBuilder._get_no_recompute_map(graph, id_prefixes)
         if not no_recompute_map:
             return
-        # 深拷贝非重计算节点字典用于反向模式
-        no_recompute_ids_b = copy.deepcopy(no_recompute_map)
+        # 拷贝非重计算节点字典用于反向模式
+        no_recompute_ids_b = {node_id: list(node_list) for node_id, node_list in no_recompute_map.items()}
         del_indexes = []
         for node_id, id_prefix in recompute_map.items():

mindstudio-probe 8.3.0__py3-none-any.whl → 8.3.1__py3-none-any.whl

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.1py3-none-any.whl