PyPI - mindstudio-probe - Versions diffs - 8.1.1__py3-none-any.whl → 8.1.2__py3-none-any.whl - Mend

mindstudio-probe 8.1.1py3-none-any.whl → 8.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/METADATA +1 -1
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/RECORD +95 -94
msprobe/core/common/const.py +3 -0
msprobe/core/common/file_utils.py +45 -5
msprobe/core/common/utils.py +117 -13
msprobe/core/common_config.py +15 -1
msprobe/core/compare/acc_compare.py +21 -9
msprobe/core/compare/compare_cli.py +10 -2
msprobe/core/compare/merge_result/merge_result.py +1 -1
msprobe/core/compare/utils.py +8 -2
msprobe/core/config_check/checkers/base_checker.py +2 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +5 -4
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +4 -1
msprobe/core/config_check/config_check_cli.py +1 -1
msprobe/core/config_check/config_checker.py +1 -2
msprobe/core/data_dump/data_collector.py +4 -1
msprobe/core/data_dump/data_processor/mindspore_processor.py +23 -1
msprobe/core/data_dump/data_processor/pytorch_processor.py +3 -25
msprobe/core/debugger/precision_debugger.py +13 -8
msprobe/core/hook_manager.py +112 -82
msprobe/core/monitor/utils.py +338 -0
msprobe/core/service.py +2 -1
msprobe/core/single_save/single_comparator.py +5 -3
msprobe/docs/01.installation.md +1 -0
msprobe/docs/05.data_dump_PyTorch.md +4 -4
msprobe/docs/07.accuracy_checker_PyTorch.md +14 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +13 -11
msprobe/docs/10.accuracy_compare_PyTorch.md +3 -1
msprobe/docs/11.accuracy_compare_MindSpore.md +4 -2
msprobe/docs/12.overflow_check_PyTorch.md +3 -2
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/14.data_parse_PyTorch.md +35 -32
msprobe/docs/21.visualization_PyTorch.md +9 -8
msprobe/docs/22.visualization_MindSpore.md +1 -0
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/24.code_mapping_Mindspore.md +6 -5
msprobe/docs/31.config_check.md +15 -5
msprobe/docs/33.generate_operator_MindSpore.md +2 -2
msprobe/docs/34.RL_collect.md +18 -9
msprobe/docs/35.nan_analyze.md +4 -3
msprobe/docs/FAQ.md +3 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +29 -1
msprobe/mindspore/cell_processor.py +35 -14
msprobe/mindspore/code_mapping/bind.py +23 -4
msprobe/mindspore/code_mapping/graph_parser.py +6 -4
msprobe/mindspore/common/utils.py +3 -0
msprobe/mindspore/compare/common_dir_compare.py +32 -12
msprobe/mindspore/compare/ms_graph_compare.py +7 -2
msprobe/mindspore/compare/utils.py +9 -1
msprobe/mindspore/debugger/debugger_config.py +13 -11
msprobe/mindspore/debugger/precision_debugger.py +67 -45
msprobe/mindspore/dump/dump_tool_factory.py +2 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +14 -9
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +12 -7
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +27 -13
msprobe/mindspore/dump/jit_dump.py +6 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +13 -6
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +6 -5
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -0
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/monitor/common_func.py +1 -1
msprobe/mindspore/monitor/module_hook.py +3 -3
msprobe/mindspore/monitor/utils.py +0 -252
msprobe/mindspore/ms_config.py +0 -1
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/nan_analyze/graph.py +4 -0
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +15 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +1 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +1 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -4
msprobe/pytorch/common/utils.py +0 -16
msprobe/pytorch/compare/pt_compare.py +5 -0
msprobe/pytorch/debugger/debugger_config.py +12 -5
msprobe/pytorch/debugger/precision_debugger.py +8 -1
msprobe/pytorch/dump/module_dump/hook_wrapper.py +1 -3
msprobe/pytorch/dump/module_dump/module_processer.py +44 -13
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +2 -0
msprobe/pytorch/hook_module/hook_module.py +9 -9
msprobe/pytorch/hook_module/pt_hook_manager.py +7 -7
msprobe/pytorch/monitor/csv2tb.py +3 -10
msprobe/pytorch/monitor/features.py +5 -0
msprobe/pytorch/monitor/module_hook.py +6 -7
msprobe/pytorch/monitor/module_metric.py +0 -3
msprobe/pytorch/monitor/optimizer_collect.py +1 -1
msprobe/pytorch/monitor/utils.py +1 -317
msprobe/pytorch/online_dispatch/dispatch.py +1 -1
msprobe/pytorch/online_dispatch/dump_compare.py +7 -1
msprobe/pytorch/parse_tool/lib/utils.py +2 -4
msprobe/visualization/graph_service.py +1 -1
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/top_level.txt +0 -0

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -13,12 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import threading
+import sys
 from collections import OrderedDict
 import torch
 from torch.utils.hooks import BackwardHook, RemovableHandle
 from msprobe.core.common.const import Const
+from msprobe.core.common.utils import ModuleQueue, ThreadSafe
 from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import is_torch_nn_module, register_forward_pre_hook
@@ -46,13 +49,15 @@ def wrap_megatron_deallocate(func):
             out.data = torch.empty((1,), device=out.device, dtype=out.dtype, )
             return func(out_clone, deallocate_pipeline_outputs)
         return func(out, deallocate_pipeline_outputs)
     return wrapper_func
 class ModuleProcesser:
+    module_queue = ModuleQueue()
     module_count = {}
-    module_stack = []
-    api_parent_node = ""
+    module_stack = {}
+    api_parent_node = {}
     module_node = {}
     module_bw_hook_kernels = {}
     module_with_backward_hook = {}
@@ -64,7 +69,15 @@ class ModuleProcesser:
         replace_checkpoint()
         try:
             from megatron.core.pipeline_parallel import schedules
+            origin_func_id = id(schedules.deallocate_output_tensor)
             schedules.deallocate_output_tensor = wrap_megatron_deallocate(schedules.deallocate_output_tensor)
+            for module in list(sys.modules.values()):
+                if module.__name__ == 'schedules':
+                    continue
+                for func in module.__dict__:
+                    if id(module.__dict__[func]) == origin_func_id:
+                        module.__setattr__(func, schedules.deallocate_output_tensor)
+                        logger.debug(f'patch {module.__name__}.{func}.')
             logger.info_on_rank_0("Patch megatron method success.")
         except ImportError:
             logger.info_on_rank_0("No megatron find.")
@@ -103,9 +116,10 @@ class ModuleProcesser:
     @classmethod
     def reset_module_stats(cls):
+        cls.module_queue = ModuleQueue()
         cls.module_count = {}
-        cls.module_stack = []
-        cls.api_parent_node = ""
+        cls.module_stack = {}
+        cls.api_parent_node = {}
         cls.module_node = {}
         cls.module_bw_hook_kernels = {}
         cls.enable_module_dump = False
@@ -144,6 +158,7 @@ class ModuleProcesser:
                 register_forward_pre_hook(module, forward_pre_hook)
     def build_module_hook(self, module_name, build_data_hook):
+        @ThreadSafe.synchronized
         def forward_pre_hook(module, args, kwargs=None):
             if kwargs is None:
                 kwargs = {}
@@ -171,15 +186,19 @@ class ModuleProcesser:
             hook_set = build_data_hook(BaseScope.Module_Type_Module, full_forward_name)
             def get_backward_pre_hook(full_backward_name):
+                @ThreadSafe.synchronized
                 def backward_pre_hook_fn(module, grad_output):
                     self.set_construct_info_in_pre_hook(full_backward_name)
                 return backward_pre_hook_fn
             def get_backward_hook(backward_data_hook, full_backward_name):
+                @ThreadSafe.synchronized
                 def backward_hook_fn(module, grad_input, grad_output):
                     new_output = backward_data_hook(module, grad_input, grad_output)
                     self.set_construct_info_in_hook(full_backward_name, is_forward=False)
                     return new_output
                 return backward_hook_fn
             if not ModuleProcesser.module_with_backward_hook.get(module_name):
@@ -193,6 +212,7 @@ class ModuleProcesser:
                 args = bw_hook.setup_input_hook(args)
             return (args, kwargs) if torch_version_above_or_equal_2 else args
+        @ThreadSafe.synchronized
         def forward_hook(module, args, kwargs_or_output, output_or_kwargs=None):
             if hasattr(module, 'msprobe_module_dump') and not self.enable_module_dump:
                 return output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
@@ -218,23 +238,34 @@ class ModuleProcesser:
         return forward_pre_hook
     def set_construct_info_in_pre_hook(self, full_name):
-        if self.module_stack:
-            ModuleProcesser.module_node[full_name] = self.module_stack[-1]
+        tid = threading.get_ident()
+        if tid not in self.module_stack:
+            ModuleProcesser.module_stack[tid] = []
+        if self.module_stack[tid]:
+            ModuleProcesser.module_node[full_name] = self.module_stack[tid][-1]
         else:
-            ModuleProcesser.module_node[full_name] = None
-        ModuleProcesser.module_stack.append(full_name)
-        ModuleProcesser.api_parent_node = full_name
+            parent_name = ModuleProcesser.module_queue.find_last(full_name)
+            ModuleProcesser.module_node[full_name] = parent_name
+        ModuleProcesser.module_queue.add_name(full_name)
+        ModuleProcesser.module_stack[tid].append(full_name)
+        ModuleProcesser.api_parent_node[tid] = full_name
         if self.scope:
             self.scope.begin_module(full_name)
     def set_construct_info_in_hook(self, full_name, is_forward=True):
+        tid = threading.get_ident()
         if torch_version_above_or_equal_2 or is_forward:
-            if self.module_stack:
-                ModuleProcesser.module_stack.pop()
-            ModuleProcesser.api_parent_node = ModuleProcesser.module_stack[-1] if self.module_stack else None
+            ModuleProcesser.module_queue.remove_name(full_name)
+            ModuleProcesser.api_parent_node[tid] = None
+            if self.module_stack.get(tid):
+                ModuleProcesser.module_stack[tid].pop()
+            if self.module_stack.get(tid):
+                ModuleProcesser.api_parent_node[tid] = ModuleProcesser.module_stack[tid][-1]
             if self.scope:
                 self.scope.end_module(full_name)
         else:
             if self.scope:
                 self.scope.begin_module(full_name)
-            ModuleProcesser.api_parent_node = full_name
+            ModuleProcesser.api_parent_node[tid] = full_name

msprobe/pytorch/free_benchmark/result_handlers/base_handler.py CHANGED Viewed

@@ -186,6 +186,8 @@ class FuzzHandler(ABC):
         ratio = self.ratio_calculate(
             origin_output, perturbed_output, norm_type=NormType.ENDLESS_NORM
         )
+        if threshold == 0:
+            raise ValueError("Threshold cannot be zero. Check `get_threshold` implementation.")
         if ratio == ThresholdConfig.SYMBOL_FLIPPING:
             is_consistent = False
         else:

msprobe/pytorch/hook_module/hook_module.py CHANGED Viewed

@@ -22,20 +22,19 @@ import torch.nn as nn
 import torch.utils.hooks as full_hooks
 from msprobe.core.common.runtime import Runtime
-from msprobe.pytorch.common.utils import is_float8_tensor, register_forward_pre_hook, register_forward_hook
+from msprobe.core.common.utils import ThreadSafe
+from msprobe.pytorch.common.utils import register_forward_pre_hook, register_forward_hook
 class HOOKModule(nn.Module):
     module_count = defaultdict(int)
-    inner_stop_hook = {}
+    inner_stop_hook = defaultdict(bool)
     def __init__(self, hook_build_func) -> None:
         super(HOOKModule, self).__init__()
         self.has_overflow = False
-        self.current_thread = threading.current_thread().ident
-        if self.current_thread not in HOOKModule.inner_stop_hook:
-            HOOKModule.inner_stop_hook[self.current_thread] = False
-        self.stop_hook = HOOKModule.inner_stop_hook.get(self.current_thread, False)
+        self.tid = threading.get_ident()
+        self.stop_hook = HOOKModule.inner_stop_hook.get(self.tid, False)
         if not self.stop_hook:
             self.forward_data_collected = False
@@ -43,6 +42,7 @@ class HOOKModule(nn.Module):
             if not Runtime.is_running:
                 return
             prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
+            ThreadSafe.acquire()
             if callable(hook_build_func):
                 hook_set = hook_build_func(prefix)
                 register_forward_pre_hook(self, hook_set.forward_pre_hook)
@@ -52,11 +52,11 @@ class HOOKModule(nn.Module):
     def __call__(self, *args, **kwargs):
         changed = False
         if not self.stop_hook:
-            HOOKModule.inner_stop_hook[self.current_thread] = True
+            HOOKModule.inner_stop_hook[self.tid] = True
             changed = True
         result = self._call_func(*args, **kwargs)
         if changed:
-            HOOKModule.inner_stop_hook[self.current_thread] = False
+            HOOKModule.inner_stop_hook[self.tid] = False
         return result
     @staticmethod
@@ -104,7 +104,7 @@ class HOOKModule(nn.Module):
                 else:
                     return result
-            if is_float8_tensor(var) or not (var.requires_grad and torch.is_grad_enabled()):
+            if not (var.requires_grad and torch.is_grad_enabled()):
                 return result
             grad_fn = var.grad_fn

msprobe/pytorch/hook_module/pt_hook_manager.py CHANGED Viewed

@@ -23,7 +23,7 @@ from msprobe.pytorch.common.utils import is_recomputation, torch_version_above_o
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
-class PytorchHookManager(BaseHookManager):
+class PytorchHookManager(BaseHookManager):
     @property
     def _is_recompute(self):
         return is_recomputation()
@@ -41,7 +41,7 @@ class PytorchHookManager(BaseHookManager):
         kwargs = kwargs_or_output if torch_version_above_or_equal_2 else {}
         output = output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
         return kwargs, output
     def build_hook(self, hook_type, name):
         if hook_type == Const.API:
             full_forward_name = name + str(HOOKModule.get_module_count(name)) + Const.SEP + Const.FORWARD
@@ -51,10 +51,10 @@ class PytorchHookManager(BaseHookManager):
         hookset = HookSet(
             forward_hook=self._build_forward_hook(hook_type, full_forward_name),
             forward_pre_hook=self._build_forward_pre_hook(hook_type, full_forward_name, name),
-            backward_hook=self._build_backward_hook(hook_type, full_backward_name)
+            backward_hook=self._build_backward_hook(hook_type, full_backward_name)
         )
         return hookset
     def _need_exchange(self, module):
         return True
@@ -62,7 +62,7 @@ class PytorchHookManager(BaseHookManager):
         params_dict = {}
         if self.config.task != Const.STRUCTURE:
             params_dict = {
-                    key.split(Const.SEP)[-1]: value
-                    for key, value in module.named_parameters(recurse=False)
-                    }
+                key.split(Const.SEP)[-1]: value
+                for key, value in module.named_parameters(recurse=False)
+            }
         return params_dict

msprobe/pytorch/monitor/csv2tb.py CHANGED Viewed

@@ -23,17 +23,17 @@ from tqdm import tqdm
 from msprobe.core.common.const import MonitorConst
 from msprobe.core.common.file_utils import read_csv, create_directory, remove_path, recursive_chmod
-from msprobe.core.common.utils import is_int
+from msprobe.core.common.utils import check_process_num
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.monitor.utils import get_target_output_dir
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.monitor.utils import get_target_output_dir
 all_data_type_list = [
     "actv", "actv_grad", "exp_avg", "exp_avg_sq",
     "grad_unreduced", "grad_reduced", "param_origin", "param_updated"
 ]
 CSV_FILE_SUFFIX = r"_\d+-\d+\.csv"
-MAX_PROCESS_NUM = 128
 def parse_step_line(line, ops):
@@ -119,13 +119,6 @@ def csv2tb_by_step_work(target_output_dirs, output_dirpath, data_type_list):
                 write_step(output_dirpath, all_step_result, rank, data_type)
-def check_process_num(process_num):
-    if not is_int(process_num) or process_num <= 0:
-        raise ValueError(f"process_num({process_num}) is not a positive integer")
-    if process_num > MAX_PROCESS_NUM:
-        raise ValueError(f"The maximum supported process_num is {MAX_PROCESS_NUM}, current value: {process_num}.")
 def check_data_type_list(data_type_list):
     if data_type_list is None:
         logger.info(f"data_type_list is None, use default all_data_type_list: {all_data_type_list}")

msprobe/pytorch/monitor/features.py CHANGED Viewed

@@ -45,13 +45,18 @@ def get_max(x: torch.tensor):
 @torch.no_grad()
 def get_zeros(x: torch.tensor, eps: float):
+    if x.numel() == 0:
+        return torch.tensor(float('nan'))
     return torch.sum(torch.abs(x) < eps) / x.numel()
 @torch.no_grad()
 def get_sign_matches(x: torch.tensor, y: torch.tensor):
+    if y.numel() == 0:
+        return torch.tensor(1.)
     xs = x.sign()
     ys = y.sign()
     try:
         same_direction_ratio = ((xs * ys).sum() / ys.numel() + 1) / 2
     except RuntimeError as e:

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -31,8 +31,11 @@ from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
 from msprobe.core.common.file_utils import write_df_to_csv
 from msprobe.core.common.utils import analyze_api_call_stack
+from msprobe.core.monitor.utils import validate_config, validate_ops, \
+    get_output_base_dir, get_target_output_dir, chmod_tensorboard_dir, validate_set_monitor
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import is_recomputation, is_float8_tensor
+from msprobe.pytorch.common.utils import is_recomputation
+from msprobe.pytorch.monitor.utils import get_param_struct
 from msprobe.pytorch.monitor.data_writers import SummaryWriterWithAD, CSVWriterWithAD, BaseWriterWithAD, WriterInput
 from msprobe.pytorch.monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, \
     get_process_group
@@ -40,8 +43,6 @@ from msprobe.pytorch.monitor.features import get_sign_matches
 from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_writer_tag_name, \
     TensorMetrics, squash_param_name
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
-from msprobe.pytorch.monitor.utils import get_param_struct, validate_config, validate_ops, \
-    get_output_base_dir, get_target_output_dir, chmod_tensorboard_dir, validate_set_monitor
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
@@ -592,7 +593,7 @@ class TrainerMon:
                 context.param_adam_update = mv_result.update
                 context.param_adam_ratio = mv_result.ratio
-            self.generate_wgrad_metrics(grad_dict)
+            _, _ = self.generate_wgrad_metrics(grad_dict)
             self.generate_mv_metrics(context)
             self.generate_param_metrics(context, MonitorConst.PRE_PARAM)
@@ -763,7 +764,7 @@ class TrainerMon:
         def clone_if_tensor(args):
             if isinstance(args, tuple):
                 return tuple([clone_if_tensor(arg) for arg in args])
-            elif isinstance(args, torch.Tensor) and not is_float8_tensor(args):
+            elif isinstance(args, torch.Tensor):
                 return args.clone()
             else:
                 return args
@@ -1170,8 +1171,6 @@ class TrainerMon:
                     grad = param.main_grad
                 else:
                     grad = param.grad
-                if is_float8_tensor(grad):
-                    grad = grad.float()
                 context_dict[key] = grad.clone()
             if param.micro_step == self.micro_batch_number:

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -16,7 +16,6 @@ import re
 import torch
-from msprobe.pytorch.common.utils import is_float8_tensor
 from msprobe.pytorch.monitor.features import get_max, get_min, get_zeros, get_nans, get_norm, get_mean
 from msprobe.pytorch.monitor.utils import get_nan_tensor
@@ -181,8 +180,6 @@ def get_metrics(ops, tag2tensor, eps, out_dict=None):
             # Non-tensor in/output filled with nan.
             out_dict[tag].update({metric_name: get_nan_tensor() for metric_name in ops})
             continue
-        if is_float8_tensor(tensor):
-            tensor = tensor.float()
         for metric_name in ops:
             fun_metric = config_metric_registry.get(metric_name)
             out_dict[tag][metric_name] = fun_metric.get_metric(tensor, eps)

msprobe/pytorch/monitor/optimizer_collect.py CHANGED Viewed

@@ -17,7 +17,7 @@ from abc import abstractmethod
 import torch
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.monitor.utils import MVResult
+from msprobe.core.monitor.utils import MVResult
 from msprobe.core.common.const import MonitorConst

mindstudio-probe 8.1.1__py3-none-any.whl → 8.1.2__py3-none-any.whl

mindstudio-probe 8.1.1py3-none-any.whl → 8.1.2py3-none-any.whl