PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/METADATA +1 -1
mindstudio_probe-1.1.0.dist-info/RECORD +287 -0
msprobe/README.md +46 -16
msprobe/__init__.py +16 -1
msprobe/config.json +0 -2
msprobe/core/advisor/advisor.py +8 -8
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +64 -3
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +54 -9
msprobe/core/common/inplace_op_checker.py +38 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +21 -11
msprobe/core/common/utils.py +153 -167
msprobe/core/common_config.py +18 -25
msprobe/core/compare/acc_compare.py +209 -36
msprobe/core/compare/check.py +102 -17
msprobe/core/compare/compare_cli.py +21 -1
msprobe/core/compare/highlight.py +41 -5
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +21 -6
msprobe/core/compare/utils.py +82 -48
msprobe/core/data_dump/data_collector.py +31 -32
msprobe/core/data_dump/data_processor/base.py +45 -22
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +11 -5
msprobe/core/data_dump/data_processor/pytorch_processor.py +24 -7
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +32 -16
msprobe/core/grad_probe/constant.py +4 -0
msprobe/core/grad_probe/grad_compare.py +2 -3
msprobe/core/grad_probe/utils.py +16 -3
msprobe/docs/01.installation.md +19 -9
msprobe/docs/02.config_introduction.md +52 -80
msprobe/docs/03.config_examples.md +3 -13
msprobe/docs/04.acl_config_examples.md +11 -9
msprobe/docs/05.data_dump_PyTorch.md +140 -12
msprobe/docs/06.data_dump_MindSpore.md +47 -5
msprobe/docs/07.accuracy_checker_PyTorch.md +57 -34
msprobe/docs/08.accuracy_checker_online_PyTorch.md +51 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +181 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +162 -31
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +59 -53
msprobe/docs/16.free_benchmarking_MindSpore.md +140 -0
msprobe/docs/17.grad_probe.md +14 -16
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +22 -10
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +35 -11
msprobe/mindspore/api_accuracy_checker/api_info.py +7 -0
msprobe/mindspore/cell_processor.py +27 -3
msprobe/mindspore/common/const.py +2 -0
msprobe/mindspore/common/utils.py +18 -2
msprobe/mindspore/compare/distributed_compare.py +9 -22
msprobe/mindspore/compare/layer_mapping.py +146 -0
msprobe/mindspore/compare/modify_mapping.py +107 -0
msprobe/mindspore/compare/ms_compare.py +173 -35
msprobe/mindspore/compare/ms_graph_compare.py +27 -11
msprobe/mindspore/debugger/debugger_config.py +16 -13
msprobe/mindspore/debugger/precision_debugger.py +37 -13
msprobe/mindspore/dump/dump_tool_factory.py +16 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +11 -1
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +206 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +82 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +41 -17
msprobe/mindspore/dump/kernel_graph_dump.py +19 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -4
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +19 -4
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -0
msprobe/mindspore/free_benchmark/common/utils.py +19 -5
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +16 -2
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +18 -3
msprobe/mindspore/free_benchmark/handler/base_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/fix_handler.py +15 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +18 -3
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +44 -18
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +18 -4
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +20 -5
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +15 -0
msprobe/mindspore/grad_probe/global_context.py +18 -8
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/service.py +42 -123
msprobe/pytorch/__init__.py +20 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +19 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +47 -21
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +67 -32
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +26 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +19 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +51 -125
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +146 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +21 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +78 -33
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +110 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +36 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +15 -0
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +21 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +180 -151
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +28 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +20 -5
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +26 -11
msprobe/pytorch/common/utils.py +40 -35
msprobe/pytorch/compare/distributed_compare.py +11 -11
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +38 -6
msprobe/pytorch/debugger/debugger_config.py +52 -39
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/enums.py +28 -0
msprobe/pytorch/free_benchmark/common/params.py +15 -0
msprobe/pytorch/free_benchmark/common/utils.py +17 -1
msprobe/pytorch/free_benchmark/compare/grad_saver.py +28 -7
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +15 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +26 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +55 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +2 -2
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +10 -11
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +17 -2
msprobe/pytorch/online_dispatch/compare.py +11 -12
msprobe/pytorch/online_dispatch/single_compare.py +7 -7
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +8 -0
msprobe/pytorch/online_dispatch/utils.py +1 -4
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +9 -10
msprobe/pytorch/parse_tool/lib/parse_tool.py +3 -0
msprobe/pytorch/parse_tool/lib/utils.py +28 -24
msprobe/pytorch/parse_tool/lib/visualization.py +1 -1
msprobe/pytorch/pt_config.py +167 -38
msprobe/pytorch/service.py +97 -32
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/top_level.txt +0 -0

msprobe/mindspore/debugger/debugger_config.py CHANGED Viewed

@@ -1,9 +1,24 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import create_directory
 from msprobe.mindspore.common.const import Const as MsConst
 from msprobe.mindspore.common.const import FreeBenchmarkConst
-from msprobe.core.common.file_utils import create_directory
 class DebuggerConfig:
@@ -51,16 +66,4 @@ class DebuggerConfig:
             self.file_format = "npy"
         if not self.check_mode:
             self.check_mode = "all"
-        self._check_rank()
-        self._check_step()
         return True
-    def _check_rank(self):
-        for rank_id in self.rank:
-            if not isinstance(rank_id, int) or rank_id < 0:
-                raise ValueError(f"rank {self.rank} must be a positive integer.")
-    def _check_step(self):
-        for s in self.step:
-            if not isinstance(s, int) or s < 0:
-                raise ValueError(f"step element {s} must be a positive integer.")

msprobe/mindspore/debugger/precision_debugger.py CHANGED Viewed

@@ -1,17 +1,31 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import mindspore as ms
 from mindspore._c_expression import MSContext
-from msprobe.mindspore.service import Service
-from msprobe.mindspore.ms_config import parse_json_config
-from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
-from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
-from msprobe.core.common.const import Const
+from msprobe.core.common.const import Const, MsgConst
 from msprobe.mindspore.common.const import Const as MsConst
-from msprobe.mindspore.runtime import Runtime
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.grad_probe.grad_monitor import GradientMonitor
+from msprobe.mindspore.ms_config import parse_json_config
+from msprobe.mindspore.runtime import Runtime
+from msprobe.mindspore.service import Service
+from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
 class PrecisionDebugger:
@@ -65,11 +79,11 @@ class PrecisionDebugger:
     def start(cls, model=None):
         instance = cls._instance
         if not instance:
-            raise Exception("No instance of PrecisionDebugger found.")
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.task in PrecisionDebugger.task_not_need_service:
             return
-        instance.config.execution_mode = instance._get_execution_mode()
+        instance.config.execution_mode = cls._get_execution_mode()
         if cls._need_service():
             if not instance.service:
                 instance.service = Service(instance.config)
@@ -82,11 +96,21 @@ class PrecisionDebugger:
         instance.first_start = True
         Runtime.is_running = True
+    @classmethod
+    def forward_backward_dump_end(cls):
+        instance = cls._instance
+        if not instance:
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if instance.task in PrecisionDebugger.task_not_need_service:
+            return
+        if instance.service:
+            instance.service.forward_backward_dump_end()
     @classmethod
     def stop(cls):
         instance = cls._instance
         if not instance:
-            raise Exception("PrecisionDebugger instance is not created.")
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.task == Const.GRAD_PROBE:
             instance.gm.stop()
         if instance.task in PrecisionDebugger.task_not_need_service:
@@ -99,7 +123,7 @@ class PrecisionDebugger:
     def step(cls):
         instance = cls._instance
         if not instance:
-            raise Exception("PrecisionDebugger instance is not created.")
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.task in PrecisionDebugger.task_not_need_service:
             return
         if instance.service:
@@ -110,7 +134,7 @@ class PrecisionDebugger:
     def monitor(cls, opt):
         instance = cls._instance
         if not instance:
-            raise Exception("PrecisionDebugger instance is not created.")
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.task != Const.GRAD_PROBE:
             return
         instance.gm.monitor(opt)
@@ -119,7 +143,7 @@ class PrecisionDebugger:
     def _need_service(cls):
         instance = cls._instance
         if not instance:
-            raise Exception("No instance of PrecisionDebugger found.")
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.config.execution_mode != MsConst.PYNATIVE_MODE:
             return False
         else:

msprobe/mindspore/dump/dump_tool_factory.py CHANGED Viewed

@@ -1,7 +1,22 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from msprobe.mindspore.common.const import Const
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
-from msprobe.mindspore.dump.kernel_kbyk_dump import KernelKbykDump
 from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump
+from msprobe.mindspore.dump.kernel_kbyk_dump import KernelKbykDump
 class DumpToolFactory:

msprobe/mindspore/dump/hook_cell/api_registry.py CHANGED Viewed

@@ -16,9 +16,10 @@
 from mindspore import Tensor, ops, mint
 from mindspore.mint.nn import functional
 from mindspore.common._stub_tensor import StubTensor
+from mindspore.communication import comm_func
 from msprobe.mindspore.dump.hook_cell.wrap_api import (HOOKTensor, HOOKStubTensor, HOOKFunctionalOP,
-                                                       HOOKMintOP, HOOKMintNNFunctionalOP,
+                                                       HOOKMintOP, HOOKMintNNFunctionalOP, HOOKDistributedOP,
                                                        get_wrap_api_list, setup_hooks)
 from msprobe.core.common.utils import Const
@@ -30,6 +31,7 @@ class ApiRegistry:
         self.functional_ori_attr = {}
         self.mint_ops_ori_attr = {}
         self.mint_func_ops_ori_attr = {}
+        self.distributed_ori_attr = {}
         self.norm_inner_ops_ori_attr = {}
         self.tensor_hook_attr = {}
@@ -37,6 +39,7 @@ class ApiRegistry:
         self.functional_hook_attr = {}
         self.mint_ops_hook_attr = {}
         self.mint_func_ops_hook_attr = {}
+        self.distibuted_hook_attr = {}
         self.norm_inner_ops_hook_attr = {}
         self.norm_inner_ops = ["norm", "square", "sqrt", "is_complex"]
@@ -74,6 +77,7 @@ class ApiRegistry:
         self.set_api_attr(ops, self.functional_hook_attr)
         self.set_api_attr(mint, self.mint_ops_hook_attr)
         self.set_api_attr(functional, self.mint_func_ops_hook_attr)
+        self.set_api_attr(comm_func, self.distibuted_hook_attr)
     def api_set_ori_func(self):
         self.set_api_attr(Tensor, self.tensor_ori_attr)
@@ -81,6 +85,7 @@ class ApiRegistry:
         self.set_api_attr(ops, self.functional_ori_attr)
         self.set_api_attr(mint, self.mint_ops_ori_attr)
         self.set_api_attr(functional, self.mint_func_ops_ori_attr)
+        self.set_api_attr(comm_func, self.distributed_ori_attr)
     def initialize_hook(self, hook):
         wrap_api_name = get_wrap_api_list()
@@ -89,6 +94,7 @@ class ApiRegistry:
         self.store_ori_attr(ops, wrap_api_name.ops_api_names, self.functional_ori_attr)
         self.store_ori_attr(mint, wrap_api_name.mint_api_names, self.mint_ops_ori_attr)
         self.store_ori_attr(functional, wrap_api_name.mint_nn_func_api_names, self.mint_func_ops_ori_attr)
+        self.store_ori_attr(comm_func, wrap_api_name.distributed_api_names, self.distributed_ori_attr)
         self.store_ori_attr(ops, self.norm_inner_ops, self.norm_inner_ops_ori_attr)
         setup_hooks(hook)
         for attr_name in dir(HOOKTensor):
@@ -113,6 +119,10 @@ class ApiRegistry:
             if attr_name.startswith(Const.ATTR_NAME_PREFIX):
                 api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
                 self.mint_func_ops_hook_attr[api_name] = getattr(HOOKMintNNFunctionalOP, attr_name)
+        for attr_name in dir(HOOKDistributedOP):
+            if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                self.distibuted_hook_attr[api_name] = getattr(HOOKDistributedOP, attr_name)
 api_register = ApiRegistry()

msprobe/mindspore/dump/hook_cell/primitive_hooks.py ADDED Viewed

@@ -0,0 +1,206 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+import mindspore as ms
+from mindspore.common.tensor import Tensor
+from mindspore import ops
+from msprobe.mindspore.common.log import logger
+from msprobe.core.common.utils import Const, DumpException
+from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs, \
+    ModuleBackwardInputs, ModuleBackwardOutputs
+class PrimitiveHookService:
+    def __init__(self, service_instance):
+        self.primitive_counters = {}
+        self.service_instance = service_instance
+    def wrap_primitive(self, origin_func, primitive_name):
+        """
+        包装原始的 primitive 函数，添加输入和输出的 hook 以捕获前向和反向数据。
+        Args:
+            origin_func (callable): 原始 的 primitive 函数。
+            primitive_name (str): 原始的 primitive 名称。
+        Returns:
+            callable: 包装后的 primitive 函数。
+        """
+        def create_backward_hook(captured_grads, num_tensors, updated_primitive_name, hook_type):
+            """
+            创建反向 hook 函数，用于捕获梯度。
+            Args:
+                captured_grads (list): 用于保存捕获的梯度。
+                num_tensors (int): 张量数量。
+                updated_primitive_name (str): 更新后的 primitive 名称。
+                hook_type (str): hook 类型 (输入/输出)。
+            Returns:
+                callable: 反向 hook 函数。
+            """
+            def backward_hook(grad):
+                captured_grads.append(grad)
+                backward_primitive_name = f"{updated_primitive_name}{Const.SEP}{Const.BACKWARD}"
+                try:
+                    if len(captured_grads) == num_tensors and hook_type == Const.INPUT:
+                        self.service_instance.data_collector.update_api_or_module_name(backward_primitive_name)
+                        new_module_input_output = ModuleBackwardOutputs(grad_output=tuple(captured_grads))
+                        self.service_instance.data_collector.backward_output_data_collect(
+                            backward_primitive_name, self, os.getpid(), new_module_input_output
+                        )
+                        captured_grads.clear()
+                    elif len(captured_grads) == num_tensors and hook_type == Const.OUTPUT:
+                        self.service_instance.data_collector.update_api_or_module_name(backward_primitive_name)
+                        new_module_input_output = ModuleBackwardInputs(grad_input=tuple(captured_grads))
+                        self.service_instance.data_collector.backward_input_data_collect(
+                            backward_primitive_name, self, os.getpid(), new_module_input_output
+                        )
+                        captured_grads.clear()
+                except Exception as exception:
+                    logger.error(f"This is a primitive op {hook_type}_backward dump error: {exception}, "
+                                 f"updated_primitive_name: {updated_primitive_name}")
+                    raise DumpException(DumpException.BACKWARD_DATA_COLLECTION_ERROR) from exception
+            return backward_hook
+        def hook_primitive_inputs(args, captured_grads_input, updated_primitive_name):
+            """
+            针对前向输入添加 hook。
+            Args:
+                args (tuple): primitive 输入参数。
+                captured_grads_input (list): 捕获的输入梯度。
+                updated_primitive_name (str): 更新后的 primitive 名称。
+            Returns:
+                list: 添加了 hook 的输入。
+            """
+            hooked_inputs = []
+            num_tensors = sum(isinstance(arg, Tensor) for arg in args)
+            input_backward_hook = create_backward_hook(captured_grads_input, num_tensors, updated_primitive_name,
+                                                       Const.INPUT)
+            for arg in args:
+                if isinstance(arg, Tensor):
+                    arg_hooked = ops.HookBackward(input_backward_hook)(arg)
+                    hooked_inputs.append(arg_hooked)
+                else:
+                    hooked_inputs.append(arg)
+            return hooked_inputs
+        def hook_primitive_outputs(out, captured_grads_output, updated_primitive_name):
+            """
+            针对前向输出添加 hook。
+            Args:
+                out (Tensor/tuple): primitive 输出。
+                captured_grads_output (list): 捕获的输出梯度。
+                updated_primitive_name (str): 更新后的 primitive 名称。
+            Returns:
+                Tensor/tuple: 添加了 hook 的输出。
+            """
+            if isinstance(out, tuple):
+                num_output_tensors = sum(isinstance(tensor, Tensor) for tensor in out)
+            else:
+                num_output_tensors = 1
+            output_backward_hook = create_backward_hook(captured_grads_output, num_output_tensors,
+                                                        updated_primitive_name, Const.OUTPUT)
+            if isinstance(out, Tensor):
+                return ops.HookBackward(output_backward_hook)(out)
+            elif isinstance(out, tuple):
+                hooked_outputs = []
+                for tensor in out:
+                    if isinstance(tensor, Tensor):
+                        hooked_outputs.append(ops.HookBackward(output_backward_hook)(tensor))
+                    else:
+                        hooked_outputs.append(tensor)
+                return tuple(hooked_outputs)
+            return out
+        def wrapped_primitive_call(instance_self, *args, **kwargs):
+            """
+            包装后的 primitive 调用函数，添加输入和输出的 hook。
+            Args:
+                instance_self (object): primitive 的实例。
+                *args: primitive 输入参数。
+                **kwargs: primitive 关键字参数。
+            Returns:
+                Tensor/tuple: primitive 的返回值。
+            """
+            self.update_primitive_counters(primitive_name)
+            current_count = self.primitive_counters.get(primitive_name, 0)
+            updated_primitive_name = f"{Const.PRIMITIVE_PREFIX}{Const.SEP}{primitive_name}{Const.SEP}{current_count}"
+            if not self.service_instance.primitive_switch:
+                return origin_func(*args, **kwargs)
+            captured_grads_input, captured_grads_output = [], []
+            try:
+                hooked_inputs = hook_primitive_inputs(args, captured_grads_input, updated_primitive_name)
+            except Exception as exception:
+                logger.error(f"This is a primitive op dump error during input hooking: {exception}, "
+                             f"primitive_name: {primitive_name}")
+                raise DumpException(DumpException.INPUT_HOOK_ERROR) from exception
+            try:
+                out = origin_func(*hooked_inputs, **kwargs)
+            except Exception as exception:
+                logger.error(f"This is a primitive op dump error during function call: {exception}, "
+                             f"primitive_name: {primitive_name}")
+                raise DumpException(DumpException.FUNCTION_CALL_ERROR) from exception
+            forward_primitive_name = f"{updated_primitive_name}{Const.SEP}{Const.FORWARD}"
+            self.service_instance.data_collector.update_api_or_module_name(forward_primitive_name)
+            if self.service_instance.data_collector:
+                module_input_output = ModuleForwardInputsOutputs(args=hooked_inputs, kwargs=kwargs, output=out)
+                try:
+                    self.service_instance.data_collector.forward_data_collect(forward_primitive_name, instance_self,
+                                                             os.getpid(), module_input_output)
+                except Exception as exception:
+                    logger.error(f"This is a primitive op dump error during forward data collection: {exception}, "
+                                 f"primitive_name: {primitive_name}")
+                    raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
+                if self.service_instance.data_collector.if_return_forward_new_output():
+                    out = self.service_instance.data_collector.get_forward_new_output()
+            try:
+                out = hook_primitive_outputs(out, captured_grads_output, updated_primitive_name)
+            except Exception as exception:
+                logger.error(f"This is a primitive op dump error during output hooking: {exception}, "
+                             f"primitive_name: {primitive_name}")
+                raise DumpException(DumpException.OUTPUT_HOOK_ERROR) from exception
+            return out
+        return wrapped_primitive_call
+    def update_primitive_counters(self, primitive_name):
+        if primitive_name not in self.primitive_counters:
+            self.primitive_counters[primitive_name] = 0
+        else:
+            self.primitive_counters[primitive_name] += 1

msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml CHANGED Viewed

@@ -185,6 +185,7 @@ ops:
   - float_power
   - fmod
   - frac
+  - flash_attention_score
   - gcd
   - hypot
   - igamma
@@ -876,16 +877,60 @@ mint.ops:
   - zeros
   - zeros_ex
   - zeros_like
-mint.nn:
-  - Dropout
-  - Embedding
-  - Fold
-  - LayerNorm
-  - Linear
-  - MaxPool2d
-  - Unfold
-  - Upsample
+  - inverse
+  - select
+  - item
+  - unsqueeze
+  - median
+  - floor
+  - histc
+  - special
+  - arctan2
+  - sign
+  - concat
+  - atanh
+  - greater_equal
+  - eye
+  - fix
+  - argmin
+  - asinh
+  - atan
+  - nan_to_num
+  - tan
+  - round
+  - cosh
+  - norm
+  - roll
+  - log1p
+  - reshape
+  - arccos
+  - outer
+  - arcsin
+  - rand_like
+  - acosh
+  - multinomial
+  - logical_xor
+  - acos
+  - linalg
+  - sinc
+  - arcsinh
+  - asin
+  - narrow
+  - arctanh
+  - trace
+  - erfc
+  - bernoulli
+  - expm1
+  - logaddexp
+  - sinh
+  - arccosh
+  - atan2
+  - rand
+  - arange
+  - trunc
+  - arctan
+  - swapaxes
+  - transpose
 mint.nn.functional:
   - absolute_import
@@ -920,3 +965,30 @@ mint.nn.functional:
   - softplus
   - tanh
   - unfold
+  - mse_loss
+  - adaptive_avg_pool1d
+  - binary_cross_entropy
+  - adaptive_avg_pool2d
+  - hardsigmoid
+  - selu
+  - softshrink
+  - prelu
+  - logsigmoid
+  - hardswish
+  - mish
+  - log_softmax
+  - hardshrink
+  - l1_loss
+  - elu
+communication.comm_func:
+  - all_reduce
+  - all_gather_into_tensor
+  - reduce
+  - reduce_scatter_tensor
+  - all_to_all_single_with_output_shape
+  - all_to_all_with_output_shape
+  - batch_isend_irecv
+  - broadcast
+  - gather_into_tensor
+  - scatter_tensor

msprobe/mindspore/dump/hook_cell/wrap_api.py CHANGED Viewed

@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,19 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
 import os
-from mindspore import Tensor, ops, mint
-from mindspore.mint.nn import functional
+from mindspore import Tensor, mint, ops
 from mindspore.common._stub_tensor import StubTensor
+from mindspore.communication import comm_func
+from mindspore.mint.nn import functional
-from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
 from msprobe.core.common.const import Const
-from msprobe.mindspore.common.const import Const as MsConst
 from msprobe.core.common.file_utils import load_yaml
+from msprobe.mindspore.common.const import Const as MsConst
+from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
 cur_path = os.path.dirname(os.path.realpath(__file__))
 yaml_path = os.path.join(cur_path, MsConst.SUPPORTED_API_LIST_FILE)
@@ -51,6 +49,10 @@ class HOOKMintNNFunctionalOP(object):
     pass
+class HOOKDistributedOP(object):
+    pass
 class ApiTemplate(HOOKCell):
     def __init__(self, api_name, api_dict, prefix, hook):
         self.api_name = api_name
@@ -65,12 +67,14 @@ class ApiTemplate(HOOKCell):
 class WrapApiName:
-    def __init__(self, tensor_api_names, stub_tensor_api_names, ops_api_names, mint_api_names, mint_nn_func_api_names):
+    def __init__(self, tensor_api_names, stub_tensor_api_names, ops_api_names, mint_api_names, mint_nn_func_api_names,
+                 distributed_api_names):
         self.tensor_api_names = tensor_api_names
         self.stub_tensor_api_names = stub_tensor_api_names
         self.ops_api_names = ops_api_names
         self.mint_api_names = mint_api_names
         self.mint_nn_func_api_names = mint_nn_func_api_names
+        self.distributed_api_names = distributed_api_names
 def get_wrap_api_list():
@@ -79,11 +83,13 @@ def get_wrap_api_list():
     ops_api = api_list.get(MsConst.SUPPORTED_OPS_LIST_KEY)
     mint_api = api_list.get(MsConst.SUPPORTED_MINT_LIST_KEY)
     mint_nn_func_api = api_list.get(MsConst.SUPPORTED__MINT_NN_FUNC_LIST_KEY)
+    distributed_api = api_list.get(MsConst.SUPPORTED_COMM_LIST_KEY)
     wrap_api_name = WrapApiName(set(tensor_api) & set(dir(Tensor)),
                                 set(tensor_api) & set(dir(StubTensor)),
                                 set(ops_api) & set(dir(ops)),
                                 set(mint_api) & set(dir(mint)),
-                                set(mint_nn_func_api) & set(dir(functional)))
+                                set(mint_nn_func_api) & set(dir(functional)),
+                                set(distributed_api) & set(dir(comm_func)))
     return wrap_api_name
@@ -111,3 +117,5 @@ def setup_hooks(hook):
                            MsConst.MINT_DATA_PREFIX, hook, HOOKMintOP)
     wrap_api_func_and_bind(wrap_api_name.mint_nn_func_api_names, {f: getattr(functional, f) for f in dir(functional)},
                            MsConst.MINT_NN_FUNC_DATA_PREFIX, hook, HOOKMintNNFunctionalOP)
+    wrap_api_func_and_bind(wrap_api_name.distributed_api_names, {f: getattr(comm_func, f) for f in dir(comm_func)},
+                           MsConst.DISTRIBUTED_DATA_PREFIX, hook, HOOKDistributedOP)

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl