PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/METADATA +1 -1
mindstudio_probe-1.1.0.dist-info/RECORD +287 -0
msprobe/README.md +46 -16
msprobe/__init__.py +16 -1
msprobe/config.json +0 -2
msprobe/core/advisor/advisor.py +8 -8
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +64 -3
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +54 -9
msprobe/core/common/inplace_op_checker.py +38 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +21 -11
msprobe/core/common/utils.py +153 -167
msprobe/core/common_config.py +18 -25
msprobe/core/compare/acc_compare.py +209 -36
msprobe/core/compare/check.py +102 -17
msprobe/core/compare/compare_cli.py +21 -1
msprobe/core/compare/highlight.py +41 -5
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +21 -6
msprobe/core/compare/utils.py +82 -48
msprobe/core/data_dump/data_collector.py +31 -32
msprobe/core/data_dump/data_processor/base.py +45 -22
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +11 -5
msprobe/core/data_dump/data_processor/pytorch_processor.py +24 -7
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +32 -16
msprobe/core/grad_probe/constant.py +4 -0
msprobe/core/grad_probe/grad_compare.py +2 -3
msprobe/core/grad_probe/utils.py +16 -3
msprobe/docs/01.installation.md +19 -9
msprobe/docs/02.config_introduction.md +52 -80
msprobe/docs/03.config_examples.md +3 -13
msprobe/docs/04.acl_config_examples.md +11 -9
msprobe/docs/05.data_dump_PyTorch.md +140 -12
msprobe/docs/06.data_dump_MindSpore.md +47 -5
msprobe/docs/07.accuracy_checker_PyTorch.md +57 -34
msprobe/docs/08.accuracy_checker_online_PyTorch.md +51 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +181 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +162 -31
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +59 -53
msprobe/docs/16.free_benchmarking_MindSpore.md +140 -0
msprobe/docs/17.grad_probe.md +14 -16
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +22 -10
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +35 -11
msprobe/mindspore/api_accuracy_checker/api_info.py +7 -0
msprobe/mindspore/cell_processor.py +27 -3
msprobe/mindspore/common/const.py +2 -0
msprobe/mindspore/common/utils.py +18 -2
msprobe/mindspore/compare/distributed_compare.py +9 -22
msprobe/mindspore/compare/layer_mapping.py +146 -0
msprobe/mindspore/compare/modify_mapping.py +107 -0
msprobe/mindspore/compare/ms_compare.py +173 -35
msprobe/mindspore/compare/ms_graph_compare.py +27 -11
msprobe/mindspore/debugger/debugger_config.py +16 -13
msprobe/mindspore/debugger/precision_debugger.py +37 -13
msprobe/mindspore/dump/dump_tool_factory.py +16 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +11 -1
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +206 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +82 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +41 -17
msprobe/mindspore/dump/kernel_graph_dump.py +19 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -4
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +19 -4
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -0
msprobe/mindspore/free_benchmark/common/utils.py +19 -5
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +16 -2
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +18 -3
msprobe/mindspore/free_benchmark/handler/base_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/fix_handler.py +15 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +18 -3
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +44 -18
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +18 -4
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +20 -5
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +15 -0
msprobe/mindspore/grad_probe/global_context.py +18 -8
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/service.py +42 -123
msprobe/pytorch/__init__.py +20 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +19 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +47 -21
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +67 -32
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +26 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +19 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +51 -125
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +146 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +21 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +78 -33
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +110 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +36 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +15 -0
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +21 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +180 -151
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +28 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +20 -5
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +26 -11
msprobe/pytorch/common/utils.py +40 -35
msprobe/pytorch/compare/distributed_compare.py +11 -11
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +38 -6
msprobe/pytorch/debugger/debugger_config.py +52 -39
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/enums.py +28 -0
msprobe/pytorch/free_benchmark/common/params.py +15 -0
msprobe/pytorch/free_benchmark/common/utils.py +17 -1
msprobe/pytorch/free_benchmark/compare/grad_saver.py +28 -7
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +15 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +26 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +55 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +2 -2
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +10 -11
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +17 -2
msprobe/pytorch/online_dispatch/compare.py +11 -12
msprobe/pytorch/online_dispatch/single_compare.py +7 -7
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +8 -0
msprobe/pytorch/online_dispatch/utils.py +1 -4
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +9 -10
msprobe/pytorch/parse_tool/lib/parse_tool.py +3 -0
msprobe/pytorch/parse_tool/lib/utils.py +28 -24
msprobe/pytorch/parse_tool/lib/visualization.py +1 -1
msprobe/pytorch/pt_config.py +167 -38
msprobe/pytorch/service.py +97 -32
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/top_level.txt +0 -0

msprobe/docs/17.grad_probe.md CHANGED Viewed

@@ -21,9 +21,9 @@
    ```json
    {
     "task": "grad_probe",
-    "dump_path": "./dump_path",
-    "rank": [],
-    "step": [],
+    "dump_path": "./dump_path",
+    "rank": [],
+    "step": [],
     "grad_probe": {
         "grad_level": "L1",
         "param_list": [],
@@ -43,7 +43,7 @@
    | step                   | step列表，表示需要导出数据的step列表。列表为空就表示导出所有step的数据。默认为空。（MindSpore静态图模式下，当前暂不支持指定step功能） | List[int] | 否 |
    | grad_level                  | 输出级别。决定导出数据的详细程度，级别越大导出数据越详细。可取值：L0, L1, L2。默认L1。|str  | 否     |
    | param_list             | 权重名称列表，表示需要监控的权重。列表为空就表示监控所有权重。默认为空。 | List[str] | 否       |
-   | bounds                 | 区间列表，用来划分区间以统计数值的分布。需要保证由数据小到大排列。可以使用默认值[-1, 0, 1]。 | List[float, int] | 否  |
+   | bounds                 | 区间列表，用来划分区间以统计数值的分布。需要保证由数据小到大排列，并且列表中的元素需要在int64取值范围内。可以使用默认值[-1, 0, 1]。 | List[float, int] | 否  |
    **不同级别的level的导出数据**
@@ -53,29 +53,29 @@
    | L0   | ("param_name", "MD5", "max", "min", "norm", "shape")         | 否             |
    | L1   | ("param_name", "max", "min", "norm", "shape")         | 是             |
    | L2   | ("param_name", *intervals, "=0", "max", "min", "norm", "shape") | 是             |
    intervals就是根据值分布bounds划分出的区间。
    MindSpore静态图模式下，L0级别中暂不支持"MD5"
    **方向数据解释**
    因为模型的参数往往非常大，所以存储真实数据是不可接受的，这里折衷一下，只存储梯度数据的正负号（一个布尔值），也就是方向。
    **bounds和值分布解释**
    + 值分布：梯度数据落在各个区间的元素个数占总元素个数的比例。
-   + bounds：一个列表，用来划分出区间以统计值分布。例如传入bounds = [-10, 0, 10]，此时有一个 grad_value: Tensor = [9.3 , 5.4, -1.0, -12.3]，依据 bounds 划分出 (-inf, -10]、(-10, 0]、(0, 10]、(10, inf) 四个区间，然后统计grad_value里的数据落在每个区间内的个数，得到 1、1、2、0。如下图所示：
+   + bounds：一个列表，用来划分出区间以统计值分布。例如传入bounds = [-10, 0, 10]，此时有一个 grad_value: Tensor = [9.3 , 5.4, -1.0, -12.3]，依据 bounds 划分出 (-inf, -10]、(-10, 0]、(0, 10]、(10, inf) 四个区间，然后统计grad_value里的数据落在每个区间内的个数，得到 1、1、2、0。如下图所示：
    ![Alt text](./img/grad_probe_image-1.png)
 2. 插入代码。示例代码如下：
-- PyTorch框架：模型构造完成后，传入config.json的路径实例化一个GradientMonitor对象，然后调用gm.monitor并将`模型`作为参数传入。
+- PyTorch框架：模型构造完成后，传入config.json的路径实例化一个PrecisionDebugger对象，然后调用debugger.monitor并将`模型`作为参数传入。
 ```python
 from msprobe.pytorch import PrecisionDebugger
 debugger = PrecisionDebugger("config_json_path")
 debugger.monitor(model)
 ```
-- MindSpore框架：优化器构造完成后，传入config.json的路径实例化一个GradientMonitor对象，然后调用gm.monitor并将`优化器`作为参数传入。
+- MindSpore框架：优化器构造完成后，传入config.json的路径实例化一个PrecisionDebugger对象，然后调用debugger.monitor并将`优化器`作为参数传入。
 ```python
 from msprobe.mindspore import PrecisionDebugger
 debugger = PrecisionDebugger("config_json_path")
@@ -143,7 +143,7 @@ GradComparator.compare_distributed("配置文件里写的dump_path",
                                     "配置文件里写的dump_path",
                                     "比对结果输出目录")
 ```
 ### 比对结果
@@ -190,6 +190,7 @@ PrecisionDebugger.monitor(module)
 | ----- | -------------------- | -------- |
 | module |Pytorch框架下传入模型，必须是torch.nn.Module；MindSpore框架下传入优化器。 | 是       |
+Pytorch场景，传入的模型不能被torch.jit.trace修饰。MindSpore动态图场景，传入的优化器不能被mindspore.jit修饰。
 **接口说明**
@@ -202,6 +203,3 @@ GradComparator.compare_distributed(dump_path1, dump_path2, output_path)
 | dump_path1 |需要比对的其中一个dump目录，也就是配置文件里写的dump_path。 | str | 是       |
 | dump_path2 |需要比对的其中一个dump目录，也就是配置文件里写的dump_path，与dump_path1可以互换。 | str | 是       |
 | output_path |输出结果目录，不存在会新建。 | str | 是       |
-# FAQ

msprobe/docs/18.online_dispatch.md ADDED Viewed

@@ -0,0 +1,89 @@
+# **PyTorch NPU在线精度比对工具使用指南**
+PyTorch NPU在线精度比对是msprobe工具实现在PyTorch训练过程中直接完成精度比对并输出比对结果的功能。
+在线精度比对实现的是NPU与CPU之间的精度比对。
+## PyTorch NPU在线精度比对总体流程
+1. 准备NPU训练工程。
+2. 在NPU环境下安装msprobe工具。
+3. 在训练脚本内插入msprobe工具在线精度比对接口。
+4. 执行训练并获取在线精度比对NPU和CPU分别执行后的精度比对结果。
+5. 比对结果分析。
+##  PyTorch NPU在线精度比对
+### 总体说明
+- 本节主要介绍NPU精度比对所需要的函数以及示例。
+- 在线精度比对工具通过截获PyTorch框架中部分Aten Ir及其输入输出，并将输入数据转到CPU执行，最后将NPU和CPU的执行结果进行精度比对得到比对结果。
+### 约束
+- Pytorch 只支持2.0及其以上版本。
+- 只支持Aten Ir级在线精度比对，所有Aten Ir可以通过dir(torch.ops.aten)查看，其中部分IR不支持在线比对：Aten Ir无对应CPU实现、NPU和CPU同AtenIR实现逻辑不一致，导致同输入不同输出。
+- 正反向不支持同时在线精度比对，不支持跨step在线精度比对。
+### 场景示例
+1. 在NPU训练脚本中添加在线精度比对接口，示例如下：
+   ```python
+   from msprobe.pytorch.common import seed_all
+   from msprobe.pytorch.online_dispatch import PtdbgDispatch
+      # 在main函数开始前固定随机数
+      seed_all()
+      ...
+      # 在需要调试精度的正向或反向代码前设置
+      # 正向示例
+      with PtdbgDispatch(dump_mode="auto", dump_path="/home/dump"):
+           output = model_cpu(inputs)
+      # 反向示例
+      with PtdbgDispatch(dump_mode="auto", dump_path="/home/dump"):
+       	loss.backward()
+   ```
+2. 执行训练。
+3. 找出精度不达标的Aten IR。
+   执行过程中会打屏Failed，Failed在比对结果csv中的Accuracy Reached or Not列标记为No，并在Dump目录下存盘精度不达标Aten IR的输入输出。
+### 计算精度评价指标
+1. Cosine < 0.99 且 MaxAbsError > 0.001时，精度不达标；
+2. Cosine < 0.9，精度不达标；
+3. MaxAbsError > 1，精度不达标。
+### 在线精度比对参数设置说明
+| 参数名称 | 说明                                                                                              | 是否必选 |
+| -------- |-------------------------------------------------------------------------------------------------| -------- |
+| dump_mode| dump模式，可取值"all"、"list"、"auto"、"OFF"，默认值为OFF（表示不Dump数据）。                                         | 否       |
+| api_list | dump范围，dump_mode="list"时设置，需要Dump Aten Ir API名称，默认为None，Aten Ir API名称可以通过dir(torch.ops.aten)查看。 | 否       |
+| dump_path| dump文件生成的路径。                                                                                    | 是       |
+| tag      | 传入tag字符串，成为dump文件夹名一部分，默认为None。                                                                 | 否       |
+| process_num | 多进程并发数，默认为0。                                                                                    | 否       |
+| debug       | debug信息打印，默认为False。                                                                             | 否       |
+### dump数据存盘说明
+dump数据存盘目录名格式：`atat_tag_rankid_{timestamp}`。
+子目录下包含1个比对结果csv文件、cpu和npudump数据目录，npu目录下包含Aten IR在NPU上的输入输出的dump数据，由于CPU的输入是直接使用NPU的输入执行，因此cpu目录下只包含执行输出的dump数据。
+```bash
+atat_rank4_20230911170521
+├── compare_result_rank4_20230911170521.csv
+├── cpu
+│   ├── native_batch_norm_backward_10_output.0.npy
+│   ............
+└── npu
+    ├── native_batch_norm_backward_10_input.0.npy
+    ............
+```

msprobe/docs/{FAQ_PyTorch.md → FAQ.md} RENAMED Viewed

@@ -1,4 +1,16 @@
-# 1 精度预检工具
+# 1 数据采集
+1. dump.json中API或Module统计信息里出现null或None值的原因是什么？
+   dump.json里出现null或None值的可能性较多，常见的场景有：
+   - 输入或者输出参数本身是一个None值。
+   - 输入参数或输出参数类型当前工具不支持，会有日志打印提醒。
+   - 输入或者输出tensor的dtype为bool时，Mean和Norm等字段为null。
+# 2 精度预检(PyTorch)
 1. 预检工具在 dump 和 run_ut 的过程中，是否需要同时开启或关闭 jit 编译（jit_compile）？
@@ -52,20 +64,20 @@
    | `__matmul__`    | 矩阵乘法         |
    | `__mod__`       | %                |
    | `__mul__`       | *                |
-   | `__nonzero__`   | 同`__bool__`     |
+   | `__nonzero__`   | 同 `__bool__`     |
    | `__or__`        | \|               |
    | `__radd__`      | +（反向）        |
    | `__rmul__`      | *（反向）        |
    | `__rshift__`    | >>               |
    | `__sub__`       | -                |
-   | `__truediv__`   | 同`__div__`      |
+   | `__truediv__`   | 同 `__div__`      |
    | `__xor__`       | ^                |
-# 2 精度比对工具
+# 3 精度比对(PyTorch)
-## 2.1 工具使用
+## 3.1 工具使用
-### 2.1.1 dump 指定融合算子
+### 3.1.1 dump 指定融合算子
 数据采集当前支持融合算子的输入输出，需要在 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 中添加，比如以下代码段调用的 softmax 融合算子。
@@ -83,7 +95,7 @@ def npu_forward_fused_softmax(self, input_, mask):
 （npu_scaled_masked_softmax 融合算子工具已支持 dump，本例仅供参考）。
-## 2.2 常见问题
+## 3.2 常见问题
 1. 在同一个目录多次执行 dump 会冲突吗？
@@ -97,7 +109,7 @@ def npu_forward_fused_softmax(self, input_, mask):
     答：torch 版本和硬件差异属于正常情况。
-## 2.3 异常情况
+## 3.3 异常情况
 1. HCCL 报错： error code: EI0006。
@@ -168,9 +180,9 @@ def npu_forward_fused_softmax(self, input_, mask):
     答：注释工具目录 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中 `Tensor: ` 下的 `- __getitem__`，工具会跳过采集该 API。如果是需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
-11. 添加 msprobe 工具后 F.gelu 触发 ValueError 报错：`activation_func must be F.gelu`等。
+11. 添加 msprobe 工具后 F.gelu 触发 ValueError 报错：`activation_func must be F.gelu` 等。以及采集 Megatron 数据时报错：`ValueError(Only support fusion of gelu and swiglu)`。
-    答：注释工具目录 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中 `functional: ` 下的 `-gelu`，工具会跳过采集该 API。如果需要采集关键位置 api 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+    答：这一类问题是因为工具本身封装了 torch 算子，所以校验算子名时会报错。注释 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中的 `-gelu` 或者 `-silu`，工具会跳过采集该 API。如果需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
 12. 添加 msprobe 工具后触发与 AsStrided 算子相关、或者编译相关的报错，如：`Failed to compile Op [AsStrided]`。

msprobe/docs/img/ms_dump.png ADDED Viewed

Binary file

msprobe/docs/img/ms_layer.png ADDED Viewed

Binary file

msprobe/docs/img/pt_dump.png ADDED Viewed

Binary file

msprobe/mindspore/__init__.py CHANGED Viewed

	@@ -1 +1,2 @@
1 1	from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger
2	+ from msprobe.mindspore.common.utils import seed_all

msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py CHANGED Viewed

@@ -92,6 +92,23 @@ class ApiAccuracyChecker:
             output_list.append(tuple([api_name_str, forward_or_backward, basic_info_status, compare_result_dict]))
         return output_list
+    @staticmethod
+    def prepare_api_input_aggregation(api_info, forward_or_backward=Const.FORWARD):
+        '''
+        Args:
+            api_info: ApiInfo
+            forward_or_backward: str
+        Returns:
+            ApiInputAggregation
+        '''
+        forward_inputs = api_info.get_compute_element_list(Const.FORWARD, Const.INPUT)
+        kwargs = api_info.get_kwargs()
+        if forward_or_backward == Const.FORWARD:
+            gradient_inputs = None
+        else:
+            gradient_inputs = api_info.get_compute_element_list(Const.BACKWARD, Const.INPUT)
+        return ApiInputAggregation(forward_inputs, kwargs, gradient_inputs)
     def parse(self, api_info_path):
         with FileOpen(api_info_path, "r") as f:
             api_info_dict = json.load(f)
@@ -131,32 +148,39 @@ class ApiAccuracyChecker:
     def run_and_compare(self):
         for api_name_str, api_info in self.api_infos.items():
             if not api_info.check_forward_info():
-                logger.warning(f"api: {api_name_str} is lack of forward infomation, skip forward and backward check")
+                logger.warning(f"api: {api_name_str} is lack of forward infomation, skip forward and backward check.")
+                continue
+            try:
+                forward_inputs_aggregation = self.prepare_api_input_aggregation(api_info, Const.FORWARD)
+            except Exception as e:
+                logger.warning(f"exception occurs when getting inputs for {api_name_str} forward api. "
+                               f"skip forward and backward check. detailed exception information: {e}.")
                 continue
-            forward_inputs = api_info.get_compute_element_list(Const.FORWARD, Const.INPUT)
-            kwargs = api_info.get_kwargs()
-            forward_inputs_aggregation = ApiInputAggregation(forward_inputs, kwargs, None)
             forward_output_list = None
             try:
                 forward_output_list = \
                     self.run_and_compare_helper(api_info, api_name_str, forward_inputs_aggregation, Const.FORWARD)
             except Exception as e:
-                logger.warning(f"exception occurs when running and comparing {api_name_str} forward api"
-                               f"detailed exception information: {e}")
+                logger.warning(f"exception occurs when running and comparing {api_name_str} forward api. "
+                               f"detailed exception information: {e}.")
             self.record(forward_output_list)
             if not api_info.check_backward_info():
-                logger.warning(f"api: {api_name_str} is lack of backward infomation, skip backward check")
+                logger.warning(f"api: {api_name_str} is lack of backward infomation, skip backward check.")
+                continue
+            try:
+                backward_inputs_aggregation = self.prepare_api_input_aggregation(api_info, Const.BACKWARD)
+            except Exception as e:
+                logger.warning(f"exception occurs when getting inputs for {api_name_str} backward api. "
+                               f"skip backward check. detailed exception information: {e}.")
                 continue
-            gradient_inputs = api_info.get_compute_element_list(Const.BACKWARD, Const.INPUT)
-            backward_inputs_aggregation = ApiInputAggregation(forward_inputs, kwargs, gradient_inputs)
             backward_output_list = None
             try:
                 backward_output_list = \
                     self.run_and_compare_helper(api_info, api_name_str, backward_inputs_aggregation, Const.BACKWARD)
             except Exception as e:
-                logger.warning(f"exception occurs when running and comparing {api_name_str} backward api"
-                               f"detailed exception information: {e}")
+                logger.warning(f"exception occurs when running and comparing {api_name_str} backward api. "
+                               f"detailed exception information: {e}.")
             self.record(backward_output_list)
     def record(self, output_list):

msprobe/mindspore/api_accuracy_checker/api_info.py CHANGED Viewed

@@ -3,9 +3,16 @@ from msprobe.core.common.const import Const
 from msprobe.mindspore.api_accuracy_checker.utils import check_and_get_from_json_dict
 from msprobe.core.common.exceptions import ApiAccuracyCheckerException
 from msprobe.mindspore.common.log import logger
+from msprobe.core.common.utils import is_invalid_pattern
 class ApiInfo:
     def __init__(self, api_name):
+        if not isinstance(api_name, str):
+            err_msg = "ApiInfo.__init__ failed: api_name is not a string"
+            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.ParseJsonFailed))
+        if is_invalid_pattern(api_name):
+            err_msg = "ApiInfo.__init__ failed: api_name contain illegal character"
+            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.ParseJsonFailed))
         self.api_name = api_name
         self.forward_info = None
         self.backward_info = None

msprobe/mindspore/cell_processor.py CHANGED Viewed

@@ -1,17 +1,19 @@
 from msprobe.core.data_dump.scope import ModuleRangeScope
 from msprobe.core.common.const import Const
-from msprobe.mindspore.common.log import logger
 class CellProcessor:
     cell_count = {}
+    cell_stack = []
+    api_parent_node = ""
+    module_node = {}
     def __init__(self, scope):
         if isinstance(scope, ModuleRangeScope):
             self.scope = scope
         else:
             self.scope = None
     @staticmethod
     def set_cell_count(cell_name):
         if cell_name not in CellProcessor.cell_count:
@@ -20,14 +22,36 @@ class CellProcessor:
             CellProcessor.cell_count[cell_name] += 1
         return CellProcessor.cell_count[cell_name]
+    @classmethod
+    def reset_cell_stats(cls):
+        cls.cell_count = {}
+        cls.cell_stack = []
+        cls.api_parent_node = ""
+        cls.module_node = {}
     def node_hook(self, name_prefix, start_or_stop, **kwargs):
         def begin_hook(cell, input):
             index = self.set_cell_count(name_prefix)
             cell.mindstudio_reserved_name = full_name = name_prefix + Const.SEP + str(index)
+            if CellProcessor.cell_stack:
+                CellProcessor.module_node[full_name] = CellProcessor.cell_stack[-1]
+            else:
+                CellProcessor.module_node[full_name] = None
+            CellProcessor.cell_stack.append(full_name)
+            CellProcessor.api_parent_node = full_name
             if self.scope:
                 self.scope.begin_module(full_name)
         def end_hook(cell, input, output):
+            if CellProcessor.cell_stack:
+                CellProcessor.cell_stack.pop()
+            if CellProcessor.cell_stack:
+                CellProcessor.api_parent_node = CellProcessor.cell_stack[-1]
+            else:
+                CellProcessor.api_parent_node = None
             if self.scope:
                 self.scope.end_module(cell.mindstudio_reserved_name)

msprobe/mindspore/common/const.py CHANGED Viewed

@@ -39,12 +39,14 @@ class Const:
     OPS_DATA_PREFIX = "Functional."
     MINT_DATA_PREFIX = "Mint."
     MINT_NN_FUNC_DATA_PREFIX = "MintFunctional."
+    DISTRIBUTED_DATA_PREFIX = "Distributed."
     SUPPORTED_API_LIST_FILE = "support_wrap_ops.yaml"
     SUPPORTED_TENSOR_LIST_KEY = "tensor"
     SUPPORTED_OPS_LIST_KEY = "ops"
     SUPPORTED_MINT_LIST_KEY = "mint.ops"
     SUPPORTED__MINT_NN_FUNC_LIST_KEY = "mint.nn.functional"
+    SUPPORTED_COMM_LIST_KEY = "communication.comm_func"
     DROPOUT_API_NAME_PREFIX = "dropout"

msprobe/mindspore/common/utils.py CHANGED Viewed

@@ -12,12 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import os
+import random
 import mindspore as ms
 from msprobe.core.common.exceptions import DistributedNotInitializedError
 from msprobe.core.common.file_utils import path_len_exceeds_limit, check_path_exists, save_npy
 from msprobe.core.common.log import logger
+from msprobe.core.common.const import Const
+from msprobe.core.common.utils import CompareException, check_seed_all
 def get_rank_if_initialized():
@@ -53,12 +58,15 @@ def list_lowest_level_directories(root_dir):
     check_path_exists(root_dir)
     lowest_level_dirs = []
-    def recurse_dirs(current_dir):
+    def recurse_dirs(current_dir, depth=0):
+        if depth > Const.MAX_DEPTH:
+            logger.error(f'The directory {current_dir} has more than {Const.MAX_DEPTH} levels.')
+            raise CompareException(CompareException.RECURSION_LIMIT_ERROR)
         for entry in os.listdir(current_dir):
             full_path = os.path.join(current_dir, entry)
             if os.path.isdir(full_path):
                 if any(os.path.isdir(os.path.join(full_path, subentry)) for subentry in os.listdir(full_path)):
-                    recurse_dirs(full_path)
+                    recurse_dirs(full_path, depth=depth+1)
                 else:
                     lowest_level_dirs.append(full_path)
@@ -66,6 +74,14 @@ def list_lowest_level_directories(root_dir):
     return lowest_level_dirs
+def seed_all(seed=1234, mode=False):
+    check_seed_all(seed, mode)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    ms.set_seed(seed)
+    random.seed(seed)
+    ms.set_context(deterministic="ON" if mode else "OFF")
+    os.environ['HCCL_DETERMINISTIC'] = str(mode)
 class MsprobeStep(ms.train.Callback):

msprobe/mindspore/compare/distributed_compare.py CHANGED Viewed

@@ -1,19 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Copyright (C) 2019-2024. Huawei Technologies Co., Ltd. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
 import os
 from msprobe.core.common.utils import CompareException, check_compare_param, \
     check_configuration_param, task_dumppath_get
@@ -24,6 +8,7 @@ from msprobe.mindspore.compare.ms_compare import MSComparator
 from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json
 from msprobe.mindspore.compare.ms_graph_compare import GraphMSComparator
 def ms_compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
     if kwargs.get('suffix'):
         logger.error("Argument 'suffix' is not supported for compare_distributed.")
@@ -54,15 +39,17 @@ def ms_compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
         }
         try:
             summary_compare, md5_compare = task_dumppath_get(dump_result_param)
-            check_configuration_param(stack_mode, auto_analyze, fuzzy_match)
+            check_configuration_param(stack_mode, auto_analyze, fuzzy_match,
+                                      dump_result_param.get('is_print_compare_log', True))
             create_directory(output_path)
-            check_compare_param(dump_result_param, output_path, summary_compare=summary_compare, md5_compare=md5_compare)
+            check_compare_param(dump_result_param, output_path,
+                                summary_compare=summary_compare, md5_compare=md5_compare)
         except (CompareException, FileCheckException) as error:
             logger.error('Compare failed. Please check the arguments and do it again!')
             raise CompareException(error.code) from error
         ms_comparator = MSComparator()
-        ms_comparator.compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}', summary_compare=summary_compare,
-                     md5_compare=md5_compare, **kwargs)
+        ms_comparator.compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}',
+                                   summary_compare=summary_compare, md5_compare=md5_compare, **kwargs)
 def ms_graph_compare(inputs, outputs):
@@ -71,5 +58,5 @@ def ms_graph_compare(inputs, outputs):
     except (CompareException, FileCheckException) as error:
         logger.error('Compare failed. Please check the arguments and do it again!')
         return
-    msComparator = GraphMSComparator(inputs, outputs)
-    msComparator.compare_core()
+    ms_comparator = GraphMSComparator(inputs, outputs)
+    ms_comparator.compare_core()

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl