PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +1 -1
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/RECORD +85 -66
msprobe/README.md +2 -2
msprobe/core/common/const.py +34 -9
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +14 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/merge_result/merge_result.py +8 -7
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/utils.py +10 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +92 -8
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +17 -4
msprobe/core/data_dump/data_processor/pytorch_processor.py +58 -7
msprobe/core/data_dump/json_writer.py +26 -8
msprobe/docs/01.installation.md +25 -0
msprobe/docs/02.config_introduction.md +14 -12
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +34 -15
msprobe/docs/06.data_dump_MindSpore.md +45 -22
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -2
msprobe/docs/19.monitor.md +257 -260
msprobe/docs/21.visualization_PyTorch.md +10 -0
msprobe/docs/22.visualization_MindSpore.md +11 -0
msprobe/docs/27.dump_json_instruction.md +24 -20
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +26 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/utils.py +20 -2
msprobe/mindspore/debugger/debugger_config.py +25 -2
msprobe/mindspore/debugger/precision_debugger.py +25 -6
msprobe/mindspore/dump/hook_cell/api_registry.py +2 -0
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/service.py +95 -21
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +71 -0
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +14 -0
msprobe/pytorch/dump/module_dump/module_processer.py +10 -30
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/wrap_distributed.py +4 -0
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +10 -12
msprobe/pytorch/monitor/module_hook.py +123 -104
msprobe/pytorch/monitor/module_metric.py +6 -6
msprobe/pytorch/monitor/optimizer_collect.py +45 -63
msprobe/pytorch/monitor/utils.py +8 -43
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +103 -24
msprobe/visualization/builder/graph_builder.py +31 -5
msprobe/visualization/builder/msprobe_adapter.py +7 -5
msprobe/visualization/graph/base_node.py +3 -2
msprobe/visualization/graph/distributed_analyzer.py +80 -3
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +3 -4
msprobe/visualization/utils.py +10 -2
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0

msprobe/docs/21.visualization_PyTorch.md CHANGED Viewed

@@ -302,6 +302,16 @@ msprobe -f pytorch graph -i ./compare.json -o ./output
 ├── compare_stepn_rankn_{timestamp}.vis
 ```
+#### 3.2.4 仅模型结构比对
+适用场景：**主要关注模型结构而非训练过程数据**。例如，在模型迁移过程中，确保迁移前后模型结构的一致性，或在排查精度差异时，判断是否由模型结构差异所引起。
+使用msprobe工具对模型数据进行采集时，**可选择仅采集模型结构（task配置为structure）**，此配置将避免采集模型训练过程的数据，从而显著减少采集所需的时间。
+dump配置请参考[dump配置示例](./03.config_examples.md#16-task-配置为-structure)
+得到dump数据后，若需比较特定两个rank之间的数据，请参考[3.2.2 双图比对](#322-双图比对)；若需进行多个rank或多个step的数据批量比对，请参考[3.2.3 批量构建或比对](#323-批量构建或比对)。
 ## 4.启动tensorboard
 ### 4.1 可直连的服务器

msprobe/docs/22.visualization_MindSpore.md CHANGED Viewed

@@ -303,6 +303,17 @@ msprobe -f mindspore graph -i ./compare.json -o ./output
 ├── compare_stepn_rankn_{timestamp}.vis
 ```
+#### 3.2.4 仅模型结构比对
+适用场景：**主要关注模型结构而非训练过程数据**。例如，在模型迁移过程中，确保迁移前后模型结构的一致性，或在排查精度差异时，判断是否由模型结构差异所引起。
+使用msprobe工具对模型数据进行采集时，**可选择仅采集模型结构（task配置为structure）**，此配置将避免采集模型训练过程的数据，从而显著减少采集所需的时间。
+dump配置请参考[dump配置示例](./03.config_examples.md#35-task-配置为-structure)
+得到dump数据后，若需比较特定两个rank之间的数据，请参考[3.2.2 双图比对](#322-双图比对)；若需进行多个rank或多个step的数据批量比对，请参考[3.2.3 批量构建或比对](#323-批量构建或比对)。
 ## 4.启动tensorboard
 ### 4.1 可直连的服务器

msprobe/docs/27.dump_json_instruction.md CHANGED Viewed

@@ -1,16 +1,18 @@
 # dump.json文件说明及示例
-## 1. dump.json文件介绍（Pytorch）
+## 1. dump.json文件示例（PyTorch）
 ### 1.1 L0级别
-L0级别的dump.json文件包括模块的前反向的输入输出，以及模块的参数和参数梯度。以Pytorch的Conv2d模块为例，网络中模块调用代码为:
-`output = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)(input)`
+L0级别的dump.json文件包括模块的前反向的输入输出，以及模块的参数和参数梯度。以PyTorch的Conv2d模块为例，网络中模块调用代码为:
+`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)`
-dump.json文件中包含以下字段：
+dump.json文件中包含以下数据名称：
-1. `Module.conv2.Conv2d.forward.0`为模块的前向数据，其中input_args为模块的输入数据（位置参数），input_kwargs为模块的输入数据（关键字参数），output为模块的输出数据，parameters为模块的参数数据，包括权重（weight）和偏置（bias）。
-2. `Module.conv2.Conv2d.parameters_grad`为模块的参数梯度数据，包括权重（weight）和偏置（bias）的梯度。
-3. `Module.conv2.Conv2d.backward.0`为模块的反向数据，其中input为模块反向的输入梯度（对应前向输出的梯度），output为模块的反向输出梯度（对应前向输入的梯度）。
+- `Module.conv2.Conv2d.forward.0`：模块的前向数据，其中input_args为模块的输入数据（位置参数），input_kwargs为模块的输入数据（关键字参数），output为模块的输出数据，parameters为模块的参数数据，包括权重（weight）和偏置（bias）。
+- `Module.conv2.Conv2d.parameters_grad`：模块的参数梯度数据，包括权重（weight）和偏置（bias）的梯度。
+- `Module.conv2.Conv2d.backward.0`：模块的反向数据，其中input为模块反向的输入梯度（对应前向输出的梯度），output为模块的反向输出梯度（对应前向输入的梯度）。
+**说明**：当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时，模块级数据的命名中包含该模块在列表中的索引index，命名格式为`{Module}.{index}.*`，*表示以上三种模块级数据的命名格式，例如：`Module.0.conv1.Conv2d.forward.0`。
 ```json
 {
@@ -167,12 +169,12 @@ dump.json文件中包含以下字段：
 ```
 ### 1.2 L1级别
-L1级别的dump.json文件包括API的前反向的输入输出。以Pytorch的relu函数为例，网络中API调用代码为:
- `output = torch.nn.functional.relu(input)`
+L1级别的dump.json文件包括API的前反向的输入输出。以PyTorch的relu函数为例，网络中API调用代码为:
+`output = torch.nn.functional.relu(input)`
-dump.json文件中包含以下字段：
-1. `Functional.relu.0.forward`为API的前向数据，其中input_args为API的输入数据（位置参数），input_kwargs为API的输入数据（关键字参数），output为API的输出数据。
-2. `Functional.relu.0.backward`为API的反向数据，其中input为API的反向输入梯度（对应前向输出的梯度），output为API的反向输出梯度（对应前向输入的梯度）。
+dump.json文件中包含以下数据名称：
+- `Functional.relu.0.forward`：API的前向数据，其中input_args为API的输入数据（位置参数），input_kwargs为API的输入数据（关键字参数），output为API的输出数据。
+- `Functional.relu.0.backward`：API的反向数据，其中input为API的反向输入梯度（对应前向输出的梯度），output为API的反向输出梯度（对应前向输入的梯度）。
 ```json
 {
@@ -272,12 +274,14 @@ mix级别的dump.json文件同时包括L0和L1级别的dump数据，文件格式
 L0级别的dump.json文件包括模块的前反向的输入输出，以及模块的参数和参数梯度。
 以MindSpore的Conv2d模块为例，dump.json文件中使用的模块调用代码为:
-`output = mindspore.nn.Conv2d(64, 128, 5, pad_mode='same', has_bias=True)(input)`
+`output = self.conv2(input) # self.conv2 = mindspore.nn.Conv2d(64, 128, 5, pad_mode='same', has_bias=True)`
+dump.json文件中包含以下数据名称：
+- `Cell.conv2.Conv2d.forward.0`：模块的前向数据，其中input_args为模块的输入数据（位置参数），input_kwargs为模块的输入数据（关键字参数），output为模块的输出数据，parameters为模块的参数数据，包括权重（weight）和偏置（bias）。
+- `Cell.conv2.Conv2d.parameters_grad`：模块的参数梯度数据，包括权重（weight）和偏置（bias）的梯度。
+- `Cell.conv2.Conv2d.backward.0`：模块的反向数据，其中input为模块反向的输入梯度（对应前向输出的梯度），output为模块的反向输出梯度（对应前向输入的梯度）。
-dump.json文件中包含以下字段：
-1. `Cell.conv2.Conv2d.forward.0`为模块的前向数据，其中input_args为模块的输入数据（位置参数），input_kwargs为模块的输入数据（关键字参数），output为模块的输出数据，parameters为模块的参数数据，包括权重（weight）和偏置（bias）。
-2. `Cell.conv2.Conv2d.parameters_grad`为模块的参数梯度数据，包括权重（weight）和偏置（bias）的梯度。
-3. `Cell.conv2.Conv2d.backward.0`为模块的反向数据，其中input为模块反向的输入梯度（对应前向输出的梯度），output为模块的反向输出梯度（对应前向输入的梯度）。
+**说明**：当dump时传入的model参数为List[mindspore.nn.Cell]或Tuple[mindspore.nn.Cell]时，模块级数据的命名中包含该模块在列表中的索引index，命名格式为`{Cell}.{index}.*`，*表示以上三种模块级数据的命名格式，例如：`Cell.0.conv2.Conv2d.forward.0`。
 ```json
 {
@@ -429,9 +433,9 @@ dump.json文件中包含以下字段：
 L1级别的dump.json文件包括API的前反向的输入输出，以MindSpore的relu函数为例，网络中API调用代码为:
  `output = mindspore.ops.relu(input)`
- dump.json文件中包含以下字段：
-1. `Functional.relu.0.forward`为API的前向数据，其中input_args为API的输入数据（位置参数），input_kwargs为API的输入数据（关键字参数），output为API的输出数据。
-2. `Functional.relu.0.backward`为API的反向数据，其中input为API的反向输入梯度（对应前向输出的梯度），output为API的反向输出梯度（对应前向输入的梯度）。
+ dump.json文件中包含以下数据名称：
+- `Functional.relu.0.forward`：API的前向数据，其中input_args为API的输入数据（位置参数），input_kwargs为API的输入数据（关键字参数），output为API的输出数据。
+- `Functional.relu.0.backward`：API的反向数据，其中input为API的反向输入梯度（对应前向输出的梯度），output为API的反向输出梯度（对应前向输入的梯度）。
 ```json
 {

msprobe/docs/28.debugger_save_instruction.md ADDED Viewed

@@ -0,0 +1,94 @@
+# 单点保存工具 README
+## 简介
+L0, L1, mix dump存在盲区，网络中的非api/module的输入输出不会被批量dump下来。单点保存提供类似np.save和print的功能和使用体验，可以保存指定的变量。同时针对大模型场景进行了增强，具备以下特性：
+- 可保存变量的反向梯度结果。
+- 能直接保存嵌套结构数据（如 list、dict），无需手动遍历。
+- 自动分 rank 保存。
+- 多次调用时会自动计数。
+- 可配置保存统计值或者张量。
+## 支持场景
+仅支持 PyTorch 与 MindSpore 的动态图场景。
+## 使能方式
+### 配置文件说明
+通用配置：
+| 参数     | 解释                                       | 是否必选 |
+| -------- |-------------------------------------------| -------- |
+| task     | dump 的任务类型，str 类型。 单点保存场景仅支持传入"statistics", "tensor"。    |  是     |
+| level    | dump 级别，str 类型，根据不同级别采集不同数据。单点保存场景传入"debug"。  | 是  |
+| dump_path  | 设置 dump 数据目录路径，str 类型。细节详见[通用配置说明](./02.config_introduction.md#11-通用配置)  | 是       |
+| rank        | 指定对某张卡上的数据进行采集，list[Union[int, str]] 类型。细节详见[通用配置说明](./02.config_introduction.md#11-通用配置)  | 否       |
+"statistics" 任务子配置项：
+| 参数     | 解释                                       | 是否必选 |
+| -------- |-------------------------------------------| -------- |
+| summary_mode     | 控制 dump 文件输出的模式，str 类型。支持传入"statistics", "md5"。 细节详见[statistics任务子配置项说明](./02.config_introduction.md#12-task-配置为-statistics)    |  否     |
+"tensor" 任务无子配置项。
+### 接口调用说明
+调用PrecisionDebugger.save，传入需要保存的变量，指定变量名称以及是否需要保存反向数据。接口入参说明详见[pytorch单点保存接口](./05.data_dump_PyTorch.md#19-save)，[mindspore单点保存接口](./06.data_dump_MindSpore.md#615-save)
+### 实例（以pytorch场景为例）
+配置文件
+```json
+{
+    "task": "statistics",
+    "dump_path": "./dump_path",
+    "rank": [],
+    "level": "debug",
+    "statistics": {
+        "summary_mode": "statistics"
+    }
+}
+```
+初始化
+```python
+# 训练启动py脚本
+from mindspore.pytorch import PrecisionDebugger
+debugger = PrecisionDebugger("./config.json")
+for data, label in data_loader:
+    # 执行模型训练
+    train(data, label)
+```
+初始化（无配置文件）
+```python
+# 训练启动py脚本
+from mindspore.pytorch import PrecisionDebugger
+debugger = PrecisionDebugger(dump_path="dump_path", level="debug")
+for data, label in data_loader:
+    # 执行模型训练
+    train(data, label)
+```
+调用保存接口
+```python
+# 训练过程中被调用py文件
+from mindspore.pytorch import PrecisionDebugger
+dict_variable = {"key1": "value1", "key2": [1, 2]}
+PrecisionDebugger.save(dict_variable, "dict_variable", save_backward=False)
+```
+## 输出结果
+  * **"task" 配置为 "statistics" 场景** ：在 dump 目录下会生成包含变量统计值信息的 `debug.json` 文件。
+  * **"task" 配置为 "tensor" 场景** ：除了在 dump 目录下生成包含变量统计值信息的 `debug.json` 文件外，还会在 dump 子目录 `dump_tensor_data` 中保存张量二进制文件，文件名称格式为 `{variable_name}{grad_flag}.{count}.tensor.{indexes}.{file_suffix}`。
+    - variable_name： 传入save接口的变量名称。
+    - grad_flag： 反向数据标识，反向数据为"_grad"，正向数据为""。
+    - count： 调用计数，多次以相同变量名称调用时的计数。
+    - indexes： 索引，在保存嵌套结构数据时的索引。例如：嵌套结构为`{"key1": "value1", "key2": ["value2", "value3"]}`，"value2"的索引为"key2.0"
+    - file_suffix：文件后缀，pytorch场景为"pt"，mindspore场景为"npy"

msprobe/docs/28.kernel_dump_MindSpore.md ADDED Viewed

@@ -0,0 +1,69 @@
+# MindSpore 场景的 kernel dump 说明
+当使用 msprobe 数据采集功能时，level 配置为 "L2" 表示采集 kernel 层级的算子数据，仅支持昇腾 NPU 平台。
+本文主要介绍 kernel dump 的配置示例和采集结果介绍， msprobe 数据采集功能的详细使用参考 《[MindSpore 场景的精度数据采集](./06.data_dump_MindSpore.md)》。
+## 1 kernel dump 配置示例
+使用 kernel dump 时，list 必须要填一个 API 名称，kernel dump 目前每个 step 只支持采集一个 API 的数据。
+API 名称填写参考 L1 dump 结果文件 dump.json 中的API名称，命名格式为：`{api_type}.{api_name}.{API调用次数}.{forward/backward}`。
+```json
+{
+    "task": "tensor",
+    "dump_path": "/home/data_dump",
+    "level": "L2",
+    "rank": [],
+    "step": [],
+    "tensor": {
+        "scope": [],
+        "list": ["Functional.linear.0.backward"]
+    }
+}
+```
+## 2 结果文件介绍
+### 2.1 采集结果说明
+如果 API kernel 级数据采集成功，会打印以下信息：
+```bash
+The kernel data of {api_name} is dumped successfully.
+```
+注意：如果打印该信息后，没有数据生成，参考**常见问题3.1**进行排查。
+如果 kernel dump 遇到不支持的 API， 会打印以下信息：
+```bash
+The kernel dump does not support the {api_name} API.
+```
+其中 {api_name} 是对应溢出的 API 名称。
+### 2.2 输出文件说明
+kernel dump 采集成功后，会在指定的 dump_path 目录下生成如下文件：
+```
+├── /home/data_dump/
+│   ├── step0
+│   │   ├── 20241201103000    # 日期时间格式，表示2024-12-01 10:30:00
+│   │   │   ├── 0             # 表示 device id
+│   │   │   │   ├──{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}    # kernel 层算子数据
+│   │   │  ...
+│   │   ├── kernel_config_{device_id}.json    # kernel dump 在接口调用过程中生成的中间文件，一般情况下无需关注
+│   │  ...
+│   ├── step1
+│  ...
+```
+成功采集到数据后，可以使用 msprobe 工具提供的《[PyTorch 场景的数据解析](./14.data_parse_PyTorch.md)》功能分析数据。
+## 3 常见问题
+#### 3.1 采集结果文件为空，有可能是什么原因？
+1. 首先需要确认工具使用方式、配置文件内容、list 填写的 API 名称格式是否都正确无误。
+2. 其次需要确认 API 是否运行在昇腾 NPU 上，如果是运行在其他设备上则不会存在 kernel 级数据。

msprobe/docs/img/monitor/step_count_per_record.png ADDED Viewed

Binary file

msprobe/mindspore/__init__.py CHANGED Viewed

@@ -25,3 +25,4 @@ except ImportError:
 from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger
 from msprobe.mindspore.common.utils import seed_all
+from msprobe.mindspore.monitor.module_hook import TrainerMon

msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py CHANGED Viewed

@@ -26,6 +26,7 @@ from msprobe.mindspore.api_accuracy_checker.data_manager import DataManager
 from msprobe.mindspore.api_accuracy_checker.utils import (check_and_get_from_json_dict, global_context,
                                                           trim_output_compute_element_list)
 from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.api_accuracy_checker import torch_mindtorch_importer
 cur_path = os.path.dirname(os.path.realpath(__file__))
 yaml_path = os.path.join(cur_path, MsCompareConst.SUPPORTED_API_LIST_FILE)
@@ -82,9 +83,11 @@ class ApiAccuracyChecker:
         # get output
         if global_context.get_is_constructed():
             # constructed situation, need use constructed input to run mindspore api getting tested_output
-            tested_outputs = api_runner(api_input_aggregation, api_name_str, forward_or_backward, Const.MS_FRAMEWORK)
+            tested_outputs = api_runner(api_input_aggregation, api_name_str,
+                                        forward_or_backward, global_context.get_framework())
         else:
             tested_outputs = api_info.get_compute_element_list(forward_or_backward, Const.OUTPUT)
         bench_outputs = api_runner(api_input_aggregation, api_name_str, forward_or_backward, Const.PT_FRAMEWORK)
         tested_outputs = trim_output_compute_element_list(tested_outputs, forward_or_backward)
         bench_outputs = trim_output_compute_element_list(bench_outputs, forward_or_backward)
@@ -153,13 +156,19 @@ class ApiAccuracyChecker:
         real_api_str = Const.SEP.join(api_name_str_list[1:-2])
         api_list = load_yaml(yaml_path)
         supported_tensor_api_list = api_list.get(MsCompareConst.SUPPORTED_TENSOR_LIST_KEY)
-        if api_type_str in (MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL):
+        if api_type_str in (MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL) \
+                and global_context.get_framework() == Const.MS_FRAMEWORK:
+            return True
+        if api_type_str in MsCompareConst.MT_VALID_API_TYPES \
+                and global_context.get_framework() == Const.MT_FRAMEWORK:
             return True
-        if api_type_str == MsCompareConst.TENSOR_API and real_api_str in supported_tensor_api_list:
+        if api_type_str == MsCompareConst.TENSOR_API and real_api_str in supported_tensor_api_list \
+                and global_context.get_framework() == Const.MS_FRAMEWORK:
             return True
         return False
     def parse(self, api_info_path):
         api_info_dict = load_json(api_info_path)
         # init global context
@@ -167,14 +176,25 @@ class ApiAccuracyChecker:
                                             "task field in api_info.json", accepted_type=str,
                                             accepted_value=(MsCompareConst.STATISTICS_TASK,
                                                             MsCompareConst.TENSOR_TASK))
+        try:
+            framework = check_and_get_from_json_dict(api_info_dict, MsCompareConst.FRAMEWORK,
+                                                "framework field in api_info.json", accepted_type=str,
+                                                accepted_value=(Const.MS_FRAMEWORK,
+                                                                Const.MT_FRAMEWORK))
+        except Exception as e:
+            framework = Const.MS_FRAMEWORK
+            logger.warning(f"JSON parsing error in framework field: {e}")
+        if framework == Const.MT_FRAMEWORK and not torch_mindtorch_importer.is_valid_pt_mt_env:
+            raise Exception(f"Please check if you have a valid PyTorch and MindTorch environment")
         is_constructed = task == MsCompareConst.STATISTICS_TASK
         if not is_constructed:
             dump_data_dir = check_and_get_from_json_dict(api_info_dict, MsCompareConst.DUMP_DATA_DIR_FIELD,
-                                                         "dump_data_dir field in api_info.json",
-                                                         accepted_type=str)
+                                                         "dump_data_dir field in api_info.json", accepted_type=str)
         else:
             dump_data_dir = ""
-        global_context.init(is_constructed, dump_data_dir)
+        global_context.init(is_constructed, dump_data_dir, framework)
         api_info_data = check_and_get_from_json_dict(api_info_dict, MsCompareConst.DATA_FIELD,
                                                      "data field in api_info.json", accepted_type=dict)

msprobe/mindspore/api_accuracy_checker/api_runner.py CHANGED Viewed

@@ -14,7 +14,6 @@
 # limitations under the License.
 import mindspore
-import torch
 from mindspore import ops
 from msprobe.core.common.const import Const, MsCompareConst
 from msprobe.core.common.exceptions import ApiAccuracyCheckerException
@@ -24,14 +23,28 @@ from msprobe.mindspore.api_accuracy_checker.utils import convert_to_tuple
 from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.api_accuracy_checker import torch_mindtorch_importer
+from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import mindtorch
+from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import mindtorch_tensor
+from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import mindtorch_func
+from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import mindtorch_dist
+if torch_mindtorch_importer.is_valid_pt_mt_env:
+    from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import torch
+else:
+    import torch
 class ApiInputAggregation:
     def __init__(self, inputs, kwargs, gradient_inputs) -> None:
-        '''
+        """
         Args:
             inputs: List[ComputeElement]
             kwargs: dict{str: ComputeElement}
             gradient_inputs: Union[List[ComputeElement], None]
-        '''
+        """
         self.inputs = inputs
         self.kwargs = kwargs
         self.gradient_inputs = gradient_inputs
@@ -43,16 +56,34 @@ api_parent_module_mapping = {
     (MsCompareConst.MINT_FUNCTIONAL, Const.MS_FRAMEWORK): mindspore.mint.nn.functional,
     (MsCompareConst.MINT_FUNCTIONAL, Const.PT_FRAMEWORK): torch.nn.functional,
     (MsCompareConst.TENSOR_API, Const.MS_FRAMEWORK): mindspore.Tensor,
-    (MsCompareConst.TENSOR_API, Const.PT_FRAMEWORK): torch.Tensor
+    (MsCompareConst.TENSOR_API, Const.PT_FRAMEWORK): torch.Tensor,
+    (MsCompareConst.MINDTORCH_TENSOR, Const.MT_FRAMEWORK): mindtorch_tensor,
+    (MsCompareConst.MINDTORCH_TENSOR, Const.PT_FRAMEWORK): torch.Tensor,
+    (MsCompareConst.MINDTORCH, Const.MT_FRAMEWORK): mindtorch,
+    (MsCompareConst.MINDTORCH, Const.PT_FRAMEWORK): torch,
+    (MsCompareConst.MINDTORCH_FUNC, Const.MT_FRAMEWORK): mindtorch_func,
+    (MsCompareConst.MINDTORCH_FUNC, Const.PT_FRAMEWORK): torch.nn.functional,
+    (MsCompareConst.MINDTORCH_DIST, Const.MT_FRAMEWORK): mindtorch_dist,
+    (MsCompareConst.MINDTORCH_DIST, Const.PT_FRAMEWORK): torch.distributed
 }
 api_parent_module_str_mapping = {
     (MsCompareConst.MINT, Const.MS_FRAMEWORK): "mindspore.mint",
     (MsCompareConst.MINT, Const.PT_FRAMEWORK): "torch",
     (MsCompareConst.MINT_FUNCTIONAL, Const.MS_FRAMEWORK): "mindspore.mint.nn.functional",
     (MsCompareConst.MINT_FUNCTIONAL, Const.PT_FRAMEWORK): "torch.nn.functional",
     (MsCompareConst.TENSOR_API, Const.MS_FRAMEWORK): "mindspore.Tensor",
-    (MsCompareConst.TENSOR_API, Const.PT_FRAMEWORK): "torch.Tensor"
+    (MsCompareConst.TENSOR_API, Const.PT_FRAMEWORK): "torch.Tensor",
+    (MsCompareConst.MINDTORCH_TENSOR, Const.MT_FRAMEWORK): "mindtorch_tensor",
+    (MsCompareConst.MINDTORCH_TENSOR, Const.PT_FRAMEWORK): "torch.Tensor",
+    (MsCompareConst.MINDTORCH, Const.MT_FRAMEWORK): "mindtorch",
+    (MsCompareConst.MINDTORCH, Const.PT_FRAMEWORK): "torch",
+    (MsCompareConst.MINDTORCH_FUNC, Const.MT_FRAMEWORK): "mindtorch_func",
+    (MsCompareConst.MINDTORCH_FUNC, Const.PT_FRAMEWORK): "torch.nn.functional",
+    (MsCompareConst.MINDTORCH_DIST, Const.MT_FRAMEWORK): "mindtorch_dist",
+    (MsCompareConst.MINDTORCH_DIST, Const.PT_FRAMEWORK): "torch.distributed"
 }
@@ -64,7 +95,7 @@ class ApiRunner:
             api_input_aggregation: ApiInputAggregation
             api_name_str: str, e.g. "MintFunctional.relu.0"
             forward_or_backward: str, Union["forward", "backward"]
-            api_platform: str, Union["mindspore", "torch"]
+            api_platform: str, Union["mindspore", "torch", "mindtorch"]
         Return:
             outputs: list[ComputeElement]
@@ -72,35 +103,41 @@ class ApiRunner:
         Description:
             run mindspore.mint/torch api
         '''
-        api_type_str, api_sub_name = self.get_info_from_name(api_name_str)
+        api_type_str, api_sub_name = self.get_info_from_name(api_name_str, api_platform)
         api_instance = self.get_api_instance(api_type_str, api_sub_name, api_platform)
         return self.run_api(api_instance, api_input_aggregation, forward_or_backward, api_platform)
     @staticmethod
-    def get_info_from_name(api_name_str):
-        '''
+    def get_info_from_name(api_name_str, api_platform=Const.MS_FRAMEWORK):
+        """
         Args:
             api_name_str: str, the trimmed key of data dict in api_info.json. e.g. "MintFunctional.relu.0"
+            api_platform: str, the platform for the API, which can be either "mindspore" or "mindtorch".
+                      It specifies which framework is being used. Default is "mindspore".
         Return:
-            api_type_str: str, Union["MintFunctional", "Mint", "Tensor"]
+            api_type_str: str, Union["MintFunctional", "Mint", "Tensor", "Torch", "Functional"]
             api_sub_name: str, e.g. "relu"
-        '''
+        """
         api_name_list = api_name_str.split(Const.SEP)
         if len(api_name_list) != 3:
             err_msg = f"ApiRunner.get_info_from_name failed: api_name_str: {api_name_str} is not in defined format"
             logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
         api_type_str, api_sub_name = api_name_list[0], api_name_list[1]
-        if api_type_str not in [MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL, MsCompareConst.TENSOR_API]:
+        if api_type_str not in [MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL, MsCompareConst.TENSOR_API] \
+                and api_platform == Const.MS_FRAMEWORK:
             err_msg = f"ApiRunner.get_info_from_name failed: not mint, mint.nn.functional or Tensor api"
             logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
+        if api_type_str not in MsCompareConst.MT_VALID_API_TYPES and api_platform == Const.MT_FRAMEWORK:
+            err_msg = f"ApiRunner.get_info_from_name failed: not torch, functional or Tensor api"
+            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
         return api_type_str, api_sub_name
     @staticmethod
     def get_api_instance(api_type_str, api_sub_name, api_platform):
-        '''
+        """
         Args:
             api_type_str: str, Union["MintFunctional", "Mint", "Tensor"]
             api_sub_name: str, e.g. "relu"
@@ -113,11 +150,12 @@ class ApiRunner:
             get mindspore.mint/torch api fucntion
             mindspore.mint.{api_sub_name} <--> torch.{api_sub_name}
             mindspore.mint.nn.functional.{api_sub_name} <--> torch.nn.functional.{api_sub_name}
-        '''
+        """
         api_parent_module = api_parent_module_mapping.get((api_type_str, api_platform))
         api_parent_module_str = api_parent_module_str_mapping.get((api_type_str, api_platform))
         full_api_name = api_parent_module_str + Const.SEP + api_sub_name
         if not hasattr(api_parent_module, api_sub_name):
             err_msg = f"ApiRunner.get_api_instance failed: {full_api_name} is not found"
             logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.ApiWrong))
@@ -147,7 +185,7 @@ class ApiRunner:
                 logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
             gradient_inputs = tuple(compute_element.get_parameter(get_origin=False, tensor_platform=api_platform)
                                     for compute_element in gradient_inputs)
-            if api_platform == Const.MS_FRAMEWORK:
+            if api_platform == Const.MS_FRAMEWORK or api_platform == Const.MT_FRAMEWORK:
                 if len(gradient_inputs) == 1:
                     gradient_inputs = gradient_inputs[0]

msprobe/mindspore/api_accuracy_checker/compute_element.py CHANGED Viewed

@@ -25,6 +25,7 @@ from msprobe.core.common.file_utils import load_npy
 from msprobe.mindspore.api_accuracy_checker.type_mapping import (api_info_type_str_to_type,
                                                                  ms_dtype_to_dtype_str, torch_dtype_to_dtype_str,
                                                                  dtype_str_to_ms_dtype, dtype_str_to_np_dtype,
+                                                                 dtype_str_to_mindtorch_dtype,
                                                                  dtype_str_to_torch_dtype, type_to_api_info_type_str,
                                                                  DEFAULT_CONSTRUCT_NP_FLOAT_DTYPE, TUPLE_TYPE_STR,
                                                                  MINDSPORE_TENSOR_TYPE_STR, MINDSPORE_DTYPE_TYPE_STR,
@@ -33,6 +34,15 @@ from msprobe.mindspore.api_accuracy_checker.type_mapping import (api_info_type_s
 from msprobe.mindspore.api_accuracy_checker.utils import check_and_get_from_json_dict, global_context
 from msprobe.mindspore.common.log import logger
+import msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer as env_module
+if env_module.is_valid_pt_mt_env:
+    from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import mindtorch
+    from msprobe.mindspore.api_accuracy_checker.torch_mindtorch_importer import torch
+else:
+    import torch
 class MstensorMetaData:
     def __init__(self, dtype_str, npy_path, maximum, minimum, shape) -> None:
@@ -86,6 +96,37 @@ class ComputeElement:
         torch_tensor = torch.from_numpy(np_ndarray).to(torch_dtype)
         return torch_tensor
+    @staticmethod
+    def transfer_to_mindtorch_tensor(ms_tensor):
+        """
+        Args:
+            ms_tensor: mindspore.Tensor
+        Return:
+            mindtorch_tensor: mindtorch.Tensor
+        """
+        ms_dtype = ms_tensor.dtype
+        dtype_str = ms_dtype_to_dtype_str.get(ms_dtype)
+        if dtype_str not in dtype_str_to_mindtorch_dtype:
+            err_msg = f"ComputeElement.transfer_to_mindtorch_tensor failed: no matching mindtorch dtype for {dtype_str}"
+            logger.error_log_with_exp(err_msg,
+                                      ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
+        else:
+            mindtorch_dtype = dtype_str_to_mindtorch_dtype.get(dtype_str)
+        if dtype_str in int_dtype_str_list:
+            middle_dtype = mindspore.int64
+        else:
+            middle_dtype = mindspore.float64
+        np_ndarray = ms_tensor.astype(middle_dtype).numpy()
+        mindtorch_tensor = mindtorch.from_numpy(np_ndarray).to(ms_dtype)
+        return mindtorch_tensor
     @staticmethod
     def transfer_to_mindspore_tensor(torch_tensor):
         '''
@@ -141,8 +182,11 @@ class ComputeElement:
         elif isinstance(self.parameter, DtypeMetaData):
             if tensor_platform == Const.MS_FRAMEWORK:
                 parameter_tmp = dtype_str_to_ms_dtype.get(self.parameter.dtype_str)
-            else:
+            elif tensor_platform == Const.PT_FRAMEWORK:
                 parameter_tmp = dtype_str_to_torch_dtype.get(self.parameter.dtype_str)
+            elif tensor_platform == Const.MT_FRAMEWORK:
+                parameter_tmp = dtype_str_to_mindtorch_dtype.get(self.parameter.dtype_str)
         elif isinstance(self.parameter, MstensorMetaData):
             mstensor_meta_data = self.parameter
             ms_dtype = dtype_str_to_ms_dtype.get(mstensor_meta_data.dtype_str)
@@ -161,6 +205,8 @@ class ComputeElement:
         # if necessary, do transfer
         if not get_origin and isinstance(parameter_tmp, mindspore.Tensor) and tensor_platform == Const.PT_FRAMEWORK:
             parameter = self.transfer_to_torch_tensor(parameter_tmp)
+        elif not get_origin and isinstance(parameter_tmp, mindspore.Tensor) and tensor_platform == Const.MT_FRAMEWORK:
+            parameter = self.transfer_to_mindtorch_tensor(parameter_tmp)
         elif not get_origin and isinstance(parameter_tmp, torch.Tensor) and tensor_platform == Const.MS_FRAMEWORK:
             parameter = self.transfer_to_mindspore_tensor(parameter_tmp)
         else:

mindstudio-probe 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl