PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt ADDED Viewed

@@ -0,0 +1,80 @@
+[DistributedDataParallel(
+  (module): Float16Module(
+    (module): VLMModel(
+      (image_encoder): VisionModel(
+        (encoder): Qwen2VLViT(
+          (patch_embed): PatchEmbed(
+            (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
+          )
+          (rotary_pos_emb): VisionRotaryEmbedding()
+          (blocks): Qwen2VLVisionTransformerBlock(
+            (layers): ModuleList(
+              (0-15): 16 x TransformerLayer(
+                (input_layernorm): RMSNorm()
+                (self_attention): Qwen2vlVitSelfAttention(
+                  (core_attention): DotProductAttention(
+                    (scale_mask_softmax): FusedScaleMaskSoftmax()
+                    (attention_dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (linear_proj): RowParallelLinear()
+                  (linear_qkv): ColumnParallelLinear()
+                  (q_layernorm): IdentityOp()
+                  (k_layernorm): IdentityOp()
+                )
+                (pre_cross_attn_layernorm): IdentityOp()
+                (cross_attention): IdentityOp()
+                (cross_attn_bda): IdentityFuncOp()
+                (pre_mlp_layernorm): RMSNorm()
+                (mlp): MLP(
+                  (linear_fc1): ColumnParallelLinear()
+                  (linear_fc2): RowParallelLinear()
+                )
+              )
+            )
+          )
+        )
+        (projector): MultimodalProjector(
+          (layernorm): RMSNorm()
+          (encoder): MLP(
+            (linear_fc1): ColumnParallelLinear()
+            (linear_fc2): RowParallelLinear()
+          )
+        )
+      )
+      (text_decoder): MMGPTModel(
+        (embedding): LanguageModelEmbedding(
+          (word_embeddings): VocabParallelEmbedding()
+          (embedding_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (rotary_pos_emb): Qwen2VLRotaryEmbedding_llm()
+        (decoder): TransformerBlock(
+          (layers): ModuleList(
+            (0-7): 8 x TransformerLayer(
+              (input_layernorm): RMSNorm()
+              (self_attention): Qwen2vlSelfAttention(
+                (core_attention): DotProductAttention(
+                  (scale_mask_softmax): FusedScaleMaskSoftmax()
+                  (attention_dropout): Dropout(p=0.0, inplace=False)
+                )
+                (linear_proj): RowParallelLinear()
+                (linear_qkv): ColumnParallelLinear()
+                (q_layernorm): IdentityOp()
+                (k_layernorm): IdentityOp()
+              )
+              (pre_cross_attn_layernorm): IdentityOp()
+              (cross_attention): IdentityOp()
+              (cross_attn_bda): IdentityFuncOp()
+              (pre_mlp_layernorm): RMSNorm()
+              (mlp): MLP(
+                (linear_fc1): ColumnParallelLinear()
+                (linear_fc2): RowParallelLinear()
+              )
+            )
+          )
+          (final_layernorm): RMSNorm()
+        )
+        (output_layer): ColumnParallelLinear()
+      )
+    )
+  )
+)]

msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png ADDED Viewed

Binary file

msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png ADDED Viewed

Binary file

msprobe/docs/visualization/mindspeed_llamafactory_mapping.md ADDED Viewed

@@ -0,0 +1,330 @@
+# MindSpeed&LLamaFactory数据采集和自动比对
+## 0. 使用场景
+基于MindSpeed和LLamaFactory框架实现的同一模型，在模型超参、环境变量、初始权重、训练数据等一致的前提下，训练过程中出现了精度差异，需要进行**整网比对**寻找精度差异点。
+本文选取Qwen2.5vl和Qwen2.5模型，指导用户如何进行MindSpeed&LLamaFactory数据采集和自动比对。
+## 1. 数据采集
+### 1.1 准备数据采集配置文件
+数据采集前需要准备一个json文件，本案例命名为config.json，其内容包含了数据采集的所需配置。
+本案例使用的配置内容如下，更多配置请参考[config.json 配置示例](../03.config_examples.md)，配置详细介绍请参考[配置文件介绍](../02.config_introduction.md)。
+```json
+{
+    "task": "statistics",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [0],
+    "level": "mix",
+    "async_dump": false,
+    "statistics": {
+        "scope": [],
+        "list": [],
+        "tensor_list": [],
+        "data_mode": ["all"],
+        "summary_mode": "statistics"
+    }
+}
+```
+请注意，在数据采集结束后将进行模型分级可视化比对，配置文件中的`level`需要配置为`L0`（模块数据）或`mix`（模块+API数据）。
+### 1.2 添加msprobe工具采集接口
+本案例使用的工具采集接口配置如下，更多配置和接口介绍请参考[PyTorch 场景的精度数据采集](../05.data_dump_PyTorch.md)。
+#### 1.2.1 LLamaFactory数据采集
+LLamaFactory依赖Transformers的底层能力，msprobe工具采集功能将添加在Transformers中。
+以Transformers 4.49.0版本为例，通过`pip3 show Transformers`获取`Location路径`，打开`Location路径/transformers/trainer.py`文件。
+1. 在trainer.py文件中添加工具接口，初始化数据采集配置以及固定随机数：
+   ![llamafactory1.png](./mindspeed_llamafactoary_img/llamafactory1.png)
+2. 在trainer.py文件**训练循环逻辑位置**添加工具接口，控制数据采集的启动、停止和step计数：
+   ![llamafactory2.png](./mindspeed_llamafactoary_img/llamafactory2.png)
+3. 配置完成，启动模型训练脚本，数据将自动采集，落盘数据格式请参考[PyTorch 场景的精度数据采集-dump-结果文件介绍](../05.data_dump_PyTorch.md#3-dump-结果文件介绍)。
+#### 1.2.2 MindSpeed数据采集
+打开training.py文件，MindSpeed-MM路径为`mindspeed_mm/training.py`，MindSpeed-LLM路径为`mindspeed_llm/training/training.py`。
+1. 在training.py文件中添加工具接口，初始化数据采集配置以及固定随机数：
+   ![mindspeed1.png](./mindspeed_llamafactoary_img/mindspeed1.png)
+2. 在training.py文件**训练循环逻辑位置**添加工具接口，控制数据采集的启动、停止和step计数：
+   ![mindspeed2.png](./mindspeed_llamafactoary_img/mindspeed2.png)
+3. 配置完成，启动模型训练脚本，数据将自动采集，落盘数据格式请参考[PyTorch 场景的精度数据采集-dump-结果文件介绍](../05.data_dump_PyTorch.md#3-dump-结果文件介绍)。
+## 2. 自动比对
+### 2.1 模型分级可视化比对
+该功能将msprobe工具dump的精度数据进行解析，还原模型图结构，实现模型各个层级的精度数据比对，方便用户理解模型结构、分析精度问题。
+我们将使用以下命令行进行模型分级可视化比对：
+```
+msprobe -f pytorch graph -i ./compare.json -o ./output -lm ./layer_mapping.yaml
+```
+具体的参数说明请点击查看[分级可视化构图比对-构图命令行说明](../21.visualization_PyTorch.md#31-构图命令行说明)。
+在基于MindSpeed和LLamaFactory框架的模型比对场景中，**-lm参数是必填的**，-lm参数所需的layer_mapping.yaml如何配置将在下面的章节进行介绍。
+模型分级可视化比对完成后，可通过tensorboard（需安装[tb_graph_ascend插件](../21.visualization_PyTorch.md#1依赖安装)）启动端口，在浏览器页面查看模型结构和精度比对结果，请参考[分级可视化构图比对-启动tensorboard](../21.visualization_PyTorch.md#4启动tensorboard)和[分级可视化构图比对-浏览器查看](../21.visualization_PyTorch.md#5浏览器查看)。
+### 2.2 layer_mapping映射文件配置
+msprobe工具的比对功能会将比对双方dump名称一致的数据进行比对。由于MindSpeed和LLamaFactory框架代码实现的差异，一些模型层级和层级名称有所不同导致无法进行匹配，需要进行layer层名称映射，才能够比对。
+#### 2.2.1 layer_mapping映射文件模板
+此处提供了Qwen2.5vl和Qwen2.5模型的layer_mapping映射文件模板，可直接使用。**如果您使用其他模型，或对MindSpeed和LLamaFactory框架进行过定制开发修改过框架源码，此layer_mapping映射文件模板可能会失效，请按照后续步骤修改layer_mapping映射文件模板**。
+每个模型有两个layer_mapping映射文件模板，分别是NPU侧为Mindspeed Bench侧为LLamaFactory，以及NPU侧为LLamaFactory Bench侧为Mindspeed，映射内容有所不同。
+文件名格式：\*.yaml，*为文件名，可自定义。本文命名为layer_mapping.yaml。
+**Qwen2.5vl**
+```yaml
+# NPU侧为Mindspeed-MM, Bench侧为LLamaFactory
+TopLayer:
+  0.module: module
+Float16Module:
+  module.image_encoder: visual
+  module.text_decoder: model
+VisionModel:
+  encoder.patch_embed: patch_embed
+  encoder.rotary_pos_emb: rotary_pos_emb
+  encoder.blocks.layers: blocks
+  projector: merger
+TransformerLayer:
+  input_layernorm: norm1
+  self_attention: attn
+  pre_mlp_layernorm: norm2
+Qwen2vlVitSelfAttention:
+  linear_qkv: qkv
+  linear_proj: proj
+MLP:
+  linear_fc1: up_proj
+  linear_fc2: down_proj
+MultimodalProjector:
+  layernorm: ln_q
+  encoder: mlp
+  encoder.linear_fc1: mlp.0
+  encoder.linear_fc2: mlp.2
+MMGPTModel:
+  embedding.word_embeddings: embed_tokens
+  rotary_pos_emb: rotary_emb
+  decoder.layers: layers
+  decoder.final_layernorm: norm
+  output_layer: lm_head
+```
+```yaml
+# NPU侧为LLamaFactory, Bench侧为Mindspeed-MM
+TopLayer:
+  module: 0.module
+Qwen2_5_VLForConditionalGeneration:
+  visual: module.image_encoder
+  model: module.text_decoder
+  lm_head: module.text_decoder.output_layer
+Qwen2_5_VisionTransformerPretrainedModel:
+  patch_embed: encoder.patch_embed
+  rotary_pos_emb: encoder.rotary_pos_emb
+  blocks: encoder.blocks.layers
+  merger: projector
+Qwen2_5_VLVisionBlock:
+  norm1: input_layernorm
+  attn: self_attention
+  norm2: pre_mlp_layernorm
+Qwen2_5_VLVisionSdpaAttention:
+  qkv: linear_qkv
+  proj: linear_proj
+Qwen2_5_VLMLP:
+  up_proj: linear_fc1
+  down_proj: linear_fc2
+Qwen2_5_VLPatchMerger:
+  ln_q: layernorm
+  mlp: encoder
+  mlp.0: encoder.linear_fc1
+  mlp.2: encoder.linear_fc2
+Qwen2_5_VLModel:
+  embed_tokens: embedding.word_embeddings
+  rotary_emb: rotary_pos_emb
+  layers: decoder.layers
+  norm: decoder.final_layernorm
+Qwen2_5_VLDecoderLayer:
+  self_attn: self_attention
+  self_attn.o_proj: self_attention.linear_proj
+  post_attention_layernorm: pre_mlp_layernorm
+```
+**Qwen2.5**
+```yaml
+# NPU侧为Mindspeed-LLM, Bench侧为LLamaFactory
+TopLayer:
+  0.module: module
+Float16Module:
+  module: model
+  module.output_layer: lm_head
+GPTModel:
+  embedding.word_embeddings: embed_tokens
+  decoder.layers: layers
+  decoder.final_layernorm: norm
+TransformerLayer:
+  self_attention: self_attn
+  pre_mlp_layernorm: post_attention_layernorm
+SelfAttention:
+  linear_proj: o_proj
+MLP:
+  linear_fc1: up_proj
+  linear_fc2: down_proj
+```
+```yaml
+# NPU侧为LLamaFactory, Bench侧为Mindspeed-LLM
+TopLayer:
+  module: 0.module
+Qwen2ForCausalLM:
+  model: module
+  lm_head: module.output_layer
+Qwen2Model:
+  embed_tokens: embedding.word_embeddings
+  layers: decoder.layers
+  norm: decoder.final_layernorm
+Qwen2DecoderLayer:
+  self_attn: self_attention
+  post_attention_layernorm: pre_mlp_layernorm
+Qwen2Attention:
+  o_proj: linear_proj
+Qwen2MLP:
+  up_proj: linear_fc1
+  down_proj: linear_fc2
+```
+#### 2.2.2 layer_mapping映射文件配置过程
+以Qwen2.5vl模型，NPU侧MindSpeed，Bench侧LLamaFactory为例。
+1. 模型结构打印
+   参考[添加msprobe工具采集接口](#12-添加msprobe工具采集接口)章节，配置过程中会在模型文件中添加`debugger.start(model=model)`，针对`start接口`中的`model`进行`print(model)`即可打印模型结构。
+   打印的模型结构：[mindspeed-mm-qwen25vl.txt](./mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt)，[llamafactory-qwen25vl.txt](./mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt)
+2. 基于模型结构由外到内进行layer mapping配置
+- 结构1
+   ![1.png](./mindspeed_llamafactoary_img/1.png)
+   ```yaml
+   TopLayer: # 代表模型最顶层
+     0.module: module # MindSpeed的model类型是list，msprobe采集会对其添加数字前缀，代表当前模型在list中的索引，因此要做0.module -> module的映射
+   Float16Module: # MindSpeed的Float16Module与LLamaFactory的Qwen2_5_VLForConditionalGeneration同级，对它们的子层进行映射
+     module.image_encoder: visual # MindSpeed的Float16Module多了一个子层module，跨层级用"."分隔，配置为module.image_encoder
+     module.text_decoder: model
+   ```
+- 结构2
+   ![2.png](./mindspeed_llamafactoary_img/2.png)
+   ```yaml
+   VisionModel: # MindSpeed的VisionModel与LLamaFactory的Qwen2_5_VisionPatchEmbed同级，对它们的子层进行映射
+     encoder.patch_embed: patch_embed
+     encoder.rotary_pos_emb: rotary_pos_emb
+     encoder.blocks.layers: blocks
+     projector: merger
+   ```
+- 结构3
+   ![3.png](./mindspeed_llamafactoary_img/3.png)
+   ```yaml
+   TransformerLayer: # MindSpeed的TransformerLayer与LLamaFactory的Qwen2_5_VLVisionBlock同级，对它们的子层进行映射
+     input_layernorm: norm1
+     self_attention: attn
+     pre_mlp_layernorm: norm2
+   ```
+- 结构4
+   ![4.png](./mindspeed_llamafactoary_img/4.png)
+   ```yaml
+   Qwen2vlVitSelfAttention: # MindSpeed的Qwen2vlVitSelfAttention与LLamaFactory的Qwen2_5_VLVisionSdpaAttention同级，对它们的子层进行映射
+     linear_qkv: qkv
+     linear_proj: proj
+   MLP: # MindSpeed的MLP与LLamaFactory的Qwen2_5_VLMLP同级，对它们的子层进行映射
+     linear_fc1: up_proj
+     linear_fc2: down_proj
+   ```
+- 结构5
+   ![5.png](./mindspeed_llamafactoary_img/5.png)
+   ```yaml
+   MultimodalProjector: # MindSpeed的MultimodalProjector与LLamaFactory的Qwen2_5_VLPatchMerger同级，对它们的子层进行映射
+     layernorm: ln_q
+     encoder: mlp
+     encoder.linear_fc1: mlp.0
+     encoder.linear_fc2: mlp.2
+   ```
+- 结构6
+   ![6.png](./mindspeed_llamafactoary_img/6.png)
+   ```yaml
+   MMGPTModel: # MindSpeed的MMGPTModel与LLamaFactory的Qwen2_5_VLModel同级，对它们的子层进行映射
+     embedding.word_embeddings: embed_tokens
+     rotary_pos_emb: rotary_emb
+     decoder.layers: layers
+     decoder.final_layernorm: norm
+     output_layer: lm_head
+   ```
+- 结构7
+   ![7.png](./mindspeed_llamafactoary_img/7.png)
+   由于TransformerLayer和MLP层已经配置过，无法再重复配置，此处的节点映射可通过[手动选择节点匹配](#23-手动选择节点匹配)完成。
+### 2.3 手动选择节点匹配
+如果通过layer_mapping映射配置后，还有节点未匹配上，可通过浏览器界面，使用鼠标选择两个待匹配的灰色节点进行匹配。
+请参考[分级可视化构图比对-手动选择节点匹配](../21.visualization_PyTorch.md#56-手动选择节点匹配)。

msprobe/mindspore/__init__.py CHANGED Viewed

@@ -25,4 +25,4 @@ except ImportError:
 from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger
 from msprobe.mindspore.common.utils import seed_all, MsprobeStep, MsprobeInitStep
 from msprobe.mindspore.monitor.module_hook import TrainerMon
-from msprobe.mindspore.dump.graph_tensor_dump import save, save_grad
+from msprobe.mindspore.dump.graph_tensor_dump import save, save_grad, step

msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py CHANGED Viewed

@@ -17,7 +17,7 @@ import os
 from dataclasses import dataclass
 from typing import Any, Optional
 from tqdm import tqdm
-import numpy as np
 from msprobe.core.common.const import Const, CompareConst
 from msprobe.core.common.file_utils import FileOpen, create_directory, write_csv, load_json, load_yaml
 from msprobe.core.common.utils import add_time_as_suffix

msprobe/mindspore/api_accuracy_checker/api_runner.py CHANGED Viewed

@@ -152,18 +152,21 @@ class ApiRunner:
         """
         api_name_list = api_name_str.split(Const.SEP)
         if len(api_name_list) != 3:
-            err_msg = f"ApiRunner.get_info_from_name failed: api_name_str: {api_name_str} is not in defined format"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
+            err_msg = f"ApiRunner.get_info_from_name failed: api_name_str: {api_name_str} is not in defined format." \
+                      f" Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
         api_type_str, api_sub_name = api_name_list[0], api_name_list[1]
         if api_type_str not in [MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL, MsCompareConst.TENSOR_API,
                                 MsCompareConst.FUNCTIONAL_API] \
                 and api_platform == Const.MS_FRAMEWORK:
-            err_msg = f"ApiRunner.get_info_from_name failed: not mint, mint.nn.functional or Tensor api"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
+            err_msg = f"ApiRunner.get_info_from_name failed: not mint, mint.nn.functional or Tensor api," \
+                      f" api_name={api_name_str}. Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
         if api_type_str not in MsCompareConst.MT_VALID_API_TYPES and api_platform == Const.MT_FRAMEWORK:
-            err_msg = f"ApiRunner.get_info_from_name failed: not torch, functional or Tensor api"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
+            err_msg = f"ApiRunner.get_info_from_name failed: not torch, functional or Tensor api," \
+                      f" api_name={api_name_str}. Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.WrongValue))
         return api_type_str, api_sub_name
     @staticmethod

msprobe/mindspore/api_accuracy_checker/compute_element.py CHANGED Viewed

@@ -67,8 +67,9 @@ class ComputeElement:
         elif compute_element_info is None:
             self._init_from_null_compute_element_info()
         else:
-            logger.error_log_with_exp(
-                "ComputeElement.__init__ failed: not init with parameter or compute_element info is not (list, dict)",
+            logger.warning_log_with_exp(
+                "ComputeElement.__init__ failed: not init with parameter or compute_element info is not (list, dict)."
+                " Exception has been raised and will be captured/logged externally.",
                 ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
     @staticmethod
@@ -82,8 +83,9 @@ class ComputeElement:
         ms_dtype = ms_tensor.dtype
         dtype_str = ms_dtype_to_dtype_str.get(ms_dtype)
         if dtype_str not in dtype_str_to_torch_dtype:
-            err_msg = f"ComputeElement.transfer_to_torch_tensor failed: no matching torch dtype for {dtype_str}"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
+            err_msg = f"ComputeElement.transfer_to_torch_tensor failed: no matching torch dtype" \
+                      f" for {dtype_str}. Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
         else:
             torch_dtype = dtype_str_to_torch_dtype.get(dtype_str)
@@ -109,8 +111,9 @@ class ComputeElement:
         dtype_str = ms_dtype_to_dtype_str.get(ms_dtype)
         if dtype_str not in dtype_str_to_mindtorch_dtype:
-            err_msg = f"ComputeElement.transfer_to_mindtorch_tensor failed: no matching mindtorch dtype for {dtype_str}"
-            logger.error_log_with_exp(err_msg,
+            err_msg = f"ComputeElement.transfer_to_mindtorch_tensor failed: no matching mindtorch dtype for" \
+                      f" {dtype_str}. Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg,
                                       ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
         else:
             mindtorch_dtype = dtype_str_to_mindtorch_dtype.get(dtype_str)
@@ -139,8 +142,9 @@ class ComputeElement:
         dtype_str = torch_dtype_to_dtype_str.get(torch_dtype)
         if dtype_str not in dtype_str_to_ms_dtype:
             err_msg = \
-                f"ComputeElement._transfer_to_mindspore_tensor failed: no matching mindspore dtype for {dtype_str}"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
+                f"ComputeElement._transfer_to_mindspore_tensor failed: no matching mindspore dtype for {dtype_str}. " \
+                f"Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
         else:
             ms_dtype = dtype_str_to_ms_dtype.get(dtype_str)
@@ -198,8 +202,9 @@ class ComputeElement:
             parameter_tmp = mindspore.Tensor(ndarray, dtype=ms_dtype)
         else:
             err_msg = "ComputeElement.get_parameter failed: self.parameter type is not in " \
-                      "(int, float, str, slice, bool, torch.Tensor, mindspore.Tensor, MstensorMetaData)"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
+                      "(int, float, str, slice, bool, torch.Tensor, mindspore.Tensor, MstensorMetaData)." \
+                      "Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
         # if necessary, do transfer
         if not get_origin and isinstance(parameter_tmp, mindspore.Tensor) and tensor_platform == Const.PT_FRAMEWORK:
@@ -296,8 +301,9 @@ class ComputeElement:
         self.shape = tuple()
         if not isinstance(parameter, self.supported_parameter_type):
             err_msg = "ComputeElement._init_with_parameter failed: " \
-                      "parameter type is not in (int, float, str, slice, bool, torch.Tensor, mindspore.Tensor)"
-            logger.error_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
+                      "parameter type is not in (int, float, str, slice, bool, torch.Tensor, mindspore.Tensor)." \
+                      "Exception has been raised and will be captured/logged externally."
+            logger.warning_log_with_exp(err_msg, ApiAccuracyCheckerException(ApiAccuracyCheckerException.UnsupportType))
         if isinstance(parameter, mindspore.Tensor):
             self.shape = tuple(parameter.shape)
             self.dtype_str = ms_dtype_to_dtype_str.get(parameter.dtype)

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl