mindstudio-probe 8.2.0__py3-none-any.whl → 8.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +63 -61
- msprobe/README.md +4 -4
- msprobe/core/common/const.py +6 -0
- msprobe/core/common/db_manager.py +35 -4
- msprobe/core/common/file_utils.py +28 -5
- msprobe/core/common/megatron_utils.py +59 -0
- msprobe/core/common/utils.py +14 -3
- msprobe/core/compare/diff_analyze/first_diff_analyze.py +16 -4
- msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
- msprobe/core/compare/find_first/analyzer.py +8 -7
- msprobe/core/compare/find_first/graph.py +11 -3
- msprobe/core/compare/find_first/utils.py +3 -2
- msprobe/core/compare/highlight.py +13 -6
- msprobe/core/compare/multiprocessing_compute.py +17 -10
- msprobe/core/compare/utils.py +14 -5
- msprobe/core/data_dump/data_collector.py +18 -21
- msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
- msprobe/core/data_dump/json_writer.py +18 -8
- msprobe/core/data_dump/scope.py +4 -6
- msprobe/core/hook_manager.py +21 -0
- msprobe/core/service.py +2 -0
- msprobe/core/single_save/single_comparator.py +16 -3
- msprobe/docs/01.installation.md +7 -5
- msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
- msprobe/docs/06.data_dump_MindSpore.md +1 -1
- msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
- msprobe/docs/14.data_parse_PyTorch.md +1 -1
- msprobe/docs/19.monitor.md +2 -0
- msprobe/docs/21.visualization_PyTorch.md +15 -80
- msprobe/docs/22.visualization_MindSpore.md +20 -104
- msprobe/docs/23.generate_operator_PyTorch.md +1 -1
- msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
- msprobe/mindspore/cell_processor.py +33 -5
- msprobe/mindspore/compare/common_dir_compare.py +22 -26
- msprobe/mindspore/debugger/precision_debugger.py +1 -1
- msprobe/mindspore/dump/cell_dump_process.py +73 -62
- msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
- msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
- msprobe/pytorch/compare/utils.py +2 -1
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
- msprobe/pytorch/dump/module_dump/module_processer.py +15 -8
- msprobe/pytorch/monitor/module_hook.py +28 -9
- msprobe/pytorch/online_dispatch/dispatch.py +42 -24
- msprobe/visualization/builder/graph_builder.py +169 -64
- msprobe/visualization/builder/graph_merger.py +0 -1
- msprobe/visualization/builder/msprobe_adapter.py +1 -1
- msprobe/visualization/db_utils.py +25 -2
- msprobe/visualization/graph/base_node.py +0 -24
- msprobe/visualization/graph/graph.py +5 -14
- msprobe/visualization/graph_service.py +29 -53
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
|
@@ -21,40 +21,37 @@
|
|
|
21
21
|
|
|
22
22
|
### 1.1 安装msprobe工具
|
|
23
23
|
|
|
24
|
-
[msprobe工具安装](
|
|
24
|
+
[msprobe工具安装](./01.installation.md)
|
|
25
25
|
|
|
26
26
|
### 1.2 安装tb_graph_ascend
|
|
27
27
|
|
|
28
28
|
**请安装tb_graph_ascend,否则无法解析构图结果。**
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
如需升级工具,请先``pip3 uninstall tb-graph-ascend``再``pip3 install tb-graph-ascend``即可。
|
|
30
|
+
[tb_graph_ascend安装](../../../../plugins/tensorboard-plugins/tb_graph_ascend#2-安装方式)
|
|
33
31
|
|
|
34
32
|
## 2.模型结构数据采集
|
|
35
|
-
[
|
|
33
|
+
[PyTorch场景的数据采集](./06.data_dump_MindSpore.md)
|
|
36
34
|
|
|
37
35
|
**仅支持动态图场景,需要选择level为L0(cell信息)或者mix(cell信息+api信息),才能采集到模型结构数据,即采集结果件construct.json内容不为空**。
|
|
38
36
|
|
|
39
37
|
## 3.生成图结构文件
|
|
40
38
|
|
|
41
39
|
### 3.1 构图命令行说明
|
|
42
|
-
|
|
40
|
+
|
|
43
41
|
**命令示例如下**:
|
|
44
42
|
```
|
|
45
43
|
msprobe -f mindspore graph -i ./compare.json -o ./output
|
|
46
44
|
```
|
|
47
45
|
**命令行参数说明**:
|
|
48
46
|
|
|
49
|
-
| 参数名 | 说明
|
|
50
|
-
|
|
51
|
-
| -f 或 --framework | 指定训练框架。mindspore。
|
|
52
|
-
| -i 或 --input_path | 指定比对文件,参考[比对文件说明](#313-比对文件说明)
|
|
53
|
-
| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis或build_{timestamp}.vis`。
|
|
47
|
+
| 参数名 | 说明 | 是否必选 |
|
|
48
|
+
|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
|
|
49
|
+
| -f 或 --framework | 指定训练框架。mindspore。 | 是 |
|
|
50
|
+
| -i 或 --input_path | 指定比对文件,参考[比对文件说明](#313-比对文件说明) | 是 |
|
|
51
|
+
| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`。 | 是 |
|
|
54
52
|
| -lm 或 --layer_mapping| 跨框架比对,MindSpore和PyTorch的比对场景。配置该参数时表示开启跨框架Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer)](#71-自定义映射文件layer), 如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。配置该参数后,将仅按节点名称进行比对,忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点,则需要配置自定义映射文件,-lm参数传入自定义映射文件路径;如果调试侧和标杆侧节点名称相同,则仅指定-lm即可。 | 否 |
|
|
55
|
-
| -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出
|
|
56
|
-
| -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明)
|
|
57
|
-
| -cs 或 --complete_stack | 是否使用完整的堆栈信息,bool类型。默认使用精简的堆栈信息,数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明) | 否 |
|
|
53
|
+
| -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出db文件中(`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 |
|
|
54
|
+
| -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 |
|
|
58
55
|
|
|
59
56
|
#### 3.1.1 匹配说明
|
|
60
57
|
|
|
@@ -140,7 +137,7 @@ msprobe -f mindspore graph -i ./compare.json -o ./output
|
|
|
140
137
|
|
|
141
138
|
3.md5:dump了API和Module的输入输出数据统计信息和md5信息。
|
|
142
139
|
|
|
143
|
-
dump类型如何配置见[数据采集配置文件介绍](https://
|
|
140
|
+
dump类型如何配置见[数据采集配置文件介绍](https://gitcode.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
|
|
144
141
|
|
|
145
142
|
**1. 准备比对文件**:
|
|
146
143
|
|
|
@@ -213,25 +210,15 @@ npu_path或bench_path格式:必须只包含rank+数字格式的文件夹,且
|
|
|
213
210
|
```
|
|
214
211
|
msprobe -f mindspore graph -i ./compare.json -o ./output
|
|
215
212
|
```
|
|
216
|
-
比对完成后将在**output**下生成
|
|
213
|
+
比对完成后将在**output**下生成1个**vis.db后缀文件**。
|
|
217
214
|
|
|
218
215
|
图构建:
|
|
219
216
|
```
|
|
220
|
-
├──
|
|
221
|
-
├── build_rank1_{timestamp}.vis
|
|
222
|
-
├── build_rank2_{timestamp}.vis
|
|
223
|
-
├── build_rank3_{timestamp}.vis
|
|
224
|
-
├── ...
|
|
225
|
-
├── build_rankn_{timestamp}.vis
|
|
217
|
+
├── build_{timestamp}.vis.db
|
|
226
218
|
```
|
|
227
219
|
图比对:
|
|
228
220
|
```
|
|
229
|
-
├──
|
|
230
|
-
├── compare_rank1_{timestamp}.vis
|
|
231
|
-
├── compare_rank2_{timestamp}.vis
|
|
232
|
-
├── compare_rank3_{timestamp}.vis
|
|
233
|
-
├── ...
|
|
234
|
-
├── compare_rankn_{timestamp}.vis
|
|
221
|
+
├── compare_{timestamp}.vis.db
|
|
235
222
|
```
|
|
236
223
|
##### 3.2.3.2 多step批量构建或比对
|
|
237
224
|
批量构建或比对多个step下的所有rank的数据
|
|
@@ -278,33 +265,15 @@ npu_path或bench_path格式:必须只包含step+数字格式的文件夹,且
|
|
|
278
265
|
```
|
|
279
266
|
msprobe -f mindspore graph -i ./compare.json -o ./output
|
|
280
267
|
```
|
|
281
|
-
比对完成后将在**output
|
|
268
|
+
比对完成后将在**output**下生成1个**vis.db后缀文件**。
|
|
282
269
|
|
|
283
270
|
图构建:
|
|
284
271
|
```
|
|
285
|
-
├──
|
|
286
|
-
├── build_step0_rank1_{timestamp}.vis
|
|
287
|
-
├── build_step0_rank2_{timestamp}.vis
|
|
288
|
-
├── build_step0_rank3_{timestamp}.vis
|
|
289
|
-
├── build_step1_rank0_{timestamp}.vis
|
|
290
|
-
├── build_step1_rank1_{timestamp}.vis
|
|
291
|
-
├── build_step1_rank2_{timestamp}.vis
|
|
292
|
-
├── build_step1_rank3_{timestamp}.vis
|
|
293
|
-
├── ...
|
|
294
|
-
├── build_stepn_rankn_{timestamp}.vis
|
|
272
|
+
├── build_{timestamp}.vis.db
|
|
295
273
|
```
|
|
296
274
|
图比对:
|
|
297
275
|
```
|
|
298
|
-
├──
|
|
299
|
-
├── compare_step0_rank1_{timestamp}.vis
|
|
300
|
-
├── compare_step0_rank2_{timestamp}.vis
|
|
301
|
-
├── compare_step0_rank3_{timestamp}.vis
|
|
302
|
-
├── compare_step1_rank0_{timestamp}.vis
|
|
303
|
-
├── compare_step1_rank1_{timestamp}.vis
|
|
304
|
-
├── compare_step1_rank2_{timestamp}.vis
|
|
305
|
-
├── compare_step1_rank3_{timestamp}.vis
|
|
306
|
-
├── ...
|
|
307
|
-
├── compare_stepn_rankn_{timestamp}.vis
|
|
276
|
+
├── compare_{timestamp}.vis.db
|
|
308
277
|
```
|
|
309
278
|
|
|
310
279
|
#### 3.2.4 仅模型结构比对
|
|
@@ -413,9 +382,11 @@ tensorboard --logdir out_path
|
|
|
413
382
|
|
|
414
383
|
### 5.1 浏览器打开图
|
|
415
384
|
推荐使用谷歌浏览器,在浏览器中输入机器地址+端口号回车,出现TensorBoard页面,其中/#graph_ascend会自动拼接。
|
|
385
|
+
|
|
416
386
|

|
|
417
387
|
|
|
418
388
|
如果您切换了TensorBoard的其他功能,此时想回到模型分级可视化页面,可以点击左上方的**GRAPH_ASCEND**
|
|
389
|
+
|
|
419
390
|

|
|
420
391
|
|
|
421
392
|
### 5.2 查看图
|
|
@@ -530,61 +501,6 @@ yaml文件中只需配置MindSpore与PyTorch模型代码中功能一致但名称
|
|
|
530
501
|
|
|
531
502
|

|
|
532
503
|
|
|
533
|
-
### 7.2 堆栈信息说明
|
|
534
|
-
|
|
535
|
-
**精简堆栈**
|
|
536
|
-
|
|
537
|
-
保留一条当前模块或api的调用信息
|
|
538
|
-
|
|
539
|
-
```json
|
|
540
|
-
{
|
|
541
|
-
"Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
|
|
542
|
-
"File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)"
|
|
543
|
-
]
|
|
544
|
-
}
|
|
545
|
-
```
|
|
546
|
-
|
|
547
|
-
**完整堆栈**
|
|
548
|
-
|
|
549
|
-
当前模块或api完整的调用信息
|
|
550
|
-
|
|
551
|
-
```json
|
|
552
|
-
{
|
|
553
|
-
"Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
|
|
554
|
-
"File /home/mindspore/nn/cell.py, line 507, in _run_construct, \n output = self._run_forward_hook(inputs, output)",
|
|
555
|
-
"File /home/mindspore/nn/cell.py, line 759, in _complex_call, \n output = self._run_construct(*args, **kwargs)",
|
|
556
|
-
"File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
|
|
557
|
-
"File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)",
|
|
558
|
-
"File /home/mindspore/nn/cell.py, line 2462, in _backward_hook_construct, \n outputs = self.construct(outputs, **kwargs)",
|
|
559
|
-
"File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
|
|
560
|
-
"File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
|
|
561
|
-
"File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 151, in construct, \n embeddings = self.word_embeddings(input_ids)",
|
|
562
|
-
"File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
|
|
563
|
-
"File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
|
|
564
|
-
"File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
|
|
565
|
-
"File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 391, in construct, \n text_embedding_out = self.embedding(enc_input_ids, enc_position_ids,",
|
|
566
|
-
"File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
|
|
567
|
-
"File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
|
|
568
|
-
"File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
|
|
569
|
-
"File /home/model/gpt_model.py, line 104, in construct, \n lm_output = self.language_model(tokens,",
|
|
570
|
-
"File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
|
|
571
|
-
"File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
|
|
572
|
-
"File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
|
|
573
|
-
"File /home/mindformers/experimental/distri_cores/pipeline_parallel/pipeline_cell.py, line 429, in construct, \n return self.model(*inputs)",
|
|
574
|
-
"File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
|
|
575
|
-
"File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
|
|
576
|
-
"File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 121, in run_forward, \n output_tensor = model(*input_data, recv_data=None)",
|
|
577
|
-
"File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 735, in forward_backward_pipelining_without_interleaving, \n micro_input_data = run_forward(*micro_input_data,",
|
|
578
|
-
"File /home/mindformers/experimental/distri_cores/training.py, line 409, in forward_backward_with_pipelining, \n loss, logits, grads = forward_backward_pipelining_without_interleaving(",
|
|
579
|
-
"File /home/mindformers/experimental/distri_cores/training.py, line 533, in construct, \n (loss, _), grads = self.forward_backward_func(*inputs_tuple, loss_scale=current_step_loss_scale, **inputs_dict)",
|
|
580
|
-
"File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
|
|
581
|
-
"File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
|
|
582
|
-
"File /home/mindformers/experimental/distri_cores/training.py, line 655, in train, \n loss, is_finite, loss_scale, learning_rate = train_one_step_cell(**data)",
|
|
583
|
-
"File /home/model/pretrain_gpt.py, line 303, in main, \n train(",
|
|
584
|
-
"File /home/model/pretrain_gpt.py, line 316, in <module>, \n main()"
|
|
585
|
-
]
|
|
586
|
-
}
|
|
587
|
-
```
|
|
588
504
|
# FAQ
|
|
589
505
|
1. 图比对场景,节点呈现灰色,且没有精度比对数据,怎么处理?
|
|
590
506
|
|
|
@@ -13,7 +13,7 @@ b. 在生成单API脚本时可以选择由工具构造随机数获得 dump 数
|
|
|
13
13
|
### 前提
|
|
14
14
|
1. 安装 msprobe。详见[ msprobe 安装](./01.installation.md)章节。
|
|
15
15
|
2. 已完成对训练过程的dump,获得dump.json文件。
|
|
16
|
-
[PyTorch场景的数据采集](https://
|
|
16
|
+
[PyTorch场景的数据采集](https://gitcode.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md)
|
|
17
17
|
|
|
18
18
|
**目前仅支持复现API级的数据,故dump时level可选择L1(API信息)或者mix(module信息+API信息)。如需复现真实数据场景的API脚本,dump时task应选择tensor,如需复现随机数据场景的API脚本,dump时task选择statistics**。
|
|
19
19
|
3. 发现某个算子疑似存在精度问题,并得知算子名,如Functional.softmax.3、Tensor.add.0、Torch.matmul.5等
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
# PyTorch 场景的精度数据采集基线
|
|
2
2
|
|
|
3
|
-
## "statistics"
|
|
3
|
+
## "statistics"模式采集时间膨胀参考基线
|
|
4
4
|
|
|
5
|
-
该基线为PyTorch框架下,使用"statistics"
|
|
5
|
+
该基线为PyTorch框架下,使用"statistics"模式采集数据性能膨胀的参考基线。本基线测试了单层 DeepSeek 大模型在不同采集模式8卡下的时间膨胀。
|
|
6
6
|
|
|
7
|
-
| 采集模式 | 无工具 (耗时) | 加工具但未使能 Dump (耗时) |
|
|
8
|
-
|
|
9
|
-
| L0 | ≈
|
|
10
|
-
| L1 | ≈
|
|
11
|
-
| mix | ≈
|
|
7
|
+
| 采集模式 | 无工具 (耗时) | 加工具但未使能 Dump (耗时) | 加工具并使能 Dump (耗时) | 加工具并使能 Md5 Dump (耗时) |
|
|
8
|
+
|:--------:|:--------:|:-------------------:|:--------------------:|:--------------------:|
|
|
9
|
+
| L0 | ≈95.1 ms | ≈95.5 ms (无膨胀) | ≈420.0 ms (膨胀4.5倍) | ≈1011.3 ms (膨胀10倍) |
|
|
10
|
+
| L1 | ≈95.1 ms | ≈115.8 ms (膨胀1.2倍) | ≈2469.0 ms (膨胀26倍) | ≈8636.0 ms (膨胀90倍) |
|
|
11
|
+
| mix | ≈95.1 ms | ≈117.8 ms (膨胀1.2倍) | ≈3635.4 ms (膨胀38 倍) | ≈10698.3 ms (膨胀112倍) |
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
## "tensor"模式采集数据量参考基线
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -17,7 +17,7 @@ import os
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
18
|
from typing import Any, Optional
|
|
19
19
|
from tqdm import tqdm
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
from msprobe.core.common.const import Const, CompareConst
|
|
22
22
|
from msprobe.core.common.file_utils import FileOpen, create_directory, write_csv, load_json, load_yaml
|
|
23
23
|
from msprobe.core.common.utils import add_time_as_suffix
|
|
@@ -25,6 +25,7 @@ from msprobe.core.common.exceptions import MsprobeException
|
|
|
25
25
|
from msprobe.core.common.runtime import Runtime
|
|
26
26
|
from msprobe.core.common.utils import ModuleQueue, ThreadSafe
|
|
27
27
|
from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope, BaseScope
|
|
28
|
+
from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
|
|
28
29
|
from msprobe.mindspore.common.const import Const as MsConst
|
|
29
30
|
from msprobe.mindspore.common.log import logger
|
|
30
31
|
from msprobe.mindspore.common.utils import (
|
|
@@ -47,6 +48,28 @@ def get_cell_construct(construct):
|
|
|
47
48
|
return _construct
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
def patch_schedules_step():
|
|
52
|
+
try:
|
|
53
|
+
from mindspeed.mindspore.core.pipeline_parallel import schedules
|
|
54
|
+
schedules.forward_step = wrap_megatron_step(schedules.forward_step)
|
|
55
|
+
schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
|
|
56
|
+
logger.info_on_rank_0("Patch mindspeed.mindspore method success.")
|
|
57
|
+
except ImportError:
|
|
58
|
+
logger.info_on_rank_0("No mindspeed.mindspore find.")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.info_on_rank_0(f"Patch mindspeed.mindspore method failed, detail:{str(e)}")
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
from megatron.core.pipeline_parallel import schedules
|
|
64
|
+
schedules.forward_step = wrap_megatron_step(schedules.forward_step)
|
|
65
|
+
schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
|
|
66
|
+
logger.info_on_rank_0("Patch megatron method success.")
|
|
67
|
+
except ImportError:
|
|
68
|
+
logger.info_on_rank_0("No megatron find.")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.info_on_rank_0(f"Patch megatron method failed, detail:{str(e)}")
|
|
71
|
+
|
|
72
|
+
|
|
50
73
|
class CellProcessor:
|
|
51
74
|
cell_queue = ModuleQueue()
|
|
52
75
|
cell_count = {}
|
|
@@ -84,6 +107,8 @@ class CellProcessor:
|
|
|
84
107
|
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
|
|
85
108
|
'The model cannot be None, when level is "L0" or "mix"')
|
|
86
109
|
|
|
110
|
+
patch_schedules_step()
|
|
111
|
+
|
|
87
112
|
is_registered = False
|
|
88
113
|
model_type = Const.MODULE if is_mindtorch() else Const.CELL
|
|
89
114
|
cells_with_index_in_pynative_mode, cells_with_index_in_graph_mode = get_cells_and_names_with_index(models)
|
|
@@ -127,6 +152,7 @@ class CellProcessor:
|
|
|
127
152
|
Runtime.run_mode = MsConst.PYNATIVE_GRAPH_MODE
|
|
128
153
|
GraphModeCellDump(config, cells_and_names_in_graph_mode, strict=False).handle()
|
|
129
154
|
|
|
155
|
+
|
|
130
156
|
def build_cell_hook(self, cell_name, build_data_hook):
|
|
131
157
|
@ThreadSafe.synchronized
|
|
132
158
|
def forward_pre_hook(cell, args):
|
|
@@ -259,24 +285,26 @@ class CellProcessor:
|
|
|
259
285
|
CellProcessor.cell_stack[tid] = []
|
|
260
286
|
|
|
261
287
|
if self.cell_stack[tid]:
|
|
262
|
-
CellProcessor.module_node[full_name] = self.cell_stack[tid][-1]
|
|
288
|
+
CellProcessor.module_node[full_name] = self.cell_stack[tid][-1] if not is_megatron() \
|
|
289
|
+
else [self.cell_stack[tid][-1], get_micro_step()]
|
|
263
290
|
else:
|
|
264
291
|
parent_name = CellProcessor.cell_queue.find_last(full_name)
|
|
265
|
-
CellProcessor.module_node[full_name] = parent_name
|
|
292
|
+
CellProcessor.module_node[full_name] = parent_name if not is_megatron() else [parent_name, get_micro_step()]
|
|
266
293
|
|
|
267
294
|
CellProcessor.cell_queue.add_name(full_name)
|
|
268
295
|
CellProcessor.cell_stack[tid].append(full_name)
|
|
269
|
-
CellProcessor.api_parent_node[tid] = full_name
|
|
296
|
+
CellProcessor.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
|
|
270
297
|
if self.scope:
|
|
271
298
|
self.scope.begin_module(full_name)
|
|
272
299
|
|
|
273
300
|
def set_construct_info_in_hook(self, full_name):
|
|
274
301
|
tid = threading.get_ident()
|
|
275
302
|
CellProcessor.cell_queue.remove_name(full_name)
|
|
276
|
-
CellProcessor.api_parent_node[tid] = None
|
|
303
|
+
CellProcessor.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
|
|
277
304
|
if self.cell_stack.get(tid):
|
|
278
305
|
CellProcessor.cell_stack[tid].pop()
|
|
279
306
|
if self.cell_stack.get(tid):
|
|
280
|
-
CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1]
|
|
307
|
+
CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1] if not is_megatron() \
|
|
308
|
+
else [CellProcessor.cell_stack[tid][-1], get_micro_step()]
|
|
281
309
|
if self.scope:
|
|
282
310
|
self.scope.end_module(full_name)
|
|
@@ -212,14 +212,14 @@ def do_multi_process(func, map_dict):
|
|
|
212
212
|
df_chunks = [result_df]
|
|
213
213
|
process_num = 1
|
|
214
214
|
logger.info(f"Using {process_num} processes with chunk size {df_chunk_size}")
|
|
215
|
-
|
|
215
|
+
|
|
216
216
|
# 分割字典
|
|
217
217
|
map_chunks = split_dict(map_dict, df_chunk_size)
|
|
218
|
-
|
|
218
|
+
|
|
219
219
|
# 创建结果列表和进程池
|
|
220
220
|
results = []
|
|
221
221
|
pool = multiprocessing.Pool(process_num)
|
|
222
|
-
|
|
222
|
+
|
|
223
223
|
progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
|
|
224
224
|
|
|
225
225
|
def update_progress(size, progress_lock, extra_param=None):
|
|
@@ -228,34 +228,30 @@ def do_multi_process(func, map_dict):
|
|
|
228
228
|
|
|
229
229
|
def err_call(args):
|
|
230
230
|
logger.error('multiprocess compare failed! Reason: {}'.format(args))
|
|
231
|
-
|
|
232
|
-
pool.close()
|
|
233
|
-
except OSError as e:
|
|
234
|
-
logger.error(f'pool terminate failed: {str(e)}')
|
|
231
|
+
|
|
235
232
|
results = []
|
|
233
|
+
|
|
234
|
+
# 提交任务到进程池
|
|
235
|
+
for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
|
|
236
|
+
start_idx = df_chunk_size * process_idx
|
|
237
|
+
result = pool.apply_async(
|
|
238
|
+
func,
|
|
239
|
+
args=(df_chunk, start_idx, map_chunk, lock),
|
|
240
|
+
error_callback=err_call,
|
|
241
|
+
callback=partial(update_progress, len(map_chunk), lock)
|
|
242
|
+
)
|
|
243
|
+
results.append(result)
|
|
244
|
+
pool.close()
|
|
245
|
+
|
|
236
246
|
try:
|
|
237
|
-
|
|
238
|
-
for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
|
|
239
|
-
start_idx = df_chunk_size * process_idx
|
|
240
|
-
result = pool.apply_async(
|
|
241
|
-
func,
|
|
242
|
-
args=(df_chunk, start_idx, map_chunk, lock),
|
|
243
|
-
error_callback=err_call,
|
|
244
|
-
callback=partial(update_progress, len(map_chunk), lock)
|
|
245
|
-
)
|
|
246
|
-
results.append(result)
|
|
247
|
-
|
|
248
|
-
final_results = [r.get() for r in results]
|
|
249
|
-
# 等待所有任务完成
|
|
250
|
-
pool.close()
|
|
251
|
-
pool.join()
|
|
252
|
-
return pd.concat(final_results, ignore_index=True)
|
|
247
|
+
final_results = [r.get(timeout=3600) for r in results]
|
|
253
248
|
except Exception as e:
|
|
254
|
-
logger.error(f"
|
|
249
|
+
logger.error(f"Task failed with exception: {e}")
|
|
255
250
|
pool.terminate()
|
|
256
251
|
return pd.DataFrame({})
|
|
257
|
-
|
|
258
|
-
|
|
252
|
+
# 等待所有任务完成
|
|
253
|
+
pool.join()
|
|
254
|
+
return pd.concat(final_results, ignore_index=True)
|
|
259
255
|
|
|
260
256
|
|
|
261
257
|
def initialize_result_df(total_size):
|
|
@@ -182,7 +182,7 @@ class PrecisionDebugger(BasePrecisionDebugger):
|
|
|
182
182
|
with ThreadSafe():
|
|
183
183
|
instance.service.step()
|
|
184
184
|
if is_graph_mode_cell_dump_allowed(instance.config):
|
|
185
|
-
GraphModeCellDump.step()
|
|
185
|
+
GraphModeCellDump.step(instance.config.dump_path, instance.config.step, instance.config.task)
|
|
186
186
|
if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2:
|
|
187
187
|
_dump_step(1)
|
|
188
188
|
if cls._is_kernel_dump() and _msprobe_c:
|
|
@@ -46,9 +46,11 @@ KEY_FORWARD = CoreConst.FORWARD
|
|
|
46
46
|
KEY_BACKWARD = CoreConst.BACKWARD
|
|
47
47
|
KEY_INPUT = CoreConst.INPUT
|
|
48
48
|
KEY_OUTPUT = CoreConst.OUTPUT
|
|
49
|
-
KEY_DUMP_TENSOR_DATA = "
|
|
49
|
+
KEY_DUMP_TENSOR_DATA = "dump_tensor_data/"
|
|
50
50
|
KEY_STATISTIC_CSV = "statistic.csv"
|
|
51
51
|
KEY_TD_FLAG = "td_flag"
|
|
52
|
+
# 设置落盘文件检测超时时间
|
|
53
|
+
TIMEOUT = 600
|
|
52
54
|
td = ops.TensorDump()
|
|
53
55
|
if (ms.__version__ >= "2.5.0"):
|
|
54
56
|
td_in = ops.TensorDump("in")
|
|
@@ -574,28 +576,33 @@ def generate_stack_info(path):
|
|
|
574
576
|
logger.info(f"Stack data saved to {json_path}")
|
|
575
577
|
|
|
576
578
|
|
|
577
|
-
def is_download_finished(directory,
|
|
579
|
+
def is_download_finished(directory, save_flag):
|
|
578
580
|
"""
|
|
579
581
|
判断指定目录在一段时间后是否有数据被下载完成
|
|
580
582
|
:param directory: 指定目录的路径
|
|
581
|
-
:param
|
|
583
|
+
:param save_flag: 数据落盘完成后的标志文件
|
|
582
584
|
:return: 如有数据被下载完成返回 True,否则返回 False
|
|
583
585
|
"""
|
|
586
|
+
# 设定一定的延迟间隔,避免频繁进行磁盘的io读取操作
|
|
587
|
+
time.sleep(0.5)
|
|
588
|
+
logger.info("Waiting for download...")
|
|
584
589
|
# 检查目录是否存在
|
|
585
590
|
if not os.path.exists(directory):
|
|
586
591
|
logger.warning(f"The specified directory {directory} does not exist.")
|
|
587
592
|
return False
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
593
|
+
|
|
594
|
+
# 遍历当前目录中的所有条目
|
|
595
|
+
for entry_path in os.listdir(directory):
|
|
596
|
+
if entry_path.startswith(save_flag):
|
|
597
|
+
return True
|
|
598
|
+
|
|
599
|
+
return False
|
|
600
|
+
|
|
596
601
|
|
|
602
|
+
def process_step(dump_path, flag_path, step, step_list):
|
|
603
|
+
if step not in step_list:
|
|
604
|
+
return
|
|
597
605
|
|
|
598
|
-
def process(dump_path):
|
|
599
606
|
if not os.path.exists(dump_path):
|
|
600
607
|
logger.warning('No grap cell data is dumped.')
|
|
601
608
|
create_directory(dump_path)
|
|
@@ -606,32 +613,38 @@ def process(dump_path):
|
|
|
606
613
|
if rank_id is not None:
|
|
607
614
|
rank_dir = CoreConst.RANK + str(rank_id)
|
|
608
615
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
616
|
+
step_dir = CoreConst.STEP + str(step)
|
|
617
|
+
|
|
618
|
+
step_path = os.path.join(dump_path, step_dir)
|
|
619
|
+
rank_path = os.path.join(step_path, rank_dir)
|
|
620
|
+
npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
|
|
621
|
+
save_finish_flag = f"step_{step}"
|
|
622
|
+
start_time = time.time()
|
|
623
|
+
while True:
|
|
624
|
+
is_finished = is_download_finished(flag_path, save_finish_flag)
|
|
625
|
+
if not is_finished:
|
|
626
|
+
logger.info("There is data being downloaded in the specified directory, continue checking...")
|
|
627
|
+
else:
|
|
628
|
+
logger.info("There is no data being downloaded in the specified directory, Stop checking.")
|
|
629
|
+
break
|
|
630
|
+
elapsed_time = time.time() - start_time
|
|
631
|
+
if elapsed_time > TIMEOUT:
|
|
632
|
+
logger.error(f"Check timed out after {TIMEOUT} seconds. Exiting.")
|
|
633
|
+
return
|
|
634
|
+
logger.info(f"==========Start processing step_{step}'s data that has already been stored on the disk!==========")
|
|
635
|
+
rename_filename(path=npy_path)
|
|
636
|
+
generate_construct(npy_path)
|
|
637
|
+
generate_dump_info(npy_path)
|
|
638
|
+
generate_stack_info(npy_path)
|
|
639
|
+
# 单卡场景,rank目录名称为rank
|
|
640
|
+
if rank_id is None:
|
|
641
|
+
new_rank_path = os.path.join(step_path, CoreConst.RANK)
|
|
642
|
+
try:
|
|
643
|
+
move_directory(rank_path, new_rank_path)
|
|
644
|
+
logger.info(f"Directory was successfully renamed to: {new_rank_path}")
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
|
|
647
|
+
logger.info(f"==========Step_{step}'s JSON file generation completed!==========")
|
|
635
648
|
|
|
636
649
|
|
|
637
650
|
# 删除csv文件中每行数据最后面的逗号
|
|
@@ -689,7 +702,10 @@ def merge_file(dump_path, rank_dir, file_dict):
|
|
|
689
702
|
" and the index is out of bounds.")
|
|
690
703
|
|
|
691
704
|
|
|
692
|
-
def
|
|
705
|
+
def process_statistics_step(dump_path, step, step_list):
|
|
706
|
+
if step_list and step not in step_list:
|
|
707
|
+
return
|
|
708
|
+
|
|
693
709
|
if not os.path.exists(dump_path):
|
|
694
710
|
logger.warning('No grap cell data is dumped.')
|
|
695
711
|
create_directory(dump_path)
|
|
@@ -723,25 +739,24 @@ def process_statistics(dump_path):
|
|
|
723
739
|
|
|
724
740
|
rank_dir = rank_dir_kbk.replace(CoreConst.REPLACEMENT_CHARACTER, '')
|
|
725
741
|
dir_list = os.listdir(dump_path)
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
logger.info("==========JSON file generation completed!==========")
|
|
742
|
+
step_dir = CoreConst.STEP + str(step)
|
|
743
|
+
step_path = os.path.join(dump_path, step_dir)
|
|
744
|
+
rank_path = os.path.join(step_path, rank_dir)
|
|
745
|
+
csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
|
|
746
|
+
logger.info("==========Start processing data csv!==========")
|
|
747
|
+
generate_construct(csv_path)
|
|
748
|
+
generate_dump_info(csv_path)
|
|
749
|
+
generate_stack_info(csv_path)
|
|
750
|
+
remove_path(rank_path_kbk)
|
|
751
|
+
# 单卡场景,rank目录名称为rank
|
|
752
|
+
if rank_id is None:
|
|
753
|
+
new_rank_path = os.path.join(step_path, CoreConst.RANK)
|
|
754
|
+
try:
|
|
755
|
+
move_directory(rank_path, new_rank_path)
|
|
756
|
+
logger.info(f"Directory was successfully renamed to: {new_rank_path}")
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
|
|
759
|
+
logger.info("==========JSON file generation completed!==========")
|
|
745
760
|
|
|
746
761
|
|
|
747
762
|
def get_yaml_keys(yaml_data):
|
|
@@ -922,7 +937,3 @@ def start(config: CellDumpConfig):
|
|
|
922
937
|
cell.data_mode = data_mode
|
|
923
938
|
|
|
924
939
|
logger.info("==========The cell_dump_process_start phase is Finished!==========")
|
|
925
|
-
if dump_task == CoreConst.TENSOR:
|
|
926
|
-
atexit.register(process, dump_path=dump_path)
|
|
927
|
-
if dump_task == CoreConst.STATISTICS:
|
|
928
|
-
atexit.register(process_statistics, dump_path=dump_path)
|