mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
  2. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
  3. msprobe/README.md +7 -5
  4. msprobe/core/common/const.py +6 -0
  5. msprobe/core/common/db_manager.py +35 -4
  6. msprobe/core/common/file_utils.py +105 -27
  7. msprobe/core/common/framework_adapter.py +7 -6
  8. msprobe/core/common/megatron_utils.py +59 -0
  9. msprobe/core/common/utils.py +14 -3
  10. msprobe/core/compare/find_first/analyzer.py +8 -7
  11. msprobe/core/compare/find_first/graph.py +11 -3
  12. msprobe/core/compare/find_first/utils.py +2 -1
  13. msprobe/core/compare/highlight.py +13 -6
  14. msprobe/core/compare/multiprocessing_compute.py +17 -10
  15. msprobe/core/compare/utils.py +14 -5
  16. msprobe/core/data_dump/data_collector.py +18 -21
  17. msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
  18. msprobe/core/data_dump/json_writer.py +18 -8
  19. msprobe/core/data_dump/scope.py +4 -6
  20. msprobe/core/hook_manager.py +37 -3
  21. msprobe/core/service.py +18 -5
  22. msprobe/core/single_save/single_comparator.py +16 -3
  23. msprobe/docs/01.installation.md +7 -5
  24. msprobe/docs/02.config_introduction.md +14 -1
  25. msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
  26. msprobe/docs/06.data_dump_MindSpore.md +1 -1
  27. msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
  28. msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
  29. msprobe/docs/14.data_parse_PyTorch.md +1 -1
  30. msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
  31. msprobe/docs/19.monitor.md +2 -0
  32. msprobe/docs/21.visualization_PyTorch.md +15 -80
  33. msprobe/docs/22.visualization_MindSpore.md +20 -104
  34. msprobe/docs/23.generate_operator_PyTorch.md +1 -1
  35. msprobe/docs/25.tool_function_introduction.md +1 -0
  36. msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
  37. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  38. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  39. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  40. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  41. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  42. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  43. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  44. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
  45. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
  46. msprobe/mindspore/cell_processor.py +33 -5
  47. msprobe/mindspore/compare/common_dir_compare.py +22 -26
  48. msprobe/mindspore/compare/utils.py +1 -2
  49. msprobe/mindspore/debugger/precision_debugger.py +1 -1
  50. msprobe/mindspore/dump/cell_dump_process.py +73 -62
  51. msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
  52. msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
  53. msprobe/msprobe.py +6 -4
  54. msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
  55. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
  56. msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
  57. msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
  58. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
  59. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
  60. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
  61. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
  62. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
  63. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
  64. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
  65. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
  66. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
  67. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
  68. msprobe/pytorch/attl_manager.py +65 -0
  69. msprobe/pytorch/common/utils.py +22 -2
  70. msprobe/pytorch/compare/utils.py +3 -3
  71. msprobe/pytorch/debugger/debugger_config.py +10 -0
  72. msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
  73. msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
  74. msprobe/pytorch/hook_module/api_register.py +6 -1
  75. msprobe/pytorch/monitor/module_hook.py +28 -9
  76. msprobe/pytorch/online_dispatch/dispatch.py +42 -24
  77. msprobe/pytorch/pt_config.py +57 -2
  78. msprobe/pytorch/pytorch_service.py +11 -2
  79. msprobe/visualization/builder/graph_builder.py +170 -64
  80. msprobe/visualization/builder/graph_merger.py +0 -1
  81. msprobe/visualization/builder/msprobe_adapter.py +1 -1
  82. msprobe/visualization/db_utils.py +25 -2
  83. msprobe/visualization/graph/base_node.py +0 -24
  84. msprobe/visualization/graph/graph.py +5 -14
  85. msprobe/visualization/graph_service.py +29 -53
  86. msprobe/visualization/utils.py +11 -1
  87. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
  88. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
  89. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
  90. {mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0
@@ -21,18 +21,16 @@
21
21
 
22
22
  ### 1.1 安装msprobe工具
23
23
 
24
- [msprobe工具安装](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/01.installation.md)
24
+ [msprobe工具安装](./01.installation.md)
25
25
 
26
26
  ### 1.2 安装tb_graph_ascend
27
27
 
28
28
  **请安装tb_graph_ascend,否则无法解析构图结果。**
29
29
 
30
- ``pip3 install tb-graph-ascend``即可。
31
-
32
- 如需升级工具,请先``pip3 uninstall tb-graph-ascend``再``pip3 install tb-graph-ascend``即可。
30
+ [tb_graph_ascend安装](../../../../plugins/tensorboard-plugins/tb_graph_ascend#2-安装方式)
33
31
 
34
32
  ## 2.模型结构数据采集
35
- [PyTorch场景的数据采集](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md)
33
+ [PyTorch场景的数据采集](./05.data_dump_PyTorch.md)
36
34
 
37
35
  **需要选择level为L0(module信息)或者mix(module信息+api信息),才能采集到模型结构数据,即采集结果件construct.json内容不为空**。
38
36
 
@@ -50,11 +48,10 @@ msprobe -f pytorch graph -i ./compare.json -o ./output
50
48
  |------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
51
49
  | -f 或 --framework | 指定训练框架。pytorch。 | 是 |
52
50
  | -i 或 --input_path | 指定比对文件,参考[比对文件说明](#313-比对文件说明) | 是 |
53
- | -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis或build_{timestamp}.vis`。 | 是 |
51
+ | -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`。 | 是 |
54
52
  | -lm 或 --layer_mapping | 跨套件比对,例如同一个模型分别使用了DeepSpeed和Megatron套件的比对场景。配置该参数时表示开启跨套件Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer)](#71-自定义映射文件layer),如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。 配置该参数后,将仅按节点名称进行比对,忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点,则需要配置自定义映射文件,-lm参数传入自定义映射文件路径;如果调试侧和标杆侧节点名称相同,则仅指定-lm即可。<br/><br/>可参考的实际案例:[MindSpeed&LLamaFactory数据采集和自动比对](./visualization/mindspeed_llamafactory_mapping.md) | 否 |
55
- | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出vis文件中(`compare_{timestamp}.vis或build_{timestamp}.vis`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 |
53
+ | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出db文件中(`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 |
56
54
  | -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 |
57
- | -cs 或 --complete_stack | 是否使用完整的堆栈信息,bool类型。默认使用精简的堆栈信息,数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明) | 否 |
58
55
 
59
56
  #### 3.1.1 匹配说明
60
57
 
@@ -139,7 +136,7 @@ msprobe -f pytorch graph -i ./compare.json -o ./output
139
136
 
140
137
  3.md5:dump了API和Module的输入输出数据统计信息和md5信息。
141
138
 
142
- dump类型如何配置见[数据采集配置文件介绍](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
139
+ dump类型如何配置见[数据采集配置文件介绍](https://gitcode.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
143
140
 
144
141
  **1. 准备比对文件**:
145
142
 
@@ -168,7 +165,7 @@ npu_path或bench_path格式:必须包含dump.json、stack.json和construct.jso
168
165
  msprobe -f pytorch graph -i ./compare.json -o ./output
169
166
  ```
170
167
 
171
- 比对完成后将在**output**下生成一个**vis后缀文件**。
168
+ 比对完成后将在**output**下生成一个**vis.db后缀文件**。
172
169
 
173
170
  #### 3.2.3 批量构建或比对
174
171
  ##### 3.2.3.1 多rank批量构建或比对
@@ -212,25 +209,15 @@ npu_path或bench_path格式:必须只包含rank+数字格式的文件夹,且
212
209
  ```
213
210
  msprobe -f pytorch graph -i ./compare.json -o ./output
214
211
  ```
215
- 比对完成后将在**output**下生成n个**vis后缀文件**。
212
+ 比对完成后将在**output**下生成1个**vis.db后缀文件**。
216
213
 
217
214
  图构建:
218
215
  ```
219
- ├── build_rank0_{timestamp}.vis
220
- ├── build_rank1_{timestamp}.vis
221
- ├── build_rank2_{timestamp}.vis
222
- ├── build_rank3_{timestamp}.vis
223
- ├── ...
224
- ├── build_rankn_{timestamp}.vis
216
+ ├── build_{timestamp}.vis.db
225
217
  ```
226
218
  图比对:
227
219
  ```
228
- ├── compare_rank0_{timestamp}.vis
229
- ├── compare_rank1_{timestamp}.vis
230
- ├── compare_rank2_{timestamp}.vis
231
- ├── compare_rank3_{timestamp}.vis
232
- ├── ...
233
- ├── compare_rankn_{timestamp}.vis
220
+ ├── compare_{timestamp}.vis.db
234
221
  ```
235
222
  ##### 3.2.3.2 多step批量构建或比对
236
223
  批量构建或比对多个step下的所有rank的数据
@@ -277,33 +264,15 @@ npu_path或bench_path格式:必须只包含step+数字格式的文件夹,且
277
264
  ```
278
265
  msprobe -f pytorch graph -i ./compare.json -o ./output
279
266
  ```
280
- 比对完成后将在**output**下生成若干个**vis后缀文件**。
267
+ 比对完成后将在**output**下生成1个**vis.db后缀文件**。
281
268
 
282
269
  图构建:
283
270
  ```
284
- ├── build_step0_rank0_{timestamp}.vis
285
- ├── build_step0_rank1_{timestamp}.vis
286
- ├── build_step0_rank2_{timestamp}.vis
287
- ├── build_step0_rank3_{timestamp}.vis
288
- ├── build_step1_rank0_{timestamp}.vis
289
- ├── build_step1_rank1_{timestamp}.vis
290
- ├── build_step1_rank2_{timestamp}.vis
291
- ├── build_step1_rank3_{timestamp}.vis
292
- ├── ...
293
- ├── build_stepn_rankn_{timestamp}.vis
271
+ ├── build_{timestamp}.vis.db
294
272
  ```
295
273
  图比对:
296
274
  ```
297
- ├── compare_step0_rank0_{timestamp}.vis
298
- ├── compare_step0_rank1_{timestamp}.vis
299
- ├── compare_step0_rank2_{timestamp}.vis
300
- ├── compare_step0_rank3_{timestamp}.vis
301
- ├── compare_step1_rank0_{timestamp}.vis
302
- ├── compare_step1_rank1_{timestamp}.vis
303
- ├── compare_step1_rank2_{timestamp}.vis
304
- ├── compare_step1_rank3_{timestamp}.vis
305
- ├── ...
306
- ├── compare_stepn_rankn_{timestamp}.vis
275
+ ├── compare_{timestamp}.vis.db
307
276
  ```
308
277
 
309
278
  #### 3.2.4 仅模型结构比对
@@ -412,9 +381,11 @@ tensorboard --logdir out_path
412
381
 
413
382
  ### 5.1 浏览器打开图
414
383
  推荐使用谷歌浏览器,在浏览器中输入机器地址+端口号回车,出现TensorBoard页面,其中/#graph_ascend会自动拼接。
384
+
415
385
  ![vis_browser_1](./img/visualization/vis_browser_1.png)
416
386
 
417
387
  如果您切换了TensorBoard的其他功能,此时想回到模型分级可视化页面,可以点击左上方的**GRAPH_ASCEND**
388
+
418
389
  ![vis_browser_2](./img/visualization/vis_browser_2.png)
419
390
 
420
391
  ### 5.2 查看图
@@ -534,42 +505,6 @@ yaml文件中只需配置待调试侧与标杆侧模型代码中功能一致但
534
505
 
535
506
  ![ms_dump](./img/ms_layer.png)
536
507
 
537
- ### 7.2 堆栈信息说明
538
-
539
- **精简堆栈**
540
-
541
- 保留一条当前模块或api的调用信息
542
-
543
- ```json
544
- {
545
- "Module.layer1.0.bn1.BatchNorm2d.forward.0": [
546
- "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)"
547
- ]
548
- }
549
- ```
550
-
551
- **完整堆栈**
552
-
553
- 当前模块或api完整的调用信息
554
-
555
- ```json
556
- {
557
- "Module.layer1.0.bn1.BatchNorm2d.forward.0": [
558
- "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)",
559
- "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)",
560
- "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
561
- "File /home/torch/nn/modules/container.py, line 215, in forward, \n input = module(input)",
562
- "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)",
563
- "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
564
- "File /home/torchvision/models/resnet.py, line 273, in _forward_impl, \n x = self.layer1(x)",
565
- "File /home/torchvision/models/resnet.py, line 285, in forward, \n return self._forward_impl(x)",
566
- "File /home/torch/nn/modules/module.py, line 1527, in _call_impl, \n return forward_call(*args, **kwargs)",
567
- "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
568
- "File /home/visualization/resnet18.py, line 40, in <module>, \n outputs = model(inputs)"
569
- ]
570
- }
571
-
572
- ```
573
508
  # FAQ
574
509
  1. 图比对场景,节点呈现灰色,且没有精度比对数据,怎么处理?
575
510
 
@@ -21,40 +21,37 @@
21
21
 
22
22
  ### 1.1 安装msprobe工具
23
23
 
24
- [msprobe工具安装](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/01.installation.md)
24
+ [msprobe工具安装](./01.installation.md)
25
25
 
26
26
  ### 1.2 安装tb_graph_ascend
27
27
 
28
28
  **请安装tb_graph_ascend,否则无法解析构图结果。**
29
29
 
30
- ``pip3 install tb-graph-ascend``即可。
31
-
32
- 如需升级工具,请先``pip3 uninstall tb-graph-ascend``再``pip3 install tb-graph-ascend``即可。
30
+ [tb_graph_ascend安装](../../../../plugins/tensorboard-plugins/tb_graph_ascend#2-安装方式)
33
31
 
34
32
  ## 2.模型结构数据采集
35
- [MindSpore场景的精度数据采集](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md)
33
+ [PyTorch场景的数据采集](./06.data_dump_MindSpore.md)
36
34
 
37
35
  **仅支持动态图场景,需要选择level为L0(cell信息)或者mix(cell信息+api信息),才能采集到模型结构数据,即采集结果件construct.json内容不为空**。
38
36
 
39
37
  ## 3.生成图结构文件
40
38
 
41
39
  ### 3.1 构图命令行说明
42
-
40
+
43
41
  **命令示例如下**:
44
42
  ```
45
43
  msprobe -f mindspore graph -i ./compare.json -o ./output
46
44
  ```
47
45
  **命令行参数说明**:
48
46
 
49
- | 参数名 | 说明 | 是否必选 |
50
- |-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
51
- | -f 或 --framework | 指定训练框架。mindspore。 | 是 |
52
- | -i 或 --input_path | 指定比对文件,参考[比对文件说明](#313-比对文件说明) | 是 |
53
- | -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis或build_{timestamp}.vis`。 | 是 |
47
+ | 参数名 | 说明 | 是否必选 |
48
+ |-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
49
+ | -f 或 --framework | 指定训练框架。mindspore。 | 是 |
50
+ | -i 或 --input_path | 指定比对文件,参考[比对文件说明](#313-比对文件说明) | 是 |
51
+ | -o 或 --output_path | 配置比对结果文件存盘目录,str 类型。文件名称基于时间戳自动生成,格式为:`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`。 | 是 |
54
52
  | -lm 或 --layer_mapping| 跨框架比对,MindSpore和PyTorch的比对场景。配置该参数时表示开启跨框架Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer)](#71-自定义映射文件layer), 如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。配置该参数后,将仅按节点名称进行比对,忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点,则需要配置自定义映射文件,-lm参数传入自定义映射文件路径;如果调试侧和标杆侧节点名称相同,则仅指定-lm即可。 | 否 |
55
- | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出vis文件中(`compare_{timestamp}.vis或build_{timestamp}.vis`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 |
56
- | -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 |
57
- | -cs 或 --complete_stack | 是否使用完整的堆栈信息,bool类型。默认使用精简的堆栈信息,数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明) | 否 |
53
+ | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出db文件中(`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 |
54
+ | -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 |
58
55
 
59
56
  #### 3.1.1 匹配说明
60
57
 
@@ -140,7 +137,7 @@ msprobe -f mindspore graph -i ./compare.json -o ./output
140
137
 
141
138
  3.md5:dump了API和Module的输入输出数据统计信息和md5信息。
142
139
 
143
- dump类型如何配置见[数据采集配置文件介绍](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
140
+ dump类型如何配置见[数据采集配置文件介绍](https://gitcode.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
144
141
 
145
142
  **1. 准备比对文件**:
146
143
 
@@ -213,25 +210,15 @@ npu_path或bench_path格式:必须只包含rank+数字格式的文件夹,且
213
210
  ```
214
211
  msprobe -f mindspore graph -i ./compare.json -o ./output
215
212
  ```
216
- 比对完成后将在**output**下生成n个**vis后缀文件**。
213
+ 比对完成后将在**output**下生成1个**vis.db后缀文件**。
217
214
 
218
215
  图构建:
219
216
  ```
220
- ├── build_rank0_{timestamp}.vis
221
- ├── build_rank1_{timestamp}.vis
222
- ├── build_rank2_{timestamp}.vis
223
- ├── build_rank3_{timestamp}.vis
224
- ├── ...
225
- ├── build_rankn_{timestamp}.vis
217
+ ├── build_{timestamp}.vis.db
226
218
  ```
227
219
  图比对:
228
220
  ```
229
- ├── compare_rank0_{timestamp}.vis
230
- ├── compare_rank1_{timestamp}.vis
231
- ├── compare_rank2_{timestamp}.vis
232
- ├── compare_rank3_{timestamp}.vis
233
- ├── ...
234
- ├── compare_rankn_{timestamp}.vis
221
+ ├── compare_{timestamp}.vis.db
235
222
  ```
236
223
  ##### 3.2.3.2 多step批量构建或比对
237
224
  批量构建或比对多个step下的所有rank的数据
@@ -278,33 +265,15 @@ npu_path或bench_path格式:必须只包含step+数字格式的文件夹,且
278
265
  ```
279
266
  msprobe -f mindspore graph -i ./compare.json -o ./output
280
267
  ```
281
- 比对完成后将在**output**下生成若干个**vis后缀文件**。
268
+ 比对完成后将在**output**下生成1个**vis.db后缀文件**。
282
269
 
283
270
  图构建:
284
271
  ```
285
- ├── build_step0_rank0_{timestamp}.vis
286
- ├── build_step0_rank1_{timestamp}.vis
287
- ├── build_step0_rank2_{timestamp}.vis
288
- ├── build_step0_rank3_{timestamp}.vis
289
- ├── build_step1_rank0_{timestamp}.vis
290
- ├── build_step1_rank1_{timestamp}.vis
291
- ├── build_step1_rank2_{timestamp}.vis
292
- ├── build_step1_rank3_{timestamp}.vis
293
- ├── ...
294
- ├── build_stepn_rankn_{timestamp}.vis
272
+ ├── build_{timestamp}.vis.db
295
273
  ```
296
274
  图比对:
297
275
  ```
298
- ├── compare_step0_rank0_{timestamp}.vis
299
- ├── compare_step0_rank1_{timestamp}.vis
300
- ├── compare_step0_rank2_{timestamp}.vis
301
- ├── compare_step0_rank3_{timestamp}.vis
302
- ├── compare_step1_rank0_{timestamp}.vis
303
- ├── compare_step1_rank1_{timestamp}.vis
304
- ├── compare_step1_rank2_{timestamp}.vis
305
- ├── compare_step1_rank3_{timestamp}.vis
306
- ├── ...
307
- ├── compare_stepn_rankn_{timestamp}.vis
276
+ ├── compare_{timestamp}.vis.db
308
277
  ```
309
278
 
310
279
  #### 3.2.4 仅模型结构比对
@@ -413,9 +382,11 @@ tensorboard --logdir out_path
413
382
 
414
383
  ### 5.1 浏览器打开图
415
384
  推荐使用谷歌浏览器,在浏览器中输入机器地址+端口号回车,出现TensorBoard页面,其中/#graph_ascend会自动拼接。
385
+
416
386
  ![vis_browser_1](./img/visualization/vis_browser_1.png)
417
387
 
418
388
  如果您切换了TensorBoard的其他功能,此时想回到模型分级可视化页面,可以点击左上方的**GRAPH_ASCEND**
389
+
419
390
  ![vis_browser_2](./img/visualization/vis_browser_2.png)
420
391
 
421
392
  ### 5.2 查看图
@@ -530,61 +501,6 @@ yaml文件中只需配置MindSpore与PyTorch模型代码中功能一致但名称
530
501
 
531
502
  ![ms_dump](./img/ms_layer.png)
532
503
 
533
- ### 7.2 堆栈信息说明
534
-
535
- **精简堆栈**
536
-
537
- 保留一条当前模块或api的调用信息
538
-
539
- ```json
540
- {
541
- "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
542
- "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)"
543
- ]
544
- }
545
- ```
546
-
547
- **完整堆栈**
548
-
549
- 当前模块或api完整的调用信息
550
-
551
- ```json
552
- {
553
- "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
554
- "File /home/mindspore/nn/cell.py, line 507, in _run_construct, \n output = self._run_forward_hook(inputs, output)",
555
- "File /home/mindspore/nn/cell.py, line 759, in _complex_call, \n output = self._run_construct(*args, **kwargs)",
556
- "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
557
- "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)",
558
- "File /home/mindspore/nn/cell.py, line 2462, in _backward_hook_construct, \n outputs = self.construct(outputs, **kwargs)",
559
- "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
560
- "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
561
- "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 151, in construct, \n embeddings = self.word_embeddings(input_ids)",
562
- "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
563
- "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
564
- "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
565
- "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 391, in construct, \n text_embedding_out = self.embedding(enc_input_ids, enc_position_ids,",
566
- "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
567
- "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
568
- "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
569
- "File /home/model/gpt_model.py, line 104, in construct, \n lm_output = self.language_model(tokens,",
570
- "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
571
- "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
572
- "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
573
- "File /home/mindformers/experimental/distri_cores/pipeline_parallel/pipeline_cell.py, line 429, in construct, \n return self.model(*inputs)",
574
- "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
575
- "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
576
- "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 121, in run_forward, \n output_tensor = model(*input_data, recv_data=None)",
577
- "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 735, in forward_backward_pipelining_without_interleaving, \n micro_input_data = run_forward(*micro_input_data,",
578
- "File /home/mindformers/experimental/distri_cores/training.py, line 409, in forward_backward_with_pipelining, \n loss, logits, grads = forward_backward_pipelining_without_interleaving(",
579
- "File /home/mindformers/experimental/distri_cores/training.py, line 533, in construct, \n (loss, _), grads = self.forward_backward_func(*inputs_tuple, loss_scale=current_step_loss_scale, **inputs_dict)",
580
- "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
581
- "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
582
- "File /home/mindformers/experimental/distri_cores/training.py, line 655, in train, \n loss, is_finite, loss_scale, learning_rate = train_one_step_cell(**data)",
583
- "File /home/model/pretrain_gpt.py, line 303, in main, \n train(",
584
- "File /home/model/pretrain_gpt.py, line 316, in <module>, \n main()"
585
- ]
586
- }
587
- ```
588
504
  # FAQ
589
505
  1. 图比对场景,节点呈现灰色,且没有精度比对数据,怎么处理?
590
506
 
@@ -13,7 +13,7 @@ b. 在生成单API脚本时可以选择由工具构造随机数获得 dump 数
13
13
  ### 前提
14
14
  1. 安装 msprobe。详见[ msprobe 安装](./01.installation.md)章节。
15
15
  2. 已完成对训练过程的dump,获得dump.json文件。
16
- [PyTorch场景的数据采集](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md)
16
+ [PyTorch场景的数据采集](https://gitcode.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md)
17
17
 
18
18
  **目前仅支持复现API级的数据,故dump时level可选择L1(API信息)或者mix(module信息+API信息)。如需复现真实数据场景的API脚本,dump时task应选择tensor,如需复现随机数据场景的API脚本,dump时task选择statistics**。
19
19
  3. 发现某个算子疑似存在精度问题,并得知算子名,如Functional.softmax.3、Tensor.add.0、Torch.matmul.5等
@@ -7,6 +7,7 @@
7
7
  | [数据采集<br>(dump)](./05.data_dump_PyTorch.md) | 采集模型训练过程中的API或Module层级的前反向输入输出数据,包括层次关系、统计值信息、真实数据和调用栈等。 | 1、将模型中训练的API或Module的前反向输入输出数据保存下来分析<br> 2、模型出现溢出时,可用于查看哪些API或Module出现了溢出 | 1、API级数据采集仅支持白名单列表上的API<br>2、工具会做一些同步操作,引入工具可能会导致一些同步问题消失<br>3、当前对inplace操作API或Module的支持度有限<br>4、暂不支持参数及参数梯度的采集 |
8
8
  | [离线预检<br>(api_accuracy_checker)](./07.accuracy_checker_PyTorch.md) | 为网络中每个API创建用例,检验其精度,并根据不同比对算法综合判定API在NPU上的精度是否达标,快速找出精度差异API。 | 1、对模型中所有的API做精度初步排查<br>2、精度排查不受模型累计误差影响 | 1、依赖GPU环境<br>2、不支持通信算子<br>3、仅支持部分融合算子 |
9
9
  | [整网比对<br>(compare)](./10.accuracy_compare_PyTorch.md) | 计算模型整网NPU和标杆设备的精度误差指标,标记精度异常API或Module,助力快速定位精度问题根因。 | 1、整网比对定位精度可疑算子 | 1、由于使用整网dump数据,定位的可疑算子受累计误差影响<br>2、当模型规模较大时,比对所需时间较长 |
10
+ | [在线预检<br>(online_api_accuracy_checker)](./08.accuracy_checker_online_PyTorch.md) | 通过TCP通信或共享存储空间的方式,进行在线精度预检,解决离线预检大数据量落盘、传输困难痛点。 | 1、使用离线预检,数据量较大落盘困难或传输耗时长时,可通过在线预检进行精度排查 | 1、依赖GPU环境,NPU和GPU能够通信<br>2、重计算模式下,不支持反向aten算子预检 |
10
11
  | [溢出检查<br>(overflow_checker)](./12.overflow_check_PyTorch.md) | 检测模型计算过程的输入输出,并在溢出时落盘数据,助力用户快速定位溢出位置。 | 1、当模型出现溢出时,用于快速定位最先溢出的API或Module<br>2、相比数据采集,性能更优,磁盘压力更小 | 1、局限性同数据采集 |
11
12
  | [数据解析<br>(parse_tool)](./14.data_parse_PyTorch.md) | 交互式界面处理解析kernel层级dump数据,便于查看分析。 | 1、比对kernel层级dump数据的一致性 | 1、仅限于NPU |
12
13
  | [无标杆比对<br>(free_benchmark)](./15.free_benchmarking_PyTorch.md) | 不依赖标杆数据,通过对算子输入增加微小扰动,计算扰动后输出与原始输出的相对误差,识别有精度风险算子。 | 1、无标杆数据场景下的算子精度排查<br>2、对个别算子进行升精度、“to cpu”等操作,以验证其对模型loss的影响 | 1、由于需要拷贝输入进行二次执行,所以在遇到大张量的输入时容易发生显存OOM的问题, 特别是反向比对过程。建议结合白名单使用<br>2、比对会延长训练时间,整网比对可能会造成严重的耗时膨胀,建议结合白名单使用 |
@@ -1,14 +1,14 @@
1
1
  # PyTorch 场景的精度数据采集基线
2
2
 
3
- ## "statistics"模式(未开启md5)采集时间膨胀参考基线
3
+ ## "statistics"模式采集时间膨胀参考基线
4
4
 
5
- 该基线为PyTorch框架下,使用"statistics"模式采集数据性能膨胀的参考基线。本基线测试了LLAMA2-7B语言大模型在不同采集模式8卡下的时间膨胀。
5
+ 该基线为PyTorch框架下,使用"statistics"模式采集数据性能膨胀的参考基线。本基线测试了单层 DeepSeek 大模型在不同采集模式8卡下的时间膨胀。
6
6
 
7
- | 采集模式 | 无工具 (耗时) | 加工具但未使能 Dump (耗时) | 加工具并使能 Dump (耗时) |
8
- |:--------:|:--------:|:--------------------:|:------------------:|
9
- | L0 | ≈17.4 s | 17.4 s (无膨胀) | ≈78.4 s (膨胀4.5倍) |
10
- | L1 | ≈17.4 s | 20.7 s (膨胀1.2倍) | ≈353 s (膨胀20倍) |
11
- | mix | ≈17.4 s | ≈20.7 s (膨胀1.2倍) | ≈430 s (膨胀24.7 倍) |
7
+ | 采集模式 | 无工具 (耗时) | 加工具但未使能 Dump (耗时) | 加工具并使能 Dump (耗时) | 加工具并使能 Md5 Dump (耗时) |
8
+ |:--------:|:--------:|:-------------------:|:--------------------:|:--------------------:|
9
+ | L0 | ≈95.1 ms | 95.5 ms (无膨胀) | ≈420.0 ms (膨胀4.5倍) | ≈1011.3 s (膨胀10倍) |
10
+ | L1 | ≈95.1 ms | 115.8 ms (膨胀1.2倍) | ≈2469.0 ms (膨胀26倍) | 8636.0 s (膨胀90倍) |
11
+ | mix | ≈95.1 ms | ≈117.8 ms (膨胀1.2倍) | ≈3635.4 ms (膨胀38 倍) | ≈10698.3 s (膨胀112倍) |
12
12
 
13
13
 
14
14
  ## "tensor"模式采集数据量参考基线
@@ -17,7 +17,7 @@ import os
17
17
  from dataclasses import dataclass
18
18
  from typing import Any, Optional
19
19
  from tqdm import tqdm
20
- import numpy as np
20
+
21
21
  from msprobe.core.common.const import Const, CompareConst
22
22
  from msprobe.core.common.file_utils import FileOpen, create_directory, write_csv, load_json, load_yaml
23
23
  from msprobe.core.common.utils import add_time_as_suffix
@@ -45,7 +45,7 @@ API_INFO = 2
45
45
  FOUR_SEGMENT = 4
46
46
  FIVE_SEGMENT = 5
47
47
  DATA_NAME = "data_name"
48
- API_MAX_LENGTH = 30
48
+ API_MAX_LENGTH = 300
49
49
  PROPAGATION_LIST = [Const.FORWARD, Const.BACKWARD]
50
50
  DATAMODE_LIST = ["random_data", "real_data"]
51
51
  ITER_MAX_TIMES = 1000
@@ -25,6 +25,7 @@ from msprobe.core.common.exceptions import MsprobeException
25
25
  from msprobe.core.common.runtime import Runtime
26
26
  from msprobe.core.common.utils import ModuleQueue, ThreadSafe
27
27
  from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope, BaseScope
28
+ from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
28
29
  from msprobe.mindspore.common.const import Const as MsConst
29
30
  from msprobe.mindspore.common.log import logger
30
31
  from msprobe.mindspore.common.utils import (
@@ -47,6 +48,28 @@ def get_cell_construct(construct):
47
48
  return _construct
48
49
 
49
50
 
51
+ def patch_schedules_step():
52
+ try:
53
+ from mindspeed.mindspore.core.pipeline_parallel import schedules
54
+ schedules.forward_step = wrap_megatron_step(schedules.forward_step)
55
+ schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
56
+ logger.info_on_rank_0("Patch mindspeed.mindspore method success.")
57
+ except ImportError:
58
+ logger.info_on_rank_0("No mindspeed.mindspore find.")
59
+ except Exception as e:
60
+ logger.info_on_rank_0(f"Patch mindspeed.mindspore method failed, detail:{str(e)}")
61
+
62
+ try:
63
+ from megatron.core.pipeline_parallel import schedules
64
+ schedules.forward_step = wrap_megatron_step(schedules.forward_step)
65
+ schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
66
+ logger.info_on_rank_0("Patch megatron method success.")
67
+ except ImportError:
68
+ logger.info_on_rank_0("No megatron find.")
69
+ except Exception as e:
70
+ logger.info_on_rank_0(f"Patch megatron method failed, detail:{str(e)}")
71
+
72
+
50
73
  class CellProcessor:
51
74
  cell_queue = ModuleQueue()
52
75
  cell_count = {}
@@ -84,6 +107,8 @@ class CellProcessor:
84
107
  raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
85
108
  'The model cannot be None, when level is "L0" or "mix"')
86
109
 
110
+ patch_schedules_step()
111
+
87
112
  is_registered = False
88
113
  model_type = Const.MODULE if is_mindtorch() else Const.CELL
89
114
  cells_with_index_in_pynative_mode, cells_with_index_in_graph_mode = get_cells_and_names_with_index(models)
@@ -127,6 +152,7 @@ class CellProcessor:
127
152
  Runtime.run_mode = MsConst.PYNATIVE_GRAPH_MODE
128
153
  GraphModeCellDump(config, cells_and_names_in_graph_mode, strict=False).handle()
129
154
 
155
+
130
156
  def build_cell_hook(self, cell_name, build_data_hook):
131
157
  @ThreadSafe.synchronized
132
158
  def forward_pre_hook(cell, args):
@@ -259,24 +285,26 @@ class CellProcessor:
259
285
  CellProcessor.cell_stack[tid] = []
260
286
 
261
287
  if self.cell_stack[tid]:
262
- CellProcessor.module_node[full_name] = self.cell_stack[tid][-1]
288
+ CellProcessor.module_node[full_name] = self.cell_stack[tid][-1] if not is_megatron() \
289
+ else [self.cell_stack[tid][-1], get_micro_step()]
263
290
  else:
264
291
  parent_name = CellProcessor.cell_queue.find_last(full_name)
265
- CellProcessor.module_node[full_name] = parent_name
292
+ CellProcessor.module_node[full_name] = parent_name if not is_megatron() else [parent_name, get_micro_step()]
266
293
 
267
294
  CellProcessor.cell_queue.add_name(full_name)
268
295
  CellProcessor.cell_stack[tid].append(full_name)
269
- CellProcessor.api_parent_node[tid] = full_name
296
+ CellProcessor.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
270
297
  if self.scope:
271
298
  self.scope.begin_module(full_name)
272
299
 
273
300
  def set_construct_info_in_hook(self, full_name):
274
301
  tid = threading.get_ident()
275
302
  CellProcessor.cell_queue.remove_name(full_name)
276
- CellProcessor.api_parent_node[tid] = None
303
+ CellProcessor.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
277
304
  if self.cell_stack.get(tid):
278
305
  CellProcessor.cell_stack[tid].pop()
279
306
  if self.cell_stack.get(tid):
280
- CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1]
307
+ CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1] if not is_megatron() \
308
+ else [CellProcessor.cell_stack[tid][-1], get_micro_step()]
281
309
  if self.scope:
282
310
  self.scope.end_module(full_name)
@@ -212,14 +212,14 @@ def do_multi_process(func, map_dict):
212
212
  df_chunks = [result_df]
213
213
  process_num = 1
214
214
  logger.info(f"Using {process_num} processes with chunk size {df_chunk_size}")
215
-
215
+
216
216
  # 分割字典
217
217
  map_chunks = split_dict(map_dict, df_chunk_size)
218
-
218
+
219
219
  # 创建结果列表和进程池
220
220
  results = []
221
221
  pool = multiprocessing.Pool(process_num)
222
-
222
+
223
223
  progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
224
224
 
225
225
  def update_progress(size, progress_lock, extra_param=None):
@@ -228,34 +228,30 @@ def do_multi_process(func, map_dict):
228
228
 
229
229
  def err_call(args):
230
230
  logger.error('multiprocess compare failed! Reason: {}'.format(args))
231
- try:
232
- pool.close()
233
- except OSError as e:
234
- logger.error(f'pool terminate failed: {str(e)}')
231
+
235
232
  results = []
233
+
234
+ # 提交任务到进程池
235
+ for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
236
+ start_idx = df_chunk_size * process_idx
237
+ result = pool.apply_async(
238
+ func,
239
+ args=(df_chunk, start_idx, map_chunk, lock),
240
+ error_callback=err_call,
241
+ callback=partial(update_progress, len(map_chunk), lock)
242
+ )
243
+ results.append(result)
244
+ pool.close()
245
+
236
246
  try:
237
- # 提交任务到进程池
238
- for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
239
- start_idx = df_chunk_size * process_idx
240
- result = pool.apply_async(
241
- func,
242
- args=(df_chunk, start_idx, map_chunk, lock),
243
- error_callback=err_call,
244
- callback=partial(update_progress, len(map_chunk), lock)
245
- )
246
- results.append(result)
247
-
248
- final_results = [r.get() for r in results]
249
- # 等待所有任务完成
250
- pool.close()
251
- pool.join()
252
- return pd.concat(final_results, ignore_index=True)
247
+ final_results = [r.get(timeout=3600) for r in results]
253
248
  except Exception as e:
254
- logger.error(f"\nMain process error: {str(e)}")
249
+ logger.error(f"Task failed with exception: {e}")
255
250
  pool.terminate()
256
251
  return pd.DataFrame({})
257
- finally:
258
- pool.close()
252
+ # 等待所有任务完成
253
+ pool.join()
254
+ return pd.concat(final_results, ignore_index=True)
259
255
 
260
256
 
261
257
  def initialize_result_df(total_size):