mindspore 2.6.0__cp310-cp310-win_amd64.whl → 2.7.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (455) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +2 -2
  5. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +42 -11
  9. mindspore/_extends/builtin_operations.py +3 -3
  10. mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
  11. mindspore/_extends/optimize/cell_utils.py +96 -0
  12. mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
  13. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  14. mindspore/_extends/parse/__init__.py +3 -3
  15. mindspore/_extends/parse/compile_config.py +44 -22
  16. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -2
  17. mindspore/_extends/parse/parser.py +64 -83
  18. mindspore/_extends/parse/resources.py +39 -0
  19. mindspore/_extends/parse/standard_method.py +47 -14
  20. mindspore/_extends/parse/trope.py +8 -1
  21. mindspore/_extends/pijit/__init__.py +1 -2
  22. mindspore/_extends/pijit/pijit_func_white_list.py +2 -5
  23. mindspore/amp.py +4 -22
  24. mindspore/atlprov.dll +0 -0
  25. mindspore/avcodec-59.dll +0 -0
  26. mindspore/avdevice-59.dll +0 -0
  27. mindspore/avfilter-8.dll +0 -0
  28. mindspore/avformat-59.dll +0 -0
  29. mindspore/avutil-57.dll +0 -0
  30. mindspore/boost/adasum.py +1 -1
  31. mindspore/boost/boost_cell_wrapper.py +4 -4
  32. mindspore/c1.dll +0 -0
  33. mindspore/c1xx.dll +0 -0
  34. mindspore/c2.dll +0 -0
  35. mindspore/common/__init__.py +43 -12
  36. mindspore/common/_grad_function.py +2 -1
  37. mindspore/common/_pijit_context.py +28 -7
  38. mindspore/common/_stub_tensor.py +1 -209
  39. mindspore/common/_tensor_cpp_method.py +1 -1
  40. mindspore/common/_tensor_docs.py +177 -52
  41. mindspore/common/_utils.py +9 -1
  42. mindspore/common/api.py +338 -208
  43. mindspore/common/dtype.py +108 -57
  44. mindspore/common/dump.py +11 -16
  45. mindspore/common/dynamic_shape/__init__.py +0 -0
  46. mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +17 -23
  47. mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
  48. mindspore/common/file_system.py +59 -9
  49. mindspore/common/generator.py +2 -3
  50. mindspore/common/hook_handle.py +33 -5
  51. mindspore/common/jit_config.py +1 -1
  52. mindspore/common/jit_trace.py +84 -105
  53. mindspore/common/np_dtype.py +3 -3
  54. mindspore/common/parameter.py +27 -29
  55. mindspore/common/recompute.py +5 -7
  56. mindspore/common/sparse_tensor.py +0 -3
  57. mindspore/common/symbol.py +0 -1
  58. mindspore/common/tensor.py +84 -133
  59. mindspore/communication/_comm_helper.py +46 -4
  60. mindspore/communication/management.py +79 -7
  61. mindspore/context.py +47 -38
  62. mindspore/dataset/__init__.py +1 -1
  63. mindspore/dataset/audio/transforms.py +1 -1
  64. mindspore/dataset/core/config.py +38 -4
  65. mindspore/dataset/engine/datasets.py +350 -322
  66. mindspore/dataset/engine/datasets_user_defined.py +69 -23
  67. mindspore/dataset/engine/iterators.py +2 -2
  68. mindspore/dataset/engine/obs/config_loader.py +2 -2
  69. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
  70. mindspore/dataset/transforms/c_transforms.py +2 -2
  71. mindspore/dataset/transforms/py_transforms.py +7 -3
  72. mindspore/dataset/transforms/transforms.py +10 -6
  73. mindspore/dataset/vision/__init__.py +1 -1
  74. mindspore/dataset/vision/py_transforms.py +8 -8
  75. mindspore/dataset/vision/transforms.py +17 -5
  76. mindspore/dataset/vision/utils.py +632 -21
  77. mindspore/dataset/vision/validators.py +1 -0
  78. mindspore/device_context/ascend/device.py +1 -1
  79. mindspore/device_context/ascend/op_tuning.py +35 -1
  80. mindspore/device_context/gpu/__init__.py +2 -2
  81. mindspore/device_context/gpu/device.py +1 -1
  82. mindspore/device_context/gpu/op_precision.py +4 -2
  83. mindspore/device_context/gpu/op_tuning.py +6 -3
  84. mindspore/device_manager.py +16 -9
  85. mindspore/dnnl.dll +0 -0
  86. mindspore/dpcmi.dll +0 -0
  87. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +5 -4
  88. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  89. mindspore/experimental/optim/adadelta.py +13 -20
  90. mindspore/experimental/optim/adagrad.py +15 -22
  91. mindspore/experimental/optim/adam.py +17 -24
  92. mindspore/experimental/optim/adamax.py +14 -22
  93. mindspore/experimental/optim/adamw.py +28 -34
  94. mindspore/experimental/optim/asgd.py +15 -25
  95. mindspore/experimental/optim/lr_scheduler.py +27 -45
  96. mindspore/experimental/optim/nadam.py +14 -24
  97. mindspore/experimental/optim/optimizer.py +13 -23
  98. mindspore/experimental/optim/radam.py +18 -24
  99. mindspore/experimental/optim/rmsprop.py +14 -25
  100. mindspore/experimental/optim/rprop.py +15 -26
  101. mindspore/experimental/optim/sgd.py +9 -19
  102. mindspore/hal/__init__.py +4 -4
  103. mindspore/hal/contiguous_tensors_handle.py +2 -2
  104. mindspore/hal/memory.py +1 -0
  105. mindspore/include/api/cell.h +65 -5
  106. mindspore/include/api/cfg.h +24 -7
  107. mindspore/include/api/context.h +1 -0
  108. mindspore/include/api/delegate.h +10 -2
  109. mindspore/include/api/dual_abi_helper.h +100 -19
  110. mindspore/include/api/graph.h +14 -1
  111. mindspore/include/api/kernel.h +16 -3
  112. mindspore/include/api/kernel_api.h +9 -1
  113. mindspore/include/api/metrics/accuracy.h +9 -0
  114. mindspore/include/api/model.h +8 -1
  115. mindspore/include/api/model_group.h +4 -0
  116. mindspore/include/api/model_parallel_runner.h +2 -0
  117. mindspore/include/api/status.h +48 -10
  118. mindspore/include/api/types.h +8 -3
  119. mindspore/include/c_api/model_c.h +0 -58
  120. mindspore/include/c_api/tensor_c.h +0 -26
  121. mindspore/include/dataset/constants.h +9 -0
  122. mindspore/include/dataset/vision_ascend.h +1 -1
  123. mindspore/jpeg62.dll +0 -0
  124. mindspore/mindrecord/tools/cifar10.py +61 -11
  125. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
  126. mindspore/mindspore_backend_common.dll +0 -0
  127. mindspore/mindspore_backend_manager.dll +0 -0
  128. mindspore/mindspore_common.dll +0 -0
  129. mindspore/mindspore_core.dll +0 -0
  130. mindspore/mindspore_cpu_res_manager.dll +0 -0
  131. mindspore/mindspore_dump.dll +0 -0
  132. mindspore/mindspore_frontend.dll +0 -0
  133. mindspore/mindspore_glog.dll +0 -0
  134. mindspore/mindspore_memory_pool.dll +0 -0
  135. mindspore/mindspore_ms_backend.dll +0 -0
  136. mindspore/mindspore_ops.dll +0 -0
  137. mindspore/mindspore_ops_host.dll +0 -0
  138. mindspore/mindspore_ops_kernel_common.dll +0 -0
  139. mindspore/mindspore_profiler.dll +0 -0
  140. mindspore/mindspore_pyboost.dll +0 -0
  141. mindspore/mindspore_pynative.dll +0 -0
  142. mindspore/mindspore_res_manager.dll +0 -0
  143. mindspore/mindspore_runtime_pipeline.dll +0 -0
  144. mindspore/mint/__init__.py +4 -44
  145. mindspore/mint/distributed/__init__.py +5 -0
  146. mindspore/mint/distributed/distributed.py +425 -19
  147. mindspore/mint/nn/__init__.py +1 -1
  148. mindspore/mint/nn/functional.py +53 -6
  149. mindspore/mint/nn/layer/_functions.py +163 -294
  150. mindspore/mint/nn/layer/activation.py +8 -6
  151. mindspore/mint/nn/layer/conv.py +125 -101
  152. mindspore/mint/nn/layer/normalization.py +11 -25
  153. mindspore/mint/optim/adam.py +19 -18
  154. mindspore/mint/optim/adamw.py +14 -8
  155. mindspore/mint/optim/sgd.py +5 -5
  156. mindspore/msobj140.dll +0 -0
  157. mindspore/mspdb140.dll +0 -0
  158. mindspore/mspdbcore.dll +0 -0
  159. mindspore/mspdbst.dll +0 -0
  160. mindspore/mspft140.dll +0 -0
  161. mindspore/msvcdis140.dll +0 -0
  162. mindspore/msvcp140_1.dll +0 -0
  163. mindspore/msvcp140_2.dll +0 -0
  164. mindspore/msvcp140_atomic_wait.dll +0 -0
  165. mindspore/msvcp140_codecvt_ids.dll +0 -0
  166. mindspore/nn/cell.py +488 -620
  167. mindspore/nn/grad/cell_grad.py +11 -12
  168. mindspore/nn/layer/activation.py +36 -36
  169. mindspore/nn/layer/basic.py +74 -77
  170. mindspore/nn/layer/channel_shuffle.py +4 -4
  171. mindspore/nn/layer/combined.py +4 -2
  172. mindspore/nn/layer/conv.py +86 -85
  173. mindspore/nn/layer/dense.py +9 -7
  174. mindspore/nn/layer/embedding.py +50 -52
  175. mindspore/nn/layer/image.py +38 -40
  176. mindspore/nn/layer/math.py +111 -112
  177. mindspore/nn/layer/normalization.py +56 -44
  178. mindspore/nn/layer/pooling.py +58 -63
  179. mindspore/nn/layer/rnn_cells.py +33 -33
  180. mindspore/nn/layer/rnns.py +56 -56
  181. mindspore/nn/layer/thor_layer.py +74 -73
  182. mindspore/nn/layer/transformer.py +11 -1
  183. mindspore/nn/learning_rate_schedule.py +20 -20
  184. mindspore/nn/loss/loss.py +79 -81
  185. mindspore/nn/optim/adam.py +2 -4
  186. mindspore/nn/optim/adasum.py +2 -2
  187. mindspore/nn/optim/lamb.py +1 -3
  188. mindspore/nn/optim/optimizer.py +1 -1
  189. mindspore/nn/optim/tft_wrapper.py +2 -3
  190. mindspore/nn/optim/thor.py +2 -2
  191. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  192. mindspore/nn/probability/distribution/exponential.py +2 -1
  193. mindspore/nn/probability/distribution/poisson.py +2 -1
  194. mindspore/nn/sparse/sparse.py +3 -3
  195. mindspore/nn/wrap/cell_wrapper.py +73 -42
  196. mindspore/nn/wrap/grad_reducer.py +37 -52
  197. mindspore/nn/wrap/loss_scale.py +72 -74
  198. mindspore/numpy/array_creations.py +7 -7
  199. mindspore/numpy/fft.py +1 -1
  200. mindspore/numpy/math_ops.py +1 -1
  201. mindspore/numpy/utils_const.py +1 -1
  202. mindspore/opencv_core452.dll +0 -0
  203. mindspore/opencv_imgcodecs452.dll +0 -0
  204. mindspore/opencv_imgproc452.dll +0 -0
  205. mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
  206. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
  207. mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
  208. mindspore/ops/_op_impl/cpu/__init__.py +1 -0
  209. mindspore/{experimental/es/__init__.py → ops/_op_impl/cpu/joinedstr_op.py} +12 -6
  210. mindspore/ops/_vmap/vmap_array_ops.py +6 -13
  211. mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
  212. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +29 -10
  213. mindspore/ops/auto_generate/gen_extend_func.py +5 -55
  214. mindspore/ops/auto_generate/gen_ops_def.py +753 -273
  215. mindspore/ops/auto_generate/gen_ops_prim.py +1687 -958
  216. mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
  217. mindspore/ops/composite/__init__.py +10 -0
  218. mindspore/ops/composite/base.py +9 -5
  219. mindspore/ops/composite/multitype_ops/__init__.py +12 -1
  220. mindspore/ops/composite/multitype_ops/_compile_utils.py +132 -108
  221. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  222. mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
  223. mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
  224. mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
  225. mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
  226. mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
  227. mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
  228. mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
  229. mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
  230. mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
  231. mindspore/ops/function/__init__.py +4 -1
  232. mindspore/ops/function/_add_attr_func.py +11 -6
  233. mindspore/ops/function/array_func.py +17 -100
  234. mindspore/ops/function/debug_func.py +8 -5
  235. mindspore/ops/function/grad/grad_func.py +5 -13
  236. mindspore/ops/function/math_func.py +65 -399
  237. mindspore/ops/function/nn_func.py +44 -61
  238. mindspore/ops/function/other_func.py +4 -1
  239. mindspore/ops/function/random_func.py +31 -4
  240. mindspore/ops/functional.py +2 -3
  241. mindspore/ops/functional_overload.py +486 -18
  242. mindspore/ops/op_info_register.py +21 -0
  243. mindspore/ops/operations/__init__.py +5 -2
  244. mindspore/ops/operations/_custom_ops_utils.py +675 -8
  245. mindspore/ops/operations/_inner_ops.py +14 -18
  246. mindspore/ops/operations/_sequence_ops.py +1 -1
  247. mindspore/ops/operations/array_ops.py +4 -50
  248. mindspore/ops/operations/comm_ops.py +186 -41
  249. mindspore/ops/operations/custom_ops.py +244 -175
  250. mindspore/ops/operations/debug_ops.py +55 -4
  251. mindspore/ops/operations/image_ops.py +13 -13
  252. mindspore/ops/operations/manually_defined/ops_def.py +27 -28
  253. mindspore/ops/operations/math_ops.py +8 -9
  254. mindspore/ops/operations/nn_ops.py +6 -7
  255. mindspore/ops/primitive.py +9 -20
  256. mindspore/ops/tensor_method.py +52 -11
  257. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
  258. mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
  259. mindspore/ops_generate/api/functions_cc_generator.py +58 -10
  260. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
  261. mindspore/ops_generate/common/base_generator.py +14 -0
  262. mindspore/ops_generate/common/gen_constants.py +7 -2
  263. mindspore/ops_generate/common/gen_utils.py +0 -19
  264. mindspore/ops_generate/common/op_proto.py +11 -4
  265. mindspore/ops_generate/common/template.py +88 -11
  266. mindspore/ops_generate/gen_ops.py +1 -1
  267. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
  268. mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
  269. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
  270. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
  271. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
  272. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
  273. mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -16
  274. mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
  275. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
  276. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
  277. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
  278. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
  279. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
  280. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
  281. mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
  282. mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
  283. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
  284. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
  285. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
  286. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
  287. mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
  288. mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
  289. mindspore/parallel/_auto_parallel_context.py +9 -17
  290. mindspore/parallel/_cell_wrapper.py +106 -40
  291. mindspore/parallel/_parallel_serialization.py +4 -3
  292. mindspore/parallel/_ps_context.py +4 -6
  293. mindspore/parallel/_tensor.py +167 -12
  294. mindspore/parallel/_transformer/moe.py +1 -1
  295. mindspore/parallel/_transformer/transformer.py +17 -12
  296. mindspore/parallel/_utils.py +5 -11
  297. mindspore/parallel/auto_parallel.py +33 -12
  298. mindspore/parallel/checkpoint_convert.py +3 -3
  299. mindspore/parallel/checkpoint_transform.py +5 -1
  300. mindspore/parallel/cluster/process_entity/_api.py +88 -49
  301. mindspore/parallel/cluster/process_entity/_utils.py +95 -7
  302. mindspore/parallel/cluster/run.py +48 -7
  303. mindspore/parallel/function/__init__.py +8 -1
  304. mindspore/parallel/function/reshard_func.py +7 -6
  305. mindspore/parallel/nn/__init__.py +15 -2
  306. mindspore/parallel/nn/parallel_cell_wrapper.py +50 -14
  307. mindspore/parallel/nn/parallel_grad_reducer.py +7 -14
  308. mindspore/parallel/shard.py +9 -23
  309. mindspore/parallel/transform_safetensors.py +468 -174
  310. mindspore/pgodb140.dll +0 -0
  311. mindspore/pgort140.dll +0 -0
  312. mindspore/profiler/__init__.py +2 -1
  313. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
  314. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
  315. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +3 -0
  316. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
  317. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
  318. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
  319. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
  320. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
  321. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
  322. mindspore/profiler/analysis/task_manager.py +1 -1
  323. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
  324. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
  325. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +10 -9
  326. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +43 -23
  327. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
  328. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
  329. mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
  330. mindspore/profiler/common/constant.py +16 -0
  331. mindspore/profiler/common/msprof_cmd_tool.py +2 -2
  332. mindspore/profiler/common/path_manager.py +9 -0
  333. mindspore/profiler/common/profiler_context.py +50 -29
  334. mindspore/profiler/common/profiler_info.py +0 -16
  335. mindspore/profiler/common/profiler_meta_data.py +1 -0
  336. mindspore/profiler/common/profiler_op_analyse.py +239 -0
  337. mindspore/profiler/common/profiler_output_path.py +23 -8
  338. mindspore/profiler/common/profiler_parameters.py +128 -35
  339. mindspore/profiler/dynamic_profile/__init__.py +0 -0
  340. mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
  341. mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
  342. mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
  343. mindspore/profiler/dynamic_profiler.py +374 -338
  344. mindspore/profiler/envprofiler.py +42 -12
  345. mindspore/profiler/experimental_config.py +112 -7
  346. mindspore/profiler/mstx.py +33 -12
  347. mindspore/profiler/platform/__init__.py +2 -3
  348. mindspore/profiler/platform/cpu_profiler.py +10 -4
  349. mindspore/profiler/platform/npu_profiler.py +30 -20
  350. mindspore/profiler/profiler.py +218 -154
  351. mindspore/profiler/profiler_action_controller.py +65 -77
  352. mindspore/profiler/profiler_interface.py +2 -2
  353. mindspore/profiler/schedule.py +10 -4
  354. mindspore/rewrite/common/config.py +1 -0
  355. mindspore/rewrite/common/namer.py +1 -0
  356. mindspore/rewrite/common/namespace.py +1 -0
  357. mindspore/rewrite/node/node.py +31 -11
  358. mindspore/rewrite/parsers/assign_parser.py +1 -1
  359. mindspore/rewrite/symbol_tree/symbol_tree.py +2 -2
  360. mindspore/run_check/_check_version.py +7 -10
  361. mindspore/runtime/__init__.py +8 -6
  362. mindspore/runtime/event.py +10 -4
  363. mindspore/runtime/executor.py +87 -45
  364. mindspore/runtime/memory.py +22 -30
  365. mindspore/runtime/thread_bind_core.py +299 -165
  366. mindspore/safeguard/rewrite_obfuscation.py +12 -13
  367. mindspore/swresample-4.dll +0 -0
  368. mindspore/swscale-6.dll +0 -0
  369. mindspore/tbbmalloc.dll +0 -0
  370. mindspore/tinyxml2.dll +0 -0
  371. mindspore/train/_utils.py +9 -5
  372. mindspore/train/amp.py +43 -23
  373. mindspore/train/callback/__init__.py +5 -5
  374. mindspore/train/callback/_callback.py +2 -1
  375. mindspore/train/callback/_checkpoint.py +4 -14
  376. mindspore/train/callback/_flops_collector.py +11 -7
  377. mindspore/train/callback/_landscape.py +0 -1
  378. mindspore/train/callback/_train_fault_tolerance.py +72 -18
  379. mindspore/train/data_sink.py +15 -6
  380. mindspore/train/dataset_helper.py +14 -5
  381. mindspore/train/model.py +49 -47
  382. mindspore/train/serialization.py +168 -126
  383. mindspore/train/summary/summary_record.py +13 -2
  384. mindspore/train/train_thor/model_thor.py +2 -2
  385. mindspore/turbojpeg.dll +0 -0
  386. mindspore/utils/__init__.py +3 -2
  387. mindspore/utils/dryrun.py +0 -6
  388. mindspore/utils/runtime_execution_order_check.py +162 -78
  389. mindspore/utils/sdc_detect.py +68 -0
  390. mindspore/utils/utils.py +14 -17
  391. mindspore/vcmeta.dll +0 -0
  392. mindspore/vcruntime140.dll +0 -0
  393. mindspore/vcruntime140_1.dll +0 -0
  394. mindspore/version.py +1 -1
  395. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/METADATA +5 -4
  396. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/RECORD +400 -439
  397. mindspore/_deprecated/jit.py +0 -198
  398. mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
  399. mindspore/communication/_hccl_management.py +0 -297
  400. mindspore/experimental/es/embedding_service.py +0 -891
  401. mindspore/experimental/es/embedding_service_layer.py +0 -581
  402. mindspore/profiler/common/validator/__init__.py +0 -14
  403. mindspore/profiler/common/validator/validate_path.py +0 -84
  404. mindspore/profiler/parser/__init__.py +0 -14
  405. mindspore/profiler/parser/aicpu_data_parser.py +0 -272
  406. mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
  407. mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
  408. mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
  409. mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
  410. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
  411. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
  412. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
  413. mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
  414. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
  415. mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
  416. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
  417. mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
  418. mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
  419. mindspore/profiler/parser/ascend_flops_generator.py +0 -116
  420. mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
  421. mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
  422. mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
  423. mindspore/profiler/parser/ascend_memory_generator.py +0 -185
  424. mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
  425. mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
  426. mindspore/profiler/parser/ascend_op_generator.py +0 -334
  427. mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
  428. mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
  429. mindspore/profiler/parser/base_timeline_generator.py +0 -483
  430. mindspore/profiler/parser/container.py +0 -229
  431. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
  432. mindspore/profiler/parser/flops_parser.py +0 -531
  433. mindspore/profiler/parser/framework_enum.py +0 -111
  434. mindspore/profiler/parser/framework_parser.py +0 -464
  435. mindspore/profiler/parser/framework_struct.py +0 -61
  436. mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
  437. mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
  438. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
  439. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
  440. mindspore/profiler/parser/hccl_parser.py +0 -573
  441. mindspore/profiler/parser/hwts_log_parser.py +0 -122
  442. mindspore/profiler/parser/integrator.py +0 -526
  443. mindspore/profiler/parser/memory_usage_parser.py +0 -277
  444. mindspore/profiler/parser/minddata_analyzer.py +0 -800
  445. mindspore/profiler/parser/minddata_parser.py +0 -186
  446. mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
  447. mindspore/profiler/parser/op_intermediate_parser.py +0 -149
  448. mindspore/profiler/parser/optime_parser.py +0 -250
  449. mindspore/profiler/parser/profiler_info.py +0 -213
  450. mindspore/profiler/parser/step_trace_parser.py +0 -666
  451. mindspore/utils/hooks.py +0 -81
  452. /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
  453. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/WHEEL +0 -0
  454. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/entry_points.txt +0 -0
  455. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/top_level.txt +0 -0
@@ -15,36 +15,25 @@
15
15
  """Dynamic Profile Monitor"""
16
16
  import os
17
17
  import sys
18
+ import json
18
19
  import time
19
20
  import stat
20
- import json
21
21
  import atexit
22
- import struct
23
22
  import random
24
23
  import multiprocessing
25
24
 
26
25
  from mindspore import log as logger
27
26
  from mindspore.train import Callback
28
- from mindspore.profiler import Profiler, tensorboard_trace_handler, schedule
29
- from mindspore.communication import get_rank
30
- from mindspore.profiler.parser.ascend_analysis.file_manager import FileManager
31
- from mindspore.profiler.parser.ascend_analysis.path_manager import PathManager
32
- from mindspore.profiler.profiler_interface import ProfilerInterface
33
- from mindspore.profiler.common.constant import (
34
- ProfilerActivity,
35
- ProfilerLevel,
36
- AicoreMetrics,
37
- ExportType,
38
- )
27
+ from mindspore.profiler import tensorboard_trace_handler, schedule
28
+ from mindspore.profiler.profiler import Profile
29
+ from mindspore.profiler.experimental_config import _ExperimentalConfig
30
+ from mindspore.profiler.common.file_manager import FileManager
31
+ from mindspore.profiler.common.path_manager import PathManager
32
+ from mindspore.profiler.dynamic_profile.dynamic_profiler_config_context import DynamicProfilerConfigContext
33
+ from mindspore.profiler.dynamic_profile.dynamic_monitor_proxy import MsDynamicMonitorProxySingleton
34
+ from mindspore.profiler.dynamic_profile.dynamic_profiler_utils import DynamicProfilerUtils
39
35
  from mindspore.profiler.common.util import no_exception_func
40
-
41
-
42
- def get_real_rank():
43
- """get rank id"""
44
- try:
45
- return get_rank()
46
- except RuntimeError:
47
- return int(os.getenv("RANK_ID", "0"))
36
+ from mindspore.profiler.profiler_interface import ProfilerInterface
48
37
 
49
38
 
50
39
  def print_msg(msg):
@@ -52,210 +41,21 @@ def print_msg(msg):
52
41
  print("[Dynamic Profiler] " + msg, flush=True)
53
42
 
54
43
 
55
- class DynamicProfilerArgs:
56
- """
57
- Data class for dynamic profile config.
58
- """
59
- FMT = "i" * 7 + "?" * 6
60
- SIZE = struct.calcsize(FMT)
61
-
62
- def __init__(self,
63
- start_step: int = -1,
64
- stop_step: int = -1,
65
- aic_metrics: int = -1,
66
- profiler_level: int = 0,
67
- analyse_mode: int = -1,
68
- activities: int = 0,
69
- export_type: int = 0,
70
- profile_memory: bool = False,
71
- mstx: bool = False,
72
- parallel_strategy: bool = False,
73
- with_stack: bool = False,
74
- data_simplification: bool = True,
75
- is_valid: bool = False,
76
- **kwargs):
77
- self._start_step = start_step
78
- self._stop_step = stop_step
79
- self._aic_metrics = aic_metrics
80
- self._profiler_level = profiler_level
81
- self._analyse_mode = analyse_mode
82
- self._activities = activities
83
- self._export_type = export_type
84
- self._profile_memory = profile_memory
85
- self._mstx = mstx
86
- self._parallel_strategy = parallel_strategy
87
- self._with_stack = with_stack
88
- self._data_simplification = data_simplification
89
- self._is_valid = is_valid
90
- self._check_params_type()
91
-
92
- def _check_params_type(self):
93
- """Check and enforce parameter types with lower complexity."""
94
- # Define a parameter check rule. {Parameter name: (expected type, default value)}
95
- param_rules = {
96
- '_start_step': (int, -1),
97
- '_stop_step': (int, -1),
98
- '_aic_metrics': (int, -1),
99
- '_profiler_level': (int, 0),
100
- '_analyse_mode': (int, -1),
101
- '_activities': (int, 0),
102
- '_export_type': (int, 0),
103
- '_profile_memory': (bool, False),
104
- '_mstx': (bool, False),
105
- '_parallel_strategy': (bool, False),
106
- '_with_stack': (bool, False),
107
- '_data_simplification': (bool, True),
108
- '_is_valid': (bool, False)
109
- }
110
-
111
- def _is_valid_type(value, expected_type):
112
- """Helper method for type checking."""
113
- if expected_type is int and isinstance(value, bool):
114
- return False
115
- return isinstance(value, expected_type)
116
-
117
- for param, (expected_type, default) in param_rules.items():
118
- value = getattr(self, param)
119
- if not _is_valid_type(value, expected_type):
120
- logger.warning(
121
- f"{param[1:]} should be {expected_type.__name__} type, "
122
- f"will be reset to {default}."
123
- )
124
- setattr(self, param, default)
125
-
126
- @property
127
- def start_step(self):
128
- """ get start step value."""
129
- return self._start_step
130
-
131
- @property
132
- def stop_step(self):
133
- """ get stop step value."""
134
- return self._stop_step
135
-
136
- @property
137
- def is_valid(self):
138
- """ get json valid value."""
139
- return self._is_valid
140
-
141
- @is_valid.setter
142
- def is_valid(self, value):
143
- """ set json valid value."""
144
- self._is_valid = value
145
-
146
- @property
147
- def analyse_mode(self):
148
- """ get analyse mode value."""
149
- return self._convert_analyse_mode(self._analyse_mode)
150
-
151
- @property
152
- def vars(self):
153
- """ get all values in DynamicProfilerArgs."""
154
- not_supported_args = ['_is_valid']
155
- res = {}
156
- for key, value in self.__dict__.items():
157
- if key not in not_supported_args:
158
- res[key.replace('_', '', 1)] = value
159
- return res
160
-
161
- @property
162
- def args(self):
163
- """ get all args in DynamicProfilerArgs."""
164
- self._profiler_level = self._convert_profiler_level(self._profiler_level)
165
- self._activities = self._convert_activities(self._activities)
166
- self._aic_metrics = self._convert_aic_metrics(self._aic_metrics)
167
- self._export_type = self._convert_export_type(self._export_type)
168
- not_supported_args = ['_start_step', '_stop_step', '_analyse_mode', '_is_valid']
169
- res = {}
170
- for key, value in self.__dict__.items():
171
- if key not in not_supported_args:
172
- res[key.replace('_', '', 1)] = value
173
- return res
174
-
175
- @classmethod
176
- def from_bytes(cls, byte_data):
177
- """ unpack bytes to DynamicProfilerArgs."""
178
- unpacked = struct.unpack(cls.FMT, byte_data)
179
- return cls(*unpacked)
180
-
181
- def to_bytes(self):
182
- """ pack DynamicProfilerArgs to bytes."""
183
- instance_vars = tuple(self.__dict__.values())
184
- if len(instance_vars) != len(self.FMT):
185
- raise ValueError("Number of variables does not match format string.")
186
- return struct.pack(DynamicProfilerArgs.FMT, *instance_vars)
187
-
188
- def _convert_analyse_mode(self, analyse_mode: int) -> str:
189
- """ convert analyse_mode to real args in Profiler."""
190
- if analyse_mode == 0:
191
- return 'sync'
192
- if analyse_mode == 1:
193
- return 'async'
194
- return None
195
-
196
- def _convert_profiler_level(self, profiler_level: int) -> ProfilerLevel:
197
- """ convert profiler_level to real args in Profiler."""
198
- if profiler_level == -1:
199
- return ProfilerLevel.LevelNone
200
- if profiler_level == 0:
201
- return ProfilerLevel.Level0
202
- if profiler_level == 1:
203
- return ProfilerLevel.Level1
204
- if profiler_level == 2:
205
- return ProfilerLevel.Level2
206
- return ProfilerLevel.Level0
207
-
208
- def _convert_activities(self, activities: int) -> ProfilerLevel:
209
- """ convert activities to real args in Profiler."""
210
- if activities == 0:
211
- return [ProfilerActivity.CPU, ProfilerActivity.NPU]
212
- if activities == 1:
213
- return [ProfilerActivity.CPU]
214
- if activities == 2:
215
- return [ProfilerActivity.NPU]
216
- return [ProfilerActivity.CPU, ProfilerActivity.NPU]
217
-
218
- def _convert_aic_metrics(self, aic_metrics: int) -> AicoreMetrics:
219
- """ convert aic_metrics to real args in Profiler."""
220
- if aic_metrics == -1:
221
- return AicoreMetrics.AiCoreNone
222
- if aic_metrics == 0:
223
- return AicoreMetrics.PipeUtilization
224
- if aic_metrics == 1:
225
- return AicoreMetrics.ArithmeticUtilization
226
- if aic_metrics == 2:
227
- return AicoreMetrics.Memory
228
- if aic_metrics == 3:
229
- return AicoreMetrics.MemoryL0
230
- if aic_metrics == 4:
231
- return AicoreMetrics.MemoryUB
232
- if aic_metrics == 5:
233
- return AicoreMetrics.ResourceConflictRatio
234
- if aic_metrics == 6:
235
- return AicoreMetrics.L2Cache
236
- if aic_metrics == 7:
237
- return AicoreMetrics.MemoryAccess
238
- return AicoreMetrics.AiCoreNone
239
-
240
- def _convert_export_type(self, export_type: int) -> ExportType:
241
- """ convert export_type to real args in Profiler."""
242
- if export_type == 0:
243
- return [ExportType.Text]
244
- if export_type == 1:
245
- return [ExportType.Db]
246
- if export_type == 2:
247
- return [ExportType.Text, ExportType.Db]
248
- return [ExportType.Text]
249
-
250
44
  class DynamicProfilerMonitorBase(Callback):
251
45
  """
252
- Dynamic profile callback base class implementing the dynamic profile functionality.
46
+ Dynamic profiler callback base class implementing the dynamic profiler functionality.
253
47
  """
254
48
 
255
- def __init__(self, cfg_path, output_path=None, poll_interval=2, **kwargs):
256
- self._cfg_path = cfg_path
257
- self._cfg_json_path = os.path.join(self._cfg_path, "profiler_config.json")
258
- self._cfg_json_path = os.path.realpath(self._cfg_json_path)
49
+ NPU_MONITOR_START = "NPU_MONITOR_START"
50
+
51
+ def __init__(self, cfg_path=None, output_path=None, poll_interval=2, **kwargs):
52
+ self._is_dyno = DynamicProfilerUtils.is_dyno_mode()
53
+ self._rank_id = DynamicProfilerUtils.get_real_rank()
54
+ if not self._is_dyno:
55
+ self._cfg_path = cfg_path
56
+ self._cfg_json_path = os.path.join(self._cfg_path, "profiler_config.json")
57
+ self._cfg_json_path = os.path.realpath(self._cfg_json_path)
58
+ self._init_cfg_json()
259
59
  self._output_path = "dyn_profile_data" if output_path is None else output_path
260
60
  self._poll_interval = poll_interval
261
61
  if not isinstance(self._poll_interval, int):
@@ -268,7 +68,6 @@ class DynamicProfilerMonitorBase(Callback):
268
68
 
269
69
  self._kwargs = kwargs
270
70
  self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
271
- self._rank_id = get_real_rank()
272
71
  self._shared_loop_flag = multiprocessing.Value('b', True)
273
72
  self._shm = None
274
73
  self._process = None
@@ -282,26 +81,35 @@ class DynamicProfilerMonitorBase(Callback):
282
81
  self._step_num = 0
283
82
 
284
83
  self._check_shm_for_killed()
285
- self._init_cfg_json()
286
84
  self._create_shm()
287
85
  self._create_process()
288
86
  atexit.register(self._clean_resource)
87
+ if self._is_dyno:
88
+ atexit.register(self._finalize_dynolog)
289
89
 
290
90
  @no_exception_func()
291
91
  def step_begin(self, run_context):
292
92
  """
293
- Start profile at the begin of step.
93
+ Start profiler at the begin of step.
294
94
 
295
95
  Args:
296
96
  run_context (RunContext): Context of the train running.
297
97
  """
298
- prof_args = self._get_prof_args()
98
+ prof_json = self._get_prof_args()
99
+ if not prof_json:
100
+ return
101
+ if self._is_dyno:
102
+ # Dyno monitor process
103
+ if self.NPU_MONITOR_START in prof_json:
104
+ self._call_dyno_monitor(prof_json)
105
+ return
299
106
 
107
+ prof_args = DynamicProfilerConfigContext(prof_json)
300
108
  if not prof_args.is_valid:
301
- logger.error("Dynamic profile json is not valid, please check the json file.")
109
+ logger.error("Dynamic profiler json is not valid, please check the json file.")
302
110
  return
303
111
 
304
- if prof_args.start_step == -1 or prof_args.start_step == self._last_start_step:
112
+ if prof_args.start_step in (-1, self._last_start_step):
305
113
  return
306
114
 
307
115
  cb_params = run_context.original_args()
@@ -311,16 +119,22 @@ class DynamicProfilerMonitorBase(Callback):
311
119
  # Prevent repeated calls of the start function within a complete interval
312
120
  if step_num == start_step:
313
121
  if self._is_started:
314
- logger.error("Dynamic profile is already started at step %d, "
315
- "please wait the first profile finished at step %d.",
122
+ logger.error("Dynamic profiler is already started at step %d, "
123
+ "please wait the first profiler finished at step %d.",
316
124
  self._last_start_step, self._last_stop_step)
317
125
  return
318
126
 
319
127
  if self._profiler is None:
320
- prof_path = os.path.join(self._output_path, f"rank{self._rank_id}_start{start_step}_stop{stop_step}")
128
+ output_path = prof_args.prof_path if prof_args.prof_path != "./" else self._output_path
129
+ prof_path = os.path.join(
130
+ output_path,
131
+ f"rank{self._rank_id}_start{start_step}_stop{stop_step}"
132
+ )
321
133
  PathManager.check_input_directory_path(prof_path)
322
- self._profiler = Profiler(on_trace_ready=tensorboard_trace_handler(dir_name=prof_path),
323
- start_profile=False, **prof_args.args)
134
+ profiler_config = self._get_prof_config(prof_args, prof_path, start_step, stop_step,
135
+ start_profile=False,
136
+ skip_first=0)
137
+ self._profiler = Profile(**profiler_config)
324
138
  print_msg(f"Rank {self._rank_id} create output path {prof_path}")
325
139
 
326
140
  self._profiler.start()
@@ -330,37 +144,82 @@ class DynamicProfilerMonitorBase(Callback):
330
144
  print_msg(f"Rank {self._rank_id} Dynamic profiler start at step {start_step}, "
331
145
  f"will stop at step {stop_step}")
332
146
 
147
+ @staticmethod
148
+ def _get_prof_config(prof_args, prof_path, start_step, stop_step, start_profile, skip_first):
149
+ """
150
+ Get profiler config.
151
+
152
+ Args:
153
+ prof_args: Profiler config.
154
+ prof_path: Profiler output path.
155
+ start_step: Start step.
156
+ stop_step: Stop step.
157
+ start_profile: enable start_profile.
158
+ skip_first: skip first step.
159
+ """
160
+ profiler_config = {
161
+ "activities": prof_args.args.get("activities"),
162
+ "with_stack": prof_args.args.get("with_stack"),
163
+ "profile_memory": prof_args.args.get("profile_memory"),
164
+ "parallel_strategy": prof_args.args.get("parallel_strategy"),
165
+ "start_profile": start_profile,
166
+ "record_shapes": prof_args.args.get("record_shapes"),
167
+ "schedule": schedule(
168
+ wait=0,
169
+ warmup=0,
170
+ active=stop_step - start_step + 1,
171
+ repeat=1,
172
+ skip_first=skip_first
173
+ ),
174
+ "on_trace_ready": tensorboard_trace_handler(
175
+ dir_name=prof_path,
176
+ analyse_flag=prof_args.analyse,
177
+ async_mode=prof_args.analyse_mode == "async",
178
+ ),
179
+ "experimental_config": _ExperimentalConfig(
180
+ profiler_level=prof_args.args.get("profiler_level"),
181
+ aic_metrics=prof_args.args.get("aic_metrics"),
182
+ l2_cache=prof_args.args.get("l2_cache"),
183
+ mstx=prof_args.args.get("mstx"),
184
+ data_simplification=prof_args.args.get("data_simplification"),
185
+ export_type=prof_args.args.get("export_type"),
186
+ mstx_domain_include=prof_args.args.get("mstx_domain_include"),
187
+ mstx_domain_exclude=prof_args.args.get("mstx_domain_exclude"),
188
+ sys_io=prof_args.args.get("sys_io"),
189
+ sys_interconnection=prof_args.args.get("sys_interconnection"),
190
+ host_sys=prof_args.args.get("host_sys")
191
+ )
192
+ }
193
+ return profiler_config
194
+
333
195
  @no_exception_func()
334
196
  def step_end(self, run_context):
335
197
  """
336
- Stop profile at the end of step.
198
+ Stop profiler at the end of step.
337
199
 
338
200
  Args:
339
201
  run_context (RunContext): Context of the train running.
340
202
  """
341
- prof_args = self._get_prof_args()
203
+ prof_json = self._get_prof_args()
204
+ prof_args = DynamicProfilerConfigContext(prof_json)
342
205
 
343
206
  if not prof_args.is_valid:
344
- logger.error("Dynamic profile json is not valid, please check the json file.")
207
+ logger.error("Dynamic profiler json is not valid, please check the json file.")
345
208
  return
346
209
 
347
210
  if prof_args.stop_step == -1:
348
211
  return
349
212
 
213
+ if self._profiler:
214
+ self._profiler.step()
215
+
350
216
  cb_params = run_context.original_args()
351
217
  step_num = cb_params.cur_step_num
352
218
 
353
219
  if step_num == self._last_stop_step and self._is_started:
354
- if self._profiler:
355
- self._profiler.stop()
356
- if prof_args.analyse_mode:
357
- self._profiler.analyse(mode=prof_args.analyse_mode)
358
- else:
359
- ProfilerInterface.finalize()
360
- ProfilerInterface.clear()
361
- self._profiler = None
362
- self._is_started = False
363
- print_msg(f"Rank {self._rank_id} Dynamic profiler stop at step {step_num}")
220
+ self._profiler = None
221
+ self._is_started = False
222
+ print_msg(f"Rank {self._rank_id} Dynamic profiler stop at step {step_num}")
364
223
 
365
224
  @no_exception_func()
366
225
  def step(self):
@@ -415,19 +274,28 @@ class DynamicProfilerMonitorBase(Callback):
415
274
  ... context.set_context(mode=mindspore.PYNATIVE_MODE)
416
275
  ... mindspore.set_device("Ascend")
417
276
  ... data_cfg = {
418
- ... "start_step": 2,
419
- ... "stop_step": 5,
420
- ... "aic_metrics": -1,
421
- ... "profiler_level": 0,
422
- ... "activities": 0,
423
- ... "export_type": 0,
424
- ... "profile_memory": False,
425
- ... "mstx": False,
426
- ... "analyse_mode": 0,
427
- ... "parallel_strategy": False,
428
- ... "with_stack": False,
429
- ... "data_simplification": True,
430
- ... }
277
+ ... "start_step": 2,
278
+ ... "stop_step": 5,
279
+ ... "aic_metrics": "AiCoreNone",
280
+ ... "profiler_level": "Level0",
281
+ ... "analyse_mode": 0,
282
+ ... "activities": ["CPU", "NPU"],
283
+ ... "export_type": ["text"],
284
+ ... "profile_memory": False,
285
+ ... "mstx": False,
286
+ ... "parallel_strategy": False,
287
+ ... "with_stack": False,
288
+ ... "data_simplification": True,
289
+ ... "l2_cache": False,
290
+ ... "analyse": True,
291
+ ... "record_shape": False,
292
+ ... "prof_path": "./data",
293
+ ... "mstx_domain_include": [],
294
+ ... "mstx_domain_exclude": [],
295
+ ... "host_sys": [],
296
+ ... "sys_io": False,
297
+ ... "sys_interconnection": False
298
+ ... }
431
299
  ... output_path = "./cfg_path"
432
300
  ... cfg_path = os.path.join(output_path, "profiler_config.json")
433
301
  ... os.makedirs(output_path, exist_ok=True)
@@ -442,7 +310,8 @@ class DynamicProfilerMonitorBase(Callback):
442
310
  ... for i in range(STEP_NUM):
443
311
  ... print(f"step {i}")
444
312
  ... train(net)
445
- ... # Modify the configuration file after step 7. For example, change start_step to 8 and stop_step to 10
313
+ ... # Modify the configuration file after step 7
314
+ ... # For example, change start_step to 8 and stop_step to 10
446
315
  ... if i == 5:
447
316
  ... # Modify parameters in the JSON file
448
317
  ... change_cfg_json(os.path.join(output_path, "profiler_config.json"))
@@ -451,43 +320,81 @@ class DynamicProfilerMonitorBase(Callback):
451
320
  """
452
321
 
453
322
  self._step_num += 1
454
- prof_args = self._get_prof_args()
323
+ prof_json = self._get_prof_args()
324
+ if not prof_json:
325
+ return
326
+ if self._is_dyno:
327
+ # Dyno monitor process
328
+ if self.NPU_MONITOR_START in prof_json:
329
+ self._call_dyno_monitor(prof_json)
330
+ return
455
331
 
332
+ prof_args = DynamicProfilerConfigContext(prof_json)
456
333
  if not prof_args.is_valid:
457
- logger.error("Dynamic profile json is not valid, please check the json file.")
334
+ logger.error("Dynamic profiler config is not valid, please check the json or dyno config.")
458
335
  return
336
+ self._handle_profiler_setup(prof_args)
459
337
 
460
- if prof_args.start_step == -1 or prof_args.stop_step == -1:
461
- return
338
+ if self._profiler:
339
+ self._profiler.step()
462
340
 
463
- # Skips the number of steps less than start_step
464
- if self._step_num < prof_args.start_step:
341
+ def _handle_profiler_setup(self, args):
342
+ """Common handler for profiler setup logic shared between dyno and non-dyno paths."""
343
+ start_step = args.start_step
344
+ stop_step = args.stop_step
345
+
346
+ if not self._is_valid_start_stop_step(self._step_num, start_step, stop_step):
465
347
  return
466
348
 
467
- if self._start_step != prof_args.start_step or self._stop_step != prof_args.stop_step:
468
- # Update new start_step and stop_step
469
- self._start_step = prof_args.start_step
470
- self._stop_step = prof_args.stop_step
471
- if self._start_step >= 0 and 0 <= self._start_step <= self._stop_step:
472
- prof_path = os.path.join(self._output_path,
473
- f"rank{self._rank_id}_start{self._start_step}_stop{self._stop_step}")
474
- print_msg(f"Rank {self._rank_id} create output path {prof_path}")
475
- print_msg(f"Rank {self._rank_id} Dynamic profile start at step {self._start_step}, "
476
- f"will stop at step {self._stop_step}")
477
- self._profiler = Profiler(schedule=schedule(wait=0, warmup=0,
478
- active=self._stop_step - self._start_step + 1,
479
- repeat=1,
480
- skip_first=1),
481
- on_trace_ready=tensorboard_trace_handler(dir_name=prof_path),
482
- **prof_args.args)
483
- else:
349
+ if self._start_step != start_step or self._stop_step != stop_step:
350
+ self._start_step = start_step
351
+ self._stop_step = stop_step
352
+
353
+ if not (start_step >= 0 and 0 <= start_step <= stop_step):
484
354
  self._profiler = None
485
- logger.error("Rank %d Dynamic profile start at step %d and stop at step %d in config_json must be "
486
- "greater than or equal to 0, and stop step should not be less than start step",
487
- self._rank_id, self._start_step, self._stop_step)
355
+ logger.error(
356
+ "Rank %d Dynamic profiler start at step %d and stop at step %d must be "
357
+ "greater than or equal to 0, and stop step should not be less than start step",
358
+ self._rank_id, start_step, stop_step
359
+ )
360
+ return
488
361
 
489
- if self._profiler:
490
- self._profiler.step()
362
+ # Setup profiler configuration
363
+ output_path = args.prof_path if args.prof_path != "./" else self._output_path
364
+ prof_path = os.path.join(
365
+ output_path,
366
+ f"rank{self._rank_id}_start{start_step}_stop{stop_step}"
367
+ )
368
+ print_msg(f"Rank {self._rank_id} create output path {prof_path}")
369
+ print_msg(
370
+ f"Rank {self._rank_id} Dynamic profiler start at step {start_step}, "
371
+ f"will stop at step {stop_step}"
372
+ )
373
+ profiler_config = self._get_prof_config(args, prof_path, start_step, stop_step, start_profile=True,
374
+ skip_first=1)
375
+ self._profiler = Profile(**profiler_config)
376
+
377
+ def _is_valid_start_stop_step(self, step_num, start_step, stop_step):
378
+ """Verify whether start_step and stop_step are valid parameters."""
379
+ if start_step < 0 or stop_step < 0:
380
+ return False
381
+
382
+ if step_num < start_step:
383
+ return False
384
+
385
+ if step_num > stop_step != self._stop_step:
386
+ logger.warning("stop_step must be greater than step_num, "
387
+ "but get start_step = %d, stop_step = %d, step_num = %d", start_step, stop_step, step_num)
388
+ return False
389
+
390
+ return True
391
+
392
+ @no_exception_func()
393
+ def _call_dyno_monitor(self, dyno_args):
394
+ if "is_valid" in dyno_args:
395
+ del dyno_args["is_valid"]
396
+ dyno_monitor_proxy = MsDynamicMonitorProxySingleton().get_proxy()
397
+ dyno_monitor_proxy.enable_dyno_npu_monitor(dyno_args)
491
398
 
492
399
  @no_exception_func()
493
400
  def on_train_end(self, run_context):
@@ -502,12 +409,16 @@ class DynamicProfilerMonitorBase(Callback):
502
409
  def _get_prof_args(self):
503
410
  """ Get prof_args """
504
411
  logger.error("Dynamic profiler _get_prof_args is not implemented")
505
- return DynamicProfilerArgs()
412
+ return {}
506
413
 
507
414
  def _clean_resource(self):
508
415
  """Clean resource"""
509
416
  logger.error("Dynamic profiler _clean_resource is not implemented")
510
417
 
418
+ def _finalize_dynolog(self):
419
+ """finalize dynolog"""
420
+ logger.error("Dynolog monitor _finalize_dynolog is not implemented")
421
+
511
422
  def _check_step(self, start_step, stop_step, step_num):
512
423
  """Check step valid"""
513
424
  if start_step <= 0 or stop_step <= 0:
@@ -535,9 +446,11 @@ class DynamicProfilerMonitorBase(Callback):
535
446
  """Init config json file"""
536
447
  if self._rank_id == 0:
537
448
  if not os.path.exists(self._cfg_json_path):
538
- logger.warning("cfg_path is not exist, create default cfg json")
539
- FileManager.create_json_file(self._cfg_path, DynamicProfilerArgs().vars,
540
- "profiler_config.json", indent=4)
449
+ logger.info("cfg_path is not exist, create default cfg json")
450
+ default_dy_config_context = DynamicProfilerConfigContext({})
451
+ PathManager.make_dir_safety(self._cfg_path)
452
+ config_file_path = os.path.join(self._cfg_path, "profiler_config.json")
453
+ FileManager.create_json_file(config_file_path, default_dy_config_context.vars, indent=4)
541
454
  else:
542
455
  logger.info("rank_id is not 0, skip init cfg json")
543
456
  print_msg(f"Init config json file: {self._cfg_json_path}")
@@ -550,10 +463,12 @@ class DynamicProfilerMonitorBase(Callback):
550
463
  def _create_process(self):
551
464
  """Create json monitor process, one process will be created at one worker"""
552
465
  if self._is_create_process:
466
+ args = [self._shared_loop_flag, self._poll_interval, self._shm, self._rank_id] if self._is_dyno else \
467
+ [self._shared_loop_flag, self._poll_interval, self._shm, self._cfg_json_path]
553
468
  # daemon need to be set to True, otherwise the process will not be killed when the main process exits.
554
- self._process = multiprocessing.Process(target=worker_func, daemon=True,
555
- args=(self._shared_loop_flag, self._poll_interval,
556
- self._shm, self._cfg_json_path))
469
+ self._process = multiprocessing.Process(target=worker_dyno_func if self._is_dyno else worker_func,
470
+ daemon=True,
471
+ args=args)
557
472
  self._process.start()
558
473
  logger.info("Config monitor process has been created by rank %d.", self._rank_id)
559
474
  else:
@@ -573,7 +488,7 @@ class DynamicProfilerMonitorBase(Callback):
573
488
  if not os.path.exists(shm_path):
574
489
  return
575
490
 
576
- MAX_TIME_DIFF = 30 # seconds
491
+ MAX_TIME_DIFF = 60 # seconds
577
492
  time_shm = os.stat(shm_path).st_ctime
578
493
  cur_proc_time = self._get_pid_st_ctime(os.getpid())
579
494
 
@@ -584,7 +499,7 @@ class DynamicProfilerMonitorBase(Callback):
584
499
  def _get_pid_st_ctime(self, pid):
585
500
  """Get pid st_ctime"""
586
501
  try:
587
- fd = os.open("/proc/" + str(pid), os.O_RDONLY, stat.S_IRUSR | stat.S_IRGRP)
502
+ fd = os.open(os.path.join('/proc', str(pid)), os.O_RDONLY, stat.S_IRUSR | stat.S_IRGRP)
588
503
  stat_ino = os.fstat(fd)
589
504
  os.close(fd)
590
505
  create_time = stat_ino.st_ctime
@@ -593,7 +508,7 @@ class DynamicProfilerMonitorBase(Callback):
593
508
  logger.error("Process with PID %d does not exist.", pid)
594
509
  except PermissionError:
595
510
  logger.error("Permission denied when accessing PID %d.", pid)
596
- except Exception as ex: # pylint: disable=W0703
511
+ except Exception as ex: # pylint: disable=W0703
597
512
  logger.error("An error occurred while getting creation time for PID %d: %s", pid, str(ex))
598
513
 
599
514
 
@@ -601,7 +516,8 @@ if sys.version_info >= (3, 8):
601
516
  @no_exception_func()
602
517
  def write_bytes(shm, byte_data):
603
518
  """Write bytes to shared memory"""
604
- shm.buf[:DynamicProfilerArgs.SIZE] = byte_data
519
+ shm.buf[:] = b'\x00' * len(shm.buf)
520
+ shm.buf[:len(byte_data)] = byte_data
605
521
  else:
606
522
  @no_exception_func()
607
523
  def write_bytes(shm, byte_data):
@@ -624,15 +540,13 @@ def worker_func(loop_flag, poll_interval, shm, cfg_path):
624
540
  with open(cfg_path, 'r') as f:
625
541
  data = json.load(f)
626
542
 
627
- # convert json to DynamicProfilerArgs
628
- prof_args = DynamicProfilerArgs(**data)
629
- prof_args.is_valid = True
543
+ data['is_valid'] = True
630
544
  logger.info("Dynamic profiler process load json success")
631
545
  except json.JSONDecodeError as e:
632
- prof_args = DynamicProfilerArgs()
633
- prof_args.is_valid = False
546
+ data = {'is_valid': False}
634
547
  logger.error("Dynamic profiler process load json failed: %s", e)
635
- byte_data = prof_args.to_bytes()
548
+ # convert json to bytes
549
+ byte_data = DynamicProfilerConfigContext.json_to_bytes(data)
636
550
  write_bytes(shm, byte_data)
637
551
  else:
638
552
  logger.error("Dynamic profiler cfg json not exists")
@@ -640,6 +554,36 @@ def worker_func(loop_flag, poll_interval, shm, cfg_path):
640
554
  logger.info("Dynamic profiler process done")
641
555
 
642
556
 
557
+ @no_exception_func()
558
+ def worker_dyno_func(loop_flag, poll_interval, shm, rank_id):
559
+ """ dyno monitor process worker function python version >= 3.8"""
560
+ proxy = MsDynamicMonitorProxySingleton().get_proxy()
561
+ ret = proxy.init_dyno(rank_id)
562
+
563
+ if not ret:
564
+ logger.warning("Rank %d init dynolog failed !")
565
+ return
566
+ print_msg("Init dynolog success !")
567
+
568
+ while loop_flag.value:
569
+ try:
570
+ res = proxy.poll_dyno()
571
+ if not res:
572
+ continue
573
+ data = DynamicProfilerUtils.dyno_str_to_dict(res)
574
+ except Exception as e: # pylint: disable=broad-except
575
+ data = {'is_valid': False}
576
+ logger.error("Dynolog process load config failed: %s", e)
577
+ else:
578
+ data['is_valid'] = True
579
+
580
+ # convert dyno config json to bytes
581
+ byte_data = DynamicProfilerConfigContext.json_to_bytes(data)
582
+ write_bytes(shm, byte_data)
583
+ time.sleep(poll_interval)
584
+ logger.info("Dynolog process done")
585
+
586
+
643
587
  if sys.version_info >= (3, 8):
644
588
  from multiprocessing import shared_memory
645
589
  from unittest.mock import patch
@@ -647,10 +591,10 @@ if sys.version_info >= (3, 8):
647
591
 
648
592
  class DynamicProfilerMonitor(DynamicProfilerMonitorBase):
649
593
  r"""
650
- This class to enable the dynamic profile monitoring of MindSpore neural networks.
594
+ This class to enable the dynamic profiler monitoring of MindSpore neural networks.
651
595
 
652
596
  Args:
653
- cfg_path (str): (Ascend only) Dynamic profile json config file directory. The requirement is a shared path
597
+ cfg_path (str): (Ascend only) Dynamic profiler json config file directory. The requirement is a shared path
654
598
  that can be accessed by all nodes. The parameters of the json configuration file are as follows:
655
599
 
656
600
  - start_step (int, required) - Sets the step number at which the Profiler starts collecting data.
@@ -660,27 +604,46 @@ if sys.version_info >= (3, 8):
660
604
  a relative value, with the first step of training being 1. The stop_step must be greater than or
661
605
  equal to start_step. The default value is -1, indicating that data collection will not start during
662
606
  the entire training process.
663
- - aic_metrics (int, optional) - The range of values corresponds to the Profiler. The default value -1
664
- indicates that AI Core utilization is not collected, and 0 indicates PipeUtilization, 1 indicates
665
- ArithmeticUtilization, 2 stands for Memory, 3 stands for MemoryL0, 4 stands for MemoryUB, 5 indicates
666
- ResourceConflictRatio, 6 indicates L2Cache, 7 indicates MemoryAccess.
667
- - profiler_level (int, optional) - Sets the level of performance data collection, where -1 represents
668
- ProfilerLevel.LevelNone, 0 represents ProfilerLevel.Level0, 1 represents ProfilerLevel.Level1, and
669
- 2 represents ProfilerLevel.Level2. The default value is 0, indicating the ProfilerLevel.Level0
670
- collection level.
671
- - activities (int, optional) - Sets the devices for performance data collection, where 0 represents
672
- CPU+NPU, 1 represents CPU, and 2 represents NPU. The default value is 0, indicating the collection
673
- of CPU+NPU performance data.
674
- - export_type (int, optional) - Sets the data type to export, where 0 represents text, 1 represents db,
675
- and 2 represents text and db. The default value is 0, indicating only export text type data.
607
+ - aic_metrics (int/str, optional) - Set the collection of AI Core metric data. The current version can
608
+ pass in either type int or str. Later, it will be updated to only pass in the str type.
609
+ Here, ``0`` and ``"PipeUtilization"`` represent PipeUtilization; ``1`` and ``"ArithmeticUtilization"``
610
+ represent ArithmeticUtilization; ``2`` and ``"Memory"`` represent Memory; ``3`` and ``"MemoryL0"``
611
+ represent MemoryL0; ``4`` and ``"MemoryUB"`` stand for MemoryUB; ``5`` and ``"ResourceConflictRatio"``
612
+ represent ResourceConflictRatio; ``6`` and ``"L2Cache"`` represent L2Cache; ``7`` and
613
+ ``"MemoryAccess"`` stand for MemoryAccess. The default value ``"AiCoreNone"`` indicates that the
614
+ AI Core metric is not collected.
615
+ - profiler_level (int/str, optional) - Set the level for collecting performance data. The current
616
+ version can pass in either type int or str, and it will be updated to only pass in str type
617
+ in the future. Among them, ``-1`` and ``"LevelNone"`` represent ProfilerLevel.LevelNone, ``0``
618
+ and ``"Level0"`` represent ProfilerLevel.Level0, and ``1`` and ``"Level1"`` represent
619
+ ProfilerLevel.Level1. ``2`` and ``"Level2"`` stand for Profile Level.Level2.
620
+ The default value ``"Level0"`` indicates the collection level of ProfilerLevel.Level0.
621
+ - activities (int/list, optional) - Set the device for collecting performance data.
622
+ The current version can pass in either type int or list. Later, it will be updated to only
623
+ pass in the list type. Among them, ``0`` and ``["CPU","NPU"]`` represent CPU+NPU, ``1`` and
624
+ ``["CPU"]`` represent CPU, and ``2`` and ``["NPU"]`` represent NPU. The default values
625
+ ``["CPU","NPU"]`` indicate the collection of performance data of CPU+NPU.
626
+ - export_type (int/list, optional) - Set the type of the exported performance data.
627
+ The current version can pass in either type int or list, and it will be updated later
628
+ to only pass in the list type. Among them, ``0`` and ``["text"]`` represent text, ``1`` and ``["db"]``
629
+ represent db, and ``2`` and ``["text","db"]`` represent text and db respectively. The default value
630
+ ``["text"]`` indicates that only performance data of the text type is exported.
676
631
  - profile_memory (bool, optional) - Set whether to collect memory performance data, true indicates that
677
632
  memory performance data is collected, false indicates that memory performance data is not collected.
678
633
  The default value is false, indicating that memory performance data is not collected.
679
634
  - mstx (bool, optional) - Set whether to enable mstx, true indicates that mstx is enabled, false
680
635
  indicates that mstx is disabled. The default value is false, indicating that mstx is not enabled.
681
- - analyse_mode (int, optional) - Sets the mode for online analysis, corresponding to the analyse_mode
682
- parameter of the mindspore.Profiler.analyse interface, where 0 represents "sync" and 1 represents
683
- "async". The default value is -1, indicating that online analysis is not used.
636
+ - analyse (bool, optional) - Set whether to enable online analysis. True indicates that online analysis
637
+ is enabled, while false indicates that online analysis is disabled. The default value is false,
638
+ indicating that online analysis is not enabled. This parameter has a higher priority than the
639
+ `analyse_mode` parameter. When this parameter is set to false, the setting of the `analyse_mode`
640
+ parameter does not take effect. When this parameter is set to true,
641
+ setting the `analyse_mode` parameter to -1 does not take effect.
642
+ - analyse_mode (int, optional) - Sets the mode for online analysis,
643
+ where 0 represents "sync" and 1 represents "async". The default value is -1,
644
+ indicating that online analysis is not used. This parameter has a lower priority than the `analyse`
645
+ parameter. When the `analyse` parameter is set to false, the setting of this parameter does not take
646
+ effect. When the `analyse` parameter is set to true, setting it to -1 does not take effect.
684
647
  - parallel_strategy (bool, optional) - Sets whether to collect parallel strategy performance data,
685
648
  where true means to collect and false means not to collect. The default value is false, indicating
686
649
  that parallel strategy performance data is not collected.
@@ -690,6 +653,44 @@ if sys.version_info >= (3, 8):
690
653
  - data_simplification (bool, optional) - Sets whether to enable data simplification, where true means
691
654
  to enable and false means not to enable. The default value is true, indicating that data
692
655
  simplification is enabled.
656
+ - record_shapes (bool, optional) - Sets whether to collect operator input tensor shapes data, where true
657
+ means that the shape data is collected and false means that the shape data is not collected. The
658
+ default value is false, indicating that input tensor shapes data is not collected.
659
+ - mstx_domain_include (list, optional) - Set the set of enabled domain names when the mstx switch
660
+ is turned on. The name must be of str type. Default value: ``[]``, indicating that this parameter
661
+ is not used to control the domain. This parameter is mutually exclusive with the mstx_domain_exclude
662
+ parameter and cannot be set. simultaneously. If both are set, only the mstx_domain_include parameter
663
+ takes effect.
664
+ - mstx_domain_exclude (list, optional) - Set the set of domain names that are not enabled when the
665
+ mstx switch is turned on. The name must be of str type. Default value: ``[]``, indicating that this
666
+ parameter is not used to control the domain.
667
+ - prof_path (str, optional) - Output data path of the dynamic profiler. It is the same as the interface
668
+ parameter `output_path`. When both are set, `prof_path` takes effect. Default value:
669
+ ``"./"`` .
670
+ - sys_io (bool, optional) - Set whether to collect NIC and RoCE data. Default value: ``False`` ,
671
+ indicating that these data are not collected.
672
+ - sys_interconnection (bool, optional) - Set whether to collect system interconnection data,
673
+ including aggregate collective communication statistics (HCCS), PCIe data, and inter-chip transmission
674
+ bandwidth information. Default value: ``False`` , indicating that these data are not collected.
675
+ - host_sys (list, optional) - Collect the data of system class calls, storage classes and cpu usage
676
+ rate on the host side, and pass in the list type. It supports passing in one or more of ``"cpu"``,
677
+ ``"mem"``, ``"disk"``, ``"network"`` and ``"osrt"``. Among them, ``"cpu"`` represents the cpu
678
+ utilization at the process level, ``"mem"`` represents the memory utilization at the process level,
679
+ ``"disk"`` represents the disk I/O utilization at the process level, and ``"network"`` represents the
680
+ network I/O utilization at the system level. ``"osrt"`` represents system-level syscall and
681
+ pthreadcall. Default value: ``[]``, indicating that system class data on the host side is
682
+ not collected. When collecting DISK or OSRT data, it is necessary to install the iotop, perf,
683
+ and ltrace third-party tools in advance. For detailed steps, please refer to
684
+ `Installing Third-party Tools <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/T&ITools/
685
+ Profiling/atlasprofiling_16_0136.html>`_ .
686
+ After the third-party tool is successfully installed, user permissions need to be configured. For
687
+ detailed steps, please refer to `Configure User Permissions <https://www.hiascend.com/document/
688
+ detail/zh/mindstudio/80RC1/T&ITools/Profiling/atlasprofiling_16_0137.
689
+ html>`_ .
690
+ Note that in step 3 of configuring user permissions, the content in the msprof_data_collection.sh
691
+ script needs to be replaced with `msprof_data_collection.sh
692
+ <https://gitee.com/mindspore/mindspore/blob/master/docs/api/api_python/mindspore/script/
693
+ msprof_data_collection.sh>`_.
693
694
 
694
695
  output_path (str, optional): (Ascend only) Output data path. Default: ``"./dyn_profile_data"`` .
695
696
  poll_interval (int, optional): (Ascend only) The polling period of the monitoring process, in seconds.
@@ -729,9 +730,13 @@ if sys.version_info >= (3, 8):
729
730
  ... model.train(10, data, callbacks=[dynprof_cb])
730
731
  """
731
732
 
732
- def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
733
- if not isinstance(cfg_path, str):
734
- raise TypeError("The cfg_path must be a string.")
733
+ def __init__(self, cfg_path=None, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
734
+ if DynamicProfilerUtils.is_dyno_mode() and cfg_path is not None:
735
+ logger.warning("If you export 'MSMONITOR_USE_DAEMON=1', your 'cfg_path' parameter will be invalid!")
736
+ cfg_path = None
737
+
738
+ if not DynamicProfilerUtils.is_dyno_mode() and not isinstance(cfg_path, str):
739
+ raise TypeError("If you set 'MSMONITOR_USE_DAEMON' to not 1, The cfg_path must be a string.")
735
740
  if not isinstance(output_path, str):
736
741
  logger.warning(f"The output_path must be a string, "
737
742
  f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
@@ -740,7 +745,21 @@ if sys.version_info >= (3, 8):
740
745
 
741
746
  def _get_prof_args(self):
742
747
  """ Get prof_args py38"""
743
- return DynamicProfilerArgs.from_bytes(self._shm.buf[:DynamicProfilerArgs.SIZE])
748
+ byte_length = self._get_shm_byte_length()
749
+
750
+ if byte_length == 0:
751
+ return {}
752
+
753
+ valid_bytes = self._shm.buf[:byte_length]
754
+ return DynamicProfilerConfigContext.bytes_to_json(bytes(valid_bytes))
755
+
756
+ def _get_shm_byte_length(self):
757
+ byte_length = 0
758
+ for i, byte in enumerate(self._shm.buf):
759
+ if byte == 0:
760
+ byte_length = i
761
+ break
762
+ return byte_length
744
763
 
745
764
  @no_exception_func()
746
765
  def _clean_resource(self):
@@ -770,6 +789,12 @@ if sys.version_info >= (3, 8):
770
789
  logger.warning("Rank %s unlink shm failed, may be removed", self._rank_id)
771
790
  self._shm = None
772
791
 
792
+ @no_exception_func()
793
+ def _finalize_dynolog(self):
794
+ dyno_monitor_proxy = MsDynamicMonitorProxySingleton().get_proxy()
795
+ dyno_monitor_proxy.finalize_dyno()
796
+ logger.info("Rank %d finalize dynolog success !", self._rank_id)
797
+
773
798
  @no_exception_func()
774
799
  def _create_shm(self):
775
800
  """Create a json monitor process based on whether the SharedMemory is successfully created py38"""
@@ -789,7 +814,7 @@ if sys.version_info >= (3, 8):
789
814
  try:
790
815
  # Step 2: only one process can create shm successfully.
791
816
  self._shm = shared_memory.SharedMemory(name=self._shm_name,
792
- create=True, size=DynamicProfilerArgs.SIZE)
817
+ create=True, size=DynamicProfilerUtils.CFG_BUFFER_SIZE)
793
818
  self._is_create_process = True
794
819
  logger.info("Rank %d shared memory is created.", self._rank_id)
795
820
  break
@@ -799,7 +824,7 @@ if sys.version_info >= (3, 8):
799
824
  logger.warning("Rank %d shared memory create failed, "
800
825
  "retry times = %d.", self._rank_id, try_times)
801
826
  time.sleep(random.uniform(0, 0.02)) # sleep 0 ~ 20 ms
802
- except Exception as e: # pylint: disable=W0703
827
+ except Exception as e: # pylint: disable=W0703
803
828
  # shm open failed because of other process create shm not finished
804
829
  try_times -= 1
805
830
  logger.warning("Rank %d shared memory open failed, error: %s, retry times = %d",
@@ -815,10 +840,10 @@ else:
815
840
 
816
841
  class DynamicProfilerMonitor(DynamicProfilerMonitorBase):
817
842
  r"""
818
- This class to enable the dynamic profile monitoring of MindSpore neural networks.
843
+ This class to enable the dynamic profiler monitoring of MindSpore neural networks.
819
844
 
820
845
  Args:
821
- cfg_path (str): Dynamic profile json config file directory. The requirement is a shared path
846
+ cfg_path (str): Dynamic profiler json config file directory. The requirement is a shared path
822
847
  that can be accessed by all nodes.
823
848
  output_path (str, optional): Output data path. Default: ``"./dyn_profile_data"`` .
824
849
  poll_interval (int, optional): The polling period of the monitoring process, in seconds.
@@ -858,16 +883,24 @@ else:
858
883
  ... model.train(10, data, callbacks=[dynprof_cb])
859
884
  """
860
885
 
861
- def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
862
- if not isinstance(cfg_path, str):
863
- raise TypeError("The cfg_path must be a string.")
886
+ def __init__(self, cfg_path=None, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
887
+ if DynamicProfilerUtils.is_dyno_mode() and cfg_path is not None:
888
+ logger.warning("If you export 'MSMONITOR_USE_DAEMON=1', your 'cfg_path' parameter will be invalid!")
889
+ cfg_path = None
890
+
891
+ if not DynamicProfilerUtils.is_dyno_mode() and not isinstance(cfg_path, str):
892
+ raise TypeError("If you set 'MSMONITOR_USE_DAEMON' to not 1, The cfg_path must be a string.")
893
+
864
894
  if not isinstance(output_path, str):
865
895
  logger.warning(f"The output_path must be a string, "
866
896
  f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
867
897
  output_path = "./dyn_profile_data"
868
898
  self._cfg_path = cfg_path
869
899
  self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
870
- self._shm_dir = os.path.join(self._cfg_path, "shm")
900
+ self._shm_dir = (
901
+ "/dev/shm" if DynamicProfilerUtils.is_dyno_mode()
902
+ else os.path.join(self._cfg_path, "shm")
903
+ )
871
904
  PathManager.make_dir_safety(self._shm_dir)
872
905
  self._shm_path = os.path.realpath(os.path.join(self._shm_dir, self._shm_name))
873
906
 
@@ -878,7 +911,8 @@ else:
878
911
  def _get_prof_args(self):
879
912
  """ Get prof_args py37"""
880
913
  self._shm.seek(0)
881
- return DynamicProfilerArgs.from_bytes(self._shm.read(DynamicProfilerArgs.SIZE))
914
+ return DynamicProfilerConfigContext.bytes_to_json(
915
+ bytes(self._shm.read(DynamicProfilerUtils.CFG_BUFFER_SIZE)))
882
916
 
883
917
  @no_exception_func()
884
918
  def _clean_resource(self):
@@ -923,7 +957,8 @@ else:
923
957
  self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
924
958
  stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
925
959
  self._memory_mapped_file = os.fdopen(self.fd, 'rb')
926
- self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerArgs.SIZE)
960
+ self._shm = mmap.mmap(self._memory_mapped_file.fileno(),
961
+ length=DynamicProfilerUtils.CFG_BUFFER_SIZE)
927
962
  self._is_create_process = False
928
963
  logger.info("Rank %d shared memory is connected.", self._rank_id)
929
964
  break
@@ -937,7 +972,7 @@ else:
937
972
 
938
973
  # Init mmap file need to write data
939
974
  with os.fdopen(fd, 'wb') as f:
940
- data_instance = DynamicProfilerArgs()
975
+ data_instance = DynamicProfilerConfigContext({})
941
976
  byte_data = data_instance.to_bytes()
942
977
  f.write(byte_data)
943
978
 
@@ -945,7 +980,8 @@ else:
945
980
  self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
946
981
  stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
947
982
  self._memory_mapped_file = os.fdopen(self.fd, 'rb')
948
- self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerArgs.SIZE)
983
+ self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerUtils.
984
+ CFG_BUFFER_SIZE)
949
985
  self._is_create_process = True
950
986
  logger.info("Rank %d shared memory is created.", self._rank_id)
951
987
  break