mindspore 2.6.0rc1__cp311-cp311-win_amd64.whl → 2.7.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (458) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +2 -2
  5. mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +42 -11
  9. mindspore/_extends/builtin_operations.py +3 -3
  10. mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
  11. mindspore/_extends/optimize/cell_utils.py +96 -0
  12. mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
  13. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  14. mindspore/_extends/parse/__init__.py +3 -3
  15. mindspore/_extends/parse/compile_config.py +44 -22
  16. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -2
  17. mindspore/_extends/parse/parser.py +65 -84
  18. mindspore/_extends/parse/resources.py +39 -0
  19. mindspore/_extends/parse/standard_method.py +58 -14
  20. mindspore/_extends/parse/trope.py +8 -1
  21. mindspore/_extends/pijit/__init__.py +1 -2
  22. mindspore/_extends/pijit/pijit_func_white_list.py +2 -5
  23. mindspore/amp.py +4 -22
  24. mindspore/atlprov.dll +0 -0
  25. mindspore/avcodec-59.dll +0 -0
  26. mindspore/avdevice-59.dll +0 -0
  27. mindspore/avfilter-8.dll +0 -0
  28. mindspore/avformat-59.dll +0 -0
  29. mindspore/avutil-57.dll +0 -0
  30. mindspore/boost/adasum.py +1 -1
  31. mindspore/boost/boost_cell_wrapper.py +4 -4
  32. mindspore/c1.dll +0 -0
  33. mindspore/c1xx.dll +0 -0
  34. mindspore/c2.dll +0 -0
  35. mindspore/common/__init__.py +43 -12
  36. mindspore/common/_grad_function.py +2 -1
  37. mindspore/common/_pijit_context.py +28 -7
  38. mindspore/common/_stub_tensor.py +1 -209
  39. mindspore/common/_tensor_cpp_method.py +1 -1
  40. mindspore/common/_tensor_docs.py +178 -53
  41. mindspore/common/_utils.py +9 -1
  42. mindspore/common/api.py +377 -203
  43. mindspore/common/dtype.py +108 -57
  44. mindspore/common/dump.py +11 -16
  45. mindspore/common/dynamic_shape/__init__.py +0 -0
  46. mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +17 -23
  47. mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
  48. mindspore/common/file_system.py +59 -9
  49. mindspore/common/generator.py +5 -3
  50. mindspore/common/hook_handle.py +33 -5
  51. mindspore/common/jit_config.py +1 -1
  52. mindspore/common/jit_trace.py +84 -105
  53. mindspore/common/np_dtype.py +3 -3
  54. mindspore/common/parameter.py +27 -29
  55. mindspore/common/recompute.py +5 -7
  56. mindspore/common/sparse_tensor.py +0 -3
  57. mindspore/common/symbol.py +0 -1
  58. mindspore/common/tensor.py +117 -131
  59. mindspore/communication/_comm_helper.py +46 -4
  60. mindspore/communication/management.py +79 -7
  61. mindspore/context.py +67 -55
  62. mindspore/dataset/__init__.py +1 -1
  63. mindspore/dataset/audio/transforms.py +1 -1
  64. mindspore/dataset/core/config.py +38 -4
  65. mindspore/dataset/engine/datasets.py +350 -322
  66. mindspore/dataset/engine/datasets_user_defined.py +70 -24
  67. mindspore/dataset/engine/iterators.py +2 -2
  68. mindspore/dataset/engine/obs/config_loader.py +2 -2
  69. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
  70. mindspore/dataset/transforms/c_transforms.py +2 -2
  71. mindspore/dataset/transforms/py_transforms.py +7 -3
  72. mindspore/dataset/transforms/transforms.py +10 -6
  73. mindspore/dataset/vision/__init__.py +1 -1
  74. mindspore/dataset/vision/py_transforms.py +8 -8
  75. mindspore/dataset/vision/transforms.py +17 -5
  76. mindspore/dataset/vision/utils.py +632 -21
  77. mindspore/dataset/vision/validators.py +1 -0
  78. mindspore/device_context/ascend/device.py +1 -1
  79. mindspore/device_context/ascend/op_tuning.py +35 -1
  80. mindspore/device_context/gpu/__init__.py +2 -2
  81. mindspore/device_context/gpu/device.py +1 -1
  82. mindspore/device_context/gpu/op_precision.py +4 -2
  83. mindspore/device_context/gpu/op_tuning.py +6 -3
  84. mindspore/device_manager.py +16 -9
  85. mindspore/dnnl.dll +0 -0
  86. mindspore/dpcmi.dll +0 -0
  87. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -4
  88. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  89. mindspore/experimental/optim/adadelta.py +13 -20
  90. mindspore/experimental/optim/adagrad.py +15 -22
  91. mindspore/experimental/optim/adam.py +17 -24
  92. mindspore/experimental/optim/adamax.py +14 -22
  93. mindspore/experimental/optim/adamw.py +28 -34
  94. mindspore/experimental/optim/asgd.py +15 -25
  95. mindspore/experimental/optim/lr_scheduler.py +27 -45
  96. mindspore/experimental/optim/nadam.py +14 -24
  97. mindspore/experimental/optim/optimizer.py +13 -23
  98. mindspore/experimental/optim/radam.py +18 -24
  99. mindspore/experimental/optim/rmsprop.py +14 -25
  100. mindspore/experimental/optim/rprop.py +15 -26
  101. mindspore/experimental/optim/sgd.py +9 -19
  102. mindspore/hal/__init__.py +4 -4
  103. mindspore/hal/contiguous_tensors_handle.py +2 -2
  104. mindspore/hal/memory.py +27 -7
  105. mindspore/include/api/cell.h +65 -5
  106. mindspore/include/api/cfg.h +24 -7
  107. mindspore/include/api/context.h +1 -0
  108. mindspore/include/api/delegate.h +10 -2
  109. mindspore/include/api/dual_abi_helper.h +100 -19
  110. mindspore/include/api/graph.h +14 -1
  111. mindspore/include/api/kernel.h +16 -3
  112. mindspore/include/api/kernel_api.h +9 -1
  113. mindspore/include/api/metrics/accuracy.h +9 -0
  114. mindspore/include/api/model.h +8 -1
  115. mindspore/include/api/model_group.h +4 -0
  116. mindspore/include/api/model_parallel_runner.h +2 -0
  117. mindspore/include/api/status.h +48 -10
  118. mindspore/include/api/types.h +8 -3
  119. mindspore/include/c_api/model_c.h +0 -58
  120. mindspore/include/c_api/tensor_c.h +0 -26
  121. mindspore/include/dataset/constants.h +9 -0
  122. mindspore/include/dataset/vision_ascend.h +1 -1
  123. mindspore/jpeg62.dll +0 -0
  124. mindspore/mindrecord/tools/cifar10.py +61 -11
  125. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
  126. mindspore/mindspore_backend_common.dll +0 -0
  127. mindspore/mindspore_backend_manager.dll +0 -0
  128. mindspore/mindspore_common.dll +0 -0
  129. mindspore/mindspore_core.dll +0 -0
  130. mindspore/mindspore_cpu_res_manager.dll +0 -0
  131. mindspore/mindspore_dump.dll +0 -0
  132. mindspore/mindspore_frontend.dll +0 -0
  133. mindspore/mindspore_glog.dll +0 -0
  134. mindspore/mindspore_memory_pool.dll +0 -0
  135. mindspore/mindspore_ms_backend.dll +0 -0
  136. mindspore/mindspore_ops.dll +0 -0
  137. mindspore/mindspore_ops_host.dll +0 -0
  138. mindspore/mindspore_ops_kernel_common.dll +0 -0
  139. mindspore/mindspore_profiler.dll +0 -0
  140. mindspore/mindspore_pyboost.dll +0 -0
  141. mindspore/mindspore_pynative.dll +0 -0
  142. mindspore/mindspore_res_manager.dll +0 -0
  143. mindspore/mindspore_runtime_pipeline.dll +0 -0
  144. mindspore/mint/__init__.py +6 -46
  145. mindspore/mint/distributed/__init__.py +5 -0
  146. mindspore/mint/distributed/distributed.py +429 -23
  147. mindspore/mint/nn/__init__.py +1 -1
  148. mindspore/mint/nn/functional.py +53 -6
  149. mindspore/mint/nn/layer/_functions.py +163 -294
  150. mindspore/mint/nn/layer/activation.py +8 -6
  151. mindspore/mint/nn/layer/conv.py +140 -104
  152. mindspore/mint/nn/layer/normalization.py +11 -25
  153. mindspore/mint/optim/adam.py +19 -18
  154. mindspore/mint/optim/adamw.py +14 -8
  155. mindspore/mint/optim/sgd.py +5 -5
  156. mindspore/msobj140.dll +0 -0
  157. mindspore/mspdb140.dll +0 -0
  158. mindspore/mspdbcore.dll +0 -0
  159. mindspore/mspdbst.dll +0 -0
  160. mindspore/mspft140.dll +0 -0
  161. mindspore/msvcdis140.dll +0 -0
  162. mindspore/msvcp140_1.dll +0 -0
  163. mindspore/msvcp140_2.dll +0 -0
  164. mindspore/msvcp140_atomic_wait.dll +0 -0
  165. mindspore/msvcp140_codecvt_ids.dll +0 -0
  166. mindspore/nn/cell.py +491 -623
  167. mindspore/nn/grad/cell_grad.py +11 -12
  168. mindspore/nn/layer/activation.py +36 -36
  169. mindspore/nn/layer/basic.py +74 -77
  170. mindspore/nn/layer/channel_shuffle.py +4 -4
  171. mindspore/nn/layer/combined.py +4 -2
  172. mindspore/nn/layer/conv.py +117 -110
  173. mindspore/nn/layer/dense.py +9 -7
  174. mindspore/nn/layer/embedding.py +50 -52
  175. mindspore/nn/layer/image.py +38 -40
  176. mindspore/nn/layer/math.py +111 -112
  177. mindspore/nn/layer/normalization.py +56 -44
  178. mindspore/nn/layer/pooling.py +58 -63
  179. mindspore/nn/layer/rnn_cells.py +33 -33
  180. mindspore/nn/layer/rnns.py +56 -56
  181. mindspore/nn/layer/thor_layer.py +74 -73
  182. mindspore/nn/layer/transformer.py +11 -1
  183. mindspore/nn/learning_rate_schedule.py +20 -20
  184. mindspore/nn/loss/loss.py +79 -81
  185. mindspore/nn/optim/adam.py +4 -6
  186. mindspore/nn/optim/adasum.py +2 -2
  187. mindspore/nn/optim/asgd.py +2 -0
  188. mindspore/nn/optim/lamb.py +1 -3
  189. mindspore/nn/optim/optimizer.py +1 -1
  190. mindspore/nn/optim/tft_wrapper.py +2 -3
  191. mindspore/nn/optim/thor.py +2 -2
  192. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  193. mindspore/nn/probability/distribution/exponential.py +2 -1
  194. mindspore/nn/probability/distribution/poisson.py +2 -1
  195. mindspore/nn/sparse/sparse.py +3 -3
  196. mindspore/nn/wrap/cell_wrapper.py +73 -42
  197. mindspore/nn/wrap/grad_reducer.py +37 -52
  198. mindspore/nn/wrap/loss_scale.py +72 -74
  199. mindspore/numpy/array_creations.py +7 -7
  200. mindspore/numpy/fft.py +1 -1
  201. mindspore/numpy/math_ops.py +5 -5
  202. mindspore/numpy/utils_const.py +1 -1
  203. mindspore/opencv_core452.dll +0 -0
  204. mindspore/opencv_imgcodecs452.dll +0 -0
  205. mindspore/opencv_imgproc452.dll +0 -0
  206. mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
  207. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
  208. mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
  209. mindspore/ops/_op_impl/cpu/__init__.py +1 -0
  210. mindspore/{experimental/es/__init__.py → ops/_op_impl/cpu/joinedstr_op.py} +12 -6
  211. mindspore/ops/_vmap/vmap_array_ops.py +31 -13
  212. mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
  213. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +54 -13
  214. mindspore/ops/auto_generate/gen_extend_func.py +27 -145
  215. mindspore/ops/auto_generate/gen_ops_def.py +1027 -347
  216. mindspore/ops/auto_generate/gen_ops_prim.py +2341 -1117
  217. mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
  218. mindspore/ops/composite/__init__.py +10 -0
  219. mindspore/ops/composite/base.py +9 -5
  220. mindspore/ops/composite/multitype_ops/__init__.py +12 -1
  221. mindspore/ops/composite/multitype_ops/_compile_utils.py +133 -109
  222. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  223. mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
  224. mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
  225. mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
  226. mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
  227. mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
  228. mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
  229. mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
  230. mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
  231. mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
  232. mindspore/ops/function/__init__.py +4 -1
  233. mindspore/ops/function/_add_attr_func.py +11 -6
  234. mindspore/ops/function/array_func.py +19 -102
  235. mindspore/ops/function/debug_func.py +8 -5
  236. mindspore/ops/function/grad/grad_func.py +5 -13
  237. mindspore/ops/function/math_func.py +77 -572
  238. mindspore/ops/function/nn_func.py +46 -94
  239. mindspore/ops/function/other_func.py +4 -1
  240. mindspore/ops/function/random_func.py +44 -5
  241. mindspore/ops/function/vmap_func.py +2 -1
  242. mindspore/ops/functional.py +4 -4
  243. mindspore/ops/functional_overload.py +594 -18
  244. mindspore/ops/op_info_register.py +21 -0
  245. mindspore/ops/operations/__init__.py +16 -11
  246. mindspore/ops/operations/_custom_ops_utils.py +689 -34
  247. mindspore/ops/operations/_inner_ops.py +14 -18
  248. mindspore/ops/operations/_sequence_ops.py +1 -1
  249. mindspore/ops/operations/array_ops.py +5 -51
  250. mindspore/ops/operations/comm_ops.py +186 -41
  251. mindspore/ops/operations/custom_ops.py +303 -177
  252. mindspore/ops/operations/debug_ops.py +59 -4
  253. mindspore/ops/operations/image_ops.py +13 -13
  254. mindspore/ops/operations/manually_defined/ops_def.py +27 -28
  255. mindspore/ops/operations/math_ops.py +8 -9
  256. mindspore/ops/operations/nn_ops.py +8 -40
  257. mindspore/ops/primitive.py +9 -20
  258. mindspore/ops/tensor_method.py +63 -15
  259. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
  260. mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
  261. mindspore/ops_generate/api/functions_cc_generator.py +58 -10
  262. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
  263. mindspore/ops_generate/common/base_generator.py +14 -0
  264. mindspore/ops_generate/common/gen_constants.py +8 -3
  265. mindspore/ops_generate/common/gen_utils.py +0 -19
  266. mindspore/ops_generate/common/op_proto.py +11 -4
  267. mindspore/ops_generate/common/template.py +88 -11
  268. mindspore/ops_generate/gen_ops.py +1 -1
  269. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
  270. mindspore/ops_generate/op_def/ops_def_cc_generator.py +0 -3
  271. mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
  272. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
  273. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
  274. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
  275. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
  276. mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -16
  277. mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
  278. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
  279. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
  280. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
  281. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
  282. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
  283. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
  284. mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
  285. mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
  286. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
  287. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
  288. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
  289. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
  290. mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
  291. mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
  292. mindspore/parallel/_auto_parallel_context.py +16 -23
  293. mindspore/parallel/_cell_wrapper.py +113 -45
  294. mindspore/parallel/_parallel_serialization.py +4 -3
  295. mindspore/parallel/_ps_context.py +4 -6
  296. mindspore/parallel/_tensor.py +167 -12
  297. mindspore/parallel/_transformer/moe.py +1 -1
  298. mindspore/parallel/_transformer/transformer.py +17 -12
  299. mindspore/parallel/_utils.py +5 -11
  300. mindspore/parallel/auto_parallel.py +35 -14
  301. mindspore/parallel/checkpoint_convert.py +3 -3
  302. mindspore/parallel/checkpoint_transform.py +13 -7
  303. mindspore/parallel/cluster/process_entity/_api.py +88 -49
  304. mindspore/parallel/cluster/process_entity/_utils.py +95 -7
  305. mindspore/parallel/cluster/run.py +48 -7
  306. mindspore/parallel/function/__init__.py +8 -1
  307. mindspore/parallel/function/reshard_func.py +12 -12
  308. mindspore/parallel/nn/__init__.py +15 -2
  309. mindspore/parallel/nn/parallel_cell_wrapper.py +50 -14
  310. mindspore/parallel/nn/parallel_grad_reducer.py +7 -14
  311. mindspore/parallel/shard.py +10 -25
  312. mindspore/parallel/transform_safetensors.py +469 -174
  313. mindspore/pgodb140.dll +0 -0
  314. mindspore/pgort140.dll +0 -0
  315. mindspore/profiler/__init__.py +2 -1
  316. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
  317. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
  318. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +12 -6
  319. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
  320. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
  321. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
  322. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
  323. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
  324. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
  325. mindspore/profiler/analysis/task_manager.py +1 -1
  326. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
  327. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
  328. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +10 -9
  329. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +43 -23
  330. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
  331. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
  332. mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
  333. mindspore/profiler/common/constant.py +16 -0
  334. mindspore/profiler/common/msprof_cmd_tool.py +2 -2
  335. mindspore/profiler/common/path_manager.py +9 -0
  336. mindspore/profiler/common/profiler_context.py +50 -29
  337. mindspore/profiler/common/profiler_info.py +0 -16
  338. mindspore/profiler/common/profiler_meta_data.py +1 -0
  339. mindspore/profiler/common/profiler_op_analyse.py +239 -0
  340. mindspore/profiler/common/profiler_output_path.py +23 -8
  341. mindspore/profiler/common/profiler_parameters.py +128 -35
  342. mindspore/profiler/dynamic_profile/__init__.py +0 -0
  343. mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
  344. mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
  345. mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
  346. mindspore/profiler/dynamic_profiler.py +374 -338
  347. mindspore/profiler/envprofiler.py +42 -12
  348. mindspore/profiler/experimental_config.py +112 -7
  349. mindspore/profiler/mstx.py +33 -12
  350. mindspore/profiler/platform/__init__.py +2 -3
  351. mindspore/profiler/platform/cpu_profiler.py +10 -4
  352. mindspore/profiler/platform/npu_profiler.py +30 -20
  353. mindspore/profiler/profiler.py +218 -154
  354. mindspore/profiler/profiler_action_controller.py +65 -77
  355. mindspore/profiler/profiler_interface.py +2 -2
  356. mindspore/profiler/schedule.py +10 -4
  357. mindspore/rewrite/common/config.py +1 -0
  358. mindspore/rewrite/common/namer.py +1 -0
  359. mindspore/rewrite/common/namespace.py +1 -0
  360. mindspore/rewrite/node/node.py +31 -11
  361. mindspore/rewrite/parsers/assign_parser.py +1 -1
  362. mindspore/rewrite/symbol_tree/symbol_tree.py +2 -2
  363. mindspore/run_check/_check_version.py +7 -10
  364. mindspore/runtime/__init__.py +8 -6
  365. mindspore/runtime/event.py +10 -4
  366. mindspore/runtime/executor.py +87 -45
  367. mindspore/runtime/memory.py +31 -32
  368. mindspore/runtime/thread_bind_core.py +299 -165
  369. mindspore/safeguard/rewrite_obfuscation.py +12 -13
  370. mindspore/swresample-4.dll +0 -0
  371. mindspore/swscale-6.dll +0 -0
  372. mindspore/tbbmalloc.dll +0 -0
  373. mindspore/tinyxml2.dll +0 -0
  374. mindspore/train/_utils.py +17 -7
  375. mindspore/train/amp.py +43 -23
  376. mindspore/train/callback/__init__.py +5 -5
  377. mindspore/train/callback/_callback.py +2 -1
  378. mindspore/train/callback/_checkpoint.py +4 -14
  379. mindspore/train/callback/_flops_collector.py +11 -7
  380. mindspore/train/callback/_landscape.py +0 -1
  381. mindspore/train/callback/_train_fault_tolerance.py +98 -21
  382. mindspore/train/data_sink.py +15 -6
  383. mindspore/train/dataset_helper.py +14 -5
  384. mindspore/train/model.py +133 -69
  385. mindspore/train/serialization.py +168 -126
  386. mindspore/train/summary/summary_record.py +13 -2
  387. mindspore/train/train_thor/model_thor.py +2 -2
  388. mindspore/turbojpeg.dll +0 -0
  389. mindspore/utils/__init__.py +3 -2
  390. mindspore/utils/dryrun.py +0 -6
  391. mindspore/utils/runtime_execution_order_check.py +163 -77
  392. mindspore/utils/sdc_detect.py +68 -0
  393. mindspore/utils/utils.py +14 -17
  394. mindspore/vcmeta.dll +0 -0
  395. mindspore/vcruntime140.dll +0 -0
  396. mindspore/vcruntime140_1.dll +0 -0
  397. mindspore/version.py +1 -1
  398. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/METADATA +5 -4
  399. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/RECORD +403 -442
  400. mindspore/_deprecated/jit.py +0 -198
  401. mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
  402. mindspore/communication/_hccl_management.py +0 -297
  403. mindspore/experimental/es/embedding_service.py +0 -891
  404. mindspore/experimental/es/embedding_service_layer.py +0 -581
  405. mindspore/profiler/common/validator/__init__.py +0 -14
  406. mindspore/profiler/common/validator/validate_path.py +0 -84
  407. mindspore/profiler/parser/__init__.py +0 -14
  408. mindspore/profiler/parser/aicpu_data_parser.py +0 -272
  409. mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
  410. mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
  411. mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
  412. mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
  413. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
  414. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
  415. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
  416. mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
  417. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
  418. mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
  419. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
  420. mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
  421. mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
  422. mindspore/profiler/parser/ascend_flops_generator.py +0 -116
  423. mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
  424. mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
  425. mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
  426. mindspore/profiler/parser/ascend_memory_generator.py +0 -185
  427. mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
  428. mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
  429. mindspore/profiler/parser/ascend_op_generator.py +0 -334
  430. mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
  431. mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
  432. mindspore/profiler/parser/base_timeline_generator.py +0 -483
  433. mindspore/profiler/parser/container.py +0 -229
  434. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
  435. mindspore/profiler/parser/flops_parser.py +0 -531
  436. mindspore/profiler/parser/framework_enum.py +0 -111
  437. mindspore/profiler/parser/framework_parser.py +0 -464
  438. mindspore/profiler/parser/framework_struct.py +0 -61
  439. mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
  440. mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
  441. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
  442. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
  443. mindspore/profiler/parser/hccl_parser.py +0 -573
  444. mindspore/profiler/parser/hwts_log_parser.py +0 -122
  445. mindspore/profiler/parser/integrator.py +0 -526
  446. mindspore/profiler/parser/memory_usage_parser.py +0 -277
  447. mindspore/profiler/parser/minddata_analyzer.py +0 -800
  448. mindspore/profiler/parser/minddata_parser.py +0 -186
  449. mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
  450. mindspore/profiler/parser/op_intermediate_parser.py +0 -149
  451. mindspore/profiler/parser/optime_parser.py +0 -250
  452. mindspore/profiler/parser/profiler_info.py +0 -213
  453. mindspore/profiler/parser/step_trace_parser.py +0 -666
  454. mindspore/utils/hooks.py +0 -81
  455. /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
  456. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/WHEEL +0 -0
  457. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/entry_points.txt +0 -0
  458. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/top_level.txt +0 -0
@@ -582,6 +582,8 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
582
582
  The number of multiprocess settings is related to the size of the host, and it is not recommended to set it
583
583
  too large, otherwise it may cause freezing.
584
584
 
585
+ This function does not support converting remove_redundancy's checkpoint file.
586
+
585
587
  Args:
586
588
  src_checkpoints_dir (str): The source checkpoints directory.
587
589
  dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
@@ -924,8 +926,8 @@ def set_op_strategy_config(mode="SAVE", path=""):
924
926
 
925
927
  def build_searched_strategy(strategy_filename):
926
928
  """
927
- Extract the sharding strategy for each parameter in the network
928
- from the strategy file for distributed inference scenarios.
929
+ Extract the sharding strategy for each parameter in the network from the strategy file
930
+ for distributed inference scenarios.
929
931
 
930
932
  Args:
931
933
  strategy_filename (str): Name of strategy file.
@@ -1025,8 +1027,10 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1025
1027
  >>> from mindspore.parallel.auto_parallel import AutoParallel
1026
1028
  >>> from mindspore.nn.utils import no_init_parameters
1027
1029
  >>> from mindspore.common.initializer import initializer, One
1030
+ >>> from mindspore.communication.management import get_group_size
1028
1031
  >>>
1029
1032
  >>> step_per_epoch = 4
1033
+ >>> device_num = get_group_size()
1030
1034
  >>>
1031
1035
  >>> # Define the network structure.
1032
1036
  >>> class Net(nn.Cell):
@@ -1070,7 +1074,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1070
1074
  ... network = AutoParallel(network, parallel_mode="semi_auto")
1071
1075
  ... network.save_param_strategy_file(file_path="./train_strategy.ckpt")
1072
1076
  ... model = ms.Model(network=network, loss_fn=net_loss, optimizer=net_opt)
1073
- ... ckpt_config = train.CheckpointConfig(keep_checkpoint_max=1, integrated_save=False)
1077
+ ... ckpt_config = train.CheckpointConfig(keep_checkpoint_max=1, integrated_save=True)
1074
1078
  ... global_rank_id = int(os.getenv("RANK_ID"))
1075
1079
  ... ckpt_path = "./rank_{}_ckpt".format(global_rank_id)
1076
1080
  ... ckpt_callback = train.ModelCheckpoint(prefix="parallel", directory=ckpt_path, config=ckpt_config)
@@ -1096,10 +1100,10 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1096
1100
  >>>
1097
1101
  >>> train_net()
1098
1102
  >>> load_model()
1099
- [[-7.3259363 -7.497216 -7.398196 ... -7.374962 -7.204874 -7.234935 ]
1100
- [ 3.362938 3.3535435 3.3832688 ... 3.4263954 3.279045 3.3202887]
1103
+ [[-9.62929535e+00, -9.76258755e+00, -9.70192051e+00 ... -9.67151260e+00, -9.71998310e+00, -9.64571190e+00],
1104
+ [-4.63218540e-01, -4.07317460e-01, -3.78161550e-01 ... -3.95918339e-01, -2.87363172e-01, -3.48693460e-01],
1101
1105
  ...
1102
- [ 1.6067538 1.6244187 1.5384722 ... 1.5449994 1.6195512 1.6176052]]
1106
+ [-4.28075647e+00, -4.36630344e+00, -4.25664043e+00 ... -4.32012939e+00, -4.30337954e+00, -4.27571440e+00]]
1103
1107
  """
1104
1108
  if format not in ['safetensors', 'ckpt'] or output_format not in ['safetensors', 'ckpt']:
1105
1109
  raise ValueError(
@@ -1161,6 +1165,8 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1161
1165
  train_strategy_filename = ms.context.get_auto_parallel_context("strategy_ckpt_load_file")
1162
1166
 
1163
1167
  _train_strategy = build_searched_strategy(train_strategy_filename)
1168
+ if not _train_strategy:
1169
+ return True
1164
1170
  train_strategy = _convert_to_list(_train_strategy)
1165
1171
 
1166
1172
  train_dev_count = 1
@@ -1185,7 +1191,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1185
1191
  param_not_in_ckpt = []
1186
1192
  for _, param in network.parameters_and_names():
1187
1193
  sliced_params = []
1188
- if param.name not in rank_list.keys():
1194
+ if param.name not in rank_list:
1189
1195
  param_not_in_strategy.append(param.name)
1190
1196
  continue
1191
1197
  if param.name not in param_total_dict:
@@ -22,7 +22,8 @@ import socket
22
22
  import psutil
23
23
  import mindspore.log as logger
24
24
  from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
25
- _is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip
25
+ _is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip, _generate_auto_bind_core_strategy, \
26
+ _generate_bind_core_strategy
26
27
 
27
28
 
28
29
  class _Node:
@@ -79,11 +80,12 @@ class _ComputeGraphNode(_Node):
79
80
  Worker node for dynamic networking. Inherits from the Node class.
80
81
  """
81
82
 
82
- def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file,
83
+ def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, node_rank, args_list, output_file,
83
84
  tail_worker_log, join, is_simulation):
84
85
  super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file,
85
86
  tail_worker_log, join, is_simulation)
86
87
  self.node_id = node_id
88
+ self.node_rank = node_rank
87
89
 
88
90
  def run(self):
89
91
  """
@@ -95,6 +97,8 @@ class _ComputeGraphNode(_Node):
95
97
  super().run()
96
98
  if self.node_id is not None:
97
99
  os.environ["MS_NODE_ID"] = str(self.node_id)
100
+ if self.node_rank is not None:
101
+ os.environ["MS_NODE_RANK"] = str(self.node_rank)
98
102
  # If simulation level is set, environment variable 'MS_ROLE' will not be set.
99
103
  if not self.is_simulation:
100
104
  os.environ["MS_ROLE"] = "MS_WORKER"
@@ -119,6 +123,9 @@ class _ComputeGraphNode(_Node):
119
123
  return subprocess.Popen(['/usr/bin/tail', '-f', self.output_file])
120
124
 
121
125
  def enable_tail_worker_log(self):
126
+ """
127
+ Get valid rank ID for tailing the corresponding worker log.
128
+ """
122
129
  tail_worker_log_list = []
123
130
  if self.tail_worker_log != "-1":
124
131
  tail_worker_log_list.extend([int(num) for num in self.tail_worker_log.split(',')])
@@ -169,12 +176,15 @@ class _ProcessManager:
169
176
 
170
177
  self.sim_level = args.sim_level
171
178
  self.sim_rank_id = args.sim_rank_id
172
- self.is_simulation = (self.sim_level != -1)
179
+ self.is_simulation = self.sim_level != -1
173
180
  if self.is_simulation:
174
181
  os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
182
+ if self.sim_rank_id == -1:
183
+ self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
175
184
  elif os.getenv("MS_SIMULATION_LEVEL"):
176
185
  self.is_simulation = True
177
- self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
186
+ if self.sim_rank_id == -1:
187
+ self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
178
188
  if os.getenv("RANK_SIZE"):
179
189
  self.exported_rank_size = os.getenv("RANK_SIZE")
180
190
  # If sim_rank_id is set, single worker can be started.
@@ -205,15 +215,24 @@ class _ProcessManager:
205
215
  finally:
206
216
  os.umask(origin_mask)
207
217
 
218
+ self.device_to_cpu_map = {}
219
+ if self.bind_core is True:
220
+ self.device_to_cpu_map = _generate_auto_bind_core_strategy(self.local_worker_num)
221
+
208
222
  self.proc_rank_map = {}
209
223
  self.enable_mindx = False
224
+ self._check_taskd()
225
+
226
+ def _check_taskd(self):
227
+ """check if enable taskd."""
210
228
  tft_env = os.getenv("MS_ENABLE_TFT", "")
211
- if ("TTP:1" in tft_env) or ("UCE:1" in tft_env) or ("ARF:1" in tft_env):
229
+ if any(v in tft_env for v in ('TTP:1', 'UCE:1', 'ARF:1', 'TSP:1', 'RSC:1', 'HCCE:1')):
212
230
  try:
213
231
  from taskd.python.framework.agent.ms_mgr.msrun_plugin import MSRunPlugin
214
232
  self.msmgr = MSRunPlugin()
215
233
  self.msmgr.register_callbacks("KILL_WORKER", self.kill_workers)
216
234
  self.msmgr.register_callbacks("START_ALL_WORKER", self.start_all_workers)
235
+ self.msmgr.register_callbacks("START_WORKER_LIST", self.start_worker_list)
217
236
  self.msmgr.register_callbacks("MONITOR", self.monitor_rank_status)
218
237
  self.enable_mindx = True
219
238
  os.environ["MS_ENABLE_RECOVERY"] = str(1)
@@ -261,6 +280,45 @@ class _ProcessManager:
261
280
  self.is_simulation)
262
281
  self.msn_process = msn.run()
263
282
 
283
+ def _start_single_worker(self, local_rank):
284
+ """
285
+ Start worker processor
286
+
287
+ Args:
288
+ local_rank: local rank id.
289
+ """
290
+ os.environ["DEVICE_ID"] = str(local_rank)
291
+ node_id, log_name = self._get_node_id_and_log_path(local_rank)
292
+ if node_id is None:
293
+ logger.warning(f"Rank ids will be assigned automatically, "
294
+ "please use 'grep -rn 'rank id:' command to check each worker log's rank id.")
295
+ else:
296
+ # If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
297
+ # This is for rank_table method's compatibility consideration.
298
+ os.environ["RANK_ID"] = str(node_id)
299
+ print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
300
+ f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
301
+ if self.is_simulation and (self.sim_rank_id != -1):
302
+ # Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
303
+ os.environ["RANK_ID"] = str(self.sim_rank_id)
304
+ logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
305
+
306
+ if self.bind_core:
307
+ affinity_cpu_str = _generate_bind_core_strategy(local_rank, self.device_to_cpu_map, self.bind_core)
308
+ if affinity_cpu_str is not None:
309
+ cmd = _generate_cmd_args_list_with_core(self.cmd, self.cmd_args, affinity_cpu_str)
310
+ else:
311
+ cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
312
+ else:
313
+ cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
314
+ cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
315
+ node_id, self.node_rank, cmd, log_name, self.tail_worker_log, self.join,
316
+ self.is_simulation)
317
+ process, tail_process = cgn.run()
318
+ self.cgn_processes.append(process)
319
+ self.tail_cgn_processes.append(tail_process)
320
+ self.proc_rank_map[local_rank] = process
321
+
264
322
  def start_workers(self):
265
323
  """
266
324
  Starts the worker nodes.
@@ -275,40 +333,8 @@ class _ProcessManager:
275
333
  "'rank_id' of each process will be assigned after cluster is successfully built.\n"
276
334
  "You can access 'RANK_ID' environment variable after calling "
277
335
  "'mindspore.communication.init()'")
278
-
279
336
  for i in range(self.local_worker_num):
280
- os.environ["DEVICE_ID"] = str(i)
281
- node_id, log_name = self._get_node_id_and_log_path(i)
282
- if node_id is None:
283
- logger.warning(f"Rank ids will be assigned automatically, "
284
- "please use 'grep -rn 'rank id:' command to check each worker log's rank id.")
285
- else:
286
- # If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
287
- # This is for rank_table method's compatibility consideration.
288
- os.environ["RANK_ID"] = str(node_id)
289
- print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
290
- f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
291
- if self.is_simulation and (self.sim_rank_id != -1):
292
- # Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
293
- os.environ["RANK_ID"] = str(self.sim_rank_id)
294
- logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
295
-
296
- if self.bind_core:
297
- cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
298
- if not cpu_num.isdigit():
299
- raise RuntimeError(f"Got cpu number from '/proc/cpuinfo' is {cpu_num}, failed to bind core.")
300
- avg = int(cpu_num) // self.local_worker_num
301
- cpu_start = avg * i
302
- cpu_end = cpu_start + avg - 1
303
- cmd = _generate_cmd_args_list_with_core(self.cmd, self.cmd_args, cpu_start, cpu_end)
304
- else:
305
- cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
306
- cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
307
- node_id, cmd, log_name, self.tail_worker_log, self.join, self.is_simulation)
308
- process, tail_process = cgn.run()
309
- self.cgn_processes.append(process)
310
- self.tail_cgn_processes.append(tail_process)
311
- self.proc_rank_map[i] = process
337
+ self._start_single_worker(i)
312
338
 
313
339
  def join_processes(self):
314
340
  """
@@ -334,7 +360,7 @@ class _ProcessManager:
334
360
  continue
335
361
  elif ret_code != 0:
336
362
  has_exception = True
337
- logger.error(f"Worker process {p.pid} exit with exception.")
363
+ logger.error(f"Worker process {p.pid} exit with exception. Error code: {ret_code}.")
338
364
  break
339
365
  else:
340
366
  success_cgn_processes.add(p)
@@ -420,14 +446,9 @@ class _ProcessManager:
420
446
  Args:
421
447
  NA.
422
448
  """
423
- for p in self.cgn_processes:
424
- if p.poll() is None:
425
- p.kill()
449
+ self.kill_worker_processes()
450
+ self.kill_tail_log_processes()
426
451
  self.cgn_processes.clear()
427
-
428
- for p in self.tail_cgn_processes:
429
- if p is not None:
430
- p.kill()
431
452
  self.tail_cgn_processes.clear()
432
453
 
433
454
  def kill_single_worker(self, pid):
@@ -441,7 +462,7 @@ class _ProcessManager:
441
462
  for i in range(len(self.cgn_processes)):
442
463
  p = self.cgn_processes[i]
443
464
  if p.pid == pid and p.poll() is None:
444
- p.kill()
465
+ os.killpg(os.getpgid(p.pid), signal.SIGKILL)
445
466
  del self.cgn_processes[i]
446
467
  tail_p = self.tail_cgn_processes[i]
447
468
  if tail_p is not None:
@@ -499,7 +520,8 @@ class _ProcessManager:
499
520
  p_status = p.poll()
500
521
  if (not psutil.pid_exists(p.pid)) and (p_status != 0):
501
522
  p_status = 300
502
- return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id}
523
+ return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id, "local_rank": rank_id,
524
+ "node_id": self.node_rank}
503
525
  except KeyError:
504
526
  logger.info(f"Process rank {rank_id} has not been initialized.")
505
527
  return {"pid": None, "status": 200, "global_rank": global_rank_id}
@@ -519,7 +541,24 @@ class _ProcessManager:
519
541
  self.start_workers()
520
542
  worker_status = self.monitor_rank_status([-1])
521
543
  for i in range(self.local_worker_num):
522
- if worker_status[i]["status"] != None: # pylint: disable=singleton-comparison
544
+ if worker_status[i]["status"] is not None:
545
+ return 1
546
+ return 0
547
+
548
+ def start_worker_list(self, rank_ids):
549
+ """
550
+ Start worker processor by rank list.
551
+
552
+ Args:
553
+ rank_ids: worker process's local rank list, which is also device_id.
554
+ """
555
+ if not isinstance(rank_ids, list):
556
+ raise TypeError(f"The type of 'rank_ids' must be a list, but got:{rank_ids}")
557
+ for idx in rank_ids:
558
+ self._start_single_worker(idx)
559
+ worker_status = self.monitor_rank_status(rank_ids)
560
+ for i in rank_ids:
561
+ if worker_status[i]["status"] is not None:
523
562
  return 1
524
563
  return 0
525
564
 
@@ -18,6 +18,8 @@ import json
18
18
  import socket
19
19
  import ipaddress
20
20
  import mindspore.log as logger
21
+ from mindspore.runtime.thread_bind_core import _get_physical_device_id, _get_cpu_available, \
22
+ _auto_generate_strategy, _equal_distribution_strategy
21
23
 
22
24
  CURRENT_IP = None
23
25
 
@@ -45,19 +47,19 @@ def _generate_cmd_args_list(cmd, cmd_args):
45
47
  return [cmd] + cmd_args
46
48
 
47
49
 
48
- def _generate_cmd_args_list_with_core(cmd, cmd_args, cpu_start, cpu_end):
50
+ def _generate_cmd_args_list_with_core(cmd, cmd_args, affinity_cpu_str):
49
51
  """
50
52
  Generates arguments list for 'Popen'. It consists of a binary file name and subsequential arguments.
51
53
  """
52
54
  # Bind cpu cores to this process.
53
- taskset_args = ['taskset'] + ['-c'] + [str(cpu_start) + '-' + str(cpu_end)]
55
+ taskset_args = ['taskset'] + ['-c'] + [affinity_cpu_str]
54
56
  final_cmd = []
55
57
  if cmd not in ['python', 'pytest', 'python3']:
56
58
  # If user don't set binary file name, defaulty use 'python' to launch the job.
57
59
  final_cmd = taskset_args + ['python'] + [cmd] + cmd_args
58
60
  else:
59
61
  final_cmd = taskset_args + [cmd] + cmd_args
60
- logger.info(f"Launch process with command: {' '.join(final_cmd)}")
62
+ logger.warning(f"Launch process with command: {' '.join(final_cmd)}")
61
63
  return final_cmd
62
64
 
63
65
 
@@ -83,8 +85,8 @@ def _get_local_ip(ip_address):
83
85
  CURRENT_IP = s.getsockname()[0]
84
86
  s.close()
85
87
  except Exception as e:
86
- raise RuntimeError(f"Get local ip failed: {e}. Please check whether an accessible address "
87
- "is input by '--master_address'.")
88
+ raise RuntimeError("Get local ip has failed. Please verify that the accessible address has been "
89
+ "specified in the '--master_address' parameter") from e
88
90
  return CURRENT_IP
89
91
 
90
92
 
@@ -124,8 +126,8 @@ def _convert_addr_to_ip(master_addr):
124
126
  logger.info(f"Convert input host name:{master_addr} to ip address:{ip_address}.")
125
127
  return ip_address
126
128
  except socket.gaierror as e:
127
- raise RuntimeError(f"DNS resolution failed: {e}. Please check whether a correct host name "
128
- "is input by '--master_address'.")
129
+ raise RuntimeError("DNS resolution has failed. Please verify that the correct hostname has been "
130
+ "specified in the '--master_address' parameter") from e
129
131
 
130
132
 
131
133
  def _send_scale_num(url, scale_num):
@@ -134,3 +136,89 @@ def _send_scale_num(url, scale_num):
134
136
 
135
137
  """
136
138
  return ""
139
+
140
+
141
+ def _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, device_to_cpu_map):
142
+ """
143
+ Parse the global device_to_cpu_map and return a cpu list for assigned local_rank_id.
144
+
145
+ """
146
+ input_device_id = int(list(device_to_cpu_map.keys())[local_rank_id].replace("device", ""))
147
+ if physical_device_id != input_device_id:
148
+ return ""
149
+ affinity_cpu_list = list(device_to_cpu_map.values())[local_rank_id]
150
+ affinity_cpu_str = ",".join(affinity_cpu_list)
151
+ return affinity_cpu_str
152
+
153
+
154
+ def _generate_auto_bind_core_strategy(local_worker_num):
155
+ """
156
+ Get device to core range assigned for the all processes.
157
+
158
+ """
159
+ simulation_level = os.getenv("MS_SIMULATION_LEVEL", "").strip()
160
+
161
+ try:
162
+ available_cpus = _get_cpu_available()
163
+ except RuntimeError as e:
164
+ logger.warning(f"Failed to acquire available cpu info, error: {e} Will not launch process with taskset.")
165
+ return {}
166
+
167
+ if not simulation_level:
168
+ device_to_cpu_map = _auto_generate_strategy(local_worker_num, available_cpus)
169
+ else:
170
+ device_to_cpu_map = _equal_distribution_strategy(local_worker_num, available_cpus)
171
+
172
+ return device_to_cpu_map
173
+
174
+
175
+ def ranges_to_str(num_list):
176
+ """
177
+ Convert a num list to a range string.
178
+
179
+ """
180
+ ranges = []
181
+ start = num_list[0]
182
+ for i in range(1, len(num_list)):
183
+ if num_list[i] != num_list[i-1] + 1:
184
+ ranges.append((start, num_list[i-1]))
185
+ start = num_list[i]
186
+ ranges.append((start, num_list[-1]))
187
+
188
+ parts = []
189
+ for start, end in ranges:
190
+ if start == end:
191
+ parts.append(str(start))
192
+ else:
193
+ parts.append(f"{start}-{end}")
194
+ return ",".join(parts)
195
+
196
+
197
+ def _generate_bind_core_strategy(local_rank_id, device_to_cpu_map, arg_bind_core):
198
+ """
199
+ Get device to core range assigned for the all processes.
200
+
201
+ """
202
+ affinity_cpu_str = ""
203
+ cpu_list_for_device = []
204
+ simulation_level = os.getenv("MS_SIMULATION_LEVEL", "").strip()
205
+
206
+ try:
207
+ physical_device_id = _get_physical_device_id(local_rank_id, simulation_level)
208
+ except RuntimeError as e:
209
+ logger.warning(f"Failed to acquire device id, error: {e} Will not launch process with taskset.")
210
+ return None
211
+
212
+ if isinstance(arg_bind_core, dict):
213
+ affinity_cpu_str = _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, arg_bind_core)
214
+ if not affinity_cpu_str:
215
+ logger.warning(f"Failed to find physical_device_id[{physical_device_id}] for "
216
+ f"process[{local_rank_id}]. Will not launch process with taskset.")
217
+ return None
218
+ elif arg_bind_core is True:
219
+ cpu_list_for_device = device_to_cpu_map.get(physical_device_id, [])
220
+ if not cpu_list_for_device:
221
+ return None
222
+ os.environ["MSRUN_CPU_LIST"] = str(cpu_list_for_device)
223
+ affinity_cpu_str = ranges_to_str(cpu_list_for_device)
224
+ return affinity_cpu_str
@@ -14,9 +14,47 @@
14
14
  # ============================================================================
15
15
  """Entrypoint of ms_run"""
16
16
  import ast
17
- from argparse import REMAINDER, ArgumentParser
17
+ import re
18
+ import json
19
+ from argparse import REMAINDER, ArgumentParser, ArgumentTypeError
18
20
  from .process_entity import _ProcessManager
19
21
 
22
+
23
+ def parse_and_validate_bind_core(value):
24
+ """
25
+ Parse input argument of --bind_core.
26
+
27
+ """
28
+ if value.lower() == "true":
29
+ return True
30
+ if value.lower() == "false":
31
+ return False
32
+
33
+ try:
34
+ value_dict = json.loads(value)
35
+ except json.JSONDecodeError as e:
36
+ raise ArgumentTypeError("Failed to parse JSON into a dictionary") from e
37
+
38
+ if isinstance(value_dict, dict):
39
+ range_pattern = re.compile(r'^\d+-\d+$')
40
+ for device_id, affinity_cpu_list in value_dict.items():
41
+ if not re.fullmatch(r"device\d+", device_id):
42
+ raise ArgumentTypeError(f"Key '{device_id}' must be in format 'deviceX' (X ≥ 0).")
43
+ if not isinstance(affinity_cpu_list, list):
44
+ raise ArgumentTypeError(f"Value for '{device_id}':{affinity_cpu_list} should be a list, "
45
+ f"but got {type(affinity_cpu_list)}.")
46
+
47
+ for cpu_range in affinity_cpu_list:
48
+ if not isinstance(cpu_range, str):
49
+ raise ArgumentTypeError(f"CPU range '{cpu_range}' in '{affinity_cpu_list}' should be a string.")
50
+ if not range_pattern.match(cpu_range):
51
+ raise ArgumentTypeError(f"CPU range '{cpu_range}' in '{affinity_cpu_list}' should be "
52
+ "in format 'cpuidX-cpuidY'.")
53
+ return value_dict
54
+
55
+ raise ArgumentTypeError(f"Type of {value} should be bool or dict, but got {type(value)}.")
56
+
57
+
20
58
  def get_args():
21
59
  """
22
60
  Parses and retrieves command-line arguments.
@@ -77,23 +115,26 @@ def get_args():
77
115
  parser.add_argument(
78
116
  "--bind_core",
79
117
  default=False,
80
- type=ast.literal_eval,
81
- choices=[True, False],
82
- help="specifies whether msrun should bind cpu cores to spawned processes."
118
+ type=parse_and_validate_bind_core,
119
+ help="specifies whether msrun should bind CPU cores to spawned processes. "
120
+ "If set to True, msrun will bind core based on the environment automatically, "
121
+ "and if passed a dict, msrun will bind core based on this dict information."
83
122
  )
84
123
  parser.add_argument(
85
124
  "--sim_level",
86
125
  default=-1,
87
126
  type=int,
88
127
  choices=[0, 1, 2, 3],
89
- help="specifies simulation level. When this argument is set, msrun only spawns one process "
90
- "but export RANK_SIZE with value worker_num and RANK_ID with value sim_rank_id."
128
+ help="specifies simulation level. This argument activates dryrun mode, functioning "
129
+ "equivalently to environment variable 'MS_SIMULATION_LEVEL' while having higher priority."
91
130
  )
92
131
  parser.add_argument(
93
132
  "--sim_rank_id",
94
133
  default=-1,
95
134
  type=int,
96
- help="specifies simulation process's rank id. Only one process is spawned in simulation scenario."
135
+ help="specifies simulation process's rank id. When this argument is set, only one process "
136
+ "is spawned on dryrun mode, functioning equivalently to environment variable 'RANK_ID' "
137
+ "while having higher priority."
97
138
  )
98
139
  parser.add_argument(
99
140
  "--rank_table_file",
@@ -16,8 +16,15 @@
16
16
  """
17
17
  Parallel function operator
18
18
  """
19
+ from __future__ import absolute_import
19
20
 
20
- from mindspore.parallel.function.reshard_func import reshard
21
+ from . import (
22
+ reshard_func
23
+ )
24
+
25
+ from .reshard_func import (
26
+ reshard
27
+ )
21
28
 
22
29
  __all__ = []
23
30
  __all__.extend(reshard_func.__all__)
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  # ============================================================================
15
15
  """Defines parameter operators with functional form."""
16
- import mindspore as ms
17
16
  from mindspore import context, ops
18
17
  from mindspore import log as logger
19
18
  from mindspore.ops import operations as P
@@ -43,11 +42,12 @@ def reshard(tensor, layout):
43
42
  can check :class:`mindspore.parallel.Layout` for reference.
44
43
 
45
44
  Note:
46
- - In the Graph mode, this function can set the sharding propagation strategy of a tensor.
47
- For those tensor do not manually be set, their strategies are decided by the sharding
48
- strategy propagation algorithm automatically.
49
- - In PyNative mode, you can use this method to arrange tensors in a cell (that is, cells
50
- that use Cell.shard/F.shard in PyNative mode) that is executed in parallel in graph mode.
45
+ In the Graph mode, this function can set the sharding propagation strategy of a tensor.
46
+ For those tensor do not manually be set, their strategies are decided by the sharding
47
+ strategy propagation algorithm automatically.
48
+
49
+ .. warning::
50
+ The method is currently not supported in PyNative mode.
51
51
 
52
52
  Args:
53
53
  tensor (Tensor): The tensor to be set the sharding strategy.
@@ -59,8 +59,8 @@ def reshard(tensor, layout):
59
59
  Tensor. The mathematically equivalent of the input tensor.
60
60
 
61
61
  Raises:
62
- TypeError: Reshard takes in Tensor type as the first input param, but got: `type(tensor)`.
63
- TypeError: Reshard only support type mindspore.parallel.Layout but got: `type(layout)`.
62
+ TypeError: If the type of input param `tensor` is not mindspore.Tensor.
63
+ TypeError: If the type of input param `layout` is not mindspore.parallel.Layout.
64
64
 
65
65
  Supported Platforms:
66
66
  ``Ascend``
@@ -220,11 +220,11 @@ def _redistribute(tensor, dst_dtensor_info):
220
220
  if not comm_tensor_data_func._current_rank_has_data:
221
221
  new_tensor_shape = tuple([tensor_data.shape[i] // tensor._dtensor_info.sharding_strategy[i]
222
222
  for i in range(len(tensor.shape))])
223
- tensor_data = comm_tensor_data_func.comm_data(ops.zeros(new_tensor_shape, tensor.dtype))
223
+ tensor_data = ops.zeros(new_tensor_shape, tensor.dtype)
224
+ _ = comm_tensor_data_func.comm_data(tensor_data)
224
225
  else:
225
- tensor_data = comm_tensor_data_func.comm_data(tensor)
226
+ _ = comm_tensor_data_func.comm_data(tensor_data)
226
227
  all_reduce_data = True
227
- ms.communication.comm_func.barrier()
228
228
  if src_layout_info['device_matrix'] == dst_layout_info['device_matrix'] and src_layout_info['tensor_map'] == \
229
229
  dst_layout_info['tensor_map']:
230
230
  return tensor_data
@@ -236,7 +236,7 @@ def _redistribute(tensor, dst_dtensor_info):
236
236
  global REDIST_CELL_CACHE
237
237
  redist_cache_key = (f"{src_layout_info['device_matrix']}, {src_layout_info['tensor_map']} -> "
238
238
  f"{dst_layout_info['device_matrix']}, {dst_layout_info['tensor_map']}")
239
- if redist_cache_key in REDIST_CELL_CACHE.keys():
239
+ if redist_cache_key in REDIST_CELL_CACHE:
240
240
  logger.debug(f"redist_cache_key is {redist_cache_key}, match cache")
241
241
  redist_func = REDIST_CELL_CACHE[redist_cache_key]
242
242
  else:
@@ -17,8 +17,21 @@ Interfaces for parallel-related functionality
17
17
  """
18
18
  from __future__ import absolute_import
19
19
 
20
- from mindspore.parallel.nn.parallel_grad_reducer import PipelineGradReducer
21
- from mindspore.parallel.nn.parallel_cell_wrapper import PipelineCell, Pipeline, MicroBatchInterleaved, GradAccumulation
20
+ from . import (
21
+ parallel_grad_reducer,
22
+ parallel_cell_wrapper
23
+ )
24
+
25
+ from .parallel_grad_reducer import (
26
+ PipelineGradReducer
27
+ )
28
+
29
+ from .parallel_cell_wrapper import (
30
+ PipelineCell,
31
+ Pipeline,
32
+ MicroBatchInterleaved,
33
+ GradAccumulation
34
+ )
22
35
 
23
36
  __all__ = []
24
37
  __all__.extend(parallel_grad_reducer.__all__)