mindspore 2.4.1__cp39-cp39-win_amd64.whl → 2.5.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (395) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +8 -3
  5. mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +0 -5
  9. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  10. mindspore/_extends/parse/compile_config.py +64 -0
  11. mindspore/_extends/parse/deprecated/__init__.py +0 -0
  12. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +375 -0
  13. mindspore/_extends/parse/parser.py +23 -5
  14. mindspore/_extends/parse/standard_method.py +123 -27
  15. mindspore/_extends/pijit/pijit_func_white_list.py +1 -1
  16. mindspore/amp.py +7 -1
  17. mindspore/atlprov.dll +0 -0
  18. mindspore/avcodec-59.dll +0 -0
  19. mindspore/avdevice-59.dll +0 -0
  20. mindspore/avfilter-8.dll +0 -0
  21. mindspore/avformat-59.dll +0 -0
  22. mindspore/avutil-57.dll +0 -0
  23. mindspore/boost/boost_cell_wrapper.py +136 -41
  24. mindspore/c1.dll +0 -0
  25. mindspore/c1xx.dll +0 -0
  26. mindspore/c2.dll +0 -0
  27. mindspore/common/__init__.py +3 -1
  28. mindspore/common/_register_for_tensor.py +0 -1
  29. mindspore/common/_stub_tensor.py +25 -4
  30. mindspore/common/_tensor_cpp_method.py +17 -0
  31. mindspore/common/_tensor_docs.py +6132 -0
  32. mindspore/common/api.py +99 -25
  33. mindspore/common/dtype.py +34 -34
  34. mindspore/common/dump.py +2 -1
  35. mindspore/common/file_system.py +8 -1
  36. mindspore/common/generator.py +2 -0
  37. mindspore/common/hook_handle.py +3 -1
  38. mindspore/common/initializer.py +3 -4
  39. mindspore/common/lazy_inline.py +8 -2
  40. mindspore/common/mindir_util.py +10 -2
  41. mindspore/common/parameter.py +30 -27
  42. mindspore/common/tensor.py +713 -1337
  43. mindspore/communication/__init__.py +1 -1
  44. mindspore/communication/_comm_helper.py +10 -0
  45. mindspore/communication/comm_func.py +215 -173
  46. mindspore/communication/management.py +23 -20
  47. mindspore/context.py +292 -193
  48. mindspore/dataset/__init__.py +23 -19
  49. mindspore/dataset/callback/ds_callback.py +2 -1
  50. mindspore/dataset/core/config.py +84 -3
  51. mindspore/dataset/engine/cache_admin.py +3 -3
  52. mindspore/dataset/engine/cache_client.py +5 -4
  53. mindspore/dataset/engine/datasets.py +192 -149
  54. mindspore/dataset/engine/datasets_audio.py +14 -0
  55. mindspore/dataset/engine/datasets_standard_format.py +28 -11
  56. mindspore/dataset/engine/datasets_text.py +38 -1
  57. mindspore/dataset/engine/datasets_user_defined.py +125 -65
  58. mindspore/dataset/engine/datasets_vision.py +81 -8
  59. mindspore/dataset/engine/iterators.py +281 -63
  60. mindspore/dataset/engine/obs/util.py +8 -0
  61. mindspore/dataset/engine/queue.py +40 -0
  62. mindspore/dataset/engine/samplers.py +26 -2
  63. mindspore/dataset/engine/serializer_deserializer.py +1 -1
  64. mindspore/dataset/engine/validators.py +43 -11
  65. mindspore/dataset/transforms/py_transforms_util.py +17 -0
  66. mindspore/dataset/transforms/transforms.py +29 -12
  67. mindspore/dataset/vision/validators.py +1 -2
  68. mindspore/device_context/__init__.py +21 -0
  69. mindspore/device_context/ascend/__init__.py +25 -0
  70. mindspore/device_context/ascend/device.py +72 -0
  71. mindspore/device_context/ascend/op_debug.py +94 -0
  72. mindspore/device_context/ascend/op_precision.py +193 -0
  73. mindspore/device_context/ascend/op_tuning.py +127 -0
  74. mindspore/device_context/cpu/__init__.py +25 -0
  75. mindspore/device_context/cpu/device.py +62 -0
  76. mindspore/device_context/cpu/op_tuning.py +43 -0
  77. mindspore/device_context/gpu/__init__.py +21 -0
  78. mindspore/device_context/gpu/device.py +70 -0
  79. mindspore/device_context/gpu/op_precision.py +67 -0
  80. mindspore/device_context/gpu/op_tuning.py +175 -0
  81. mindspore/device_manager.py +134 -0
  82. mindspore/dnnl.dll +0 -0
  83. mindspore/dpcmi.dll +0 -0
  84. mindspore/experimental/llm_boost/__init__.py +3 -2
  85. mindspore/experimental/llm_boost/ascend_native/__init__.py +22 -0
  86. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +211 -0
  87. mindspore/experimental/llm_boost/ascend_native/llm_boost.py +52 -0
  88. mindspore/experimental/llm_boost/atb/boost_base.py +239 -64
  89. mindspore/experimental/llm_boost/atb/llama_boost.py +52 -30
  90. mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
  91. mindspore/experimental/llm_boost/register.py +1 -0
  92. mindspore/experimental/optim/adadelta.py +26 -22
  93. mindspore/experimental/optim/adam.py +3 -0
  94. mindspore/experimental/optim/lr_scheduler.py +33 -24
  95. mindspore/experimental/optim/radam.py +33 -30
  96. mindspore/hal/device.py +28 -0
  97. mindspore/hal/event.py +17 -0
  98. mindspore/hal/memory.py +94 -3
  99. mindspore/hal/stream.py +91 -6
  100. mindspore/include/api/context.h +1 -2
  101. mindspore/include/dataset/constants.h +2 -2
  102. mindspore/jpeg62.dll +0 -0
  103. mindspore/log.py +12 -0
  104. mindspore/mindrecord/__init__.py +1 -1
  105. mindspore/mindrecord/config.py +17 -316
  106. mindspore/mindrecord/filereader.py +1 -9
  107. mindspore/mindrecord/filewriter.py +5 -15
  108. mindspore/mindrecord/mindpage.py +1 -9
  109. mindspore/mindspore_backend.dll +0 -0
  110. mindspore/mindspore_common.dll +0 -0
  111. mindspore/mindspore_core.dll +0 -0
  112. mindspore/mindspore_glog.dll +0 -0
  113. mindspore/mindspore_ops.dll +0 -0
  114. mindspore/mint/__init__.py +824 -218
  115. mindspore/mint/distributed/__init__.py +66 -4
  116. mindspore/mint/distributed/distributed.py +2594 -44
  117. mindspore/mint/linalg/__init__.py +6 -0
  118. mindspore/mint/nn/__init__.py +473 -14
  119. mindspore/mint/nn/functional.py +486 -11
  120. mindspore/mint/nn/layer/__init__.py +17 -4
  121. mindspore/mint/nn/layer/_functions.py +330 -0
  122. mindspore/mint/nn/layer/activation.py +169 -1
  123. mindspore/mint/nn/layer/basic.py +123 -0
  124. mindspore/mint/nn/layer/conv.py +727 -0
  125. mindspore/mint/nn/layer/normalization.py +215 -19
  126. mindspore/mint/nn/layer/padding.py +797 -0
  127. mindspore/mint/nn/layer/pooling.py +170 -0
  128. mindspore/mint/optim/__init__.py +2 -1
  129. mindspore/mint/optim/adam.py +223 -0
  130. mindspore/mint/optim/adamw.py +26 -19
  131. mindspore/mint/special/__init__.py +2 -1
  132. mindspore/msobj140.dll +0 -0
  133. mindspore/mspdb140.dll +0 -0
  134. mindspore/mspdbcore.dll +0 -0
  135. mindspore/mspdbst.dll +0 -0
  136. mindspore/mspft140.dll +0 -0
  137. mindspore/msvcdis140.dll +0 -0
  138. mindspore/msvcp140_1.dll +0 -0
  139. mindspore/msvcp140_2.dll +0 -0
  140. mindspore/msvcp140_atomic_wait.dll +0 -0
  141. mindspore/msvcp140_codecvt_ids.dll +0 -0
  142. mindspore/multiprocessing/__init__.py +5 -0
  143. mindspore/nn/__init__.py +2 -0
  144. mindspore/nn/cell.py +142 -21
  145. mindspore/nn/dynamic_lr.py +2 -1
  146. mindspore/nn/layer/activation.py +6 -6
  147. mindspore/nn/layer/basic.py +35 -25
  148. mindspore/nn/layer/channel_shuffle.py +3 -3
  149. mindspore/nn/layer/conv.py +3 -0
  150. mindspore/nn/layer/embedding.py +3 -3
  151. mindspore/nn/layer/normalization.py +8 -7
  152. mindspore/nn/layer/padding.py +4 -3
  153. mindspore/nn/layer/pooling.py +55 -23
  154. mindspore/nn/layer/rnn_cells.py +1 -1
  155. mindspore/nn/layer/rnns.py +2 -1
  156. mindspore/nn/layer/timedistributed.py +5 -5
  157. mindspore/nn/layer/transformer.py +48 -26
  158. mindspore/nn/learning_rate_schedule.py +5 -3
  159. mindspore/nn/loss/loss.py +31 -36
  160. mindspore/nn/optim/ada_grad.py +1 -0
  161. mindspore/nn/optim/adadelta.py +2 -2
  162. mindspore/nn/optim/adam.py +1 -1
  163. mindspore/nn/optim/lars.py +1 -4
  164. mindspore/nn/optim/optimizer.py +1 -1
  165. mindspore/nn/optim/rprop.py +2 -2
  166. mindspore/nn/optim/thor.py +2 -1
  167. mindspore/nn/utils/__init__.py +22 -0
  168. mindspore/nn/utils/init.py +73 -0
  169. mindspore/nn/wrap/cell_wrapper.py +4 -6
  170. mindspore/nn/wrap/loss_scale.py +3 -4
  171. mindspore/numpy/array_creations.py +60 -62
  172. mindspore/numpy/array_ops.py +148 -143
  173. mindspore/numpy/logic_ops.py +41 -42
  174. mindspore/numpy/math_ops.py +361 -359
  175. mindspore/numpy/utils.py +16 -16
  176. mindspore/numpy/utils_const.py +4 -4
  177. mindspore/opencv_core452.dll +0 -0
  178. mindspore/opencv_imgcodecs452.dll +0 -0
  179. mindspore/opencv_imgproc452.dll +0 -0
  180. mindspore/ops/__init__.py +2 -1
  181. mindspore/ops/_grad_experimental/grad_comm_ops.py +107 -8
  182. mindspore/ops/_grad_experimental/grad_debug_ops.py +6 -1
  183. mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
  184. mindspore/ops/_grad_experimental/grad_math_ops.py +2 -1
  185. mindspore/ops/_op_impl/cpu/__init__.py +1 -0
  186. mindspore/ops/_op_impl/cpu/raise_op.py +28 -0
  187. mindspore/ops/_vmap/vmap_array_ops.py +20 -19
  188. mindspore/ops/_vmap/vmap_base.py +0 -2
  189. mindspore/ops/_vmap/vmap_grad_nn_ops.py +19 -13
  190. mindspore/ops/_vmap/vmap_math_ops.py +11 -9
  191. mindspore/ops/_vmap/vmap_nn_ops.py +20 -34
  192. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +149 -12
  193. mindspore/ops/auto_generate/gen_arg_handler.py +0 -61
  194. mindspore/ops/auto_generate/gen_extend_func.py +554 -60
  195. mindspore/ops/auto_generate/gen_ops_def.py +1621 -115
  196. mindspore/ops/auto_generate/gen_ops_prim.py +8027 -3411
  197. mindspore/ops/auto_generate/pyboost_inner_prim.py +183 -79
  198. mindspore/ops/composite/base.py +1 -1
  199. mindspore/ops/composite/multitype_ops/_compile_utils.py +229 -30
  200. mindspore/ops/composite/multitype_ops/pow_impl.py +0 -29
  201. mindspore/ops/function/__init__.py +12 -0
  202. mindspore/ops/function/array_func.py +561 -159
  203. mindspore/ops/function/clip_func.py +64 -0
  204. mindspore/ops/function/debug_func.py +28 -20
  205. mindspore/ops/function/image_func.py +1 -1
  206. mindspore/ops/function/linalg_func.py +5 -4
  207. mindspore/ops/function/math_func.py +1664 -294
  208. mindspore/ops/function/nn_func.py +988 -317
  209. mindspore/ops/function/parameter_func.py +3 -56
  210. mindspore/ops/function/random_func.py +243 -33
  211. mindspore/ops/function/sparse_unary_func.py +1 -1
  212. mindspore/ops/functional.py +18 -5
  213. mindspore/ops/functional_overload.py +897 -0
  214. mindspore/ops/operations/__init__.py +3 -2
  215. mindspore/ops/operations/_embedding_cache_ops.py +4 -4
  216. mindspore/ops/operations/_grad_ops.py +2 -34
  217. mindspore/ops/operations/_infer_ops.py +2 -1
  218. mindspore/ops/operations/_inner_ops.py +38 -8
  219. mindspore/ops/operations/array_ops.py +45 -303
  220. mindspore/ops/operations/comm_ops.py +23 -17
  221. mindspore/ops/operations/custom_ops.py +7 -49
  222. mindspore/ops/operations/debug_ops.py +42 -47
  223. mindspore/ops/operations/inner_ops.py +6 -4
  224. mindspore/ops/operations/linalg_ops.py +3 -2
  225. mindspore/ops/operations/manually_defined/ops_def.py +185 -104
  226. mindspore/ops/operations/math_ops.py +11 -216
  227. mindspore/ops/operations/nn_ops.py +153 -310
  228. mindspore/ops/primitive.py +23 -21
  229. mindspore/ops/tensor_method.py +1669 -0
  230. mindspore/ops_generate/aclnn_kernel_register_auto_cc_generator.py +110 -0
  231. mindspore/ops_generate/add_tensor_docs_generator.py +54 -0
  232. mindspore/ops_generate/arg_handler.py +0 -61
  233. mindspore/ops_generate/auto_grad_impl_cc_generator.py +135 -0
  234. mindspore/ops_generate/auto_grad_reg_cc_generator.py +93 -0
  235. mindspore/ops_generate/base_generator.py +11 -0
  236. mindspore/ops_generate/cpp_create_prim_instance_helper_generator.py +108 -0
  237. mindspore/ops_generate/functional_map_cpp_generator.py +491 -0
  238. mindspore/ops_generate/functional_overload_py_generator.py +110 -0
  239. mindspore/ops_generate/functions_cc_generator.py +233 -0
  240. mindspore/ops_generate/gen_aclnn_implement.py +110 -114
  241. mindspore/ops_generate/gen_constants.py +157 -3
  242. mindspore/ops_generate/gen_ops.py +245 -990
  243. mindspore/ops_generate/gen_pyboost_func.py +97 -998
  244. mindspore/ops_generate/gen_utils.py +119 -33
  245. mindspore/ops_generate/lite_ops_cpp_generator.py +155 -0
  246. mindspore/ops_generate/op_api_proto.py +206 -0
  247. mindspore/ops_generate/op_def_py_generator.py +131 -0
  248. mindspore/ops_generate/op_prim_py_generator.py +480 -0
  249. mindspore/ops_generate/op_proto.py +373 -108
  250. mindspore/ops_generate/op_template_parser.py +436 -0
  251. mindspore/ops_generate/ops_def_cc_generator.py +288 -0
  252. mindspore/ops_generate/ops_def_h_generator.py +74 -0
  253. mindspore/ops_generate/ops_name_h_generator.py +68 -0
  254. mindspore/ops_generate/ops_primitive_h_generator.py +81 -0
  255. mindspore/ops_generate/pyboost_functions_cpp_generator.py +370 -0
  256. mindspore/ops_generate/pyboost_functions_h_generator.py +68 -0
  257. mindspore/ops_generate/pyboost_functions_py_generator.py +148 -0
  258. mindspore/ops_generate/pyboost_grad_function_cpp_generator.py +154 -0
  259. mindspore/ops_generate/pyboost_inner_prim_generator.py +131 -0
  260. mindspore/ops_generate/pyboost_native_grad_functions_generator.py +268 -0
  261. mindspore/ops_generate/pyboost_op_cpp_code_generator.py +851 -0
  262. mindspore/ops_generate/pyboost_overload_functions_cpp_generator.py +344 -0
  263. mindspore/ops_generate/pyboost_utils.py +92 -33
  264. mindspore/ops_generate/template.py +294 -44
  265. mindspore/ops_generate/tensor_func_reg_cpp_generator.py +422 -0
  266. mindspore/parallel/__init__.py +3 -3
  267. mindspore/parallel/_auto_parallel_context.py +44 -34
  268. mindspore/parallel/_cell_wrapper.py +22 -3
  269. mindspore/parallel/_parallel_serialization.py +13 -2
  270. mindspore/parallel/_utils.py +4 -2
  271. mindspore/parallel/algo_parameter_config.py +1 -1
  272. mindspore/parallel/checkpoint_transform.py +44 -0
  273. mindspore/parallel/cluster/process_entity/_api.py +131 -37
  274. mindspore/parallel/cluster/process_entity/_utils.py +41 -6
  275. mindspore/parallel/cluster/run.py +20 -3
  276. mindspore/parallel/parameter_broadcast.py +1 -1
  277. mindspore/parallel/shard.py +3 -0
  278. mindspore/parallel/transform_safetensors.py +119 -253
  279. mindspore/pgodb140.dll +0 -0
  280. mindspore/pgort140.dll +0 -0
  281. mindspore/profiler/__init__.py +17 -4
  282. mindspore/profiler/analysis/__init__.py +0 -0
  283. mindspore/profiler/analysis/parser/__init__.py +0 -0
  284. mindspore/profiler/analysis/parser/ascend_cann_parser.py +166 -0
  285. mindspore/profiler/analysis/parser/base_parser.py +158 -0
  286. mindspore/profiler/analysis/parser/framework_cann_relation_parser.py +45 -0
  287. mindspore/profiler/analysis/parser/ms_framework_parser.py +142 -0
  288. mindspore/profiler/analysis/parser/ms_minddata_parser.py +145 -0
  289. mindspore/profiler/analysis/parser/timeline_assembly_factory/__init__.py +0 -0
  290. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +261 -0
  291. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +40 -0
  292. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +84 -0
  293. mindspore/profiler/analysis/parser/timeline_creator/__init__.py +0 -0
  294. mindspore/profiler/analysis/parser/timeline_creator/base_timeline_creator.py +44 -0
  295. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +90 -0
  296. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +76 -0
  297. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +103 -0
  298. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +134 -0
  299. mindspore/profiler/analysis/parser/timeline_event/__init__.py +0 -0
  300. mindspore/profiler/analysis/parser/timeline_event/base_event.py +233 -0
  301. mindspore/profiler/analysis/parser/timeline_event/cpu_op_event.py +47 -0
  302. mindspore/profiler/analysis/parser/timeline_event/flow_event.py +36 -0
  303. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +260 -0
  304. mindspore/profiler/analysis/parser/timeline_event/msprof_event.py +73 -0
  305. mindspore/profiler/analysis/parser/timeline_event/scope_layer_event.py +53 -0
  306. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +146 -0
  307. mindspore/profiler/analysis/task_manager.py +131 -0
  308. mindspore/profiler/analysis/time_converter.py +84 -0
  309. mindspore/profiler/analysis/viewer/__init__.py +0 -0
  310. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +333 -0
  311. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +87 -0
  312. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +252 -0
  313. mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +313 -0
  314. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +322 -0
  315. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +265 -0
  316. mindspore/profiler/analysis/viewer/ascend_timeline_viewer.py +58 -0
  317. mindspore/profiler/analysis/viewer/base_viewer.py +26 -0
  318. mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +97 -0
  319. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +581 -0
  320. mindspore/profiler/analysis/work_flow.py +73 -0
  321. mindspore/profiler/common/ascend_msprof_exporter.py +138 -0
  322. mindspore/profiler/common/command_executor.py +90 -0
  323. mindspore/profiler/common/constant.py +174 -3
  324. mindspore/profiler/common/file_manager.py +208 -0
  325. mindspore/profiler/common/log.py +130 -0
  326. mindspore/profiler/common/msprof_cmd_tool.py +202 -0
  327. mindspore/profiler/common/path_manager.py +371 -0
  328. mindspore/profiler/common/process_bar.py +168 -0
  329. mindspore/profiler/common/process_pool.py +9 -3
  330. mindspore/profiler/common/profiler_context.py +476 -0
  331. mindspore/profiler/common/profiler_info.py +304 -0
  332. mindspore/profiler/common/profiler_output_path.py +284 -0
  333. mindspore/profiler/common/profiler_parameters.py +210 -0
  334. mindspore/profiler/common/profiler_path_manager.py +120 -0
  335. mindspore/profiler/common/record_function.py +76 -0
  336. mindspore/profiler/common/tlv_decoder.py +76 -0
  337. mindspore/profiler/common/util.py +75 -2
  338. mindspore/profiler/dynamic_profiler.py +270 -37
  339. mindspore/profiler/envprofiler.py +138 -0
  340. mindspore/profiler/mstx.py +199 -0
  341. mindspore/profiler/platform/__init__.py +21 -0
  342. mindspore/profiler/platform/base_profiler.py +40 -0
  343. mindspore/profiler/platform/cpu_profiler.py +124 -0
  344. mindspore/profiler/platform/gpu_profiler.py +74 -0
  345. mindspore/profiler/platform/npu_profiler.py +309 -0
  346. mindspore/profiler/profiler.py +580 -93
  347. mindspore/profiler/profiler_action_controller.py +187 -0
  348. mindspore/profiler/profiler_interface.py +114 -0
  349. mindspore/profiler/schedule.py +208 -0
  350. mindspore/rewrite/api/symbol_tree.py +1 -2
  351. mindspore/run_check/_check_version.py +18 -13
  352. mindspore/runtime/__init__.py +37 -0
  353. mindspore/runtime/device.py +27 -0
  354. mindspore/runtime/event.py +209 -0
  355. mindspore/runtime/executor.py +148 -0
  356. mindspore/runtime/memory.py +392 -0
  357. mindspore/runtime/stream.py +460 -0
  358. mindspore/runtime/thread_bind_core.py +401 -0
  359. mindspore/swresample-4.dll +0 -0
  360. mindspore/swscale-6.dll +0 -0
  361. mindspore/tbbmalloc.dll +0 -0
  362. mindspore/tinyxml2.dll +0 -0
  363. mindspore/train/__init__.py +2 -2
  364. mindspore/train/_utils.py +53 -18
  365. mindspore/train/amp.py +8 -4
  366. mindspore/train/callback/_checkpoint.py +32 -18
  367. mindspore/train/callback/_early_stop.py +1 -1
  368. mindspore/train/callback/_flops_collector.py +105 -69
  369. mindspore/train/callback/_history.py +1 -1
  370. mindspore/train/callback/_summary_collector.py +44 -6
  371. mindspore/train/callback/_tft_register.py +37 -15
  372. mindspore/train/dataset_helper.py +11 -11
  373. mindspore/train/metrics/precision.py +4 -5
  374. mindspore/train/mind_ir_pb2.py +167 -46
  375. mindspore/train/model.py +13 -14
  376. mindspore/train/serialization.py +461 -72
  377. mindspore/train/summary/summary_record.py +1 -2
  378. mindspore/train/train_thor/model_thor.py +1 -1
  379. mindspore/turbojpeg.dll +0 -0
  380. mindspore/utils/__init__.py +4 -2
  381. mindspore/utils/dryrun.py +138 -0
  382. mindspore/utils/runtime_execution_order_check.py +550 -0
  383. mindspore/vcmeta.dll +0 -0
  384. mindspore/vcruntime140.dll +0 -0
  385. mindspore/vcruntime140_1.dll +0 -0
  386. mindspore/version.py +1 -1
  387. {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/METADATA +3 -4
  388. {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/RECORD +391 -265
  389. {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/entry_points.txt +1 -1
  390. mindspore/common/_tensor_overload.py +0 -139
  391. mindspore/mindspore_np_dtype.dll +0 -0
  392. mindspore/profiler/envprofiling.py +0 -254
  393. mindspore/profiler/profiling.py +0 -1926
  394. {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/WHEEL +0 -0
  395. {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,107 @@
13
13
  # limitations under the License.
14
14
  # ============================================================================
15
15
  """Communication management API"""
16
+ from __future__ import absolute_import
17
+ import hashlib
18
+ import builtins
19
+ import io
20
+ import pickle
21
+ import numpy as np
16
22
  from mindspore import log as logger
17
- from mindspore.communication._comm_helper import _destroy_group_helper, GlobalComm, _get_rank_helper, _get_size_helper
18
- from mindspore.communication import init, release, get_group_size
23
+ from mindspore.common import dtype as mstype
24
+ from mindspore.ops import ReduceOp, cat
25
+ from mindspore.common.tensor import Tensor
26
+ from mindspore._c_expression import Tensor as Tensor_
27
+ from mindspore.ops.primitive import _primexpr
28
+ from mindspore.communication._comm_helper import (
29
+ _destroy_group_helper,
30
+ _get_rank_helper,
31
+ _get_size_helper,
32
+ _get_backend,
33
+ _get_group_ranks,
34
+ )
35
+ from mindspore.communication import (
36
+ init,
37
+ release,
38
+ get_group_size,
39
+ get_world_rank_from_group_rank,
40
+ create_group,
41
+ GlobalComm,
42
+ get_group_rank_from_world_rank,
43
+ )
44
+ from mindspore.communication.comm_func import (
45
+ _deal_comm_outputs,
46
+ _check_all_tensors,
47
+ _contiguous,
48
+ _check_all_tensor_same_dtype,
49
+ _is_split_sizes_empty,
50
+ _get_size,
51
+ _get_group_rank_from_world_rank_from_cache_helper,
52
+ )
53
+ from mindspore.ops.auto_generate.gen_ops_prim import (
54
+ dist_comm_all_gather_op,
55
+ dist_comm_all_reduce_op,
56
+ dist_comm_reduce_scatter_op,
57
+ dist_comm_isend_op,
58
+ dist_comm_all_to_all_v_op,
59
+ dist_comm_reduce_scatter_tensor_op,
60
+ dist_comm_all_to_all_v_single_op,
61
+ dist_comm_broadcast_op,
62
+ dist_comm_all_gather_into_tensor_op,
63
+ dist_comm_irecv_op,
64
+ dist_comm_scatter_tensor_op,
65
+ dist_comm_gather_into_tensor_op,
66
+ dist_comm_gather_op,
67
+ dist_comm_reduce_op,
68
+ dist_comm_scatter_op,
69
+ dist_comm_barrier_op,
70
+ dist_comm_batch_isend_irecv_op,
71
+ )
72
+
73
+ _pickler = pickle.Pickler
74
+ _unpickler = pickle.Unpickler
75
+ BACKEND_HCCL = "hccl"
76
+ BACKEND_MCCL = "mccl"
77
+
78
+ safe_builtins = {
79
+ 'range',
80
+ 'complex',
81
+ 'set',
82
+ 'frozenset',
83
+ 'slice',
84
+ }
85
+
86
+
87
+ class RestrictedUnpickler(pickle.Unpickler):
88
+ # Override find_class method.
89
+ def find_class(self, module, name):
90
+ # Only allow safe classes from builtins.
91
+ if module == "builtins" and name in safe_builtins:
92
+ return getattr(builtins, name)
93
+ # Forbid everything else.
94
+ raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
95
+ (module, name))
96
+
97
+
98
+ def restricted_loads(s):
99
+ """Helper function analogous to pickle.loads()."""
100
+ return RestrictedUnpickler(io.BytesIO(s)).load()
101
+
102
+
103
+ def _object_to_tensor(obj, size=0):
104
+ f = io.BytesIO()
105
+ _pickler(f).dump(obj)
106
+ buf = np.frombuffer(f.getvalue(), dtype=np.int8)
107
+ tensor_size = buf.size
108
+ if size > tensor_size:
109
+ buf = np.resize(buf, size)
110
+ tensor_size = size
111
+ return Tensor(buf), tensor_size
112
+
113
+
114
+ def _tensor_to_object(tensor, tensor_size):
115
+ buf = tensor.asnumpy().tobytes()[:tensor_size]
116
+ return restricted_loads(buf)
19
117
 
20
118
 
21
119
  def init_process_group(backend="hccl",
@@ -37,28 +135,27 @@ def init_process_group(backend="hccl",
37
135
  Args:
38
136
  backend (str, optional): The backend to ues. default is hccl and now only support hccl.
39
137
  init_method (str, invalid): URL specifying how to init collective communication group. Provides parameters
40
- consistent with pytorch, but is not currently support, setting is invalid.
138
+ consistent with pytorch, but is not currently support, setting is invalid.
41
139
  timeout (timedelta, invalid): Timeout for API executed. Provides parameters consistent with pytorch, but is not
42
- currently support, setting is invalid.
140
+ currently support, setting is invalid.
43
141
  world_size (int, optional): Number of the processes participating in the job.
44
142
  rank (int, invalid): Rank of the current process. Provides parameters consistent with pytorch, but is not
45
- currently support, setting is invalid.
143
+ currently support, setting is invalid.
46
144
  store (Store, invalid): Key/Value store accessible to all workers, used to exchange connection/address
47
- information. Provides parameters consistent with pytorch, but is not currently support,
48
- setting is invalid.
145
+ information. Provides parameters consistent with pytorch, but is not currently support,
146
+ setting is invalid.
49
147
  pg_options (ProcessGroupOptions, invalid): process group options specifying what additional options need to be
50
- passed in during the construction of specific process group. Provides
51
- parameters consistent with pytorch, but is not currently support,
52
- setting is invalid.
148
+ passed in during the construction of specific process group. Provides parameters consistent with pytorch,
149
+ but is not currently support, setting is invalid.
53
150
  device_id (int, invalid): the device id to exeute. Provides parameters consistent with pytorch, but is not
54
- currently support, setting is invalid.
151
+ currently support, setting is invalid.
55
152
 
56
153
  Raises:
57
154
  ValueError: If `backend` is not hccl.
58
155
  ValueError: If `world_size` is not equal to -1 or process group number.
59
156
  RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails,
60
- or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH
61
- have not been exported when backend is HCCL.
157
+ or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH
158
+ have not been exported when backend is HCCL.
62
159
 
63
160
  Supported Platforms:
64
161
  ``Ascend``
@@ -70,13 +167,12 @@ def init_process_group(backend="hccl",
70
167
  For Ascend devices, it is recommended to use the msrun startup method
71
168
  without any third-party or configuration file dependencies.
72
169
  Please see the `msrun start up
73
- <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
170
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
74
171
  for more details.
75
172
 
76
173
  >>> import mindspore as ms
77
- >>> from mindspore import set_context
78
174
  >>> from mindspore.mint.distributed import init_process_group, destroy_process_group
79
- >>> set_context(device_target="Ascend")
175
+ >>> ms.set_device(device_target="Ascend")
80
176
  >>> init_process_group()
81
177
  >>> destroy_process_group()
82
178
  """
@@ -93,13 +189,18 @@ def init_process_group(backend="hccl",
93
189
  if rank != -1:
94
190
  logger.warning("rank is ignored, setting is invalid")
95
191
  if backend != "hccl":
96
- raise ValueError("Only support hccl now, please setting backend to hccl or using default value")
192
+ raise ValueError(
193
+ "Only support hccl now, please setting backend to hccl or using default value"
194
+ )
97
195
 
98
- #init hccl & create world group
196
+ # init hccl & create world group
99
197
  init(backend)
100
198
 
101
199
  if world_size != -1 and world_size != get_group_size():
102
- raise ValueError("world_size is wrong, please using default value or setting: ", get_group_size())
200
+ raise ValueError(
201
+ "world_size is wrong, please using default value or setting: ",
202
+ get_group_size(),
203
+ )
103
204
 
104
205
 
105
206
  def destroy_process_group(group=None):
@@ -108,11 +209,13 @@ def destroy_process_group(group=None):
108
209
  If group is None or "hccl_world_group", Destroy all group and release collective communication lib.
109
210
 
110
211
  Note:
111
- This method isn't supported in GPU and CPU versions of MindSpore.
112
- This method should be used after init_process_group().
212
+ - This method isn't supported in GPU and CPU versions of MindSpore.
213
+ - This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
113
214
 
114
215
  Args:
115
- group (str): The communication group to destroy, the group should be created by init_process_group or new_group.
216
+ group (str, optional): The communication group to work on. Normally, the group should be created by
217
+ :func:`mindspore.mint.distributed.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
218
+ Default: ``None``.
116
219
 
117
220
  Raises:
118
221
  TypeError: If group is not a string.
@@ -128,13 +231,12 @@ def destroy_process_group(group=None):
128
231
  For Ascend devices, it is recommended to use the msrun startup method
129
232
  without any third-party or configuration file dependencies.
130
233
  Please see the `msrun start up
131
- <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
234
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
132
235
  for more details.
133
236
 
134
237
  >>> import mindspore as ms
135
- >>> from mindspore import set_context
136
238
  >>> from mindspore.mint.distributed import init_process_group, destroy_process_group
137
- >>> set_context(device_target="Ascend")
239
+ >>> ms.set_device(device_target="Ascend")
138
240
  >>> init_process_group()
139
241
  >>> destroy_process_group()
140
242
  """
@@ -142,8 +244,10 @@ def destroy_process_group(group=None):
142
244
  if group == GlobalComm.WORLD_COMM_GROUP or group is None:
143
245
  release()
144
246
  elif not isinstance(group, str):
145
- raise TypeError("For 'destroy_group', the argument 'group' must be type of string or None, "
146
- "but got 'group' type : {}.".format(type(group)))
247
+ raise TypeError(
248
+ "For 'destroy_group', the argument 'group' must be type of string or None, "
249
+ "but got 'group' type : {}.".format(type(group))
250
+ )
147
251
  else:
148
252
  _destroy_group_helper(group)
149
253
 
@@ -153,11 +257,12 @@ def get_rank(group=None):
153
257
  Get the rank ID for the current device in the specified collective communication group.
154
258
 
155
259
  Note:
156
- This method should be used after init().
260
+ This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
157
261
 
158
262
  Args:
159
- group (str): The communication group to work on. Normally, the group should be created by create_group,
160
- otherwise, using the default group. If None, ``GlobalComm.WORLD_COMM_GROUP`` will be used.
263
+ group (str, optional): The communication group to work on. Normally, the group should be created by
264
+ :func:`mindspore.mint.distributed.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
265
+ Default: ``None``.
161
266
 
162
267
  Returns:
163
268
  int, the rank ID of the calling process within the group.
@@ -167,7 +272,7 @@ def get_rank(group=None):
167
272
  TypeError: If group is not a string.
168
273
 
169
274
  Supported Platforms:
170
- ``Ascend``
275
+ ``Ascend`` ``CPU``
171
276
 
172
277
  Examples:
173
278
  .. note::
@@ -176,12 +281,12 @@ def get_rank(group=None):
176
281
  For Ascend devices, it is recommended to use the msrun startup method
177
282
  without any third-party or configuration file dependencies.
178
283
  Please see the `msrun start up
179
- <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
284
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
180
285
  for more details.
181
286
 
182
- >>> from mindspore import set_context
287
+ >>> import mindspore as ms
183
288
  >>> from mindspore.mint.distributed import init_process_group, get_rank
184
- >>> set_context(device_target="Ascend")
289
+ >>> ms.set_device(device_target="Ascend")
185
290
  >>> init_process_group()
186
291
  >>> rank_id = get_rank()
187
292
  >>> print(rank_id)
@@ -190,8 +295,10 @@ def get_rank(group=None):
190
295
  if group is None:
191
296
  group = GlobalComm.WORLD_COMM_GROUP
192
297
  if not isinstance(group, str):
193
- raise TypeError("For 'get_rank', the argument 'group' must be type of string, "
194
- "but got 'group' type : {}.".format(type(group)))
298
+ raise TypeError(
299
+ "For 'get_rank', the argument 'group' must be type of string, "
300
+ "but got 'group' type : {}.".format(type(group))
301
+ )
195
302
  try:
196
303
  ret = _get_rank_helper(group)
197
304
  except RuntimeError as e:
@@ -205,11 +312,12 @@ def get_world_size(group=None):
205
312
  Get the rank size of the specified collective communication group.
206
313
 
207
314
  Note:
208
- This method should be used after init().
315
+ This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
209
316
 
210
317
  Args:
211
- group (str): The communication group to work on. Normally, the group should be created by create_group,
212
- otherwise, using the default group. If None, ``GlobalComm.WORLD_COMM_GROUP`` will be used.
318
+ group (str, optional): The communication group to work on. Normally, the group should be created by
319
+ :func:`mindspore.mint.distributed.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
320
+ Default: ``None``.
213
321
 
214
322
  Returns:
215
323
  int, the rank size of the group.
@@ -228,13 +336,12 @@ def get_world_size(group=None):
228
336
  For Ascend devices, it is recommended to use the msrun startup method
229
337
  without any third-party or configuration file dependencies.
230
338
  Please see the `msrun start up
231
- <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
339
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
232
340
  for more details.
233
341
 
234
342
  >>> import mindspore as ms
235
- >>> from mindspore import set_context
236
343
  >>> from mindspore.mint.distributed import init_process_group, get_world_size
237
- >>> set_context(device_target="Ascend")
344
+ >>> ms.set_device(device_target="Ascend")
238
345
  >>> init_process_group()
239
346
  >>> group_size = get_world_size()
240
347
  >>> print("group_size is: ", group_size)
@@ -244,11 +351,2454 @@ def get_world_size(group=None):
244
351
  if group is None:
245
352
  group = GlobalComm.WORLD_COMM_GROUP
246
353
  if not isinstance(group, str):
247
- raise TypeError("For 'get_group_size', the argument 'group' must be type of string, "
248
- "but got 'group' type : {}.".format(type(group)))
354
+ raise TypeError(
355
+ "For 'get_world_size', the argument 'group' must be type of string, "
356
+ "but got 'group' type : {}.".format(type(group))
357
+ )
249
358
  try:
250
359
  ret = _get_size_helper(group)
251
360
  except RuntimeError as e:
252
361
  logger.warning(e)
253
362
  ret = -1
254
363
  return ret
364
+
365
+
366
+ def new_group(ranks=None,
367
+ timeout=None,
368
+ backend=None,
369
+ pg_options=None,
370
+ use_local_synchronization=False,
371
+ group_desc=None):
372
+ """
373
+ Create a new distributed group.
374
+
375
+ Note:
376
+ This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
377
+
378
+ Args:
379
+ ranks (list[int], optional): List of ranks of group members. If ``None``,
380
+ will be create the world group. Default is ``None``.
381
+ timeout (int, invalid): Currently it is a reserved parameter.
382
+ backend (str, invalid): Support backend Library, Currently support ``"hccl"`` and ``"mccl"``.
383
+ when backend is ``"hccl"`` will use Huawei Collective Communication Library(HCCL).
384
+ when backend is ``"mccl"`` will use MindSpore Collective Communication Library(MCCL).
385
+ If ``None``, which means ``"hccl"`` in Ascend. Default is ``None``.
386
+ pg_options (str, invalid): Currently it is a reserved parameter.
387
+ use_local_synchronization (bool, invalid): Currently it is a reserved parameter.
388
+ group_desc (str, invalid): Currently it is a reserved parameter.
389
+
390
+ Returns:
391
+ A string with group name. Return "" in the abnormal scenarios.
392
+
393
+ Raises:
394
+ TypeError: If list ranks in Group has duplicate rank id.
395
+
396
+ Supported Platforms:
397
+ ``Ascend`` ``CPU``
398
+
399
+ Examples:
400
+ .. note::
401
+ Before running the following examples, you need to configure the communication environment variables.
402
+ For Ascend devices, it is recommended to use the msrun startup method
403
+ without any third-party or configuration file dependencies.
404
+ Please see the `msrun start up
405
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
406
+ for more details.
407
+
408
+ >>> import mindspore as ms
409
+ >>> from mindspore.mint.distributed import init_process_group, new_group
410
+ >>> ms.set_device(device_target="Ascend")
411
+ >>> init_process_group()
412
+ >>> group = new_group()
413
+ >>> print("group is: ", group)
414
+ group is: hccl_world_group
415
+ """
416
+ if ranks is not None:
417
+ if not isinstance(ranks, list):
418
+ raise TypeError("ranks must be list, but got {}".format(type(ranks)))
419
+ ranks = sorted(ranks)
420
+ else:
421
+ return GlobalComm.WORLD_COMM_GROUP
422
+ if backend is None:
423
+ backend = "hccl"
424
+ if not isinstance(backend, str) or backend not in ("hccl", "mccl"):
425
+ raise TypeError(f"the input backend must be hccl or mccl, but got {backend}")
426
+ group = backend + "_" + str(len(ranks)) + "_" + hashlib.sha1(bytes("_".join(map(str, ranks)), "utf-8")).hexdigest()
427
+ try:
428
+ create_group(group, ranks)
429
+ except RuntimeError as e:
430
+ logger.warning(e)
431
+ group = ""
432
+ return group
433
+
434
+
435
+ def get_backend(group=None):
436
+ """
437
+ Get the backend of communication process groups.
438
+
439
+ Note:
440
+ Only one communication backend is supported by MindSpore for each process.
441
+ It should be one of `hccl`/`nccl`/`mccl`. Currently only support hccl and mccl.
442
+
443
+ Args:
444
+ group (str, optional): The communication group to work on.
445
+ Normally, the group should be created by :func:`mindspore.mint.distributed.new_group`, If ``None``,
446
+ which means ``"hccl_world_group"`` in Ascend. Default: ``None``.
447
+
448
+ Returns:
449
+ string, the backend of the group.
450
+
451
+ Raises:
452
+ TypeError: If the `group` is not a str.
453
+
454
+ Supported Platforms:
455
+ ``Ascend`` ``CPU``
456
+
457
+ Examples:
458
+ .. note::
459
+ Before running the following examples, you need to configure the communication environment variables.
460
+ For Ascend devices, it is recommended to use the msrun startup method
461
+ without any third-party or configuration file dependencies.
462
+ Please see the `msrun start up
463
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
464
+ for more details.
465
+
466
+ >>> import mindspore as ms
467
+ >>> from mindspore.mint.distributed import init_process_group, get_backend
468
+ >>> ms.set_device(device_target="Ascend")
469
+ >>> init_process_group()
470
+ >>> backend = get_backend()
471
+ >>> print("backend is: ", backend)
472
+ backend is: hccl
473
+ """
474
+ if group is None:
475
+ return BACKEND_HCCL
476
+ if not isinstance(group, str):
477
+ raise TypeError(
478
+ "For 'get_backend', the argument 'group' must be type of string or None, "
479
+ "but got 'group' type : {}.".format(type(group))
480
+ )
481
+ if BACKEND_HCCL in group:
482
+ return BACKEND_HCCL
483
+ if BACKEND_MCCL in group:
484
+ return BACKEND_MCCL
485
+ return _get_backend()
486
+
487
+
488
+ def get_global_rank(group, group_rank):
489
+ """
490
+ A function that returns the rank id in the world group corresponding to the
491
+ rank which id is 'group_rank' in the user group.
492
+
493
+ Note:
494
+ This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
495
+
496
+ Args:
497
+ group (str): The communication group to work on. Normally, the group should
498
+ be created by :func:`mindspore.mint.distributed.new_group`. If ``None``, which
499
+ means ``"hccl_world_group"`` in Ascend.
500
+ group_rank (int): Group rank to query.
501
+
502
+ Returns:
503
+ An integer scalar with the rank id in the world group.
504
+
505
+ Raises:
506
+ TypeError: If the `group` is not a str.
507
+ TypeError: If the `group_rank` is not an integer.
508
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
509
+
510
+ Supported Platforms:
511
+ ``Ascend``
512
+
513
+ Examples:
514
+ .. note::
515
+ Before running the following examples, you need to configure the communication environment variables.
516
+
517
+ For Ascend devices, it is recommended to use the msrun startup method
518
+ without any third-party or configuration file dependencies.
519
+
520
+ Please see the `msrun start up
521
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
522
+ for more details.
523
+
524
+ This example should be run with 8 devices.
525
+
526
+ >>> import mindspore as ms
527
+ >>> from mindspore.mint.distributed import init_process_group, get_global_rank, new_group, get_rank
528
+ >>> ms.set_device(device_target="Ascend")
529
+ >>> # Launch 8 processes.
530
+ >>> init_process_group()
531
+ >>> rank_ids = [0,4]
532
+ >>> if get_rank() in rank_ids:
533
+ ... group = new_group(rank_ids)
534
+ ... world_rank_id = get_global_rank(group, 1)
535
+ ... print("world_rank_id is: ", world_rank_id)
536
+ #rank 0 and 4:
537
+ world_rank_id is: 4
538
+ """
539
+ if not isinstance(group_rank, int):
540
+ raise TypeError(
541
+ f"The group_rank argument must be integer, but got {type(group_rank)}."
542
+ )
543
+
544
+ if group is None or group is GlobalComm.WORLD_COMM_GROUP:
545
+ return group_rank
546
+
547
+ if not isinstance(group, str):
548
+ raise TypeError(
549
+ "For 'get_global_rank', the argument 'group' must be type of string or None, "
550
+ "but got 'group' type : {}.".format(type(group))
551
+ )
552
+ return get_world_rank_from_group_rank(group, group_rank)
553
+
554
+
555
+ def get_group_rank(group, global_rank):
556
+ """
557
+ Get the rank ID in the specified user communication group corresponding to
558
+ the rank ID in the world communication group.
559
+
560
+ Note:
561
+ This method should be used after :func:`mindspore.mint.distributed.init_process_group`.
562
+
563
+ Args:
564
+ group (str): The communication group to work on. Normally, the group should be
565
+ created by :func:`mindspore.mint.distributed.new_group`. If ``None``, which means
566
+ ``"hccl_world_group"`` in Ascend.
567
+ global_rank (int): A rank ID in the world communication group.
568
+
569
+ Returns:
570
+ int, the rank ID in the user communication group.
571
+
572
+ Raises:
573
+ TypeError: If global_rank is not an integer or the group is not a string.
574
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
575
+
576
+ Supported Platforms:
577
+ ``Ascend``
578
+
579
+ Examples:
580
+ .. note::
581
+ Before running the following examples, you need to configure the communication environment variables.
582
+
583
+ For Ascend devices, it is recommended to use the msrun startup method
584
+ without any third-party or configuration file dependencies.
585
+ Please see the `msrun start up
586
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
587
+ for more details.
588
+
589
+ This example should be run with 8 devices.
590
+
591
+ >>> import mindspore as ms
592
+ >>> from mindspore.mint.distributed import init_process_group, new_group, get_group_rank, get_rank
593
+ >>> ms.set_device(device_target="Ascend")
594
+ >>> # Launch 8 processes.
595
+ >>> init_process_group()
596
+ >>> rank_ids = [0,4]
597
+ >>> if get_rank() in rank_ids:
598
+ ... group = new_group(rank_ids)
599
+ ... group_rank_id = get_group_rank(group, 4)
600
+ ... print("group_rank_id is: ", group_rank_id)
601
+ #rank 0 and 4:
602
+ group_rank_id is: 1
603
+ """
604
+ if not isinstance(global_rank, int):
605
+ raise TypeError(
606
+ f"The global_rank argument must be integer, but got {type(global_rank)}."
607
+ )
608
+ if group is None:
609
+ group = GlobalComm.WORLD_COMM_GROUP
610
+ if not isinstance(group, str):
611
+ raise TypeError(
612
+ "For 'get_group_rank_from_world_rank', the argument 'group' must be type of string, "
613
+ "but got 'group' type : {}.".format(type(group))
614
+ )
615
+ return _get_group_rank_from_world_rank_from_cache_helper(
616
+ world_rank_id=global_rank, group=group
617
+ )
618
+
619
+
620
+ def get_process_group_ranks(group=None):
621
+ """
622
+ Gets the ranks of the specific group and returns the process ranks in the communication group as a list.
623
+
624
+ Args:
625
+ group (str, optional): The communication group to work on. Normally, the group should be created by
626
+ :func:`mindspore.mint.distributed.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
627
+ Default: ``None``.
628
+
629
+ Returns:
630
+ List (List[int]), List of process ranks in the specified communication group.
631
+
632
+ Raises:
633
+ TypeError: If the `group` is not a str.
634
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
635
+
636
+ Supported Platforms:
637
+ ``Ascend`` ``CPU``
638
+
639
+ Examples:
640
+ .. note::
641
+ Before running the following examples, you need to configure the communication environment variables.
642
+
643
+ For Ascend devices, it is recommended to use the msrun startup method
644
+ without any third-party or configuration file dependencies.
645
+
646
+ Please see the `msrun start up
647
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
648
+ for more details.
649
+
650
+ This example should be run with 4 devices.
651
+
652
+ >>> import mindspore as ms
653
+ >>> from mindspore.mint.distributed import init_process_group, get_process_group_ranks
654
+ >>> # Launch 4 processes.
655
+ >>> ms.set_device(device_target="Ascend")
656
+ >>> init_process_group()
657
+ >>> output = get_process_group_ranks()
658
+ >>> print(output)
659
+ [0, 1, 2, 3]
660
+
661
+ """
662
+ if group is None:
663
+ group = GlobalComm.WORLD_COMM_GROUP
664
+
665
+ if not isinstance(group, str):
666
+ raise TypeError(
667
+ "For 'get_process_group_ranks', the argument 'group' must be type of string or None, "
668
+ "but got 'group' type : {}.".format(type(group))
669
+ )
670
+ return _get_group_ranks(group)
671
+
672
+
673
+ @_primexpr
674
+ def _check_all_tensor_same_dtype_and_shape(*tensor_lists):
675
+ """check all the input tensor has same dtype and shape"""
676
+ consistent_dtype = None
677
+ consistent_shape = None
678
+ for list_ in tensor_lists:
679
+ if not isinstance(list_, (list, tuple)):
680
+ list_ = [list_]
681
+ for tensor_ in list_:
682
+ if not isinstance(tensor_, Tensor):
683
+ continue
684
+ dtype = tensor_.dtype
685
+ shape = tensor_.shape
686
+ if consistent_dtype is None:
687
+ consistent_dtype = dtype
688
+ consistent_shape = shape
689
+ else:
690
+ if dtype != consistent_dtype:
691
+ raise TypeError(
692
+ "tensor_lists dtype must be the same, "
693
+ f"but got {consistent_dtype} and {dtype}."
694
+ )
695
+ if shape != consistent_shape:
696
+ raise TypeError(
697
+ "tensor_lists shape must be the same, "
698
+ f"but got {consistent_shape} and {shape}."
699
+ )
700
+
701
+
702
+ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
703
+ """
704
+ Reduce tensors across all devices in such a way that all deviceswill get the same final result,
705
+ returns the tensor which is all reduced.
706
+
707
+ Note:
708
+ The tensors must have the same shape and format in all processes of the collection.
709
+
710
+ Args:
711
+ tensor (Tensor): The input and output tensor of collective. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
712
+ The function operates in-place.
713
+ op (str, optional): Specifies an operation used for element-wise reductions, like sum, prod, max, and min.
714
+ Default: ``ReduceOp.SUM`` .
715
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
716
+ Ascend. Default: ``None``.
717
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
718
+
719
+ Returns:
720
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True. CommHandle will be None,
721
+ when `async_op` is False.
722
+
723
+ Raises:
724
+ TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str,
725
+ `op` range is illegal or async_op is not bool.
726
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
727
+
728
+ Supported Platforms:
729
+ ``Ascend``
730
+
731
+ Examples:
732
+ .. note::
733
+ Before running the following examples, you need to configure the communication environment variables.
734
+
735
+ For Ascend devices, it is recommended to use the msrun startup method
736
+ without any third-party or configuration file dependencies.
737
+ Please see the `msrun start up
738
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
739
+ for more details.
740
+
741
+ This example should be run with 2 devices.
742
+
743
+ >>> import numpy as np
744
+ >>> from mindspore.mint.distributed import init_process_group
745
+ >>> from mindspore.mint.distributed import all_reduce
746
+ >>> from mindspore import Tensor
747
+ >>>
748
+ >>> init_process_group()
749
+ >>> tensor = Tensor(np.ones([2, 8]).astype(np.float32))
750
+ >>> output = all_reduce(tensor)
751
+ >>> print(tensor)
752
+ [[2. 2. 2. 2. 2. 2. 2. 2.]
753
+ [2. 2. 2. 2. 2. 2. 2. 2.]]
754
+
755
+ """
756
+ if not isinstance(tensor, (Tensor, Tensor_)):
757
+ raise TypeError("For all_reduce, the input tensor must be tensor")
758
+ if not isinstance(op, str):
759
+ raise TypeError("For all_reduce, the input op type must be str")
760
+ if op not in ("sum", "prod", "min", "max"):
761
+ raise TypeError(
762
+ "For all_reduce, the input op value must be one of sum, prod, min, max"
763
+ )
764
+
765
+ if group is None:
766
+ group = GlobalComm.WORLD_COMM_GROUP
767
+
768
+ if not isinstance(group, str):
769
+ raise TypeError(
770
+ "The argument 'group' must be type of string, "
771
+ "but got 'group' type : {}.".format(type(group))
772
+ )
773
+ if not isinstance(async_op, bool):
774
+ raise TypeError(
775
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
776
+ )
777
+
778
+ output = dist_comm_all_reduce_op(tensor, op, group)
779
+ _, handle = _deal_comm_outputs(output, async_op)
780
+ return handle
781
+
782
+
783
+ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False):
784
+ """
785
+ Gathers tensors from the specified communication group and returns the tensor which is all gathered.
786
+
787
+ Note:
788
+ The tensors must have the same shape and format in all processes of the collection.
789
+
790
+ Args:
791
+ output_tensor (Tensor): The output tensor to be all gathered into tensor.If the number of devices
792
+ in the group is N, then the shape of output tensor is :math:`(N*x_1, x_2, ..., x_R)`.
793
+ input_tensor (Tensor): The input tensor to be all gathered into tensor.
794
+ The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
795
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
796
+ Ascend. Default: ``None``.
797
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
798
+
799
+ Returns:
800
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
801
+ CommHandle will be None, when `async_op` is False.
802
+
803
+ Raises:
804
+ TypeError: If the type of the input_tensor or output_tensor parameter is not Tensor,
805
+ `group` is not a str, or async_op is not bool.
806
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
807
+
808
+ Supported Platforms:
809
+ ``Ascend``
810
+
811
+ Examples:
812
+ .. note::
813
+ Before running the following examples, you need to configure the communication environment variables.
814
+
815
+ For Ascend devices, it is recommended to use the msrun startup method
816
+ without any third-party or configuration file dependencies.
817
+ Please see the `msrun start up
818
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
819
+ for more details.
820
+
821
+ This example should be run with 2 devices.
822
+
823
+ >>> import numpy as np
824
+ >>> import mindspore as ms
825
+ >>> from mindspore import ops
826
+ >>> from mindspore.mint.distributed import init_process_group
827
+ >>> from mindspore.mint.distributed import all_gather_into_tensor
828
+ >>> from mindspore import Tensor
829
+ >>>
830
+ >>> ms.set_device(device_target="Ascend")
831
+ >>> init_process_group()
832
+ >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
833
+ >>> out_tensor = Tensor(np.zeros([4, 8]).astype(np.float32))
834
+ >>> output = all_gather_into_tensor(out_tensor, input_tensor)
835
+ >>> print(out_tensor)
836
+ [[1. 1. 1. 1. 1. 1. 1. 1.]
837
+ [1. 1. 1. 1. 1. 1. 1. 1.]
838
+ [1. 1. 1. 1. 1. 1. 1. 1.]
839
+ [1. 1. 1. 1. 1. 1. 1. 1.]]
840
+
841
+ """
842
+
843
+ if not isinstance(input_tensor, (Tensor, Tensor_)):
844
+ raise TypeError("For all_gather_into_tensor, the input tensor must be tensor")
845
+ if not isinstance(output_tensor, (Tensor, Tensor_)):
846
+ raise TypeError("For all_gather_into_tensor, the output tensor must be tensor")
847
+ if group is None:
848
+ group = GlobalComm.WORLD_COMM_GROUP
849
+ if not isinstance(group, str):
850
+ raise TypeError(
851
+ "The argument 'group' must be type of string, "
852
+ "but got 'group' type : {}.".format(type(group))
853
+ )
854
+ if not isinstance(async_op, bool):
855
+ raise TypeError(
856
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
857
+ )
858
+ group_size = get_group_size(group)
859
+ result = dist_comm_all_gather_into_tensor_op(
860
+ output_tensor, input_tensor, group_size, group
861
+ )
862
+ _, handle = _deal_comm_outputs(result, async_op)
863
+ return handle
864
+
865
+
866
+ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=False):
867
+ r"""
868
+ Reduces and scatters tensors from the specified communication group and
869
+ returns the tensor which is reduced and scattered.
870
+
871
+ Note:
872
+ The tensors must have the same shape and format in all processes of the collection.
873
+
874
+ Args:
875
+ output(Tensor): the output tensor has the same dtype as `input_x` with a shape of :math:`(N/rank\_size, *)`
876
+ input(Tensor): The input tensor to be reduced and scattered, suppose it has a shape :math:`(N, *)`, where `*`
877
+ means any number of additional dimensions. N must be divisible by rank_size.
878
+ rank_size refers to the number of cards in the communication group.
879
+ op (str, optional): Specifies an operation used for element-wise reductions,
880
+ like SUM and MAX. Default: ``ReduceOp.SUM`` .
881
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
882
+ Ascend. Default: ``None``.
883
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
884
+
885
+ Returns:
886
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
887
+ CommHandle will be None, when `async_op` is False.
888
+
889
+ Raises:
890
+ TypeError: If the type of the input and output parameter is not Tensor, any of `op` and `group` is not a str.
891
+ async_op is not bool or 'op' is invalid.
892
+ ValueError: If the first dimension of the input cannot be divided by the rank_size.
893
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
894
+
895
+ Supported Platforms:
896
+ ``Ascend``
897
+
898
+ Examples:
899
+ .. note::
900
+ Before running the following examples, you need to configure the communication environment variables.
901
+
902
+ For Ascend devices, it is recommended to use the msrun startup method
903
+ without any third-party or configuration file dependencies.
904
+ Please see the `msrun start up
905
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
906
+ for more details.
907
+
908
+ This example should be run with 2 devices.
909
+
910
+ >>> import mindspore as ms
911
+ >>> from mindspore import Tensor
912
+ >>> from mindspore.mint.distributed import init_process_group
913
+ >>> from mindspore.mint.distributed import reduce_scatter_tensor
914
+ >>> import numpy as np
915
+ >>>
916
+ >>> ms.set_device(device_target="Ascend")
917
+ >>> init_process_group()
918
+ >>> input_tensor = Tensor(np.ones([8, 8]).astype(np.float32))
919
+ >>> output_tensor = Tensor(np.ones([4, 8]).astype(np.float32))
920
+ >>> output = reduce_scatter_tensor(output_tensor ,input_tensor)
921
+ >>> print(output_tensor)
922
+ [[2. 2. 2. 2. 2. 2. 2. 2.]
923
+ [2. 2. 2. 2. 2. 2. 2. 2.]
924
+ [2. 2. 2. 2. 2. 2. 2. 2.]
925
+ [2. 2. 2. 2. 2. 2. 2. 2.]]
926
+
927
+ """
928
+
929
+ if not isinstance(input, (Tensor, Tensor_)):
930
+ raise TypeError("For reduce_scatter_tensor, the input tensor must be tensor")
931
+ if not isinstance(output, (Tensor, Tensor_)):
932
+ raise TypeError("For reduce_scatter_tensor, the output tensor must be tensor")
933
+ if not isinstance(op, str):
934
+ raise TypeError("For reduce_scatter_tensor, the input op type must be str")
935
+ if op not in ("sum", "prod", "min", "max"):
936
+ raise TypeError(
937
+ "For reduce_scatter_tensor, the input op value must be one of sum, prod, min, max"
938
+ )
939
+ if group is None:
940
+ group = GlobalComm.WORLD_COMM_GROUP
941
+ if not isinstance(group, str):
942
+ raise TypeError(
943
+ "The argument 'group' must be type of string, "
944
+ "but got 'group' type : {}.".format(type(group))
945
+ )
946
+ if not isinstance(async_op, bool):
947
+ raise TypeError(
948
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
949
+ )
950
+ rank_size = get_group_size(group)
951
+ result = dist_comm_reduce_scatter_tensor_op(output, input, rank_size, op, group)
952
+ _, handle = _deal_comm_outputs(result, async_op)
953
+ return handle
954
+
955
+
956
+ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
957
+ """
958
+ Reduces tensors across the processes in the specified communication group, sends the result
959
+ to the target dst(global rank), and returns the tensor which is sent to the target process.
960
+
961
+ Note:
962
+ - Only process with destination rank receives the reduced output.
963
+ - Only support PyNative mode, Graph mode is not currently supported.
964
+ - Other processes only get a tensor with shape [1], which has no mathematical meaning.
965
+
966
+ Args:
967
+ tensor (Tensor): Input and output of the collective. The function operates in-place.
968
+ dst (int): The target rank of the process(global rank) that receives the reduced output.
969
+ op (str, optional): Specifies an operation used for element-wise reductions, like sum, prod, max, and min.
970
+ Default: ``ReduceOp.SUM`` .
971
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
972
+ Ascend. Default: ``None``.
973
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
974
+
975
+ Returns:
976
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
977
+ CommHandle will be None, when `async_op` is False.
978
+
979
+ Raises:
980
+ TypeError: If the type of `tensor` is not Tensor, any of `op` and `group` is not a str.
981
+ async_op is not bool or 'op' is invalid.
982
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
983
+
984
+ Supported Platforms:
985
+ ``Ascend``
986
+
987
+ Examples:
988
+ .. note::
989
+ Before running the following examples, you need to configure the communication environment variables.
990
+
991
+ For Ascend devices, it is recommended to use the msrun startup method
992
+ without any third-party or configuration file dependencies.
993
+
994
+ Please see the `msrun start up
995
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
996
+ for more details.
997
+
998
+ This example should be run with 4 devices.
999
+
1000
+ >>> from mindspore import ops
1001
+ >>> import mindspore.nn as nn
1002
+ >>> from mindspore.mint.distributed import init_process_group, reduce
1003
+ >>> from mindspore import Tensor
1004
+ >>> import numpy as np
1005
+ >>> # Launch 2 processes.
1006
+ >>> init_process_group()
1007
+ >>> dest_rank=1
1008
+ >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
1009
+ >>> output = reduce(input_tensor, dest_rank)
1010
+ >>> print(input_tensor)
1011
+ Process with rank 0: [[1. 1. 1. 1. 1. 1. 1. 1.]
1012
+ [1. 1. 1. 1. 1. 1. 1. 1.]],
1013
+ Process with rank 1: [[2. 2. 2. 2. 2. 2. 2. 2.]
1014
+ [2. 2. 2. 2. 2. 2. 2. 2.]],
1015
+ """
1016
+
1017
+ if not isinstance(tensor, (Tensor, Tensor_)):
1018
+ raise TypeError("For reduce, the input tensor must be tensor")
1019
+ if not isinstance(dst, int):
1020
+ raise TypeError("For reduce, the dst must be int")
1021
+ if not isinstance(op, str):
1022
+ raise TypeError("For reduce, the input op type must be str")
1023
+ if op not in ("sum", "prod", "min", "max"):
1024
+ raise TypeError(
1025
+ "For reduce, the input op value must be one of sum, prod, min, max"
1026
+ )
1027
+ if group is None:
1028
+ group = GlobalComm.WORLD_COMM_GROUP
1029
+ if not isinstance(group, str):
1030
+ raise TypeError(
1031
+ "The argument 'group' must be type of string, "
1032
+ "but got 'group' type : {}.".format(type(group))
1033
+ )
1034
+ if not isinstance(async_op, bool):
1035
+ raise TypeError(
1036
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1037
+ )
1038
+ result = dist_comm_reduce_op(tensor, op, dst, group)
1039
+ _, handle = _deal_comm_outputs(result, async_op)
1040
+ return handle
1041
+
1042
+
1043
+ class P2POp:
1044
+ """
1045
+ Object for `batch_isend_irecv` input, to store information of ``"isend"`` and ``"irecv"``.
1046
+
1047
+ Note:
1048
+ `tensor` will be modified in-place by final result when `op` is ``"irecv"``.
1049
+
1050
+ Args:
1051
+ op(Union[str, function]): Only string of ``"isend"`` and ``"irecv"`` are allowed.
1052
+ Or function of ``distributed.isend`` and ``distributed.irecv`` are allowed.
1053
+ tensor(Tensor): tensor for sending/receiving.
1054
+ peer(int): remote global rank for send/receive.
1055
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1056
+ Ascend. Default: ``None``.
1057
+ tag(int, optional): currently not supported yet. Default: ``0``.
1058
+
1059
+ Returns:
1060
+ P2POp Object.
1061
+
1062
+ Raises:
1063
+ TypeError: when `op` is not string or function of 'isend' and 'irecv'.
1064
+ TypeError: when `tensor` is not type of Tensor or 'peer' is not int.
1065
+ NotImplementedError: when `tag` is not 0.
1066
+
1067
+ Supported Platforms:
1068
+ ``Ascend``
1069
+
1070
+ Examples:
1071
+ >>> import numpy as np
1072
+ >>> import mindspore
1073
+ >>> from mindspore.mint.distributed import P2POp, isend, irecv
1074
+ >>> from mindspore import Tensor
1075
+ >>> # Launch 2 processes.
1076
+ >>> send_tensor = Tensor(1.)
1077
+ >>> send_op = P2POp('isend', send_tensor, 1)
1078
+ >>> send_op = P2POp(isend, send_tensor, 1)
1079
+ >>> recv_tensor = Tensor(0.)
1080
+ >>> recv_op = P2POp('irecv', recv_tensor, 0)
1081
+ >>> recv_op = P2POp(irecv, recv_tensor, 0)
1082
+ """
1083
+
1084
+ def __init__(self, op, tensor, peer, group=None, tag=0):
1085
+ self.op = op
1086
+ self.tensor = tensor
1087
+ self.peer = peer
1088
+ self.group = group
1089
+ self.tag = tag
1090
+
1091
+ def __new__(cls, op, tensor, peer, group=None, tag=0):
1092
+ if isinstance(op, str):
1093
+ op_name = op
1094
+ if op_name not in ["isend", "irecv"]:
1095
+ raise TypeError(
1096
+ f"Expected op to be of type isend or irecv, but got {op_name}"
1097
+ )
1098
+ else:
1099
+ if op not in [isend, irecv]:
1100
+ raise TypeError(
1101
+ f"Expected op to be of type isend or irecv, but got {op}"
1102
+ )
1103
+ op_name = op.__name__
1104
+
1105
+ if not isinstance(tensor, (Tensor, Tensor_)):
1106
+ raise TypeError(
1107
+ f"Expected tensor to be Tensor, but got {type(tensor)}."
1108
+ )
1109
+ if not isinstance(peer, int):
1110
+ raise TypeError("For P2POp, the peer must be int")
1111
+ if tag != 0:
1112
+ raise NotImplementedError("tag is not support yet.")
1113
+ return object.__new__(cls)
1114
+
1115
+
1116
+ TYPE_ISEND = 0
1117
+ TYPE_IRECV = 1
1118
+
1119
+
1120
+ def batch_isend_irecv(p2p_op_list):
1121
+ """
1122
+ Batch send and recv tensors asynchronously.
1123
+
1124
+ Note:
1125
+ - The 'isend' and 'irecv' of `P2POp` in `p2p_op_list` between ranks need to match each other.
1126
+ - `P2POp` in `p2p_op_list` can only use the same communication group.
1127
+ - `tag` of `P2POp` in `p2p_op_list` is not support yet.
1128
+ - `tensor` of `P2POp` in `p2p_op_list` will not be modified by result inplace.
1129
+ - Only support PyNative mode, Graph mode is not currently supported.
1130
+
1131
+ Args:
1132
+ p2p_op_list(list[P2POp]): list contains `P2POp`. `P2POp` is type of :class:`mindspore.mint.distributed.P2POp`
1133
+
1134
+ Returns:
1135
+ list[CommHandle], CommHandle is an async work handle, Currently only one packaging handle is supported.
1136
+
1137
+ Raises:
1138
+ TypeError: If `p2p_op_list` is empty or `p2p_op_list` are not all type of `P2POp`.
1139
+ TypeError: The group name in `p2p_op_list` are not consistent.
1140
+ TypeError: The `tensor` in `p2p_op_list` are not Tensor.
1141
+ TypeError: The `op` in `p2p_op_list` are not isend or irecv.
1142
+
1143
+ Supported Platforms:
1144
+ ``Ascend``
1145
+
1146
+ Examples:
1147
+ .. note::
1148
+ Before running the following examples, you need to configure the communication environment variables.
1149
+
1150
+ For Ascend devices, it is recommended to use the msrun startup method
1151
+ without any third-party or configuration file dependencies.
1152
+ Please see the `msrun start up
1153
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1154
+ for more details.
1155
+
1156
+ This example should be run with 2 devices.
1157
+
1158
+ >>> import numpy as np
1159
+ >>> import mindspore
1160
+ >>> from mindspore.mint.distributed import init_process_group, get_rank, get_world_size
1161
+ >>> from mindspore.mint.distributed import batch_isend_irecv, P2POp
1162
+ >>> from mindspore import Tensor
1163
+ >>>
1164
+ >>> init_process_group()
1165
+ >>> this_rank = get_rank()
1166
+ >>> world_size = get_world_size()
1167
+ >>> next_rank = (this_rank + 1) % world_size
1168
+ >>> prev_rank = (this_rank + world_size - 1) % world_size
1169
+ >>>
1170
+ >>> send_tensor = Tensor(this_rank + 1, dtype=mindspore.float32)
1171
+ >>> recv_tensor = Tensor(0., dtype=mindspore.float32)
1172
+ >>>
1173
+ >>> send_op = P2POp('isend', send_tensor, next_rank)
1174
+ >>> recv_op = P2POp('irecv', recv_tensor, prev_rank)
1175
+ >>>
1176
+ >>> p2p_op_list = [send_op, recv_op]
1177
+ >>> output = batch_isend_irecv(p2p_op_list)
1178
+ >>> print(recv_tensor)
1179
+ rank 0:
1180
+ 2.0
1181
+ rank 1:
1182
+ 1.0
1183
+ """
1184
+ tensors = []
1185
+ op_types = []
1186
+ remotes_ranks = []
1187
+ tags = []
1188
+ if not p2p_op_list:
1189
+ raise TypeError(f"p2p_op_list can not be empty list.")
1190
+ for _, p2p_op in enumerate(p2p_op_list):
1191
+ if not isinstance(p2p_op, P2POp):
1192
+ raise TypeError("The elements in p2p_op_list must be type of P2POp.")
1193
+ group = p2p_op_list[0].group
1194
+
1195
+ type_ = None
1196
+ for _, p2p_op in enumerate(p2p_op_list):
1197
+ if group != p2p_op.group:
1198
+ raise TypeError("The group name in p2p_op_list must be consistent.")
1199
+ if isinstance(p2p_op.op, str):
1200
+ type_ = p2p_op.op
1201
+ else:
1202
+ type_ = p2p_op.op.__name__
1203
+ rank_ = (
1204
+ p2p_op.peer
1205
+ if p2p_op.group is None
1206
+ else get_group_rank_from_world_rank(p2p_op.peer, p2p_op.group)
1207
+ )
1208
+ remotes_ranks.append(rank_)
1209
+ tags.append(p2p_op.tag)
1210
+ if type_ == "isend":
1211
+ tensors.append(p2p_op.tensor)
1212
+ op_types.append(TYPE_ISEND)
1213
+ elif type_ == "irecv":
1214
+ if isinstance(p2p_op.tensor, Tensor):
1215
+ tensors.append(p2p_op.tensor)
1216
+ op_types.append(TYPE_IRECV)
1217
+ else:
1218
+ raise TypeError("p2p_op.tensor must be tensor")
1219
+ else:
1220
+ raise TypeError("p2p_op.op must be isend or irecv")
1221
+
1222
+ if group is None:
1223
+ group = GlobalComm.WORLD_COMM_GROUP
1224
+ output = dist_comm_batch_isend_irecv_op(tensors, group, op_types, remotes_ranks)
1225
+ _, handle = _deal_comm_outputs(output, True)
1226
+ return [handle]
1227
+
1228
+
1229
+ def scatter_tensor(output_tensor, input_tensor, src=0, group=None, async_op=False):
1230
+ r"""
1231
+ Scatter tensor evently across the processes in the specified communication group.
1232
+
1233
+ Note:
1234
+ - The interface behavior only support Tensor input and scatter evenly, which
1235
+ is different from that of `pytoch.distributed.scatter`.
1236
+ - Only the tensor in process `src` (global rank) will do scatter.
1237
+ - Only support PyNative mode, Graph mode is not currently supported.
1238
+
1239
+ Args:
1240
+ output_tensor (Tensor): Output tensor. It should have the same size across all ranks.
1241
+ input_tensor (Tensor): The input tensor to be scattered. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
1242
+ src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
1243
+ And only process `src` will send the tensor. Default is 0.
1244
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1245
+ Ascend. Default: ``None``.
1246
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
1247
+
1248
+ Returns:
1249
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1250
+ CommHandle will be None, when `async_op` is False.
1251
+
1252
+ Raises:
1253
+ TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str.
1254
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
1255
+
1256
+ Supported Platforms:
1257
+ ``Ascend``
1258
+
1259
+ Examples:
1260
+ .. note::
1261
+ Before running the following examples, you need to configure the communication environment variables.
1262
+
1263
+ For Ascend devices, it is recommended to use the msrun startup method
1264
+ without any third-party or configuration file dependencies.
1265
+ Please see the `msrun start up
1266
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1267
+ for more details.
1268
+
1269
+ This example should be run with 2 devices.
1270
+
1271
+ >>> import mindspore as ms
1272
+ >>> from mindspore.mint.distributed import init_process_group
1273
+ >>> from mindspore.communication.comm_func import scatter_tensor
1274
+ >>> import numpy as np
1275
+ >>> # Launch 2 processes.
1276
+ >>>
1277
+ >>> init_process_group()
1278
+ >>> input = ms.Tensor(np.arange(8).reshape([4, 2]).astype(np.float32))
1279
+ >>> output = ms.Tensor(np.zeros([2, 2]).astype(np.float32))
1280
+ >>> out = scatter_tensor(output, input, src=0)
1281
+ >>> print(output)
1282
+ # rank_0
1283
+ [[0. 1.]
1284
+ [2. 3.]]
1285
+ # rank_1
1286
+ [[4. 5.]
1287
+ [6. 7.]]
1288
+ """
1289
+ if not isinstance(input_tensor, (Tensor, Tensor_)):
1290
+ raise TypeError("For scatter_tensor, the input tensor must be tensor")
1291
+ if not isinstance(output_tensor, (Tensor, Tensor_)):
1292
+ raise TypeError("For scatter_tensor, the output tensor must be tensor")
1293
+ if not isinstance(src, int):
1294
+ raise TypeError("For scatter_tensor, the src must be int")
1295
+ if group is None:
1296
+ group = GlobalComm.WORLD_COMM_GROUP
1297
+ if not isinstance(group, str):
1298
+ raise TypeError(
1299
+ "The argument 'group' must be type of string, "
1300
+ "but got 'group' type : {}.".format(type(group))
1301
+ )
1302
+ if not isinstance(async_op, bool):
1303
+ raise TypeError(
1304
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1305
+ )
1306
+ src = get_group_rank_from_world_rank(src, group)
1307
+ rank_size = get_group_size(group)
1308
+ rank_id = get_rank(group)
1309
+ output = dist_comm_scatter_tensor_op(
1310
+ output_tensor, input_tensor, rank_size, src, rank_id, group
1311
+ )
1312
+ _, handle = _deal_comm_outputs(output, async_op)
1313
+ return handle
1314
+
1315
+
1316
+ def gather_into_tensor(output_tensor, input_tensor, dst=0, group=None, async_op=False):
1317
+ r"""
1318
+ Gathers tensors from the specified communication group. The operation will gather the tensor
1319
+ from processes according to dimension 0.
1320
+
1321
+ Note:
1322
+ - Only the tensor in process `dst` (global rank) will keep the gathered tensor. The other process
1323
+ will keep a tensor with shape [1], which has no mathematical meaning.
1324
+ - Only support PyNative mode, Graph mode is not currently supported.
1325
+
1326
+ Args:
1327
+ output_tensor (Tensor): Output tensor to accommodate tensor elements from all ranks.
1328
+ input_tensor (Tensor): The tensor to be gathered. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
1329
+ the input tensors in this API must have the same size across all ranks.
1330
+ dst(int, optional): Specifies the rank(global rank) of the process that receive the tensor.
1331
+ And only process `dst` will receive the gathered tensor. Default: 0.
1332
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1333
+ Ascend. Default: ``None``.
1334
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
1335
+
1336
+ Returns:
1337
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1338
+ CommHandle will be None, when `async_op` is False.
1339
+
1340
+ Raises:
1341
+ TypeError: If the type of the `input_tensor` or `output_tensor` parameter is not Tensor,
1342
+ or any of `op` and `group` is not a str.
1343
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
1344
+
1345
+ Supported Platforms:
1346
+ ``Ascend``
1347
+
1348
+ Examples:
1349
+ .. note::
1350
+ Before running the following examples, you need to configure the communication environment variables.
1351
+
1352
+ For Ascend devices, it is recommended to use the msrun startup method
1353
+ without any third-party or configuration file dependencies.
1354
+ Please see the `msrun start up
1355
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1356
+ for more details.
1357
+
1358
+ This example should be run with 2 devices.
1359
+
1360
+ >>> import numpy as np
1361
+ >>> import mindspore as ms
1362
+ >>> import mindspore.nn as nn
1363
+ >>> from mindspore.mint.distributed import init_process_group
1364
+ >>> from mindspore import Tensor
1365
+ >>> from mindspore.communication.comm_func import gather_into_tensor
1366
+ >>> # Launch 2 processes.
1367
+ >>>
1368
+ >>> init_process_group()
1369
+ >>> input = Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
1370
+ >>> output = Tensor(np.zeros([4, 2]).astype(np.float32))
1371
+ >>> handle = gather_into_tensor(output, input, dst=0)
1372
+ >>> print(output)
1373
+ Process with rank 0: [[0. 1.],
1374
+ [2. 3.],
1375
+ [0. 1.],
1376
+ [2. 3.]]
1377
+ Process with rank 1: [[0. 0.],
1378
+ [0. 0.],
1379
+ [0. 0.],
1380
+ [0. 0.]]
1381
+ """
1382
+ if not isinstance(input_tensor, (Tensor, Tensor_)):
1383
+ raise TypeError("For gather_into_tensor, the input tensor must be tensor")
1384
+ if not isinstance(output_tensor, (Tensor, Tensor_)):
1385
+ raise TypeError("For gather_into_tensor, the output tensor must be tensor")
1386
+ if not isinstance(dst, int):
1387
+ raise TypeError("For gather_into_tensor, the dst must be int")
1388
+ if group is None:
1389
+ group = GlobalComm.WORLD_COMM_GROUP
1390
+ if not isinstance(group, str):
1391
+ raise TypeError(
1392
+ "The argument 'group' must be type of string, "
1393
+ "but got 'group' type : {}.".format(type(group))
1394
+ )
1395
+ if not isinstance(async_op, bool):
1396
+ raise TypeError(
1397
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1398
+ )
1399
+ group_size = get_group_size(group)
1400
+ dst = get_group_rank_from_world_rank(dst, group)
1401
+ rank_id = get_rank(group)
1402
+ output = dist_comm_gather_into_tensor_op(
1403
+ output_tensor, input_tensor, group_size, dst, rank_id, group
1404
+ )
1405
+ _, handle = _deal_comm_outputs(output, async_op)
1406
+ return handle
1407
+
1408
+
1409
+ def broadcast(tensor, src, group=None, async_op=False):
1410
+ """
1411
+ Broadcasts the tensor to the whole group.
1412
+
1413
+ Note:
1414
+ - The tensors must have the same shape and format in all processes of the collection.
1415
+ - Only support PyNative mode, Graph mode is not currently supported.
1416
+
1417
+ Args:
1418
+ tensor (Tensor): Data to be sent if src is the rank of current process,
1419
+ and tensor to be used to save received data otherwise.
1420
+ src (int): Specifies the rank(global rank) of the process that broadcast the tensor.
1421
+ And only process `src` will broadcast the tensor.
1422
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1423
+ Ascend. Default: ``None``.
1424
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
1425
+
1426
+ Returns:
1427
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1428
+ CommHandle will be None, when `async_op` is False.
1429
+
1430
+ Raises:
1431
+ TypeError: If the type of the `tensor` parameter is not Tensor, `src` is not an integer,
1432
+ `group` is not a string or `async_op` is not bool.
1433
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
1434
+
1435
+ Supported Platforms:
1436
+ ``Ascend`` ``CPU``
1437
+
1438
+ Examples:
1439
+ .. note::
1440
+ Before running the following examples, you need to configure the communication environment variables.
1441
+
1442
+ For Ascend devices, it is recommended to use the msrun startup method
1443
+ without any third-party or configuration file dependencies.
1444
+ Please see the `msrun start up
1445
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1446
+ for more details.
1447
+
1448
+ This example should be run with 2 devices.
1449
+
1450
+ >>> import mindspore as ms
1451
+ >>> from mindspore.mint.distributed import init_process_group, broadcast
1452
+ >>> import numpy as np
1453
+ >>> # Launch 2 processes.
1454
+ >>>
1455
+ >>> init_process_group()
1456
+ >>> data = ms.Tensor(np.arange(8).reshape([2, 4]).astype(np.float32))
1457
+ >>> handle = broadcast(tensor=data, src=0)
1458
+ >>> print(data)
1459
+ [[0. 1. 2. 3.]
1460
+ [4. 5. 6. 7.]]
1461
+ """
1462
+ if not isinstance(tensor, (Tensor, Tensor_)):
1463
+ raise TypeError("For broadcast, the input tensor must be tensor")
1464
+ if not isinstance(src, int):
1465
+ raise TypeError("For broadcast, the src must be int")
1466
+ if group is None:
1467
+ group = GlobalComm.WORLD_COMM_GROUP
1468
+ if not isinstance(group, str):
1469
+ raise TypeError(
1470
+ "The argument 'group' must be type of string, "
1471
+ "but got 'group' type : {}.".format(type(group))
1472
+ )
1473
+ if not isinstance(async_op, bool):
1474
+ raise TypeError(
1475
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1476
+ )
1477
+ rank = get_group_rank_from_world_rank(src, group)
1478
+ output = dist_comm_broadcast_op(tensor, rank, group)
1479
+ _, handle = _deal_comm_outputs(output, async_op)
1480
+ return handle
1481
+
1482
+
1483
+ def barrier(group=None, async_op=False, device_ids=None):
1484
+ """
1485
+ Synchronizes all processes in the specified group. Once the process call this operation, it will be blocked until
1486
+ all processes call this operation. After all processes finish calling the operations, the blocked processes
1487
+ will be woken and continue their task.
1488
+
1489
+ Args:
1490
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1491
+ Ascend. Default: ``None``.
1492
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
1493
+ device_ids (list[int], optional): Currently It is a reserved Parameter.
1494
+
1495
+ Returns:
1496
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1497
+ CommHandle will be None, when `async_op` is False.
1498
+
1499
+ Raises:
1500
+ TypeError: `group` is not a str or `async_op` is not a bool.
1501
+ RuntimeError: If backend is invalid, or distributed initialization fails.
1502
+
1503
+ Supported Platforms:
1504
+ ``Ascend``
1505
+
1506
+ Examples:
1507
+ .. note::
1508
+ Before running the following examples, you need to configure the communication environment variables.
1509
+
1510
+ For Ascend devices, it is recommended to use the msrun startup method
1511
+ without any third-party or configuration file dependencies.
1512
+ Please see the `msrun start up
1513
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1514
+ for more details.
1515
+
1516
+ This example should be run with 2 devices.
1517
+
1518
+ >>> from mindspore.mint.distributed import init_process_group
1519
+ >>> from mindspore.communication.comm_func import barrier
1520
+ >>> # Launch 2 processes.
1521
+ >>> init_process_group()
1522
+ >>> barrier()
1523
+ """
1524
+ if group is None:
1525
+ group = GlobalComm.WORLD_COMM_GROUP
1526
+ if not isinstance(group, str):
1527
+ raise TypeError(
1528
+ "The argument 'group' must be type of string, "
1529
+ "but got 'group' type : {}.".format(type(group))
1530
+ )
1531
+ if not isinstance(async_op, bool):
1532
+ raise TypeError(
1533
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1534
+ )
1535
+ output = dist_comm_barrier_op(group)
1536
+ _, handle = _deal_comm_outputs(output, async_op, True)
1537
+ return handle
1538
+
1539
+
1540
+ def send(tensor, dst=0, group=None, tag=0):
1541
+ """
1542
+ Send tensors to the specified dest_rank.
1543
+
1544
+ Note:
1545
+ Only support PyNative mode, Graph mode is not currently supported.
1546
+
1547
+ Args:
1548
+ tensor (Tensor): Tensor to send.
1549
+ dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
1550
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1551
+ Ascend. Default: ``None``.
1552
+ tag (int, optional): A required integer identifying the send/recv message tag. The message will
1553
+ be received by the Receive op with the same "tag". Default: 0. It is a reserved parameter currently.
1554
+
1555
+ Raises:
1556
+ TypeError: If the `tensor` is not Tensor, `dst` is not an int or `group` is not a str.
1557
+ ValueError: If the `dst` process rank id is same as the current process.
1558
+
1559
+ Supported Platforms:
1560
+ ``Ascend``
1561
+
1562
+ Examples:
1563
+ .. note::
1564
+ Before running the following examples, you need to configure the communication environment variables.
1565
+
1566
+ For Ascend devices, it is recommended to use the msrun startup method
1567
+ without any third-party or configuration file dependencies.
1568
+ Please see the `msrun start up
1569
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1570
+ for more details.
1571
+
1572
+ This example should be run with 2 devices.
1573
+
1574
+ >>> from mindspore.mint.distributed import init_process_group
1575
+ >>> from mindspore.mint.distributed import send, recv, get_rank
1576
+ >>> from mindspore import Tensor
1577
+ >>> import numpy as np
1578
+ >>>
1579
+ # Launch 2 processes.
1580
+ >>> init_process_group()
1581
+ >>> this_rank = get_rank()
1582
+ # Process 0 send the array to Process 1
1583
+ >>> if this_rank == 0:
1584
+ >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
1585
+ >>> send(input_, 1)
1586
+ >>> if this_rank == 1:
1587
+ >>> x = Tensor(np.zeros([2, 8]).astype(np.float32))
1588
+ >>> out = recv(x, src=0)
1589
+ >>> print(x)
1590
+ rank 1:
1591
+ [[1. 1. 1. 1. 1. 1. 1. 1.]
1592
+ [1. 1. 1. 1. 1. 1. 1. 1.]]
1593
+ """
1594
+ if not isinstance(tensor, (Tensor, Tensor_)):
1595
+ raise TypeError("For send, the input tensor must be tensor")
1596
+ if not isinstance(dst, int):
1597
+ raise TypeError("For send, the dst must be int")
1598
+ if group is None:
1599
+ group = GlobalComm.WORLD_COMM_GROUP
1600
+ if not isinstance(group, str):
1601
+ raise TypeError(
1602
+ "The argument 'group' must be type of string, "
1603
+ "but got 'group' type : {}.".format(type(group))
1604
+ )
1605
+ if get_rank() == dst:
1606
+ raise ValueError(
1607
+ "Invalid destination rank: destination rank should not be the same as "
1608
+ "the rank of the current process."
1609
+ )
1610
+ _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
1611
+ output = dist_comm_isend_op(tensor, _dst, group, tag)
1612
+ _deal_comm_outputs(output, False)
1613
+
1614
+
1615
+
1616
+ def recv(tensor, src=0, group=None, tag=0):
1617
+ """
1618
+ Receive tensors from src.
1619
+
1620
+ Note:
1621
+ Only support PyNative mode, Graph mode is not currently supported.
1622
+
1623
+ Args:
1624
+ tensor (Tensor): Tensor to fill with received data.
1625
+ src (int, optional): A required integer identifying the source rank(global rank). Default: ``0``.
1626
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1627
+ Ascend. Default: ``None``.
1628
+ tag (int, optional): A required integer identifying the send/recv message tag. The message will
1629
+ be received by the Send op with the same "tag". Default: 0. It is a reserved parameter currently.
1630
+
1631
+ Returns:
1632
+ int, If success, return ``0``.
1633
+
1634
+ Raises:
1635
+ TypeError: If the `tensor` is not Tensor, `src` is not an int or `group` is not a str.
1636
+ ValueError: If the rank ID of the process is greater than the rank size of the communication group.
1637
+
1638
+ Supported Platforms:
1639
+ ``Ascend``
1640
+
1641
+ Examples:
1642
+ .. note::
1643
+ Before running the following examples, you need to configure the communication environment variables.
1644
+
1645
+ For Ascend devices, it is recommended to use the msrun startup method
1646
+ without any third-party or configuration file dependencies.
1647
+ Please see the `msrun start up
1648
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1649
+ for more details.
1650
+
1651
+ This example should be run with 2 devices.
1652
+
1653
+ >>> from mindspore.mint.distributed import init_process_group
1654
+ >>> from mindspore.mint.distributed import send, recv, get_rank
1655
+ >>> from mindspore import Tensor
1656
+ >>> import numpy as np
1657
+ >>>
1658
+ # Launch 2 processes.
1659
+ >>> init_process_group()
1660
+ >>> this_rank = get_rank()
1661
+ # Process 0 send the array to Process 1
1662
+ >>> if this_rank == 0:
1663
+ >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
1664
+ >>> send(input_, 1)
1665
+ >>> if this_rank == 1:
1666
+ >>> x = Tensor(np.zeros([2, 8]).astype(np.float32))
1667
+ >>> out = recv(x, src=0)
1668
+ >>> print(x)
1669
+ rank 1:
1670
+ [[1. 1. 1. 1. 1. 1. 1. 1.]
1671
+ [1. 1. 1. 1. 1. 1. 1. 1.]]
1672
+ """
1673
+ if not isinstance(tensor, (Tensor, Tensor_)):
1674
+ raise TypeError("For recv, the input tensor must be tensor")
1675
+ if not isinstance(src, int):
1676
+ raise TypeError("For recv, the src must be int")
1677
+ if group is None:
1678
+ group = GlobalComm.WORLD_COMM_GROUP
1679
+ if not isinstance(group, str):
1680
+ raise TypeError(
1681
+ "The argument 'group' must be type of string, "
1682
+ "but got 'group' type : {}.".format(type(group))
1683
+ )
1684
+ _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
1685
+ _deal_comm_outputs(
1686
+ dist_comm_irecv_op(tensor, tag, _src, group), False
1687
+ )
1688
+ return 0
1689
+
1690
+
1691
+ def isend(tensor, dst=0, group=None, tag=0):
1692
+ """
1693
+ Send tensors to the specified dest_rank asynchronously.
1694
+
1695
+ Note:
1696
+ Only support PyNative mode, Graph mode is not currently supported.
1697
+
1698
+ Args:
1699
+ tensor (Tensor): Tensor to send.
1700
+ dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
1701
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1702
+ Ascend. Default: ``None``.
1703
+ tag (int, optional): A required integer identifying the send/recv message tag. The message will
1704
+ be received by the Receive op with the same "tag". Default: 0. It is a reserved parameter currently.
1705
+
1706
+ Returns:
1707
+ CommHandle, it is an async work handle.
1708
+
1709
+ Raises:
1710
+ TypeError: If the `tensor` is not Tensor, `dst` is not an int or `group` is not a str.
1711
+ ValueError: If the `dst` process rank id is same as the current process.
1712
+
1713
+ Supported Platforms:
1714
+ ``Ascend``
1715
+
1716
+ Examples:
1717
+ .. note::
1718
+ Before running the following examples, you need to configure the communication environment variables.
1719
+
1720
+ For Ascend devices, it is recommended to use the msrun startup method
1721
+ without any third-party or configuration file dependencies.
1722
+ Please see the `msrun start up
1723
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1724
+ for more details.
1725
+
1726
+ This example should be run with 2 devices.
1727
+
1728
+ >>> from mindspore.mint.distributed import init_process_group
1729
+ >>> from mindspore.mint.distributed import isend, irecv, get_rank
1730
+ >>> from mindspore import Tensor
1731
+ >>> import numpy as np
1732
+ >>>
1733
+ # Launch 2 processes.
1734
+ >>> init_process_group()
1735
+ >>> this_rank = get_rank()
1736
+ # Process 0 send the array to Process 1
1737
+ >>> if this_rank == 0:
1738
+ >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
1739
+ >>> handle = isend(input_, 1)
1740
+ >>> handle.wait()
1741
+ >>> if this_rank == 1:
1742
+ >>> x = Tensor(np.zeros([2, 8]).astype(np.float32))
1743
+ >>> handle = irecv(x, src=0)
1744
+ >>> handle.wait()
1745
+ >>> print(x)
1746
+ rank 1:
1747
+ [[1. 1. 1. 1. 1. 1. 1. 1.]
1748
+ [1. 1. 1. 1. 1. 1. 1. 1.]]
1749
+ """
1750
+ if not isinstance(tensor, (Tensor, Tensor_)):
1751
+ raise TypeError("For isend, the input tensor must be tensor")
1752
+ if not isinstance(dst, int):
1753
+ raise TypeError("For isend, the dst must be int")
1754
+ if group is None:
1755
+ group = GlobalComm.WORLD_COMM_GROUP
1756
+ if not isinstance(group, str):
1757
+ raise TypeError(
1758
+ "The argument 'group' must be type of string, "
1759
+ "but got 'group' type : {}.".format(type(group))
1760
+ )
1761
+ if get_rank() == dst:
1762
+ raise ValueError(
1763
+ "Invalid destination rank: destination rank should not be the same as "
1764
+ "the rank of the current process."
1765
+ )
1766
+ _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
1767
+ output = dist_comm_isend_op(tensor, _dst, group, tag)
1768
+ _, handle = _deal_comm_outputs(output, True)
1769
+ return handle
1770
+
1771
+
1772
+ def irecv(tensor, src=0, group=None, tag=0):
1773
+ """
1774
+ Receive tensors from src asynchronously.
1775
+
1776
+ Note:
1777
+ Only support PyNative mode, Graph mode is not currently supported.
1778
+
1779
+ Args:
1780
+ tensor (Tensor): Tensor to fill with received data.
1781
+ src (int, optional): A required integer identifying the source rank(global rank). Default: ``0``.
1782
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1783
+ Ascend. Default: ``None``.
1784
+ tag (int, optional): A required integer identifying the send/recv message tag. The message will
1785
+ be received by the Send op with the same "tag". Default: ``0``. It is a reserved parameter currently.
1786
+
1787
+ Returns:
1788
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1789
+ CommHandle will be None, when `async_op` is False.
1790
+
1791
+ Raises:
1792
+ TypeError: If the type of `tensor` is not Tensor, If `src` is not an int or `group` is not a str.
1793
+ ValueError: If the rank ID of the process is greater than the rank size of the communication group.
1794
+
1795
+ Supported Platforms:
1796
+ ``Ascend``
1797
+
1798
+ Examples:
1799
+ .. note::
1800
+ Before running the following examples, you need to configure the communication environment variables.
1801
+
1802
+ For Ascend devices, it is recommended to use the msrun startup method
1803
+ without any third-party or configuration file dependencies.
1804
+ Please see the `msrun start up
1805
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1806
+ for more details.
1807
+
1808
+ This example should be run with 2 devices.
1809
+
1810
+ >>> from mindspore.mint.distributed import init_process_group
1811
+ >>> from mindspore.mint.distributed import isend, irecv, get_rank
1812
+ >>> from mindspore import Tensor
1813
+ >>> import numpy as np
1814
+ >>>
1815
+ # Launch 2 processes.
1816
+ >>> init_process_group()
1817
+ >>> this_rank = get_rank()
1818
+ # Process 0 send the array to Process 1
1819
+ >>> if this_rank == 0:
1820
+ >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
1821
+ >>> handle = isend(input_, 1)
1822
+ >>> handle.wait()
1823
+ >>> if this_rank == 1:
1824
+ >>> x = Tensor(np.zeros([2, 8]).astype(np.float32))
1825
+ >>> handle = irecv(x, src=0)
1826
+ >>> handle.wait()
1827
+ >>> print(x)
1828
+ rank 1:
1829
+ [[1. 1. 1. 1. 1. 1. 1. 1.]
1830
+ [1. 1. 1. 1. 1. 1. 1. 1.]]
1831
+ """
1832
+ if not isinstance(tensor, (Tensor, Tensor_)):
1833
+ raise TypeError("For irecv, the input tensor must be tensor")
1834
+ if group is None:
1835
+ group = GlobalComm.WORLD_COMM_GROUP
1836
+ if not isinstance(group, str):
1837
+ raise TypeError(
1838
+ "The argument 'group' must be type of string, "
1839
+ "but got 'group' type : {}.".format(type(group))
1840
+ )
1841
+ if not isinstance(src, int):
1842
+ raise TypeError("For irecv, the src must be int")
1843
+ _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
1844
+ output = dist_comm_irecv_op(tensor, tag, _src, group)
1845
+ _, handle = _deal_comm_outputs(output, True)
1846
+ return handle
1847
+
1848
+
1849
+ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False):
1850
+ """
1851
+ scatter and gather list of tensor to/from all rank according to input/output tensor list.
1852
+
1853
+ Note:
1854
+ - tensor shape in `output_shape_list` and `input_tensor_list` should be match across ranks.
1855
+ - Only support PyNative mode, Graph mode is not currently supported.
1856
+
1857
+ Args:
1858
+ output_tensor_list (List[Tensor]): List of tensors that indicate the gathered from remote ranks.
1859
+ input_tensor_list (List[Tensor]): List of tensors to scatter to the remote rank.
1860
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
1861
+ Ascend. Default: ``None``.
1862
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
1863
+
1864
+ Returns:
1865
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
1866
+ CommHandle will be None, when `async_op` is False.
1867
+
1868
+ Raises:
1869
+ TypeError: If not all elements in `input_tensor_list` or `output_tensor_list` are Tensor.
1870
+ TypeError: If tensors in `input_tensor_list` or `output_tensor_list` are not the same type.
1871
+ TypeError: If `group` is not str or `async_op` is not bool.
1872
+
1873
+ Supported Platforms:
1874
+ ``Ascend``
1875
+
1876
+ Examples:
1877
+ .. note::
1878
+ Before running the following examples, you need to configure the communication environment variables.
1879
+
1880
+ For Ascend devices, it is recommended to use the msrun startup method
1881
+ without any third-party or configuration file dependencies.
1882
+ Please see the `msrun start up
1883
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
1884
+ for more details.
1885
+
1886
+ This example should be run with 2 devices.
1887
+
1888
+ >>> import mindspore as ms
1889
+ >>> from mindspore.mint.distributed import init_process_group, get_rank
1890
+ >>> from mindspore.mint.distributed import all_to_all
1891
+ >>> from mindspore import Tensor
1892
+ >>>
1893
+ >>> init_process_group()
1894
+ >>> this_rank = get_rank()
1895
+ >>> if this_rank == 0:
1896
+ >>> send_tensor_list = [Tensor(1.), Tensor([[2, 3], [4, 5.]])]
1897
+ >>> recv_tensor_list = [Tensor((0), dtype=ms.float32), Tensor([0, 0.])]
1898
+ >>> if this_rank == 1:
1899
+ >>> send_tensor_list = [Tensor([2, 2.]), Tensor([4, 5, 6, 7.])]
1900
+ >>> recv_tensor_list = [Tensor([[0, 0.],[0, 0]]), Tensor([0, 0, 0, 0.])]
1901
+ >>> handle = all_to_all(recv_tensor_list, send_tensor_list)
1902
+ >>> print(recv_tensor_list)
1903
+ rank 0:
1904
+ (Tensor(shape=[], dtype=Float32, value= 1),
1905
+ Tensor(shape=[2], dtype=Float32, value= [2.00000000e+00, 2.00000000e+00]))
1906
+ rank 1:
1907
+ (Tensor(shape=[2, 2], dtype=Float32, value=
1908
+ [[2.00000000e+00, 3.00000000e+00],
1909
+ [4.00000000e+00, 5.00000000e+00]]),
1910
+ Tensor(shape=[4], dtype=Float32, value=[4.00000000e+00, 5.00000000e+00, 6.00000000e+00, 7.00000000e+00]))
1911
+
1912
+ """
1913
+ if group is None:
1914
+ group = GlobalComm.WORLD_COMM_GROUP
1915
+ if not isinstance(group, str):
1916
+ raise TypeError(
1917
+ "The argument 'group' must be type of string, "
1918
+ "but got 'group' type : {}.".format(type(group))
1919
+ )
1920
+ if not isinstance(async_op, bool):
1921
+ raise TypeError(
1922
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
1923
+ )
1924
+
1925
+ _check_all_tensors(input_tensor_list)
1926
+ _check_all_tensors(output_tensor_list)
1927
+ _check_all_tensor_same_dtype(input_tensor_list)
1928
+ _check_all_tensor_same_dtype(output_tensor_list)
1929
+ send_numel_list = []
1930
+ send_flatten_tensor = []
1931
+ recv_numel_list = []
1932
+ recv_shape_list = []
1933
+
1934
+ for tensor in input_tensor_list:
1935
+ send_numel_list.append(tensor.size)
1936
+ send_flatten_tensor.append(tensor.reshape(-1))
1937
+ for tensor in output_tensor_list:
1938
+ recv_numel_list.append(tensor.size)
1939
+ recv_shape_list.append(tensor.shape)
1940
+
1941
+ send_flatten_tensor = cat(send_flatten_tensor)
1942
+ send_flatten_tensor = _contiguous(send_flatten_tensor)
1943
+
1944
+ rank_size = get_group_size(group)
1945
+ output = dist_comm_all_to_all_v_op(
1946
+ output_tensor_list,
1947
+ send_flatten_tensor,
1948
+ group,
1949
+ send_numel_list,
1950
+ recv_numel_list,
1951
+ rank_size,
1952
+ )
1953
+ _, handle = _deal_comm_outputs(output, async_op)
1954
+ return handle
1955
+
1956
+
1957
+ def _get_all_to_all_single_numel_list(tensor, output, output_split_sizes,
1958
+ input_split_sizes, group):
1959
+ """get numel list for all_to_all_single."""
1960
+ if _is_split_sizes_empty(input_split_sizes):
1961
+ _world_size = get_group_size(group)
1962
+ if tensor.shape[0] % _world_size != 0:
1963
+ raise ValueError(
1964
+ "input shape at dim 0 must be divided by world_size, "
1965
+ f"but got {tensor.shape[0]} and {_world_size}."
1966
+ )
1967
+ _split_size = tensor.shape[0] // _world_size
1968
+ input_split_sizes = (_split_size,) * _world_size
1969
+ if _is_split_sizes_empty(output_split_sizes):
1970
+ _world_size = get_group_size(group)
1971
+ shape_dim_0 = output.shape[0]
1972
+
1973
+ if shape_dim_0 % _world_size != 0:
1974
+ raise ValueError(
1975
+ "output shape at dim 0 must be divided by world_size, "
1976
+ f"but got {shape_dim_0} and {_world_size}."
1977
+ )
1978
+ _split_size = shape_dim_0 // _world_size
1979
+ output_split_sizes = (_split_size,) * _world_size
1980
+
1981
+ send_size_without_first_dim = _get_size(tensor.shape[1:])
1982
+ send_numel_list = [size * send_size_without_first_dim for size in input_split_sizes]
1983
+
1984
+ recv_shape_without_first_dim = output.shape[1:]
1985
+ recv_size_without_first_dim = _get_size(recv_shape_without_first_dim)
1986
+ recv_numel_list = [
1987
+ size * recv_size_without_first_dim for size in output_split_sizes
1988
+ ]
1989
+ return send_numel_list, recv_numel_list, recv_shape_without_first_dim
1990
+
1991
+
1992
+ def all_to_all_single(output,
1993
+ input,
1994
+ output_split_sizes=None,
1995
+ input_split_sizes=None,
1996
+ group=None,
1997
+ async_op=False):
1998
+ """
1999
+ scatter and gather input with split size to/from all rank, and return result in a single tensor.
2000
+
2001
+ Note:
2002
+ - Only support PyNative mode, Graph mode is not currently supported.
2003
+
2004
+ Args:
2005
+ output (Tensor): the output tensor is gathered concatenated from remote ranks.
2006
+ input (Tensor): tensor to be scattered to remote rank.
2007
+ output_split_sizes (Union(Tuple(int), List(int)), optional): output split size at dim 0. If set to None,
2008
+ it means equally split by ``world_size``. Default: ``None``.
2009
+ input_split_sizes (Union(Tuple(int), List(int)), optional): input split size at dim 0. If set to None,
2010
+ it means equally split by ``world_size``. Default: ``None``.
2011
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2012
+ Ascend. Default: ``None``.
2013
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
2014
+
2015
+ Returns:
2016
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
2017
+ CommHandle will be None, when `async_op` is False.
2018
+
2019
+ Raises:
2020
+ TypeError: If `input` or `output` is not tensor. `group` is not a str, or async_op is not bool.
2021
+ ValueError: When `input_split_sizes` is empty, input dim 0 can not be divided by ``world_size``.
2022
+ ValueError: When `output_split_sizes` is empty, output dim 0 can not be divided by ``world_size``.
2023
+
2024
+ Supported Platforms:
2025
+ ``Ascend``
2026
+
2027
+ Examples:
2028
+ .. note::
2029
+ Before running the following examples, you need to configure the communication environment variables.
2030
+
2031
+ For Ascend devices, it is recommended to use the msrun startup method
2032
+ without any third-party or configuration file dependencies.
2033
+ Please see the `msrun start up
2034
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2035
+ for more details.
2036
+
2037
+ This example should be run with 2 devices.
2038
+
2039
+ >>> import numpy as np
2040
+ >>> import mindspore
2041
+ >>> from mindspore.mint.distributed import init_process_group, get_rank
2042
+ >>> from mindspore.mint.distributed import all_to_all_single
2043
+ >>> from mindspore import Tensor
2044
+ >>> from mindspore.ops import zeros
2045
+ >>>
2046
+ >>> init_process_group()
2047
+ >>> this_rank = get_rank()
2048
+ >>> if this_rank == 0:
2049
+ >>> output = Tensor(np.zeros([3, 3]).astype(np.float32))
2050
+ >>> tensor = Tensor([[0, 1, 2.], [3, 4, 5], [6, 7, 8]])
2051
+ >>> result = all_to_all_single(output, tensor, [2, 1], [2, 1])
2052
+ >>> print(output)
2053
+ >>> if this_rank == 1:
2054
+ >>> output = Tensor(np.zeros([2, 3]).astype(np.float32))
2055
+ >>> tensor = Tensor([[9, 10., 11], [12, 13, 14]])
2056
+ >>> result = all_to_all_single(output, tensor, [1, 1], [1, 1])
2057
+ >>> print(output)
2058
+ rank 0:
2059
+ [[ 0. 1. 2.]
2060
+ [ 3. 4. 5.]
2061
+ [ 9. 10. 11.]]
2062
+ rank 1:
2063
+ [[ 6. 7. 8.]
2064
+ [12. 13. 14.]]
2065
+
2066
+ """
2067
+
2068
+ _check_all_tensors([input])
2069
+ _check_all_tensors([output])
2070
+ if group is None:
2071
+ group = GlobalComm.WORLD_COMM_GROUP
2072
+ if not isinstance(group, str):
2073
+ raise TypeError(
2074
+ "The argument 'group' must be type of string, "
2075
+ "but got 'group' type : {}.".format(type(group))
2076
+ )
2077
+ if not isinstance(async_op, bool):
2078
+ raise TypeError(
2079
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
2080
+ )
2081
+ split_sizes_empty = _is_split_sizes_empty(output_split_sizes) and _is_split_sizes_empty(input_split_sizes)
2082
+ send_numel_list, recv_numel_list, _ = \
2083
+ _get_all_to_all_single_numel_list(input, output, output_split_sizes, input_split_sizes, group)
2084
+ _input = input.reshape(-1)
2085
+ rank_size = get_group_size(group)
2086
+ result = dist_comm_all_to_all_v_single_op(
2087
+ output,
2088
+ _input,
2089
+ group,
2090
+ send_numel_list,
2091
+ recv_numel_list,
2092
+ rank_size,
2093
+ split_sizes_empty,
2094
+ )
2095
+ _, handle = _deal_comm_outputs(result, async_op)
2096
+ return handle
2097
+
2098
+
2099
+ def _check_tensor_list(tensor_list, tensor, group_size):
2100
+ """check all elements in tensor_list are type of Tensor or tuple or list"""
2101
+ if not tensor_list or len(tensor_list) != group_size:
2102
+ raise TypeError(
2103
+ f"The argument list tensor len must be equal to group rank size, but got {len(tensor_list)}."
2104
+ )
2105
+ if tensor.dtype != tensor_list[0].dtype:
2106
+ raise TypeError(
2107
+ f"The argument list tensor type must be equal to tensor type, but got {tensor_list[0].dtype}."
2108
+ )
2109
+ if tensor.shape != tensor_list[0].shape:
2110
+ raise TypeError(
2111
+ f"The argument list tensor shape must be equal to tensor shape, but got {tensor_list[0].shape}."
2112
+ )
2113
+
2114
+
2115
+ def all_gather(tensor_list, tensor, group=None, async_op=False):
2116
+ """
2117
+ Gathers tensors from the specified communication group and returns the tensor list which is all gathered.
2118
+
2119
+ Note:
2120
+ The tensors must have the same shape and format in all processes of the collection.
2121
+
2122
+ Args:
2123
+ tensor_list (list[Tensor]): Output list.
2124
+ tensor (Tensor): The input tensor to be all gathered into tensor.
2125
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2126
+ Ascend. Default: ``None``.
2127
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
2128
+
2129
+ Returns:
2130
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
2131
+ CommHandle will be None, when `async_op` is False.
2132
+
2133
+ Raises:
2134
+ TypeError: If the type of input `tensor` is not Tensor, `tensor_list` is not Tensor List,
2135
+ `group` is not a str or async_op is not bool.
2136
+ TypeError: If size of `tensor_list` is not equal to group size。
2137
+ TypeError: If the type or shape of `tensor` not equal to the member of `tensor_list`。
2138
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2139
+
2140
+ Supported Platforms:
2141
+ ``Ascend`` ``CPU``
2142
+
2143
+ Examples:
2144
+ .. note::
2145
+ Before running the following examples, you need to configure the communication environment variables.
2146
+
2147
+ For Ascend devices, it is recommended to use the msrun startup method
2148
+ without any third-party or configuration file dependencies.
2149
+ Please see the `msrun start up
2150
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2151
+ for more details.
2152
+
2153
+ This example should be run with 2 devices.
2154
+
2155
+ >>> import numpy as np
2156
+ >>> import mindspore as ms
2157
+ >>> from mindspore.mint.distributed import init_process_group
2158
+ >>> from mindspore.mint.distributed import all_gather
2159
+ >>> from mindspore import Tensor
2160
+ >>>
2161
+ >>> init_process_group()
2162
+ >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
2163
+ >>> out_tensors = [Tensor(np.zeros([2, 8]).astype(np.float32)), Tensor(np.zeros([2, 8]).astype(np.float32))]
2164
+ >>> output = all_gather(out_tensors, input_tensor)
2165
+ >>> print(out_tensors)
2166
+ [Tensor(shape=[2, 8], dtype=Float32, value=
2167
+ [[ 1.00000000e+00, 1.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
2168
+ [ 1.00000000e+00, 1.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00]]),
2169
+ Tensor(shape=[2, 8], dtype=Float32, value=
2170
+ [[ 1.00000000e+00, 1.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
2171
+ [ 1.00000000e+00, 1.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00]])]
2172
+
2173
+
2174
+ """
2175
+ _check_all_tensors(tensor_list)
2176
+ _check_all_tensor_same_dtype_and_shape(tensor_list)
2177
+ if not isinstance(tensor, (Tensor, Tensor_)):
2178
+ raise TypeError("For all_gather_into_tensor, the input tensor must be tensor")
2179
+ if group is None:
2180
+ group = GlobalComm.WORLD_COMM_GROUP
2181
+ if not isinstance(group, str):
2182
+ raise TypeError(
2183
+ "The argument 'group' must be type of string, "
2184
+ "but got 'group' type : {}.".format(type(group))
2185
+ )
2186
+ if not isinstance(async_op, bool):
2187
+ raise TypeError(
2188
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
2189
+ )
2190
+ group_size = get_group_size(group)
2191
+ _check_tensor_list(tensor_list, tensor, group_size)
2192
+ result = dist_comm_all_gather_op(tensor_list, tensor, group_size, group)
2193
+ _, handle = _deal_comm_outputs(result, async_op)
2194
+ return handle
2195
+
2196
+
2197
+ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
2198
+ r"""
2199
+ Reduces and scatters tensors from the specified communication group and
2200
+ returns the tensor which is reduced and scattered.
2201
+
2202
+ Note:
2203
+ The tensors must have the same shape and format in all processes of the collection.
2204
+
2205
+ Args:
2206
+ output (Tensor): the output tensor.
2207
+ input_list (list[Tensor]): List of tensors to reduce and scatter.
2208
+ op (str, optional): Specifies an operation used for element-wise reductions,
2209
+ like SUM and MAX. Default: ``ReduceOp.SUM`` .
2210
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2211
+ Ascend. Default: ``None``.
2212
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
2213
+
2214
+ Returns:
2215
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
2216
+ CommHandle will be None, when `async_op` is False.
2217
+
2218
+ Raises:
2219
+ TypeError: If the type of `output` parameter is not Tensor, `input_list` is not Tensor List.
2220
+ TypeError: If any of `op` and `group` is not a str. async_op is not bool or 'op' is invalid.
2221
+ TypeError: If size of `input_list` is not equal to group size.
2222
+ TypeError: If the type or shape of `output` not equal to the member of `input_list`.
2223
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2224
+
2225
+ Supported Platforms:
2226
+ ``Ascend``
2227
+
2228
+ Examples:
2229
+ .. note::
2230
+ Before running the following examples, you need to configure the communication environment variables.
2231
+
2232
+ For Ascend devices, it is recommended to use the msrun startup method
2233
+ without any third-party or configuration file dependencies.
2234
+ Please see the `msrun start up
2235
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2236
+ for more details.
2237
+
2238
+ This example should be run with 2 devices.
2239
+
2240
+ >>> from mindspore import Tensor
2241
+ >>> from mindspore.mint.distributed import init_process_group
2242
+ >>> from mindspore.mint.distributed import reduce_scatter
2243
+ >>> import numpy as np
2244
+ >>>
2245
+ >>> init_process_group()
2246
+ >>> input_tensors = [Tensor(np.ones([4, 8]).astype(np.float32)), Tensor(np.ones([4, 8]).astype(np.float32))]
2247
+ >>> output_tensor = Tensor(np.zeros([4, 8]).astype(np.float32))
2248
+ >>> output = reduce_scatter(output_tensor ,input_tensors)
2249
+ >>> print(output_tensor)
2250
+ [[2. 2. 2. 2. 2. 2. 2. 2.]
2251
+ [2. 2. 2. 2. 2. 2. 2. 2.]
2252
+ [2. 2. 2. 2. 2. 2. 2. 2.]
2253
+ [2. 2. 2. 2. 2. 2. 2. 2.]]
2254
+
2255
+ """
2256
+
2257
+ _check_all_tensors(input_list)
2258
+ _check_all_tensor_same_dtype_and_shape(input_list)
2259
+ if not isinstance(output, (Tensor, Tensor_)):
2260
+ raise TypeError("For reduce_scatter, the output tensor must be tensor")
2261
+ if group is None:
2262
+ group = GlobalComm.WORLD_COMM_GROUP
2263
+ if not isinstance(group, str):
2264
+ raise TypeError(
2265
+ "The argument 'group' must be type of string, "
2266
+ "but got 'group' type : {}.".format(type(group))
2267
+ )
2268
+ if not isinstance(async_op, bool):
2269
+ raise TypeError(
2270
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
2271
+ )
2272
+ if not isinstance(op, str):
2273
+ raise TypeError("For reduce_scatter, the input op type must be str")
2274
+ if op not in ("sum", "prod", "min", "max"):
2275
+ raise TypeError(
2276
+ "For reduce_scatter, the input op value must be one of sum, prod, min, max"
2277
+ )
2278
+ rank_size = get_group_size(group)
2279
+ _check_tensor_list(input_list, output, rank_size)
2280
+ result = dist_comm_reduce_scatter_op(output, input_list, rank_size, op, group)
2281
+ _, handle = _deal_comm_outputs(result, async_op)
2282
+ return handle
2283
+
2284
+
2285
+ def scatter(tensor, scatter_list, src=0, group=None, async_op=False):
2286
+ r"""
2287
+ Scatter tensor evently across the processes in the specified communication group.
2288
+
2289
+ Note:
2290
+ - The interface behavior only support Tensor List input and scatter evenly.
2291
+ - Only the tensor in process `src` (global rank) will do scatter.
2292
+ - Only support PyNative mode, Graph mode is not currently supported.
2293
+
2294
+ Args:
2295
+ tensor (Tensor): the output tensor.
2296
+ scatter_list (list[Tensor]): List of same-sized tensors to scatter.
2297
+ default is None, must be specified on the source rank.
2298
+ src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
2299
+ And only process `src` will send the tensor.
2300
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2301
+ Ascend. Default: ``None``.
2302
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
2303
+
2304
+ Returns:
2305
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
2306
+ CommHandle will be None, when `async_op` is False.
2307
+
2308
+ Raises:
2309
+ TypeError: If the type of `tensor` parameter is not Tensor, `scatter_list` is not Tensor List.
2310
+ TypeError: If any of `op` and `group` is not a str. async_op is not bool or 'op' is invalid.
2311
+ TypeError: If size of `scatter_list` is not equal to group size.
2312
+ TypeError: If the type or shape of `tensor` not equal to the member of `scatter_list`.
2313
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2314
+
2315
+ Supported Platforms:
2316
+ ``Ascend`` ``CPU``
2317
+
2318
+ Examples:
2319
+ .. note::
2320
+ Before running the following examples, you need to configure the communication environment variables.
2321
+
2322
+ For Ascend devices, it is recommended to use the msrun startup method
2323
+ without any third-party or configuration file dependencies.
2324
+ Please see the `msrun start up
2325
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2326
+ for more details.
2327
+
2328
+ This example should be run with 2 devices.
2329
+
2330
+ >>> from mindspore import Tensor
2331
+ >>> from mindspore.mint.distributed import init_process_group, scatter
2332
+ >>> import numpy as np
2333
+ >>> # Launch 2 processes.
2334
+ >>>
2335
+ >>> init_process_group()
2336
+ >>> inputs = [Tensor(np.ones([2, 2]).astype(np.float32)), Tensor(np.ones([2, 2]).astype(np.float32))]
2337
+ >>> output = Tensor(np.zeros([2, 2]).astype(np.float32))
2338
+ >>> scatter(output, inputs, src=0)
2339
+ >>> print(output)
2340
+ # rank_0
2341
+ [[1. 1.]
2342
+ [1. 1.]]
2343
+ # rank_1
2344
+ [[1. 1.]
2345
+ [1. 1.]]
2346
+ """
2347
+ _check_all_tensors(scatter_list)
2348
+ _check_all_tensor_same_dtype_and_shape(scatter_list)
2349
+ if not isinstance(tensor, (Tensor, Tensor_)):
2350
+ raise TypeError("For scatter_tensor, the output tensor must be tensor")
2351
+ if not isinstance(src, int):
2352
+ raise TypeError("For scatter_tensor, the src must be int")
2353
+ if group is None:
2354
+ group = GlobalComm.WORLD_COMM_GROUP
2355
+ if not isinstance(group, str):
2356
+ raise TypeError(
2357
+ "The argument 'group' must be type of string, "
2358
+ "but got 'group' type : {}.".format(type(group))
2359
+ )
2360
+ if not isinstance(async_op, bool):
2361
+ raise TypeError(
2362
+ f"The argument 'async_op' must be a bool, but got {type(async_op)}."
2363
+ )
2364
+ src = get_group_rank_from_world_rank(src, group)
2365
+ rank_size = get_group_size(group)
2366
+ rank_id = get_rank(group)
2367
+ if src == rank_id:
2368
+ _check_tensor_list(scatter_list, tensor, rank_size)
2369
+ output = dist_comm_scatter_op(tensor, scatter_list, rank_size, src, rank_id, group)
2370
+ _, handle = _deal_comm_outputs(output, async_op)
2371
+ return handle
2372
+
2373
+
2374
+ def gather(tensor, gather_list, dst=0, group=None, async_op=False):
2375
+ r"""
2376
+ Gathers tensors from the specified communication group. The operation will gather the tensor
2377
+ from processes according to dimension 0.
2378
+
2379
+ Note:
2380
+ - Only the tensor in process `dst` (global rank) will keep the gathered tensor. The other process
2381
+ will keep a tensor list which has no mathematical meaning.
2382
+ - The tensors must have the same shape and format in all processes of the collection.
2383
+ - Only support PyNative mode, Graph mode is not currently supported.
2384
+
2385
+ Args:
2386
+ tensor (Tensor): The tensor to be gathered.
2387
+ gather_list (list[Tensor]): List of same-sized tensors to use for gathered data.
2388
+ dst (int, optional): Specifies the rank(global rank) of the process that receive the tensor.
2389
+ And only process `dst` will receive the gathered tensor. Default: ``0`` .
2390
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2391
+ Ascend. Default: ``None``.
2392
+ async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
2393
+
2394
+ Returns:
2395
+ CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
2396
+ CommHandle will be None, when `async_op` is False.
2397
+
2398
+ Raises:
2399
+ TypeError: If the type of input tensor is not Tensor, or gather_list is not Tensor list.
2400
+ TypeError: If dst is not an integer, group is not a string or async_op is not bool.
2401
+ TypeError: If size of `gather_list` is not equal to group size.
2402
+ TypeError: If the type or shape of `tensor` not equal to the member of `gather_list`.
2403
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2404
+
2405
+ Supported Platforms:
2406
+ ``Ascend`` ``CPU``
2407
+
2408
+ Examples:
2409
+ .. note::
2410
+ Before running the following examples, you need to configure the communication environment variables.
2411
+
2412
+ For Ascend devices, it is recommended to use the msrun startup method
2413
+ without any third-party or configuration file dependencies.
2414
+ Please see the `msrun start up
2415
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2416
+ for more details.
2417
+
2418
+ This example should be run with 2 devices.
2419
+
2420
+ >>> import numpy as np
2421
+ >>> import mindspore as ms
2422
+ >>> import mindspore.nn as nn
2423
+ >>> from mindspore.mint.distributed import init_process_group, gather
2424
+ >>> from mindspore import Tensor
2425
+ >>> # Launch 2 processes.
2426
+ >>> init_process_group()
2427
+ >>> input = Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
2428
+ >>> outputs = [Tensor(np.zeros([2, 2]).astype(np.float32)),Tensor(np.zeros([2, 2]).astype(np.float32))]
2429
+ >>> gather(input, outputs, dst=0)
2430
+ >>> print(outputs)
2431
+ # rank_0
2432
+ [Tensor(shape=[2, 2], dtype=Float32, value=
2433
+ [[ 0.00000000e+00, 1.00000000e+00],
2434
+ [ 2.00000000e+00, 3.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
2435
+ [[ 0.00000000e+00, 1.00000000e+00], [ 2.00000000e+00, 3.00000000e+00]])]
2436
+ [Tensor(shape=[2, 2], dtype=Float32, value=
2437
+ [[ 0.00000000e+00, 1.00000000e+00],
2438
+ [ 2.00000000e+00, 3.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
2439
+ [[ 0.00000000e+00, 1.00000000e+00], [ 2.00000000e+00, 3.00000000e+00]])]
2440
+ # rank_1
2441
+ [Tensor(shape=[2, 2], dtype=Float32, value=
2442
+ [[ 0.00000000e+00, 0.00000000e+00],
2443
+ [ 0.00000000e+00, 0.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
2444
+ [[ 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00]])]
2445
+ [Tensor(shape=[2, 2], dtype=Float32, value=
2446
+ [[ 0.00000000e+00, 0.00000000e+00],
2447
+ [ 0.00000000e+00, 0.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
2448
+ [[ 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00]])]
2449
+ """
2450
+ if not isinstance(tensor, (Tensor, Tensor_)):
2451
+ raise TypeError("For gather, the input tensor must be tensor")
2452
+ _check_all_tensors(gather_list)
2453
+ _check_all_tensor_same_dtype_and_shape(gather_list)
2454
+ if not isinstance(dst, int):
2455
+ raise TypeError("For gather, the dst must be int")
2456
+ if group is None:
2457
+ group = GlobalComm.WORLD_COMM_GROUP
2458
+ if not isinstance(group, str):
2459
+ raise TypeError(
2460
+ "The argument 'group' must be type of string, "
2461
+ "but got 'group' type : {}.".format(type(group))
2462
+ )
2463
+ if not isinstance(async_op, bool):
2464
+ raise TypeError(f"The argument 'async_op' must be a bool, but got {type(async_op)}.")
2465
+ group_size = get_group_size(group)
2466
+ dst = get_group_rank_from_world_rank(dst, group)
2467
+ rank_id = get_rank(group)
2468
+ if dst == rank_id:
2469
+ _check_tensor_list(gather_list, tensor, group_size)
2470
+ output = dist_comm_gather_op(tensor, gather_list, group_size, dst, rank_id, group)
2471
+ _, handle = _deal_comm_outputs(output, async_op)
2472
+ return handle
2473
+
2474
+
2475
+ def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None):
2476
+ r"""
2477
+ Scatters picklable objects in scatter_object_input_list to the whole group.
2478
+
2479
+ Note:
2480
+ - Similar to :func:`mindspore.mint.distributed.scatter`, but Python objects can be passed in.
2481
+ - Only the objects in process `src` (global rank) will do scatter.
2482
+ - Only support PyNative mode, Graph mode is not currently supported.
2483
+
2484
+ Args:
2485
+ scatter_object_output_list (list[Any]): Non-empty list whose first element
2486
+ will store the object scattered to this rank.
2487
+ scatter_object_input_list (list[Any]): List of python objects to scatter.
2488
+ it must be specified on the source rank.
2489
+ src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
2490
+ And only process `src` will send the tensor. Default: ``0`` .
2491
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2492
+ Ascend. Default: ``None``.
2493
+
2494
+ Raises:
2495
+ TypeError: If `group` is not a str or `src` is not an integer.
2496
+ TypeError: If size of `scatter_object_input_list` is not equal to group size.
2497
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2498
+
2499
+ Supported Platforms:
2500
+ ``Ascend``
2501
+
2502
+ Examples:
2503
+ .. note::
2504
+ Before running the following examples, you need to configure the communication environment variables.
2505
+
2506
+ For Ascend devices, it is recommended to use the msrun startup method
2507
+ without any third-party or configuration file dependencies.
2508
+ Please see the `msrun start up
2509
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2510
+ for more details.
2511
+
2512
+ This example should be run with 2 devices.
2513
+
2514
+ >>> from mindspore.mint.distributed import init_process_group, scatter_object_list
2515
+ >>> init_process_group()
2516
+ >>> obj = ["test", {1: 2}]
2517
+ >>> scatter_object_output_list=[None]
2518
+ >>> scatter_object_list(scatter_object_output_list, obj)
2519
+ >>> print(scatter_object_output_list)
2520
+ # rank_0
2521
+ ['test']
2522
+ # rank_1
2523
+ [{1: 2}]
2524
+ """
2525
+ if group is None:
2526
+ group = GlobalComm.WORLD_COMM_GROUP
2527
+ if not isinstance(group, str):
2528
+ raise TypeError(
2529
+ "For 'scatter_object_list', the argument 'group' must be type of string, "
2530
+ "but got 'group' type : {}.".format(type(group))
2531
+ )
2532
+ if not isinstance(scatter_object_output_list, list) or not scatter_object_output_list:
2533
+ raise TypeError(f"The scatter_object_output_list can not be empty.")
2534
+ if not isinstance(src, int):
2535
+ raise TypeError("For scatter_object_list, the src must be int")
2536
+ group_size = get_group_size(group)
2537
+ rank_id = get_rank()
2538
+ tensor_sizes = []
2539
+ tensor_list = []
2540
+ if rank_id == src:
2541
+ if not isinstance(scatter_object_input_list, list) or len(scatter_object_input_list) != group_size:
2542
+ raise TypeError(
2543
+ "The len of scatter_object_input_list must be equal to group rank size, "
2544
+ "but got {len(scatter_object_input_list)}."
2545
+ )
2546
+ for obj in scatter_object_input_list:
2547
+ _, size = _object_to_tensor(obj)
2548
+ tensor_sizes.append(Tensor([size], dtype=mstype.int32))
2549
+ max_size = int(max(tensor_sizes).item())
2550
+ for obj in scatter_object_input_list:
2551
+ tensor, _ = _object_to_tensor(obj, max_size)
2552
+ tensor_list.append(tensor)
2553
+ else:
2554
+ tensor_sizes = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
2555
+
2556
+ object_size = cat(tensor_sizes)
2557
+ broadcast(object_size, src, group)
2558
+ max_object_size = int(max(object_size).item())
2559
+ data = np.zeros((max_object_size)).astype(np.int8)
2560
+ if rank_id != src:
2561
+ tensor_list = [Tensor(data) for i in range(group_size)]
2562
+ out_tensor = Tensor(data)
2563
+ scatter(out_tensor, tensor_list, src, group)
2564
+ group_id = get_group_rank_from_world_rank(rank_id, group)
2565
+ scatter_object_output_list[0] = _tensor_to_object(out_tensor, object_size[group_id])
2566
+
2567
+
2568
+ def gather_object(obj, object_gather_list=None, dst=0, group=None):
2569
+ r"""
2570
+ Gathers python objects from the whole group in a single process.
2571
+
2572
+ Note:
2573
+ - Similar to :func:`mindspore.mint.distributed.gather`, but Python objects can be passed in.
2574
+ - Only support PyNative mode, Graph mode is not currently supported.
2575
+
2576
+ Args:
2577
+ obj (Any): The python objects to be gathered.
2578
+ object_gather_list (list[Any], optional): List of same-sized tensors to use for gathered data.
2579
+ On the ``dst`` rank, it should be correctly sized as the size of the group for this
2580
+ collective and will contain the output. Default: ``None``.
2581
+ dst (int, optional): Specifies the rank(global rank) of the process that receive the tensor.
2582
+ And only process `dst` will receive the gathered tensor. Default: ``0`` .
2583
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2584
+ Ascend. Default: ``None``.
2585
+
2586
+ Raises:
2587
+ TypeError: If dst is not an integer, or group is not a string.
2588
+ TypeError: If size of `object_gather_list` is not equal to group size.
2589
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2590
+
2591
+ Supported Platforms:
2592
+ ``Ascend``
2593
+
2594
+ Examples:
2595
+ .. note::
2596
+ Before running the following examples, you need to configure the communication environment variables.
2597
+
2598
+ For Ascend devices, it is recommended to use the msrun startup method
2599
+ without any third-party or configuration file dependencies.
2600
+ Please see the `msrun start up
2601
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2602
+ for more details.
2603
+
2604
+ This example should be run with 2 devices.
2605
+
2606
+ >>> from mindspore.mint.distributed import init_process_group, gather_object, get_rank
2607
+ >>> init_process_group()
2608
+ >>> rank = get_rank()
2609
+ >>> obj = ["test", {1: 2}]
2610
+ >>> object_gather_list=[None, None]
2611
+ >>> gather_object(obj[rank], object_gather_list)
2612
+ >>> print(object_gather_list)
2613
+ # rank_0
2614
+ ['test', {1: 2}]
2615
+ """
2616
+ if group is None:
2617
+ group = GlobalComm.WORLD_COMM_GROUP
2618
+ if not isinstance(group, str):
2619
+ raise TypeError(
2620
+ "For 'gather_object', the argument 'group' must be type of string, "
2621
+ "but got 'group' type : {}.".format(type(group))
2622
+ )
2623
+ if not isinstance(dst, int):
2624
+ raise TypeError("For gather_object, the dst must be int")
2625
+ group_size = get_group_size(group)
2626
+ rank_id = get_rank()
2627
+ if rank_id == dst:
2628
+ if not isinstance(object_gather_list, list) or len(object_gather_list) != group_size:
2629
+ raise TypeError(
2630
+ f"The len of object_gather_list must be equal to group rank size, but got {len(object_gather_list)}."
2631
+ )
2632
+ _, size = _object_to_tensor(obj)
2633
+ tensor = Tensor([size], dtype=mstype.int32)
2634
+ object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
2635
+ all_gather(object_size_list, tensor, group=group)
2636
+ max_object_size = int(max(object_size_list).item())
2637
+ in_tensor, size = _object_to_tensor(obj, max_object_size)
2638
+ data = np.zeros((size)).astype(np.int8)
2639
+ object_tensor_list = [Tensor(data) for i in range(group_size)]
2640
+ gather(in_tensor, object_tensor_list, dst, group)
2641
+ if rank_id != dst:
2642
+ return
2643
+ for i, item in enumerate(object_size_list):
2644
+ tensor_size = int(item.item())
2645
+ tensor = object_tensor_list[i]
2646
+ object_gather_list[i] = _tensor_to_object(tensor, tensor_size)
2647
+
2648
+
2649
+ def broadcast_object_list(object_list, src=0, group=None, device=None):
2650
+ """
2651
+ Broadcasts the entire group of input Python objects.
2652
+
2653
+ Note:
2654
+ - Similar to :func:`mindspore.mint.distributed.broadcast`, but Python objects can be passed in.
2655
+ - Only support PyNative mode, Graph mode is not currently supported.
2656
+
2657
+ Args:
2658
+ object_list (list[Any]): list of input to be sent if src is the rank of current process,
2659
+ and list to be used to save received data otherwise.
2660
+ src (int, optional): Specifies the rank(global rank) of the process that broadcast the Python objects.
2661
+ And only process `src` will broadcast the Python objects. Default: ``0`` .
2662
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2663
+ Ascend. Default: ``None``.
2664
+ device (str, optional): Currently it is a reserved parameter. Default: ``None``.
2665
+
2666
+ Raises:
2667
+ TypeError: If `src` is not an integer or `group` is not a string.
2668
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2669
+
2670
+ Supported Platforms:
2671
+ ``Ascend``
2672
+
2673
+ Examples:
2674
+ .. note::
2675
+ Before running the following examples, you need to configure the communication environment variables.
2676
+
2677
+ For Ascend devices, it is recommended to use the msrun startup method
2678
+ without any third-party or configuration file dependencies.
2679
+ Please see the `msrun start up
2680
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2681
+ for more details.
2682
+
2683
+ This example should be run with 2 devices.
2684
+
2685
+ >>> from mindspore.mint.distributed import init_process_group, broadcast_object_list, get_rank
2686
+ >>> init_process_group()
2687
+ >>> rank = get_rank()
2688
+ >>> obj = ["test", 12, {1: 2}]
2689
+ >>> if rank == 1:
2690
+ >>> obj = [None, None, None]
2691
+ >>> broadcast_object_list(obj)
2692
+ >>> print(obj)
2693
+ ['test', 12, {1: 2}]
2694
+ """
2695
+ if group is None:
2696
+ group = GlobalComm.WORLD_COMM_GROUP
2697
+ if not isinstance(group, str):
2698
+ raise TypeError(
2699
+ "For 'broadcast_object_list', the argument 'group' must be type of string, "
2700
+ "but got 'group' type : {}.".format(type(group))
2701
+ )
2702
+ if not isinstance(src, int):
2703
+ raise TypeError("For broadcast_object_list, the src must be int")
2704
+ if not isinstance(object_list, list) or not object_list:
2705
+ raise TypeError(f"The object_list can not be empty.")
2706
+ rank_id = get_rank()
2707
+ tensor_sizes = []
2708
+ tensor_list = []
2709
+ size = 0
2710
+ object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(len(object_list))]
2711
+ if rank_id == src:
2712
+ tensor_list, tensor_sizes = zip(
2713
+ *[_object_to_tensor(obj) for obj in object_list]
2714
+ )
2715
+ object_size_list = [Tensor([tensor_sizes[i]], dtype=mstype.int32) for i in range(len(tensor_sizes))]
2716
+ object_tensor = cat(tensor_list)
2717
+ object_size = cat(object_size_list)
2718
+ broadcast(object_size, src, group)
2719
+ size = int(sum(object_size).item())
2720
+ if rank_id != src:
2721
+ data = np.zeros((size)).astype(np.int8)
2722
+ object_tensor = Tensor(data)
2723
+ broadcast(object_tensor, src, group)
2724
+ if rank_id != src:
2725
+ offset = 0
2726
+ for i, item in enumerate(object_size):
2727
+ obj_size = item
2728
+ obj_view = object_tensor[offset : offset + obj_size]
2729
+ offset += obj_size
2730
+ object_list[i] = _tensor_to_object(obj_view, obj_size)
2731
+
2732
+
2733
+ def all_gather_object(object_list, obj, group=None):
2734
+ """
2735
+ Aggregates Python objects in a specified communication group.
2736
+
2737
+ Note:
2738
+ Similar to :func:`mindspore.mint.distributed.all_gather`, but Python objects can be passed in.
2739
+
2740
+ Args:
2741
+ object_list (list[Any]): Output Python object list.
2742
+ obj (Any): Python object to be broadcast from current process.
2743
+ group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
2744
+ Ascend. Default: ``None``.
2745
+
2746
+ Raises:
2747
+ TypeError: `group` is not a str.
2748
+ TypeError: If size of `object_list` is not equal to group size.
2749
+ RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
2750
+
2751
+ Supported Platforms:
2752
+ ``Ascend``
2753
+
2754
+ Examples:
2755
+ .. note::
2756
+ Before running the following examples, you need to configure the communication environment variables.
2757
+
2758
+ For Ascend devices, it is recommended to use the msrun startup method
2759
+ without any third-party or configuration file dependencies.
2760
+ Please see the `msrun start up
2761
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
2762
+ for more details.
2763
+
2764
+ This example should be run with 2 devices.
2765
+
2766
+ >>> from mindspore.mint.distributed import init_process_group, get_rank
2767
+ >>> from mindspore.mint.distributed import all_gather_object
2768
+ >>> init_process_group()
2769
+ >>> rank = get_rank()
2770
+ >>> obj = ["test", {1: 2}]
2771
+ >>> object_gather_list=[None, None]
2772
+ >>> all_gather_object(object_gather_list, obj[rank])
2773
+ >>> print(object_gather_list)
2774
+ # rank_0
2775
+ ['test', {1: 2}]
2776
+ # rank_1
2777
+ ['test', {1: 2}]
2778
+ """
2779
+ if group is None:
2780
+ group = GlobalComm.WORLD_COMM_GROUP
2781
+ if not isinstance(group, str):
2782
+ raise TypeError(
2783
+ "For 'all_gather_object', the argument 'group' must be type of string, "
2784
+ "but got 'group' type : {}.".format(type(group))
2785
+ )
2786
+ group_size = get_group_size(group)
2787
+ if not isinstance(object_list, list) or len(object_list) != group_size:
2788
+ raise TypeError(
2789
+ f"The len of argument object_list must be equal to group rank size, but got {len(object_list)}."
2790
+ )
2791
+ _, size = _object_to_tensor(obj)
2792
+ tensor = Tensor([size], dtype=mstype.int32)
2793
+ object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
2794
+ all_gather(object_size_list, tensor, group=group)
2795
+ max_object_size = int(max(object_size_list).item())
2796
+ in_tensor, size = _object_to_tensor(obj, max_object_size)
2797
+ data = np.zeros((size)).astype(np.int8)
2798
+ object_tensor_list = [Tensor(data) for i in range(group_size)]
2799
+ all_gather(object_tensor_list, in_tensor, group=group)
2800
+
2801
+ for i, item in enumerate(object_size_list):
2802
+ tensor_size = int(item.item())
2803
+ tensor = object_tensor_list[i]
2804
+ object_list[i] = _tensor_to_object(tensor, tensor_size)