mindspore 2.7.0rc1__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (370) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +5 -2
  3. mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +2 -2
  7. mindspore/_extends/builtin_operations.py +3 -3
  8. mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
  9. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  10. mindspore/_extends/parse/__init__.py +3 -3
  11. mindspore/_extends/parse/compile_config.py +24 -1
  12. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
  13. mindspore/_extends/parse/parser.py +28 -22
  14. mindspore/_extends/parse/resources.py +1 -1
  15. mindspore/_extends/parse/standard_method.py +23 -2
  16. mindspore/_extends/parse/trope.py +2 -1
  17. mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
  18. mindspore/amp.py +0 -18
  19. mindspore/avcodec-59.dll +0 -0
  20. mindspore/avdevice-59.dll +0 -0
  21. mindspore/avfilter-8.dll +0 -0
  22. mindspore/avformat-59.dll +0 -0
  23. mindspore/avutil-57.dll +0 -0
  24. mindspore/boost/base.py +29 -2
  25. mindspore/common/__init__.py +18 -12
  26. mindspore/common/_decorator.py +3 -2
  27. mindspore/common/_grad_function.py +3 -1
  28. mindspore/common/_tensor_cpp_method.py +1 -1
  29. mindspore/common/_tensor_docs.py +371 -96
  30. mindspore/common/_utils.py +7 -43
  31. mindspore/common/api.py +434 -135
  32. mindspore/common/dtype.py +98 -57
  33. mindspore/common/dump.py +7 -108
  34. mindspore/common/dynamic_shape/__init__.py +0 -0
  35. mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
  36. mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
  37. mindspore/common/file_system.py +59 -9
  38. mindspore/common/hook_handle.py +82 -3
  39. mindspore/common/jit_config.py +5 -1
  40. mindspore/common/jit_trace.py +27 -12
  41. mindspore/common/lazy_inline.py +5 -3
  42. mindspore/common/np_dtype.py +3 -3
  43. mindspore/common/parameter.py +17 -127
  44. mindspore/common/recompute.py +4 -13
  45. mindspore/common/tensor.py +50 -217
  46. mindspore/communication/_comm_helper.py +11 -1
  47. mindspore/communication/comm_func.py +138 -4
  48. mindspore/communication/management.py +85 -1
  49. mindspore/config/op_info.config +0 -15
  50. mindspore/context.py +20 -106
  51. mindspore/dataset/__init__.py +1 -1
  52. mindspore/dataset/audio/transforms.py +1 -1
  53. mindspore/dataset/core/config.py +35 -1
  54. mindspore/dataset/engine/datasets.py +338 -319
  55. mindspore/dataset/engine/datasets_user_defined.py +38 -22
  56. mindspore/dataset/engine/datasets_vision.py +1 -1
  57. mindspore/dataset/engine/validators.py +1 -15
  58. mindspore/dataset/transforms/c_transforms.py +2 -2
  59. mindspore/dataset/transforms/transforms.py +3 -3
  60. mindspore/dataset/vision/__init__.py +1 -1
  61. mindspore/dataset/vision/py_transforms.py +8 -8
  62. mindspore/dataset/vision/transforms.py +17 -5
  63. mindspore/dataset/vision/utils.py +632 -21
  64. mindspore/device_context/ascend/op_tuning.py +35 -1
  65. mindspore/dnnl.dll +0 -0
  66. mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
  67. mindspore/graph/custom_pass.py +55 -0
  68. mindspore/include/api/cell.h +28 -4
  69. mindspore/include/api/cfg.h +24 -7
  70. mindspore/include/api/context.h +1 -0
  71. mindspore/include/api/delegate.h +0 -2
  72. mindspore/include/api/dual_abi_helper.h +100 -19
  73. mindspore/include/api/graph.h +14 -1
  74. mindspore/include/api/kernel.h +16 -3
  75. mindspore/include/api/kernel_api.h +9 -1
  76. mindspore/include/api/metrics/accuracy.h +9 -0
  77. mindspore/include/api/model.h +5 -1
  78. mindspore/include/api/model_group.h +4 -0
  79. mindspore/include/api/model_parallel_runner.h +2 -0
  80. mindspore/include/api/status.h +48 -10
  81. mindspore/include/api/types.h +6 -1
  82. mindspore/include/dataset/constants.h +9 -0
  83. mindspore/include/dataset/execute.h +2 -2
  84. mindspore/jpeg62.dll +0 -0
  85. mindspore/mindrecord/__init__.py +3 -3
  86. mindspore/mindrecord/common/exceptions.py +1 -0
  87. mindspore/mindrecord/config.py +1 -1
  88. mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
  89. mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
  90. mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
  91. mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
  92. mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
  93. mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
  94. mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
  95. mindspore/mindrecord/filereader.py +4 -4
  96. mindspore/mindrecord/filewriter.py +5 -5
  97. mindspore/mindrecord/mindpage.py +2 -2
  98. mindspore/mindrecord/tools/cifar10.py +4 -3
  99. mindspore/mindrecord/tools/cifar100.py +1 -1
  100. mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
  101. mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
  102. mindspore/mindrecord/tools/csv_to_mr.py +1 -1
  103. mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
  104. mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
  105. mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
  106. mindspore/mindspore_backend_common.dll +0 -0
  107. mindspore/mindspore_backend_manager.dll +0 -0
  108. mindspore/mindspore_cluster.dll +0 -0
  109. mindspore/mindspore_common.dll +0 -0
  110. mindspore/mindspore_core.dll +0 -0
  111. mindspore/mindspore_cpu.dll +0 -0
  112. mindspore/mindspore_dump.dll +0 -0
  113. mindspore/mindspore_frontend.dll +0 -0
  114. mindspore/mindspore_glog.dll +0 -0
  115. mindspore/mindspore_hardware_abstract.dll +0 -0
  116. mindspore/mindspore_memory_pool.dll +0 -0
  117. mindspore/mindspore_ms_backend.dll +0 -0
  118. mindspore/mindspore_ops.dll +0 -0
  119. mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
  120. mindspore/mindspore_profiler.dll +0 -0
  121. mindspore/mindspore_pyboost.dll +0 -0
  122. mindspore/mindspore_pynative.dll +0 -0
  123. mindspore/mindspore_runtime_pipeline.dll +0 -0
  124. mindspore/mindspore_runtime_utils.dll +0 -0
  125. mindspore/mindspore_tools.dll +0 -0
  126. mindspore/mint/__init__.py +15 -10
  127. mindspore/mint/distributed/__init__.py +4 -0
  128. mindspore/mint/distributed/distributed.py +392 -69
  129. mindspore/mint/nn/__init__.py +2 -16
  130. mindspore/mint/nn/functional.py +4 -110
  131. mindspore/mint/nn/layer/__init__.py +0 -2
  132. mindspore/mint/nn/layer/_functions.py +1 -2
  133. mindspore/mint/nn/layer/activation.py +0 -6
  134. mindspore/mint/nn/layer/basic.py +0 -47
  135. mindspore/mint/nn/layer/conv.py +10 -10
  136. mindspore/mint/nn/layer/normalization.py +11 -16
  137. mindspore/mint/nn/layer/pooling.py +0 -4
  138. mindspore/nn/__init__.py +1 -3
  139. mindspore/nn/cell.py +231 -239
  140. mindspore/nn/layer/activation.py +4 -2
  141. mindspore/nn/layer/basic.py +56 -14
  142. mindspore/nn/layer/container.py +16 -0
  143. mindspore/nn/layer/embedding.py +4 -169
  144. mindspore/nn/layer/image.py +1 -1
  145. mindspore/nn/layer/normalization.py +2 -1
  146. mindspore/nn/layer/thor_layer.py +4 -85
  147. mindspore/nn/optim/ada_grad.py +0 -1
  148. mindspore/nn/optim/adafactor.py +0 -1
  149. mindspore/nn/optim/adam.py +32 -127
  150. mindspore/nn/optim/adamax.py +0 -1
  151. mindspore/nn/optim/asgd.py +0 -1
  152. mindspore/nn/optim/ftrl.py +8 -102
  153. mindspore/nn/optim/lamb.py +1 -4
  154. mindspore/nn/optim/lars.py +0 -3
  155. mindspore/nn/optim/lazyadam.py +25 -218
  156. mindspore/nn/optim/momentum.py +5 -43
  157. mindspore/nn/optim/optimizer.py +6 -55
  158. mindspore/nn/optim/proximal_ada_grad.py +0 -1
  159. mindspore/nn/optim/rmsprop.py +0 -1
  160. mindspore/nn/optim/rprop.py +0 -1
  161. mindspore/nn/optim/sgd.py +0 -1
  162. mindspore/nn/optim/tft_wrapper.py +2 -4
  163. mindspore/nn/optim/thor.py +0 -2
  164. mindspore/nn/probability/bijector/bijector.py +7 -8
  165. mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
  166. mindspore/nn/probability/bijector/power_transform.py +20 -21
  167. mindspore/nn/probability/bijector/scalar_affine.py +5 -5
  168. mindspore/nn/probability/bijector/softplus.py +13 -14
  169. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  170. mindspore/nn/wrap/cell_wrapper.py +39 -5
  171. mindspore/nn/wrap/grad_reducer.py +4 -89
  172. mindspore/numpy/array_creations.py +4 -4
  173. mindspore/numpy/fft.py +9 -9
  174. mindspore/numpy/utils_const.py +1 -1
  175. mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
  176. mindspore/onnx/onnx_export.py +137 -0
  177. mindspore/opencv_core4110.dll +0 -0
  178. mindspore/opencv_imgcodecs4110.dll +0 -0
  179. mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
  180. mindspore/ops/__init__.py +2 -0
  181. mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
  182. mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
  183. mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
  184. mindspore/ops/_op_impl/cpu/__init__.py +1 -5
  185. mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
  186. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
  187. mindspore/ops/auto_generate/gen_extend_func.py +6 -11
  188. mindspore/ops/auto_generate/gen_ops_def.py +385 -154
  189. mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
  190. mindspore/ops/communication.py +97 -0
  191. mindspore/ops/composite/__init__.py +5 -2
  192. mindspore/ops/composite/base.py +16 -2
  193. mindspore/ops/composite/multitype_ops/__init__.py +3 -1
  194. mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
  195. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  196. mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
  197. mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
  198. mindspore/ops/function/__init__.py +2 -0
  199. mindspore/ops/function/array_func.py +24 -18
  200. mindspore/ops/function/comm_func.py +3883 -0
  201. mindspore/ops/function/debug_func.py +7 -6
  202. mindspore/ops/function/grad/grad_func.py +4 -12
  203. mindspore/ops/function/math_func.py +89 -86
  204. mindspore/ops/function/nn_func.py +92 -313
  205. mindspore/ops/function/random_func.py +9 -18
  206. mindspore/ops/functional.py +4 -1
  207. mindspore/ops/functional_overload.py +377 -30
  208. mindspore/ops/operations/__init__.py +2 -5
  209. mindspore/ops/operations/_custom_ops_utils.py +7 -9
  210. mindspore/ops/operations/_inner_ops.py +12 -50
  211. mindspore/ops/operations/_rl_inner_ops.py +0 -933
  212. mindspore/ops/operations/array_ops.py +5 -50
  213. mindspore/ops/operations/comm_ops.py +95 -17
  214. mindspore/ops/operations/custom_ops.py +237 -22
  215. mindspore/ops/operations/debug_ops.py +33 -35
  216. mindspore/ops/operations/manually_defined/ops_def.py +39 -318
  217. mindspore/ops/operations/math_ops.py +5 -5
  218. mindspore/ops/operations/nn_ops.py +3 -3
  219. mindspore/ops/operations/sparse_ops.py +0 -83
  220. mindspore/ops/primitive.py +4 -27
  221. mindspore/ops/tensor_method.py +88 -10
  222. mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
  223. mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
  224. mindspore/ops_generate/api/functions_cc_generator.py +53 -4
  225. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
  226. mindspore/ops_generate/common/gen_constants.py +11 -10
  227. mindspore/ops_generate/common/op_proto.py +18 -1
  228. mindspore/ops_generate/common/template.py +102 -245
  229. mindspore/ops_generate/common/template_utils.py +212 -0
  230. mindspore/ops_generate/gen_custom_ops.py +69 -0
  231. mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
  232. mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
  233. mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
  234. mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
  235. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
  236. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
  237. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
  238. mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
  239. mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
  240. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
  241. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
  242. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
  243. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
  244. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
  245. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
  246. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
  247. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
  248. mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
  249. mindspore/ops_generate/resources/yaml_loader.py +13 -0
  250. mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
  251. mindspore/parallel/_auto_parallel_context.py +5 -15
  252. mindspore/parallel/_cell_wrapper.py +1 -1
  253. mindspore/parallel/_parallel_serialization.py +4 -6
  254. mindspore/parallel/_ps_context.py +2 -2
  255. mindspore/parallel/_utils.py +34 -17
  256. mindspore/parallel/auto_parallel.py +23 -9
  257. mindspore/parallel/checkpoint_transform.py +20 -2
  258. mindspore/parallel/cluster/process_entity/_api.py +28 -33
  259. mindspore/parallel/cluster/process_entity/_utils.py +9 -5
  260. mindspore/parallel/cluster/run.py +5 -3
  261. mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
  262. mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
  263. mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
  264. mindspore/parallel/function/reshard_func.py +6 -5
  265. mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
  266. mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
  267. mindspore/parallel/shard.py +7 -21
  268. mindspore/parallel/strategy.py +336 -0
  269. mindspore/parallel/transform_safetensors.py +127 -20
  270. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
  271. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
  272. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
  273. mindspore/profiler/common/constant.py +5 -0
  274. mindspore/profiler/common/file_manager.py +9 -0
  275. mindspore/profiler/common/msprof_cmd_tool.py +40 -4
  276. mindspore/profiler/common/path_manager.py +65 -24
  277. mindspore/profiler/common/profiler_context.py +27 -14
  278. mindspore/profiler/common/profiler_info.py +3 -3
  279. mindspore/profiler/common/profiler_meta_data.py +1 -0
  280. mindspore/profiler/common/profiler_op_analyse.py +10 -6
  281. mindspore/profiler/common/profiler_path_manager.py +13 -0
  282. mindspore/profiler/common/util.py +30 -3
  283. mindspore/profiler/dynamic_profiler.py +91 -46
  284. mindspore/profiler/envprofiler.py +30 -5
  285. mindspore/profiler/experimental_config.py +18 -2
  286. mindspore/profiler/platform/cpu_profiler.py +10 -4
  287. mindspore/profiler/platform/npu_profiler.py +34 -7
  288. mindspore/profiler/profiler.py +193 -145
  289. mindspore/profiler/profiler_action_controller.py +1 -1
  290. mindspore/profiler/profiler_interface.py +2 -2
  291. mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
  292. mindspore/run_check/_check_version.py +108 -24
  293. mindspore/runtime/__init__.py +9 -6
  294. mindspore/runtime/executor.py +35 -0
  295. mindspore/runtime/memory.py +113 -0
  296. mindspore/runtime/thread_bind_core.py +1 -1
  297. mindspore/swresample-4.dll +0 -0
  298. mindspore/swscale-6.dll +0 -0
  299. mindspore/tinyxml2.dll +0 -0
  300. mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
  301. mindspore/tools/data_dump.py +130 -0
  302. mindspore/tools/sdc_detect.py +91 -0
  303. mindspore/tools/stress_detect.py +63 -0
  304. mindspore/train/__init__.py +6 -6
  305. mindspore/train/_utils.py +8 -21
  306. mindspore/train/amp.py +6 -7
  307. mindspore/train/callback/_callback.py +2 -1
  308. mindspore/train/callback/_checkpoint.py +1 -17
  309. mindspore/train/callback/_flops_collector.py +10 -6
  310. mindspore/train/callback/_train_fault_tolerance.py +72 -25
  311. mindspore/train/data_sink.py +5 -9
  312. mindspore/train/dataset_helper.py +5 -5
  313. mindspore/train/model.py +41 -230
  314. mindspore/train/serialization.py +160 -401
  315. mindspore/train/train_thor/model_thor.py +2 -2
  316. mindspore/turbojpeg.dll +0 -0
  317. mindspore/utils/__init__.py +6 -3
  318. mindspore/utils/dlpack.py +92 -0
  319. mindspore/utils/dryrun.py +1 -1
  320. mindspore/utils/runtime_execution_order_check.py +10 -0
  321. mindspore/utils/sdc_detect.py +14 -12
  322. mindspore/utils/stress_detect.py +43 -0
  323. mindspore/utils/utils.py +152 -16
  324. mindspore/version.py +1 -1
  325. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
  326. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
  327. mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
  328. mindspore/communication/_hccl_management.py +0 -297
  329. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
  330. mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
  331. mindspore/experimental/llm_boost/atb/__init__.py +0 -23
  332. mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
  333. mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
  334. mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
  335. mindspore/experimental/llm_boost/register.py +0 -130
  336. mindspore/experimental/llm_boost/utils.py +0 -31
  337. mindspore/include/OWNERS +0 -7
  338. mindspore/mindspore_cpu_res_manager.dll +0 -0
  339. mindspore/mindspore_ops_kernel_common.dll +0 -0
  340. mindspore/mindspore_res_manager.dll +0 -0
  341. mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
  342. mindspore/nn/reinforcement/_batch_read_write.py +0 -142
  343. mindspore/nn/reinforcement/_tensors_queue.py +0 -152
  344. mindspore/nn/reinforcement/tensor_array.py +0 -145
  345. mindspore/opencv_core452.dll +0 -0
  346. mindspore/opencv_imgcodecs452.dll +0 -0
  347. mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
  348. mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
  349. mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
  350. mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
  351. mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
  352. mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
  353. mindspore/ops/operations/_tensor_array.py +0 -359
  354. mindspore/ops/operations/rl_ops.py +0 -288
  355. mindspore/parallel/_offload_context.py +0 -275
  356. mindspore/parallel/_recovery_context.py +0 -115
  357. mindspore/parallel/_transformer/__init__.py +0 -35
  358. mindspore/parallel/_transformer/layers.py +0 -765
  359. mindspore/parallel/_transformer/loss.py +0 -251
  360. mindspore/parallel/_transformer/moe.py +0 -693
  361. mindspore/parallel/_transformer/op_parallel_config.py +0 -222
  362. mindspore/parallel/_transformer/transformer.py +0 -3124
  363. mindspore/parallel/mpi/_mpi_config.py +0 -116
  364. mindspore/profiler/common/validator/validate_path.py +0 -84
  365. mindspore/train/memory_profiling_pb2.py +0 -298
  366. mindspore/utils/hooks.py +0 -81
  367. /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
  368. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
  369. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
  370. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
@@ -78,14 +78,14 @@ def get_convert_type_str(dtype: str, optional, use_basic_type=False):
78
78
  'float': 'ToFloat',
79
79
  'bool': 'ToBool',
80
80
  'number': 'ToScalar',
81
- 'tuple[int]': 'ToIntList<py::tuple>',
82
- 'tuple[float]': 'ToFloatList<py::tuple>',
83
- 'tuple[bool]': 'ToBoolList<py::tuple>',
84
- 'tuple[tensor]': 'ToTensorList<py::tuple>',
85
- 'list[int]': 'ToIntList<py::list>',
86
- 'list[float]': 'ToFloatList<py::list>',
87
- 'list[bool]': 'ToBoolList<py::list>',
88
- 'list[tensor]': 'ToTensorList<py::list>',
81
+ 'tuple[int]': 'ToIntList<CPythonTuple>',
82
+ 'tuple[float]': 'ToFloatList<CPythonTuple>',
83
+ 'tuple[bool]': 'ToBoolList<CPythonTuple>',
84
+ 'tuple[tensor]': 'ToTensorList<CPythonTuple>',
85
+ 'list[int]': 'ToIntList<CPythonList>',
86
+ 'list[float]': 'ToFloatList<CPythonList>',
87
+ 'list[bool]': 'ToBoolList<CPythonList>',
88
+ 'list[tensor]': 'ToTensorList<CPythonList>',
89
89
  'tensor': 'ToTensor',
90
90
  'str': 'ToString',
91
91
  'type': 'ToDtype',
@@ -97,14 +97,14 @@ def get_convert_type_str(dtype: str, optional, use_basic_type=False):
97
97
  'tensor': 'ToTensorOptional',
98
98
  'type': 'ToDtypeOptional',
99
99
  'str': 'ToStringOptional',
100
- 'tuple[int]': 'ToIntListOptional<py::tuple>',
101
- 'tuple[float]': 'ToFloatListOptional<py::tuple>',
102
- 'tuple[bool]': 'ToBoolListOptional<py::tuple>',
103
- 'tuple[tensor]': 'ToTensorListOptional<py::tuple>',
104
- 'list[int]': 'ToIntListOptional<py::list>',
105
- 'list[float]': 'ToFloatListOptional<py::list>',
106
- 'list[bool]': 'ToBoolListOptional<py::list>',
107
- 'list[tensor]': 'ToTensorListOptional<py::list>',
100
+ 'tuple[int]': 'ToIntListOptional<CPythonTuple>',
101
+ 'tuple[float]': 'ToFloatListOptional<CPythonTuple>',
102
+ 'tuple[bool]': 'ToBoolListOptional<CPythonTuple>',
103
+ 'tuple[tensor]': 'ToTensorListOptional<CPythonTuple>',
104
+ 'list[int]': 'ToIntListOptional<CPythonList>',
105
+ 'list[float]': 'ToFloatListOptional<CPythonList>',
106
+ 'list[bool]': 'ToBoolListOptional<CPythonList>',
107
+ 'list[tensor]': 'ToTensorListOptional<CPythonList>',
108
108
  }
109
109
  basic_optional_type_convert = {
110
110
  'tuple[int]': "ToBasicIntVectorOptional",
@@ -385,6 +385,17 @@ def get_input_dtype(dtype: str, optional, use_basic_type=False):
385
385
  raise TypeError(f"""Unsupported convert type {dtype} for args.""")
386
386
 
387
387
 
388
+ def get_output_dtype(dtype: str):
389
+ type_convert = {
390
+ 'tensor': "mindspore::tensor::TensorPtr",
391
+ 'tuple[tensor]': "std::vector<mindspore::tensor::TensorPtr>",
392
+ 'list[tensor]': "std::vector<mindspore::tensor::TensorPtr>",
393
+ }
394
+ if dtype in type_convert:
395
+ return type_convert[dtype]
396
+ raise TypeError(f"""Unsupported convert type {dtype} for args.""")
397
+
398
+
388
399
  def is_cube(class_name):
389
400
  cube_set = {'Bmm', 'Baddbmm', 'MatMulExt', 'Mv'}
390
401
  if class_name in cube_set:
@@ -29,6 +29,7 @@ class YamlLoader(ResourceLoader):
29
29
  """
30
30
  YamlLoader is a utility class for loading yaml files.
31
31
  """
32
+
32
33
  def __init__(self, resouce_type: ResourceType, yaml_path: Union[Sequence[str], str]):
33
34
  """
34
35
  Initialize YamlLoader.
@@ -65,15 +66,26 @@ class OpDocYamlLoader(YamlLoader):
65
66
  """
66
67
  OpDocYamlLoader is a class for loading op primitive doc yaml files.
67
68
  """
69
+
68
70
  def __init__(self):
69
71
  op_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_OP_DEF_YAML_PATH, "doc")
70
72
  super().__init__(ResourceType.OP_DOC_YAML, op_doc_yaml_path)
71
73
 
72
74
 
75
+ class CustomOpDocYamlLoader(YamlLoader):
76
+ """
77
+ CustomOpDocYamlLoader is a class for loading op primitive doc yaml files.
78
+ """
79
+
80
+ def __init__(self, doc_yaml_path):
81
+ super().__init__(ResourceType.OP_DOC_YAML, doc_yaml_path)
82
+
83
+
73
84
  class TensorMethodDocYamlLoader(YamlLoader):
74
85
  """
75
86
  TensorMethodDocYamlLoader is a class for loading tensor method doc yaml files.
76
87
  """
88
+
77
89
  def __init__(self):
78
90
  tensor_method_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_TENSOR_METHOD_DOC_YAML_PATH)
79
91
  super().__init__(ResourceType.TENSOR_METHOD_DOC_YAML, tensor_method_doc_yaml_path)
@@ -83,6 +95,7 @@ class MintFuncDocYamlLoader(YamlLoader):
83
95
  """
84
96
  MintFuncDocYamlLoader is a class for loading mint func doc yaml files.
85
97
  """
98
+
86
99
  def __init__(self):
87
100
  mint_func_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_MINT_FUNC_DOC_YAML_PATH)
88
101
  super().__init__(ResourceType.MINT_FUNC_DOC_YAML, mint_func_doc_yaml_path)
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  # ============================================================================
15
15
  """
16
- Generates mindspore/ccsrc/pybind_api/ir/tensor_py.cc which includes the CPython Tensor APIs.
16
+ Generates mindspore/ccsrc/pybind_api/ir/tensor/tensor_py.cc which includes the CPython Tensor APIs.
17
17
  """
18
18
 
19
19
  import os
@@ -26,7 +26,7 @@ from pyboost import pyboost_utils
26
26
 
27
27
  class TensorPyCppGenerator(BaseGenerator):
28
28
  """
29
- This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor_register/
29
+ This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor/tensor_register/
30
30
  auto_generate/tensor_py_gen.cc
31
31
  """
32
32
  def __init__(self):
@@ -81,6 +81,7 @@ class _PipelineScheduler:
81
81
  PIPELINE_SEQPIPE = "seqpipe"
82
82
  PIPELINE_SEQVPP = "seqvpp"
83
83
  PIPELINE_SEQSMARTVPP = "seqsmartvpp"
84
+ PIPELINE_ZBV = "zero_bubble_v"
84
85
 
85
86
 
86
87
  class _AutoParallelContext:
@@ -434,13 +435,6 @@ class _AutoParallelContext:
434
435
  """
435
436
  self.check_context_handle()
436
437
  run_mode = context.get_context("mode")
437
- if run_mode == context.PYNATIVE_MODE and parallel_mode not in (
438
- context.ParallelMode.DATA_PARALLEL, context.ParallelMode.STAND_ALONE,
439
- context.ParallelMode.AUTO_PARALLEL):
440
- raise ValueError(f"Pynative only supports STAND_ALONE, DATA_PARALLEL and AUTO_PARALLEL using"
441
- f" sharding_propagation under shard function"
442
- f" for ParallelMode, "
443
- f"but got {parallel_mode.upper()}.")
444
438
  ret = self._context_handle.set_parallel_mode(parallel_mode)
445
439
  if ret is False:
446
440
  raise ValueError("The context configuration parameter 'parallel_mode' only support 'stand_alone', "
@@ -604,9 +598,6 @@ class _AutoParallelContext:
604
598
  if not isinstance(dim, int):
605
599
  raise TypeError("For 'set_auto_parallel_context', the element of argument "
606
600
  "'dataset_strategy' must be int type, but got the type : {} .".format(type(dim)))
607
- if context.get_context('mode') == context.PYNATIVE_MODE:
608
- raise ValueError("In PyNative mode, the setting value of 'dataset_strategy' must be either 'full_batch' "
609
- f"or 'data_parallel', but got {dataset_strategy}.")
610
601
  self._dataset_strategy_using_str = False
611
602
  self._context_handle.set_dataset_strategy(dataset_strategy)
612
603
 
@@ -646,9 +637,6 @@ class _AutoParallelContext:
646
637
  return "full_batch"
647
638
  return "data_parallel"
648
639
  dataset_strategy = self._context_handle.get_dataset_strategy()
649
- if context.get_context('mode') == context.PYNATIVE_MODE:
650
- raise ValueError("In PyNative mode, the value of 'dataset_strategy' must be either 'full_batch' "
651
- f"or 'data_parallel', but got the setting value is {dataset_strategy}.")
652
640
  return dataset_strategy
653
641
 
654
642
  def set_grad_accumulation_step(self, grad_accumulation_step):
@@ -662,7 +650,7 @@ class _AutoParallelContext:
662
650
  raise ValueError("The interface is deprecated. To use gradient accumulation, "
663
651
  "please use GradAccumulationCell in mindspore.nn.wrap.cell_wrapper.")
664
652
  self.check_context_handle()
665
- Validator.check_positive_int(grad_accumulation_step)
653
+ Validator.check_positive_int(grad_accumulation_step, prim_name='grad_accumulation_step')
666
654
  self._context_handle.set_grad_accumulation_step(grad_accumulation_step)
667
655
 
668
656
  def get_grad_accumulation_step(self):
@@ -998,6 +986,8 @@ class _AutoParallelContext:
998
986
  _PipelineScheduler.PIPELINE_GPIPE,
999
987
  _PipelineScheduler.PIPELINE_SEQPIPE,
1000
988
  _PipelineScheduler.PIPELINE_SEQVPP,
989
+ _PipelineScheduler.PIPELINE_SEQSMARTVPP,
990
+ _PipelineScheduler.PIPELINE_ZBV,
1001
991
  _PipelineScheduler.PIPELINE_SEQSMARTVPP])
1002
992
  scheduler_val = pipeline_config[pp_scheduler]
1003
993
  if not pipeline_config[pp_interleave] and scheduler_val != _PipelineScheduler.PIPELINE_1F1B:
@@ -1072,7 +1062,7 @@ class _AutoParallelContext:
1072
1062
 
1073
1063
  if threshold_name in parallel_optimizer_config:
1074
1064
  Validator.check_non_negative_int(
1075
- parallel_optimizer_config[threshold_name])
1065
+ parallel_optimizer_config[threshold_name], prim_name=threshold_name)
1076
1066
  self._context_handle.set_parallel_optimizer_threshold(
1077
1067
  parallel_optimizer_config[threshold_name])
1078
1068
 
@@ -263,7 +263,7 @@ def _single_parameter_broadcast(net, layout, param_not_load=None, param_loaded=N
263
263
  if not single_params:
264
264
  return
265
265
  param_redundancy_reversed = _get_param_redundancy_reversed(param_redundancy, cur_rank)
266
- if not param_redundancy_reversed or cur_rank not in single_params:
266
+ if not param_redundancy_reversed:
267
267
  return
268
268
  net_param_dict = net.parameters_dict()
269
269
  _chang_parallel_context(origin_dataset_strategy)
@@ -144,8 +144,7 @@ def _build_protobuf_strategy(strategy_filename):
144
144
  parallel_strategy_map = _load_protobuf_strategy(strategy_filename)
145
145
  layout_items = parallel_strategy_map.parallel_layout_item
146
146
  if not layout_items:
147
- raise ValueError(f"For 'build_searched_strategy', the strategy file {strategy_filename} has no sliced "
148
- f"parameter, please check whether the 'strategy_filename' is correct.")
147
+ return {}
149
148
 
150
149
  strategy = {}
151
150
  for layout_item in layout_items:
@@ -159,6 +158,8 @@ def _build_json_strategy(strategy_filename):
159
158
  """build strategy from json file"""
160
159
  with open(strategy_filename, 'r') as f:
161
160
  json_content = json.load(f)
161
+ if "parallel_layout_item" not in json_content:
162
+ return {}
162
163
  layout_items = json_content.get("parallel_layout_item")
163
164
  strategy = {}
164
165
  for parameter_name, layout_item in layout_items.items():
@@ -525,10 +526,7 @@ def _make_dir(path, arg_name):
525
526
  else:
526
527
  ms.log.debug("The directory(%s) doesn't exist, will create it", path)
527
528
  try:
528
- permissions = os.R_OK | os.W_OK | os.X_OK
529
- os.umask(permissions << 3 | permissions)
530
- mode = permissions << 6
531
- os.makedirs(path, mode=mode, exist_ok=True)
529
+ os.makedirs(path, mode=0o700, exist_ok=True)
532
530
  real_path = path
533
531
  except PermissionError as e:
534
532
  ms.log.critical("No write permission on the directory(%r), error = %r", path, e)
@@ -114,8 +114,8 @@ def _set_ps_context(**kwargs):
114
114
  Default: ``False``.
115
115
  config_file_path (str): Configuration file path used by recovery. Default: ''.
116
116
  enable_ssl (bool): Set PS SSL mode enabled or disabled. Default: ``False``.
117
- There might be risk when this is set to False.
118
- It is user's responsibility to ensure the network environment is safe.
117
+ When set to False, users need to review and confirm the security of network environment
118
+ where the distributed job is located.
119
119
  client_password (str): Password to decrypt the secret key stored in the client certificate. Default: ''.
120
120
  server_password (str): Password to decrypt the secret key stored in the server certificate. Default: ''.
121
121
 
@@ -14,14 +14,15 @@
14
14
  # ============================================================================
15
15
  """Utils of auto parallel"""
16
16
  import os
17
+ import re
17
18
  from time import perf_counter
18
19
  from importlib import import_module
19
20
  import numpy as np
20
21
  import mindspore as ms
21
22
  from mindspore import context, log as logger
22
- from mindspore._c_expression import reset_op_id, reset_op_id_with_offset
23
+ from mindspore._c_expression import reset_op_id
23
24
  from mindspore.common.tensor import Tensor
24
- from mindspore.common.dtype import dtype_to_nptype
25
+ from mindspore.common.dtype import _dtype_to_nptype
25
26
  from mindspore.common import dtype as mstype
26
27
  from mindspore.communication.management import get_group_size, get_rank
27
28
  from mindspore.communication._comm_helper import _is_initialized
@@ -156,7 +157,7 @@ def _is_in_auto_parallel_mode():
156
157
 
157
158
 
158
159
  def _is_parallel_mode():
159
- if not _is_initialized() or context.get_context('mode') == context.PYNATIVE_MODE:
160
+ if not _is_initialized():
160
161
  return False
161
162
  if os.getenv("RUN_MODE") != "predict":
162
163
  return False
@@ -173,12 +174,6 @@ def _is_in_hybrid_parallel_mode():
173
174
  return _get_parallel_mode() == ms.ParallelMode.HYBRID_PARALLEL
174
175
 
175
176
 
176
- def _is_pynative_parallel():
177
- parallel_mode = context.get_auto_parallel_context('parallel_mode')
178
- return context.get_context('mode') == context.PYNATIVE_MODE and parallel_mode in (
179
- context.ParallelMode.SEMI_AUTO_PARALLEL, context.ParallelMode.AUTO_PARALLEL)
180
-
181
-
182
177
  def _get_full_batch():
183
178
  """Get whether to use full_batch."""
184
179
  return auto_parallel_context().get_full_batch()
@@ -452,7 +447,7 @@ def _to_full_tensor(elem, global_device_num, global_rank, scaling_sens=None):
452
447
  batchsize_per_device = item
453
448
  else:
454
449
  new_shape += (item,)
455
- new_tensor_numpy = np.zeros(new_shape, dtype_to_nptype(type_))
450
+ new_tensor_numpy = np.zeros(new_shape, _dtype_to_nptype(type_)) # pylint:disable=protected-access
456
451
  start = stage_rank * batchsize_per_device
457
452
  new_tensor_numpy[start: start + batchsize_per_device] = data.asnumpy()
458
453
  else:
@@ -466,7 +461,7 @@ def _to_full_tensor(elem, global_device_num, global_rank, scaling_sens=None):
466
461
  end = (stage_rank % dataset_strategy[index][i] + 1) * item
467
462
  s = slice(start, end, 1)
468
463
  slice_index += (s,)
469
- new_tensor_numpy = np.zeros(new_shape, dtype_to_nptype(type_))
464
+ new_tensor_numpy = np.zeros(new_shape, _dtype_to_nptype(type_)) # pylint:disable=protected-access
470
465
  new_tensor_numpy[slice_index] = data.asnumpy()
471
466
  new_tensor = Tensor(new_tensor_numpy, dtype=type_)
472
467
  lst.append(new_tensor)
@@ -590,11 +585,6 @@ def _reset_op_id():
590
585
  reset_op_id()
591
586
 
592
587
 
593
- def _reset_op_id_with_offset():
594
- """Reset op id with offset."""
595
- reset_op_id_with_offset()
596
-
597
-
598
588
  def _parallel_predict_check():
599
589
  """validate parallel model prediction"""
600
590
  if _is_in_auto_parallel_mode():
@@ -773,7 +763,7 @@ def _grads_divided_by_device_num_if_recomputation(grads):
773
763
  """
774
764
  If in pynative parallel and full_batch is True, divide grads by device num to ensure that the gradients is correct.
775
765
  """
776
- if not _is_pynative_parallel() or not _get_full_batch():
766
+ if not _get_full_batch():
777
767
  return grads
778
768
 
779
769
  device_num = _get_device_num()
@@ -804,3 +794,30 @@ def _check_rank(cur_rank, initial_rank, pipeline_stages):
804
794
  raise ValueError(f"For parameter broadcast, the cur_rank: {cur_rank} is wrong.")
805
795
  if initial_rank % (get_group_size() / pipeline_stages) != 0:
806
796
  raise ValueError(f"For parameter broadcast, the initial_rank: {initial_rank} is wrong.")
797
+
798
+
799
+ def _check_path_safe(path, arg_name):
800
+ """
801
+ Check input path string is safe.
802
+ """
803
+ illegal_patterns = [
804
+ r"\.\.",
805
+ r"//+",
806
+ r"~",
807
+ r"^\s*$",
808
+ r"\./\."
809
+ ]
810
+ for pattern in illegal_patterns:
811
+ if re.search(pattern, path):
812
+ pattern_info = pattern.replace('\\', '')
813
+ raise ValueError(f"{arg_name} contains '{pattern_info}' is not safe, please use a safe one.")
814
+
815
+
816
+ def _check_path_writable(path):
817
+ """
818
+ Check the write permission of the input path.
819
+ """
820
+ if not os.path.exists(path):
821
+ raise RuntimeError(f"{path} Path does not exist.")
822
+ if not os.access(path, os.W_OK):
823
+ raise PermissionError(f"Don't have the write permission on the directory {path}.")
@@ -14,6 +14,7 @@
14
14
  # ============================================================================
15
15
  """Cell of auto parallel"""
16
16
  import os
17
+ from mindspore import jit
17
18
  from mindspore.nn.cell import Cell
18
19
  from mindspore.parallel.shard import Layout
19
20
  from mindspore.communication.management import get_rank, get_group_size
@@ -281,7 +282,8 @@ class AutoParallel(Cell):
281
282
  Note:
282
283
  - It only works when `parallel_mode=sharding_propagation`.
283
284
  - When performing distributed training, users can first save the strategy using dryrun on a single device
284
- and then load strategy to perform distributed training.
285
+ and then load strategy to perform distributed training. Note that only the first device of each node will
286
+ save the strategy file, so the simulated rank id specified by Dryrun must be divisible by 8.
285
287
 
286
288
  Args:
287
289
  file_path (str): Path to save parallel strategy json, must be an absolute path.
@@ -511,17 +513,17 @@ class AutoParallel(Cell):
511
513
  raise ValueError("For 'AutoParallel.pipeline', the argument 'stages' "
512
514
  "must be larger than zero, but got value: {}.".format(stages))
513
515
  if not isinstance(output_broadcast, bool):
514
- raise TypeError("For 'AutoParallel.pipeline', the argument 'stages' "
516
+ raise TypeError("For 'AutoParallel.pipeline', the argument 'output_broadcast' "
515
517
  "must be bool type, but got the type : {}.".format(type(output_broadcast)))
516
518
  if not isinstance(interleave, bool):
517
- raise TypeError("For 'AutoParallel.pipeline', the argument 'stages' "
519
+ raise TypeError("For 'AutoParallel.pipeline', the argument 'interleave' "
518
520
  "must be bool type, but got the type : {}.".format(type(interleave)))
519
521
  if not isinstance(scheduler, str):
520
- raise TypeError("For 'AutoParallel.pipeline', the argument 'stages' "
522
+ raise TypeError("For 'AutoParallel.pipeline', the argument 'scheduler' "
521
523
  "must be str type, but got the type : {}.".format(type(scheduler)))
522
- if scheduler not in ("1f1b", "gpipe", "seqpipe", "seqvpp", "seqsmartvpp"):
524
+ if scheduler not in ("1f1b", "gpipe", "seqpipe", "seqvpp", "seqsmartvpp", "zero_bubble_v"):
523
525
  raise ValueError("For 'AutoParallel.pipeline', the argument "
524
- "'scheduler' must be '1f1b'/'gpipe'/'seqpipe'/'seqvpp'/'seqsmartvpp' ," \
526
+ "'scheduler' must be '1f1b'/'gpipe'/'seqpipe'/'seqvpp'/'seqsmartvpp'/'zero_bubble_v' ," \
525
527
  " but got the value : {}."
526
528
  .format(scheduler))
527
529
  self._pipeline_stages = stages
@@ -665,8 +667,11 @@ class AutoParallel(Cell):
665
667
  - recomputation_communication_overlap (bool): Enable overlap between recompute ops and communication ops
666
668
  if True.
667
669
  Default: False.
668
- - grad_matmul_communication_overlap (bool): Enable overlap between dw matmul and
669
- tensor parallel communication ops if True. Default: False.
670
+ - grad_matmul_communication_overlap (bool, str): When set to ``True``, it indicates that overlap
671
+ between dw matmul and tensor parallel communication is enabled. When set to ``False``, it indicates
672
+ that this feature is disabled. When set to str, it only optimizes the specified communication
673
+ operator types, with operators separated by ``,``. For example, "AlltoAll,AlltoAllV" indicates that
674
+ only ``AlltoAll`` and ``AlltoAllV`` are optimized. Default: ``False``.
670
675
  - grad_fa_allgather_overlap (bool): Enable overlap between duplicated allgather by recomputing
671
676
  in sequence parallel and flashattentionscoregrad ops if True. Default: False.
672
677
  - enable_communication_fusion (bool): Enable communication fusion to optimize the number of
@@ -681,7 +686,9 @@ class AutoParallel(Cell):
681
686
  and optimizer parallel allgather communication if True. Currently, do not support
682
687
  `O2 <https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.JitConfig.html>`_
683
688
  Default: False.
684
- - computation_communication_fusion_level (int): Enable the fusion between compute and communicate.
689
+ - computation_communication_fusion_level (int): Enable the fusion between compute and communicate,
690
+ which fuses communication tasks and computing tasks, allows for partial pipelining and parallel
691
+ execution of these tasks during operation, thereby enhancing performance.
685
692
  Default: ``0``. Note: This function must be used with Ascend Training Solution 24.0.RC2 or later.
686
693
  This is an experimental configuration, may be changed or canceled in the future.
687
694
 
@@ -692,6 +699,12 @@ class AutoParallel(Cell):
692
699
  - 2: Apply fusion to backward nodes.
693
700
 
694
701
  - 3: Apply fusion to all nodes.
702
+
703
+ .. warning::
704
+ After setting ``export MS_ENABLE_LCCL=on``, the fusion operator based on memory semantics will be
705
+ used. Please note that this operator is still in an experimental stage and may be changed or
706
+ removed in the future.
707
+
695
708
  - dataset_broadcast_opt_level (int): Optimize the scenario that the dataset repeated reading. Only
696
709
  support O0/O1 jit level. It doesn't work in O2 mode. Default: ``0``.
697
710
 
@@ -735,5 +748,6 @@ class AutoParallel(Cell):
735
748
  self._transformer_opt_config = file_path
736
749
  ctx.ascend_config['parallel_speed_up_json_path'] = file_path
737
750
 
751
+ @jit
738
752
  def construct(self, *args, **kwargs):
739
753
  return self.network(*args, **kwargs)
@@ -31,7 +31,7 @@ from mindspore.communication.management import get_rank, get_group_size
31
31
  from mindspore.parallel._tensor import _load_tensor, _reshape_param_data, _reshape_param_data_with_weight, \
32
32
  _get_tensor_slice_index, _get_tensor_strategy
33
33
  from mindspore.parallel._utils import _is_in_auto_parallel_mode, _get_pipeline_stages, _infer_rank_list, \
34
- _remove_repeated_slices, _get_auto_parallel_net
34
+ _remove_repeated_slices, _get_auto_parallel_net, _check_path_safe, _check_path_writable
35
35
  from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
36
36
  _transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, _build_searched_strategy, \
37
37
  _extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
@@ -69,7 +69,9 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
69
69
  >>> ms.parallel.merge_pipeline_strategys("./src_strategy_dir", "./dst_strategy.ckpt")
70
70
 
71
71
  """
72
- dst_strategy_dir, _ = os.path.split(dst_strategy_file)
72
+ dst_strategy_file = os.path.normpath(dst_strategy_file)
73
+ dst_strategy_file = os.path.abspath(dst_strategy_file)
74
+ dst_strategy_dir = os.path.dirname(dst_strategy_file)
73
75
  if not os.path.exists(dst_strategy_dir):
74
76
  _make_dir(dst_strategy_dir, "path")
75
77
  if not os.path.isdir(src_strategy_dirs):
@@ -495,6 +497,9 @@ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckp
495
497
  def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
496
498
  dst_strategy_file=None):
497
499
  """Transform checkpoints for all stages in src_strategy_file"""
500
+ _check_path_safe(dst_checkpoints_dir, "dst_checkpoints_dir")
501
+ dst_checkpoints_dir = os.path.realpath(dst_checkpoints_dir)
502
+ _check_path_safe(ckpt_prefix, "ckpt_prefix")
498
503
  checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
499
504
  all_checkpoint_files_map = {}
500
505
  for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
@@ -563,6 +568,7 @@ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix
563
568
  save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(transform_rank))
564
569
  if not os.path.exists(save_checkpoint_file_dir):
565
570
  _make_dir(save_checkpoint_file_dir, "path")
571
+ _check_path_writable(save_checkpoint_file_dir)
566
572
  save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
567
573
  ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
568
574
  del param_total_dict_copy
@@ -913,6 +919,15 @@ def set_op_strategy_config(mode="SAVE", path=""):
913
919
  if file_type != ".json":
914
920
  raise KeyError("File type must be .json")
915
921
  dir_path = os.path.dirname(path)
922
+
923
+ normalized_path = os.path.abspath(os.path.realpath(path))
924
+ dangerous_paths = ['/etc', '/usr', '/bin', '/sbin', '/boot', '/proc', '/sys']
925
+ for dangerous_path in dangerous_paths:
926
+ if normalized_path.startswith(dangerous_path):
927
+ raise PermissionError(
928
+ f"Writing to system directory '{dangerous_path}' is not allowed"
929
+ )
930
+
916
931
  if dir_path and not os.path.exists(dir_path):
917
932
  os.makedirs(dir_path, mode=0o700, exist_ok=True)
918
933
  check_mode_type = ["SAVE", "LOAD"]
@@ -1165,6 +1180,8 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1165
1180
  train_strategy_filename = ms.context.get_auto_parallel_context("strategy_ckpt_load_file")
1166
1181
 
1167
1182
  _train_strategy = build_searched_strategy(train_strategy_filename)
1183
+ if not _train_strategy:
1184
+ return True
1168
1185
  train_strategy = _convert_to_list(_train_strategy)
1169
1186
 
1170
1187
  train_dev_count = 1
@@ -1180,6 +1197,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
1180
1197
 
1181
1198
  param_total_dict = defaultdict(dict)
1182
1199
  for file_index, file_name in enumerate(checkpoint_filenames):
1200
+ file_name = os.path.abspath(file_name)
1183
1201
  ckpt_dict = ms.load_checkpoint(file_name, dec_key=dec_key, dec_mode=dec_mode)
1184
1202
  for param_name, param in ckpt_dict.items():
1185
1203
  param_total_dict[param_name][file_index] = param
@@ -21,6 +21,7 @@ import subprocess
21
21
  import socket
22
22
  import psutil
23
23
  import mindspore.log as logger
24
+ from mindspore.utils import RSCPluginHandle
24
25
  from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
25
26
  _is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip, _generate_auto_bind_core_strategy, \
26
27
  _generate_bind_core_strategy
@@ -179,9 +180,12 @@ class _ProcessManager:
179
180
  self.is_simulation = self.sim_level != -1
180
181
  if self.is_simulation:
181
182
  os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
183
+ if self.sim_rank_id == -1:
184
+ self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
182
185
  elif os.getenv("MS_SIMULATION_LEVEL"):
183
186
  self.is_simulation = True
184
- self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
187
+ if self.sim_rank_id == -1:
188
+ self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
185
189
  if os.getenv("RANK_SIZE"):
186
190
  self.exported_rank_size = os.getenv("RANK_SIZE")
187
191
  # If sim_rank_id is set, single worker can be started.
@@ -218,23 +222,28 @@ class _ProcessManager:
218
222
 
219
223
  self.proc_rank_map = {}
220
224
  self.enable_mindx = False
225
+ self.handler = None
221
226
  self._check_taskd()
222
227
 
223
228
  def _check_taskd(self):
224
229
  """check if enable taskd."""
225
- tft_env = os.getenv("MS_ENABLE_TFT", "")
226
- if any(v in tft_env for v in ('TTP:1', 'UCE:1', 'ARF:1', 'TSP:1', 'RSC:1', 'HCCE:1')):
227
- try:
228
- from taskd.python.framework.agent.ms_mgr.msrun_plugin import MSRunPlugin
229
- self.msmgr = MSRunPlugin()
230
- self.msmgr.register_callbacks("KILL_WORKER", self.kill_workers)
231
- self.msmgr.register_callbacks("START_ALL_WORKER", self.start_all_workers)
232
- self.msmgr.register_callbacks("START_WORKER_LIST", self.start_worker_list)
233
- self.msmgr.register_callbacks("MONITOR", self.monitor_rank_status)
234
- self.enable_mindx = True
235
- os.environ["MS_ENABLE_RECOVERY"] = str(1)
236
- except Exception as e: # pylint: disable=broad-except
237
- logger.warning(f"mindx is not installed, using original mindspore recovery strategy.: {str(e)}")
230
+ self.handler = RSCPluginHandle()
231
+ self.enable_mindx = self.handler.check_enable()
232
+ if self.enable_mindx is False:
233
+ self.handler = None
234
+ return
235
+ ret = self.handler.register_callback({"KILL_WORKER": self.kill_workers,
236
+ "START_ALL_WORKER": self.start_all_workers,
237
+ "START_WORKER_LIST": self.start_worker_list,
238
+ "MONITOR": self.monitor_rank_status
239
+ })
240
+ if not ret:
241
+ logger.warning(f"Register callback to mindx failed, process controlled by msrun.")
242
+ self.enable_mindx = False
243
+ self.handler = None
244
+ return
245
+ logger.warning(f"Mindx enabled, process controlled by mindx.")
246
+ os.environ["MS_ENABLE_RECOVERY"] = str(1)
238
247
 
239
248
  def run(self):
240
249
  """
@@ -257,7 +266,7 @@ class _ProcessManager:
257
266
  if self.is_master and not self.is_simulation:
258
267
  self.start_scheduler()
259
268
  if self.enable_mindx:
260
- self.msmgr.start()
269
+ self.handler.start()
261
270
  else:
262
271
  self.start_workers()
263
272
  if self.join:
@@ -379,8 +388,7 @@ class _ProcessManager:
379
388
  logger.error(f"Scheduler process {self.msn_process.pid} exit with exception.")
380
389
 
381
390
  if has_exception:
382
- logger.info("Analyzing exception log...")
383
- self._analyze_log()
391
+ self._analyze_sched_log()
384
392
  raise RuntimeError("Distributed job exited with exception. Please check logs in "
385
393
  f"directory: {self.log_dir}.")
386
394
 
@@ -580,26 +588,13 @@ class _ProcessManager:
580
588
  log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
581
589
  return node_id, log_name
582
590
 
583
- def _analyze_log(self):
591
+ def _analyze_sched_log(self):
584
592
  """
585
- Analyze exception logs.
593
+ Analyze scheduler log.
586
594
  """
587
595
  scheduler_log_path = os.path.join(self.log_dir, "scheduler.log")
588
- time_out_node_ids = []
589
596
  if os.path.exists(scheduler_log_path):
590
- with open(scheduler_log_path, "r") as log:
591
- scheduler_log = log.read()
592
- # Filter out abnormal logs.
593
- time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
594
-
595
- # Filter out node ids of the processes which exit abnormally.
596
- def node_id_splitter(node_id):
597
- return re.split(" is timed out", re.split("node: ", node_id)[1])[0]
598
- for node_id in time_out_node_log:
599
- time_out_node_ids.append(node_id_splitter(node_id))
600
- logger.error(f"Time out nodes are {time_out_node_ids}")
601
-
602
- os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
597
+ os.system(f"cat {scheduler_log_path} | grep -E 'ERROR|CRITICAL|Traceback|Error' -C 5")
603
598
 
604
599
  def format_worker_log_name(self):
605
600
  """