mindspore 2.4.10__cp311-cp311-win_amd64.whl → 2.6.0rc1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (602) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +13 -6
  5. mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
  8. mindspore/_check_jit_forbidden_api.py +3 -0
  9. mindspore/_checkparam.py +3 -38
  10. mindspore/_deprecated/__init__.py +17 -0
  11. mindspore/_deprecated/jit.py +198 -0
  12. mindspore/_extends/builtin_operations.py +1 -1
  13. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  14. mindspore/_extends/parse/__init__.py +6 -7
  15. mindspore/_extends/parse/compile_config.py +83 -0
  16. mindspore/_extends/parse/deprecated/__init__.py +0 -0
  17. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +394 -0
  18. mindspore/_extends/parse/jit_fallback_modules/__init__.py +0 -0
  19. mindspore/_extends/parse/jit_fallback_modules/check_utils.py +123 -0
  20. mindspore/_extends/parse/jit_fallback_modules/third_party_modules.py +50 -0
  21. mindspore/_extends/parse/parser.py +46 -197
  22. mindspore/_extends/parse/resources.py +1 -5
  23. mindspore/_extends/parse/standard_method.py +217 -98
  24. mindspore/_extends/pijit/__init__.py +2 -2
  25. mindspore/_extends/pijit/pijit_func_white_list.py +17 -12
  26. mindspore/_extends/pijit/tensor_func_list.py +27 -0
  27. mindspore/_extends/utils.py +1 -1
  28. mindspore/amp.py +11 -5
  29. mindspore/atlprov.dll +0 -0
  30. mindspore/avcodec-59.dll +0 -0
  31. mindspore/avdevice-59.dll +0 -0
  32. mindspore/avfilter-8.dll +0 -0
  33. mindspore/avformat-59.dll +0 -0
  34. mindspore/avutil-57.dll +0 -0
  35. mindspore/boost/__init__.py +2 -2
  36. mindspore/boost/base.py +3 -7
  37. mindspore/boost/boost_cell_wrapper.py +138 -43
  38. mindspore/c1.dll +0 -0
  39. mindspore/c1xx.dll +0 -0
  40. mindspore/c2.dll +0 -0
  41. mindspore/common/__init__.py +6 -3
  42. mindspore/common/_grad_function.py +56 -0
  43. mindspore/common/_pijit_context.py +14 -5
  44. mindspore/common/_register_for_tensor.py +1 -2
  45. mindspore/common/_stub_tensor.py +30 -14
  46. mindspore/common/_tensor_cpp_method.py +17 -0
  47. mindspore/common/_tensor_docs.py +4760 -0
  48. mindspore/common/api.py +435 -371
  49. mindspore/common/auto_dynamic_shape.py +41 -44
  50. mindspore/common/dtype.py +39 -36
  51. mindspore/common/dump.py +9 -6
  52. mindspore/common/file_system.py +9 -1
  53. mindspore/common/generator.py +2 -0
  54. mindspore/common/hook_handle.py +6 -2
  55. mindspore/common/initializer.py +13 -10
  56. mindspore/common/jit_begin_end.py +94 -0
  57. mindspore/common/jit_config.py +6 -1
  58. mindspore/common/jit_context.py +76 -0
  59. mindspore/common/jit_trace.py +378 -0
  60. mindspore/common/lazy_inline.py +9 -3
  61. mindspore/common/mindir_util.py +10 -2
  62. mindspore/common/mutable.py +5 -4
  63. mindspore/common/parameter.py +135 -52
  64. mindspore/common/seed.py +2 -2
  65. mindspore/common/sparse_tensor.py +23 -17
  66. mindspore/common/tensor.py +951 -1992
  67. mindspore/communication/__init__.py +7 -5
  68. mindspore/communication/_comm_helper.py +52 -2
  69. mindspore/communication/comm_func.py +240 -181
  70. mindspore/communication/management.py +95 -26
  71. mindspore/context.py +314 -566
  72. mindspore/dataset/__init__.py +65 -37
  73. mindspore/dataset/audio/__init__.py +2 -8
  74. mindspore/dataset/audio/transforms.py +3 -17
  75. mindspore/dataset/callback/ds_callback.py +2 -1
  76. mindspore/dataset/core/config.py +87 -6
  77. mindspore/dataset/engine/cache_admin.py +3 -3
  78. mindspore/dataset/engine/cache_client.py +6 -5
  79. mindspore/dataset/engine/datasets.py +292 -267
  80. mindspore/dataset/engine/datasets_audio.py +22 -8
  81. mindspore/dataset/engine/datasets_standard_format.py +46 -27
  82. mindspore/dataset/engine/datasets_text.py +78 -48
  83. mindspore/dataset/engine/datasets_user_defined.py +182 -116
  84. mindspore/dataset/engine/datasets_vision.py +120 -44
  85. mindspore/dataset/engine/iterators.py +283 -63
  86. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +1 -1
  87. mindspore/dataset/engine/obs/util.py +8 -0
  88. mindspore/dataset/engine/queue.py +40 -0
  89. mindspore/dataset/engine/samplers.py +289 -43
  90. mindspore/dataset/engine/serializer_deserializer.py +3 -2
  91. mindspore/dataset/engine/validators.py +53 -11
  92. mindspore/dataset/text/__init__.py +7 -6
  93. mindspore/dataset/text/transforms.py +6 -5
  94. mindspore/dataset/text/utils.py +3 -3
  95. mindspore/dataset/transforms/__init__.py +0 -9
  96. mindspore/dataset/transforms/py_transforms_util.py +17 -0
  97. mindspore/dataset/transforms/transforms.py +31 -14
  98. mindspore/dataset/utils/browse_dataset.py +1 -1
  99. mindspore/dataset/vision/__init__.py +2 -9
  100. mindspore/dataset/vision/transforms.py +202 -158
  101. mindspore/dataset/vision/utils.py +7 -5
  102. mindspore/dataset/vision/validators.py +1 -2
  103. mindspore/device_context/__init__.py +21 -0
  104. mindspore/device_context/ascend/__init__.py +25 -0
  105. mindspore/device_context/ascend/device.py +72 -0
  106. mindspore/device_context/ascend/op_debug.py +153 -0
  107. mindspore/device_context/ascend/op_precision.py +193 -0
  108. mindspore/device_context/ascend/op_tuning.py +123 -0
  109. mindspore/{ops_generate/gen_constants.py → device_context/cpu/__init__.py} +6 -17
  110. mindspore/device_context/cpu/device.py +62 -0
  111. mindspore/device_context/cpu/op_tuning.py +43 -0
  112. mindspore/device_context/gpu/__init__.py +21 -0
  113. mindspore/device_context/gpu/device.py +70 -0
  114. mindspore/device_context/gpu/op_precision.py +67 -0
  115. mindspore/device_context/gpu/op_tuning.py +175 -0
  116. mindspore/device_manager.py +170 -0
  117. mindspore/dnnl.dll +0 -0
  118. mindspore/dpcmi.dll +0 -0
  119. mindspore/experimental/es/embedding_service.py +35 -27
  120. mindspore/experimental/llm_boost/__init__.py +1 -0
  121. mindspore/experimental/llm_boost/ascend_native/__init__.py +22 -0
  122. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +211 -0
  123. mindspore/experimental/llm_boost/ascend_native/llm_boost.py +52 -0
  124. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  125. mindspore/experimental/llm_boost/atb/llama_boost.py +6 -1
  126. mindspore/experimental/llm_boost/register.py +1 -0
  127. mindspore/experimental/map_parameter.py +4 -4
  128. mindspore/experimental/optim/adadelta.py +6 -6
  129. mindspore/experimental/optim/adagrad.py +4 -4
  130. mindspore/experimental/optim/adam.py +7 -0
  131. mindspore/experimental/optim/adamax.py +4 -4
  132. mindspore/experimental/optim/adamw.py +4 -0
  133. mindspore/experimental/optim/asgd.py +1 -1
  134. mindspore/experimental/optim/lr_scheduler.py +73 -46
  135. mindspore/experimental/optim/radam.py +34 -31
  136. mindspore/experimental/optim/rprop.py +1 -1
  137. mindspore/experimental/optim/sgd.py +1 -1
  138. mindspore/hal/contiguous_tensors_handle.py +6 -10
  139. mindspore/hal/device.py +55 -53
  140. mindspore/hal/event.py +52 -52
  141. mindspore/hal/memory.py +157 -117
  142. mindspore/hal/stream.py +150 -109
  143. mindspore/include/api/context.h +0 -1
  144. mindspore/include/dataset/constants.h +7 -4
  145. mindspore/include/dataset/execute.h +2 -2
  146. mindspore/jpeg62.dll +0 -0
  147. mindspore/log.py +50 -0
  148. mindspore/mindrecord/__init__.py +21 -8
  149. mindspore/mindrecord/config.py +17 -316
  150. mindspore/mindrecord/filereader.py +1 -9
  151. mindspore/mindrecord/filewriter.py +5 -15
  152. mindspore/mindrecord/mindpage.py +1 -9
  153. mindspore/mindspore_backend_common.dll +0 -0
  154. mindspore/mindspore_backend_manager.dll +0 -0
  155. mindspore/mindspore_common.dll +0 -0
  156. mindspore/mindspore_core.dll +0 -0
  157. mindspore/mindspore_dump.dll +0 -0
  158. mindspore/mindspore_frontend.dll +0 -0
  159. mindspore/mindspore_glog.dll +0 -0
  160. mindspore/mindspore_memory_pool.dll +0 -0
  161. mindspore/mindspore_ms_backend.dll +0 -0
  162. mindspore/mindspore_ops.dll +0 -0
  163. mindspore/{mindspore_backend.dll → mindspore_ops_host.dll} +0 -0
  164. mindspore/mindspore_ops_kernel_common.dll +0 -0
  165. mindspore/mindspore_profiler.dll +0 -0
  166. mindspore/mindspore_pyboost.dll +0 -0
  167. mindspore/mindspore_pynative.dll +0 -0
  168. mindspore/mindspore_res_manager.dll +0 -0
  169. mindspore/mindspore_runtime_pipeline.dll +0 -0
  170. mindspore/mint/__init__.py +796 -759
  171. mindspore/mint/distributed/__init__.py +70 -4
  172. mindspore/mint/distributed/distributed.py +2679 -44
  173. mindspore/mint/linalg/__init__.py +8 -0
  174. mindspore/mint/nn/__init__.py +743 -22
  175. mindspore/mint/nn/functional.py +716 -23
  176. mindspore/mint/nn/layer/__init__.py +21 -4
  177. mindspore/mint/nn/layer/_functions.py +334 -0
  178. mindspore/mint/nn/layer/activation.py +276 -1
  179. mindspore/mint/nn/layer/basic.py +123 -0
  180. mindspore/mint/nn/layer/conv.py +921 -0
  181. mindspore/mint/nn/layer/normalization.py +223 -28
  182. mindspore/mint/nn/layer/padding.py +797 -0
  183. mindspore/mint/nn/layer/pooling.py +235 -0
  184. mindspore/mint/optim/__init__.py +3 -1
  185. mindspore/mint/optim/adam.py +223 -0
  186. mindspore/mint/optim/adamw.py +26 -19
  187. mindspore/mint/optim/sgd.py +171 -0
  188. mindspore/mint/special/__init__.py +2 -1
  189. mindspore/msobj140.dll +0 -0
  190. mindspore/mspdb140.dll +0 -0
  191. mindspore/mspdbcore.dll +0 -0
  192. mindspore/mspdbst.dll +0 -0
  193. mindspore/mspft140.dll +0 -0
  194. mindspore/msvcdis140.dll +0 -0
  195. mindspore/msvcp140_1.dll +0 -0
  196. mindspore/msvcp140_2.dll +0 -0
  197. mindspore/msvcp140_atomic_wait.dll +0 -0
  198. mindspore/msvcp140_codecvt_ids.dll +0 -0
  199. mindspore/multiprocessing/__init__.py +5 -0
  200. mindspore/nn/__init__.py +4 -1
  201. mindspore/nn/cell.py +1370 -189
  202. mindspore/nn/dynamic_lr.py +2 -1
  203. mindspore/nn/layer/activation.py +29 -27
  204. mindspore/nn/layer/basic.py +51 -35
  205. mindspore/nn/layer/channel_shuffle.py +3 -3
  206. mindspore/nn/layer/container.py +1 -1
  207. mindspore/nn/layer/conv.py +22 -17
  208. mindspore/nn/layer/embedding.py +12 -11
  209. mindspore/nn/layer/normalization.py +56 -49
  210. mindspore/nn/layer/padding.py +4 -3
  211. mindspore/nn/layer/pooling.py +120 -42
  212. mindspore/nn/layer/rnn_cells.py +1 -1
  213. mindspore/nn/layer/rnns.py +2 -1
  214. mindspore/nn/layer/timedistributed.py +5 -5
  215. mindspore/nn/layer/transformer.py +59 -36
  216. mindspore/nn/learning_rate_schedule.py +8 -4
  217. mindspore/nn/loss/loss.py +58 -55
  218. mindspore/nn/optim/ada_grad.py +7 -5
  219. mindspore/nn/optim/adadelta.py +11 -9
  220. mindspore/nn/optim/adafactor.py +1 -1
  221. mindspore/nn/optim/adam.py +17 -13
  222. mindspore/nn/optim/adamax.py +8 -7
  223. mindspore/nn/optim/adasum.py +5 -5
  224. mindspore/nn/optim/asgd.py +1 -1
  225. mindspore/nn/optim/ftrl.py +11 -9
  226. mindspore/nn/optim/lamb.py +1 -1
  227. mindspore/nn/optim/lars.py +1 -4
  228. mindspore/nn/optim/lazyadam.py +12 -10
  229. mindspore/nn/optim/momentum.py +7 -6
  230. mindspore/nn/optim/optimizer.py +3 -3
  231. mindspore/nn/optim/proximal_ada_grad.py +12 -10
  232. mindspore/nn/optim/rmsprop.py +13 -12
  233. mindspore/nn/optim/rprop.py +11 -9
  234. mindspore/nn/optim/sgd.py +9 -6
  235. mindspore/nn/optim/tft_wrapper.py +5 -2
  236. mindspore/nn/optim/thor.py +2 -1
  237. mindspore/nn/probability/bijector/bijector.py +17 -11
  238. mindspore/nn/probability/bijector/gumbel_cdf.py +5 -5
  239. mindspore/nn/probability/bijector/invert.py +2 -2
  240. mindspore/nn/probability/bijector/scalar_affine.py +3 -3
  241. mindspore/nn/probability/bijector/softplus.py +3 -2
  242. mindspore/nn/probability/distribution/beta.py +3 -3
  243. mindspore/nn/probability/distribution/categorical.py +1 -1
  244. mindspore/nn/probability/distribution/cauchy.py +4 -2
  245. mindspore/nn/probability/distribution/exponential.py +6 -7
  246. mindspore/nn/probability/distribution/gamma.py +2 -2
  247. mindspore/nn/probability/distribution/gumbel.py +2 -2
  248. mindspore/nn/probability/distribution/half_normal.py +5 -3
  249. mindspore/nn/probability/distribution/logistic.py +5 -3
  250. mindspore/nn/probability/distribution/poisson.py +1 -1
  251. mindspore/nn/probability/distribution/uniform.py +5 -3
  252. mindspore/nn/reinforcement/_tensors_queue.py +1 -1
  253. mindspore/nn/reinforcement/tensor_array.py +1 -1
  254. mindspore/nn/utils/init.py +13 -11
  255. mindspore/nn/wrap/__init__.py +6 -6
  256. mindspore/nn/wrap/cell_wrapper.py +181 -122
  257. mindspore/nn/wrap/grad_reducer.py +45 -36
  258. mindspore/nn/wrap/loss_scale.py +6 -7
  259. mindspore/numpy/array_creations.py +63 -65
  260. mindspore/numpy/array_ops.py +149 -144
  261. mindspore/numpy/logic_ops.py +41 -42
  262. mindspore/numpy/math_ops.py +365 -363
  263. mindspore/numpy/utils.py +17 -18
  264. mindspore/numpy/utils_const.py +5 -6
  265. mindspore/opencv_core452.dll +0 -0
  266. mindspore/opencv_imgcodecs452.dll +0 -0
  267. mindspore/opencv_imgproc452.dll +0 -0
  268. mindspore/ops/__init__.py +5 -3
  269. mindspore/ops/_grad_experimental/grad_comm_ops.py +112 -16
  270. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -2
  271. mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
  272. mindspore/ops/_grad_experimental/grad_math_ops.py +2 -1
  273. mindspore/ops/_grad_experimental/taylor_rule.py +29 -0
  274. mindspore/ops/_op_impl/cpu/__init__.py +1 -0
  275. mindspore/ops/_op_impl/cpu/raise_op.py +28 -0
  276. mindspore/ops/_register_for_op.py +0 -11
  277. mindspore/{ops_generate → ops/_utils}/arg_dtype_cast.py +123 -4
  278. mindspore/{ops_generate → ops/_utils}/arg_handler.py +3 -65
  279. mindspore/ops/_vmap/vmap_array_ops.py +27 -25
  280. mindspore/ops/_vmap/vmap_base.py +0 -2
  281. mindspore/ops/_vmap/vmap_grad_nn_ops.py +21 -14
  282. mindspore/ops/_vmap/vmap_math_ops.py +15 -16
  283. mindspore/ops/_vmap/vmap_nn_ops.py +29 -42
  284. mindspore/ops/auto_generate/__init__.py +4 -3
  285. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +236 -46
  286. mindspore/ops/auto_generate/gen_extend_func.py +764 -124
  287. mindspore/ops/auto_generate/gen_ops_def.py +4018 -2264
  288. mindspore/ops/auto_generate/gen_ops_prim.py +15463 -5037
  289. mindspore/ops/auto_generate/pyboost_inner_prim.py +221 -87
  290. mindspore/ops/composite/__init__.py +2 -1
  291. mindspore/ops/composite/base.py +20 -25
  292. mindspore/ops/composite/math_ops.py +6 -16
  293. mindspore/ops/composite/multitype_ops/__init__.py +5 -2
  294. mindspore/ops/composite/multitype_ops/_compile_utils.py +228 -30
  295. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -2
  296. mindspore/ops/composite/multitype_ops/add_impl.py +2 -1
  297. mindspore/ops/composite/multitype_ops/bitwise_and_impl.py +2 -1
  298. mindspore/ops/composite/multitype_ops/bitwise_or_impl.py +2 -1
  299. mindspore/ops/composite/multitype_ops/bitwise_xor_impl.py +2 -1
  300. mindspore/ops/composite/multitype_ops/div_impl.py +6 -4
  301. mindspore/ops/composite/multitype_ops/equal_impl.py +4 -3
  302. mindspore/ops/composite/multitype_ops/floordiv_impl.py +2 -1
  303. mindspore/ops/composite/multitype_ops/getitem_impl.py +3 -2
  304. mindspore/ops/composite/multitype_ops/greater_equal_impl.py +4 -3
  305. mindspore/ops/composite/multitype_ops/greater_impl.py +4 -3
  306. mindspore/ops/composite/multitype_ops/in_impl.py +2 -1
  307. mindspore/ops/composite/multitype_ops/invert_impl.py +50 -0
  308. mindspore/ops/composite/multitype_ops/left_shift_impl.py +2 -1
  309. mindspore/ops/composite/multitype_ops/less_equal_impl.py +4 -3
  310. mindspore/ops/composite/multitype_ops/less_impl.py +4 -3
  311. mindspore/ops/composite/multitype_ops/logic_not_impl.py +3 -2
  312. mindspore/ops/composite/multitype_ops/logical_and_impl.py +2 -1
  313. mindspore/ops/composite/multitype_ops/logical_or_impl.py +2 -1
  314. mindspore/ops/composite/multitype_ops/mod_impl.py +2 -1
  315. mindspore/ops/composite/multitype_ops/mul_impl.py +3 -2
  316. mindspore/ops/composite/multitype_ops/negative_impl.py +2 -1
  317. mindspore/ops/composite/multitype_ops/not_equal_impl.py +2 -1
  318. mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -1
  319. mindspore/ops/composite/multitype_ops/ones_like_impl.py +18 -0
  320. mindspore/ops/composite/multitype_ops/pow_impl.py +2 -30
  321. mindspore/ops/composite/multitype_ops/right_shift_impl.py +2 -1
  322. mindspore/ops/composite/multitype_ops/setitem_impl.py +2 -1
  323. mindspore/ops/composite/multitype_ops/sub_impl.py +2 -1
  324. mindspore/ops/function/__init__.py +40 -2
  325. mindspore/ops/function/_add_attr_func.py +58 -0
  326. mindspore/ops/function/array_func.py +2089 -2403
  327. mindspore/ops/function/clip_func.py +80 -23
  328. mindspore/ops/function/debug_func.py +57 -57
  329. mindspore/ops/function/grad/__init__.py +1 -0
  330. mindspore/ops/function/grad/grad_func.py +104 -71
  331. mindspore/ops/function/image_func.py +2 -2
  332. mindspore/ops/function/linalg_func.py +47 -78
  333. mindspore/ops/function/math_func.py +4501 -3802
  334. mindspore/ops/function/nn_func.py +1726 -620
  335. mindspore/ops/function/other_func.py +159 -1
  336. mindspore/ops/function/parameter_func.py +18 -84
  337. mindspore/ops/function/random_func.py +440 -387
  338. mindspore/ops/function/reshard_func.py +4 -70
  339. mindspore/ops/function/sparse_func.py +3 -3
  340. mindspore/ops/function/sparse_unary_func.py +6 -6
  341. mindspore/ops/function/spectral_func.py +25 -58
  342. mindspore/ops/function/vmap_func.py +24 -17
  343. mindspore/ops/functional.py +22 -7
  344. mindspore/ops/functional_overload.py +1440 -0
  345. mindspore/ops/op_info_register.py +32 -244
  346. mindspore/ops/operations/__init__.py +13 -7
  347. mindspore/ops/operations/_custom_ops_utils.py +247 -0
  348. mindspore/ops/operations/_embedding_cache_ops.py +4 -4
  349. mindspore/ops/operations/_grad_ops.py +2 -43
  350. mindspore/ops/operations/_infer_ops.py +2 -1
  351. mindspore/ops/operations/_inner_ops.py +43 -84
  352. mindspore/ops/operations/_ms_kernel.py +4 -10
  353. mindspore/ops/operations/_rl_inner_ops.py +1 -1
  354. mindspore/ops/operations/_scalar_ops.py +3 -2
  355. mindspore/ops/operations/_sequence_ops.py +1 -1
  356. mindspore/ops/operations/_tensor_array.py +1 -1
  357. mindspore/ops/operations/array_ops.py +81 -324
  358. mindspore/ops/operations/comm_ops.py +154 -108
  359. mindspore/ops/operations/custom_ops.py +232 -78
  360. mindspore/ops/operations/debug_ops.py +153 -59
  361. mindspore/ops/operations/inner_ops.py +7 -5
  362. mindspore/ops/operations/linalg_ops.py +1 -57
  363. mindspore/ops/operations/manually_defined/_inner.py +1 -1
  364. mindspore/ops/operations/manually_defined/ops_def.py +928 -180
  365. mindspore/ops/operations/math_ops.py +32 -234
  366. mindspore/ops/operations/nn_ops.py +210 -498
  367. mindspore/ops/operations/other_ops.py +62 -9
  368. mindspore/ops/operations/random_ops.py +13 -7
  369. mindspore/ops/operations/reshard_ops.py +1 -1
  370. mindspore/ops/operations/sparse_ops.py +2 -2
  371. mindspore/ops/primitive.py +66 -53
  372. mindspore/ops/tensor_method.py +1888 -0
  373. mindspore/ops_generate/__init__.py +0 -5
  374. mindspore/ops_generate/aclnn/__init__.py +0 -0
  375. mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +135 -0
  376. mindspore/ops_generate/aclnn/gen_aclnn_implement.py +257 -0
  377. mindspore/ops_generate/api/__init__.py +0 -0
  378. mindspore/ops_generate/api/add_tensor_docs_generator.py +56 -0
  379. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +105 -0
  380. mindspore/ops_generate/api/functional_map_cpp_generator.py +504 -0
  381. mindspore/ops_generate/api/functional_overload_py_generator.py +112 -0
  382. mindspore/ops_generate/api/functions_cc_generator.py +237 -0
  383. mindspore/ops_generate/api/gen_api.py +103 -0
  384. mindspore/ops_generate/api/op_api_proto.py +235 -0
  385. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +461 -0
  386. mindspore/ops_generate/common/__init__.py +0 -0
  387. mindspore/ops_generate/common/base_generator.py +11 -0
  388. mindspore/ops_generate/common/gen_constants.py +91 -0
  389. mindspore/ops_generate/common/gen_utils.py +348 -0
  390. mindspore/ops_generate/common/op_proto.py +473 -0
  391. mindspore/ops_generate/common/template.py +523 -0
  392. mindspore/ops_generate/gen_ops.py +22 -1069
  393. mindspore/ops_generate/op_def/__init__.py +0 -0
  394. mindspore/ops_generate/op_def/gen_op_def.py +90 -0
  395. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +191 -0
  396. mindspore/ops_generate/op_def/ops_def_cc_generator.py +299 -0
  397. mindspore/ops_generate/op_def/ops_def_h_generator.py +74 -0
  398. mindspore/ops_generate/op_def/ops_name_h_generator.py +83 -0
  399. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +125 -0
  400. mindspore/ops_generate/op_def_py/__init__.py +0 -0
  401. mindspore/ops_generate/op_def_py/gen_op_def_py.py +47 -0
  402. mindspore/ops_generate/op_def_py/op_def_py_generator.py +132 -0
  403. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +489 -0
  404. mindspore/ops_generate/pyboost/__init__.py +0 -0
  405. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +139 -0
  406. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +93 -0
  407. mindspore/ops_generate/pyboost/gen_pyboost_func.py +175 -0
  408. mindspore/ops_generate/pyboost/op_template_parser.py +517 -0
  409. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +407 -0
  410. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +100 -0
  411. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +148 -0
  412. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +155 -0
  413. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +132 -0
  414. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +272 -0
  415. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +938 -0
  416. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +357 -0
  417. mindspore/ops_generate/{pyboost_utils.py → pyboost/pyboost_utils.py} +179 -36
  418. mindspore/ops_generate/resources/__init__.py +0 -0
  419. mindspore/ops_generate/resources/resource_list.py +30 -0
  420. mindspore/ops_generate/resources/resource_loader.py +36 -0
  421. mindspore/ops_generate/resources/resource_manager.py +64 -0
  422. mindspore/ops_generate/resources/yaml_loader.py +88 -0
  423. mindspore/ops_generate/tensor_py_cc_generator.py +122 -0
  424. mindspore/parallel/__init__.py +7 -3
  425. mindspore/parallel/_auto_parallel_context.py +152 -34
  426. mindspore/parallel/_cell_wrapper.py +130 -15
  427. mindspore/parallel/_parallel_serialization.py +107 -5
  428. mindspore/parallel/_ps_context.py +1 -1
  429. mindspore/parallel/_recovery_context.py +7 -2
  430. mindspore/parallel/_tensor.py +142 -18
  431. mindspore/parallel/_utils.py +199 -23
  432. mindspore/parallel/algo_parameter_config.py +4 -4
  433. mindspore/parallel/auto_parallel.py +732 -0
  434. mindspore/parallel/checkpoint_convert.py +159 -0
  435. mindspore/parallel/checkpoint_transform.py +698 -35
  436. mindspore/parallel/cluster/process_entity/_api.py +276 -50
  437. mindspore/parallel/cluster/process_entity/_utils.py +41 -6
  438. mindspore/parallel/cluster/run.py +21 -4
  439. mindspore/parallel/function/__init__.py +24 -0
  440. mindspore/parallel/function/reshard_func.py +259 -0
  441. mindspore/parallel/nn/__init__.py +25 -0
  442. mindspore/parallel/nn/parallel_cell_wrapper.py +263 -0
  443. mindspore/parallel/nn/parallel_grad_reducer.py +169 -0
  444. mindspore/parallel/parameter_broadcast.py +25 -14
  445. mindspore/parallel/shard.py +137 -58
  446. mindspore/parallel/transform_safetensors.py +363 -305
  447. mindspore/pgodb140.dll +0 -0
  448. mindspore/pgort140.dll +0 -0
  449. mindspore/profiler/__init__.py +22 -5
  450. mindspore/profiler/analysis/__init__.py +0 -0
  451. mindspore/profiler/analysis/parser/__init__.py +0 -0
  452. mindspore/profiler/analysis/parser/ascend_cann_parser.py +170 -0
  453. mindspore/profiler/analysis/parser/base_parser.py +158 -0
  454. mindspore/profiler/analysis/parser/framework_cann_relation_parser.py +45 -0
  455. mindspore/profiler/analysis/parser/ms_framework_parser.py +142 -0
  456. mindspore/profiler/analysis/parser/ms_minddata_parser.py +145 -0
  457. mindspore/profiler/analysis/parser/timeline_assembly_factory/__init__.py +0 -0
  458. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +264 -0
  459. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +40 -0
  460. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +106 -0
  461. mindspore/profiler/analysis/parser/timeline_creator/__init__.py +0 -0
  462. mindspore/profiler/analysis/parser/timeline_creator/base_timeline_creator.py +44 -0
  463. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +90 -0
  464. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +76 -0
  465. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +103 -0
  466. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +134 -0
  467. mindspore/profiler/analysis/parser/timeline_event/__init__.py +0 -0
  468. mindspore/profiler/analysis/parser/timeline_event/base_event.py +233 -0
  469. mindspore/profiler/analysis/parser/timeline_event/cpu_op_event.py +47 -0
  470. mindspore/profiler/analysis/parser/timeline_event/flow_event.py +36 -0
  471. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +415 -0
  472. mindspore/profiler/analysis/parser/timeline_event/msprof_event.py +73 -0
  473. mindspore/profiler/analysis/parser/timeline_event/scope_layer_event.py +53 -0
  474. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +146 -0
  475. mindspore/profiler/analysis/task_manager.py +131 -0
  476. mindspore/profiler/analysis/time_converter.py +84 -0
  477. mindspore/profiler/analysis/viewer/__init__.py +0 -0
  478. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +372 -0
  479. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +87 -0
  480. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +250 -0
  481. mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +320 -0
  482. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +327 -0
  483. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +376 -0
  484. mindspore/profiler/analysis/viewer/ascend_timeline_viewer.py +58 -0
  485. mindspore/profiler/analysis/viewer/base_viewer.py +26 -0
  486. mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +96 -0
  487. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +581 -0
  488. mindspore/profiler/analysis/work_flow.py +73 -0
  489. mindspore/profiler/common/ascend_msprof_exporter.py +139 -0
  490. mindspore/profiler/common/command_executor.py +90 -0
  491. mindspore/profiler/common/constant.py +186 -3
  492. mindspore/profiler/common/file_manager.py +208 -0
  493. mindspore/profiler/common/log.py +130 -0
  494. mindspore/profiler/common/msprof_cmd_tool.py +221 -0
  495. mindspore/profiler/common/path_manager.py +395 -0
  496. mindspore/profiler/common/process_bar.py +168 -0
  497. mindspore/profiler/common/process_pool.py +9 -3
  498. mindspore/profiler/common/profiler_context.py +500 -0
  499. mindspore/profiler/common/profiler_info.py +304 -0
  500. mindspore/profiler/common/profiler_meta_data.py +74 -0
  501. mindspore/profiler/common/profiler_output_path.py +284 -0
  502. mindspore/profiler/common/profiler_parameters.py +251 -0
  503. mindspore/profiler/common/profiler_path_manager.py +179 -0
  504. mindspore/profiler/common/record_function.py +76 -0
  505. mindspore/profiler/common/tlv_decoder.py +76 -0
  506. mindspore/profiler/common/util.py +75 -2
  507. mindspore/profiler/dynamic_profiler.py +341 -75
  508. mindspore/profiler/envprofiler.py +163 -0
  509. mindspore/profiler/experimental_config.py +197 -0
  510. mindspore/profiler/mstx.py +242 -0
  511. mindspore/profiler/platform/__init__.py +21 -0
  512. mindspore/profiler/platform/base_profiler.py +40 -0
  513. mindspore/profiler/platform/cpu_profiler.py +124 -0
  514. mindspore/profiler/platform/gpu_profiler.py +74 -0
  515. mindspore/profiler/platform/npu_profiler.py +335 -0
  516. mindspore/profiler/profiler.py +1073 -90
  517. mindspore/profiler/profiler_action_controller.py +187 -0
  518. mindspore/profiler/profiler_interface.py +118 -0
  519. mindspore/profiler/schedule.py +243 -0
  520. mindspore/rewrite/api/node.py +15 -13
  521. mindspore/rewrite/api/symbol_tree.py +2 -3
  522. mindspore/run_check/_check_version.py +27 -20
  523. mindspore/run_check/run_check.py +1 -1
  524. mindspore/runtime/__init__.py +37 -0
  525. mindspore/runtime/device.py +27 -0
  526. mindspore/runtime/event.py +209 -0
  527. mindspore/runtime/executor.py +177 -0
  528. mindspore/runtime/memory.py +409 -0
  529. mindspore/runtime/stream.py +460 -0
  530. mindspore/runtime/thread_bind_core.py +401 -0
  531. mindspore/safeguard/rewrite_obfuscation.py +12 -9
  532. mindspore/swresample-4.dll +0 -0
  533. mindspore/swscale-6.dll +0 -0
  534. mindspore/tbbmalloc.dll +0 -0
  535. mindspore/tinyxml2.dll +0 -0
  536. mindspore/train/__init__.py +8 -8
  537. mindspore/train/_utils.py +88 -25
  538. mindspore/train/amp.py +9 -5
  539. mindspore/train/callback/__init__.py +2 -2
  540. mindspore/train/callback/_callback.py +2 -16
  541. mindspore/train/callback/_checkpoint.py +53 -55
  542. mindspore/train/callback/_cluster_monitor.py +14 -18
  543. mindspore/train/callback/_early_stop.py +1 -1
  544. mindspore/train/callback/_flops_collector.py +103 -68
  545. mindspore/train/callback/_history.py +8 -5
  546. mindspore/train/callback/_lambda_callback.py +2 -2
  547. mindspore/train/callback/_landscape.py +0 -3
  548. mindspore/train/callback/_loss_monitor.py +2 -1
  549. mindspore/train/callback/_on_request_exit.py +6 -5
  550. mindspore/train/callback/_reduce_lr_on_plateau.py +11 -6
  551. mindspore/train/callback/_summary_collector.py +52 -19
  552. mindspore/train/callback/_time_monitor.py +2 -1
  553. mindspore/train/callback/{_tft_register.py → _train_fault_tolerance.py} +204 -107
  554. mindspore/train/data_sink.py +25 -2
  555. mindspore/train/dataset_helper.py +15 -16
  556. mindspore/train/loss_scale_manager.py +8 -7
  557. mindspore/train/metrics/accuracy.py +3 -3
  558. mindspore/train/metrics/confusion_matrix.py +9 -9
  559. mindspore/train/metrics/error.py +3 -3
  560. mindspore/train/metrics/hausdorff_distance.py +4 -4
  561. mindspore/train/metrics/mean_surface_distance.py +3 -3
  562. mindspore/train/metrics/metric.py +0 -12
  563. mindspore/train/metrics/occlusion_sensitivity.py +4 -2
  564. mindspore/train/metrics/precision.py +11 -10
  565. mindspore/train/metrics/recall.py +9 -9
  566. mindspore/train/metrics/root_mean_square_surface_distance.py +2 -2
  567. mindspore/train/mind_ir_pb2.py +174 -46
  568. mindspore/train/model.py +184 -113
  569. mindspore/train/serialization.py +622 -978
  570. mindspore/train/summary/_summary_adapter.py +2 -2
  571. mindspore/train/summary/summary_record.py +2 -3
  572. mindspore/train/train_thor/model_thor.py +1 -1
  573. mindspore/turbojpeg.dll +0 -0
  574. mindspore/utils/__init__.py +6 -3
  575. mindspore/utils/dryrun.py +140 -0
  576. mindspore/utils/hooks.py +81 -0
  577. mindspore/utils/runtime_execution_order_check.py +550 -0
  578. mindspore/utils/utils.py +138 -4
  579. mindspore/vcmeta.dll +0 -0
  580. mindspore/vcruntime140.dll +0 -0
  581. mindspore/vcruntime140_1.dll +0 -0
  582. mindspore/version.py +1 -1
  583. {mindspore-2.4.10.dist-info → mindspore-2.6.0rc1.dist-info}/METADATA +3 -3
  584. {mindspore-2.4.10.dist-info → mindspore-2.6.0rc1.dist-info}/RECORD +587 -418
  585. {mindspore-2.4.10.dist-info → mindspore-2.6.0rc1.dist-info}/entry_points.txt +1 -1
  586. mindspore/_install_custom.py +0 -43
  587. mindspore/common/_register_for_adapter.py +0 -74
  588. mindspore/common/_tensor_overload.py +0 -139
  589. mindspore/mindspore_np_dtype.dll +0 -0
  590. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +0 -252
  591. mindspore/ops/auto_generate/gen_arg_handler.py +0 -197
  592. mindspore/ops/operations/_opaque_predicate_registry.py +0 -41
  593. mindspore/ops_generate/gen_aclnn_implement.py +0 -263
  594. mindspore/ops_generate/gen_ops_inner_prim.py +0 -131
  595. mindspore/ops_generate/gen_pyboost_func.py +0 -1052
  596. mindspore/ops_generate/gen_utils.py +0 -209
  597. mindspore/ops_generate/op_proto.py +0 -145
  598. mindspore/ops_generate/template.py +0 -261
  599. mindspore/profiler/envprofiling.py +0 -254
  600. mindspore/profiler/profiling.py +0 -1926
  601. {mindspore-2.4.10.dist-info → mindspore-2.6.0rc1.dist-info}/WHEEL +0 -0
  602. {mindspore-2.4.10.dist-info → mindspore-2.6.0rc1.dist-info}/top_level.txt +0 -0
@@ -16,23 +16,32 @@
16
16
  import os
17
17
  import re
18
18
  import sys
19
+ import signal
19
20
  import subprocess
21
+ import socket
22
+ import psutil
20
23
  import mindspore.log as logger
21
- from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url,\
22
- _is_local_ip, _send_scale_num
24
+ from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
25
+ _is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip
26
+
23
27
 
24
28
  class _Node:
25
29
  """
26
30
  Base class for dynamic networking nodes.
27
31
 
28
32
  """
29
- def __init__(self, worker_num, sched_host, sched_port, timeout, args_list, output_file):
33
+
34
+ def __init__(self, worker_num, sched_host, sched_port, timeout, args_list, output_file, tail_worker_log,
35
+ join, is_simulation):
30
36
  self.worker_num = worker_num
31
37
  self.sched_host = sched_host
32
38
  self.sched_port = sched_port
33
39
  self.args_list = args_list
34
40
  self.output_file = output_file
35
41
  self.timeout = timeout
42
+ self.tail_worker_log = tail_worker_log
43
+ self.join = join
44
+ self.is_simulation = is_simulation
36
45
 
37
46
  def run(self):
38
47
  """
@@ -40,15 +49,20 @@ class _Node:
40
49
 
41
50
  """
42
51
  os.environ["MS_WORKER_NUM"] = str(self.worker_num)
43
- os.environ["MS_SCHED_HOST"] = self.sched_host
44
- os.environ["MS_SCHED_PORT"] = str(self.sched_port)
45
- os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
52
+ # If simulation level is set, environment variables for dynamic networking will not be set,
53
+ # and scheduler will not be started.
54
+ if not self.is_simulation:
55
+ os.environ["MS_SCHED_HOST"] = self.sched_host
56
+ os.environ["MS_SCHED_PORT"] = str(self.sched_port)
57
+ os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
58
+
46
59
 
47
60
  class _MetaServerNode(_Node):
48
61
  """
49
62
  Scheduler node for dynamic networking. Inherits from the Node class.
50
63
 
51
64
  """
65
+
52
66
  def run(self):
53
67
  """
54
68
  Runs the MetaServerNode by setting environment variables, setting the MS_ROLE variable to
@@ -59,14 +73,17 @@ class _MetaServerNode(_Node):
59
73
  with open(self.output_file, "w") as file_handle:
60
74
  return subprocess.Popen(self.args_list, stdout=file_handle, stderr=subprocess.STDOUT)
61
75
 
76
+
62
77
  class _ComputeGraphNode(_Node):
63
78
  """
64
79
  Worker node for dynamic networking. Inherits from the Node class.
65
80
  """
66
- def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file):
67
- super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file)
68
- self.node_id = node_id
69
81
 
82
+ def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file,
83
+ tail_worker_log, join, is_simulation):
84
+ super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file,
85
+ tail_worker_log, join, is_simulation)
86
+ self.node_id = node_id
70
87
 
71
88
  def run(self):
72
89
  """
@@ -78,9 +95,36 @@ class _ComputeGraphNode(_Node):
78
95
  super().run()
79
96
  if self.node_id is not None:
80
97
  os.environ["MS_NODE_ID"] = str(self.node_id)
81
- os.environ["MS_ROLE"] = "MS_WORKER"
98
+ # If simulation level is set, environment variable 'MS_ROLE' will not be set.
99
+ if not self.is_simulation:
100
+ os.environ["MS_ROLE"] = "MS_WORKER"
101
+ tail_worker_process = None
102
+ is_tail_worker_log = self.enable_tail_worker_log()
103
+ if self.join and not is_tail_worker_log:
104
+ logger.warning(f"The '--tail_worker_log' is:{self.tail_worker_log}, "
105
+ f"which doesn't contain this worker {self.node_id}."
106
+ f" So this worker {self.node_id}'s log will not be output to console. Reset "
107
+ "'--tail_worker_log', if you want to output this worker's log to console.")
82
108
  with open(self.output_file, "w") as file_handle:
83
- return subprocess.Popen(self.args_list, stdout=file_handle, stderr=subprocess.STDOUT)
109
+ worker_process = subprocess.Popen(self.args_list, preexec_fn=os.setsid, stdout=file_handle,
110
+ stderr=subprocess.STDOUT)
111
+ if self.join and is_tail_worker_log:
112
+ tail_worker_process = self.output_to_console()
113
+ return worker_process, tail_worker_process
114
+
115
+ def output_to_console(self):
116
+ """
117
+ Output worker log file to console.
118
+ """
119
+ return subprocess.Popen(['/usr/bin/tail', '-f', self.output_file])
120
+
121
+ def enable_tail_worker_log(self):
122
+ tail_worker_log_list = []
123
+ if self.tail_worker_log != "-1":
124
+ tail_worker_log_list.extend([int(num) for num in self.tail_worker_log.split(',')])
125
+ if self.tail_worker_log != "-1" and self.node_id not in tail_worker_log_list:
126
+ return False
127
+ return True
84
128
 
85
129
 
86
130
  class _ProcessManager:
@@ -89,6 +133,7 @@ class _ProcessManager:
89
133
  training
90
134
 
91
135
  """
136
+
92
137
  def __init__(self, args):
93
138
  """
94
139
  Initializes a ProcessManager object.
@@ -99,13 +144,14 @@ class _ProcessManager:
99
144
  """
100
145
  self.msn_process = None
101
146
  self.cgn_processes = []
147
+ self.tail_cgn_processes = []
102
148
 
103
- """`is_master` flags whether the current node is the master node."""
104
- self.is_master = _is_local_ip(args.master_addr)
105
-
106
- self.master_addr = args.master_addr
149
+ self.master_addr = _convert_addr_to_ip(args.master_addr)
107
150
  self.master_port = args.master_port
108
151
 
152
+ """`is_master` flags whether the current node is the master node."""
153
+ self.is_master = _is_local_ip(self.master_addr)
154
+
109
155
  self.worker_num = args.worker_num
110
156
  if self.worker_num <= 0:
111
157
  raise ValueError(f"worker_num must be greater than 0, but got {self.worker_num}.")
@@ -115,6 +161,8 @@ class _ProcessManager:
115
161
 
116
162
  self.log_dir = args.log_dir
117
163
  self.join = args.join
164
+ self.worker_log_name = args.worker_log_name
165
+ self.tail_worker_log = args.tail_worker_log
118
166
  self.cluster_time_out = args.cluster_time_out
119
167
  self.bind_core = args.bind_core
120
168
  self.rank_table_file = args.rank_table_file
@@ -123,19 +171,21 @@ class _ProcessManager:
123
171
  self.sim_rank_id = args.sim_rank_id
124
172
  self.is_simulation = (self.sim_level != -1)
125
173
  if self.is_simulation:
126
- # If simulation level is set, reset the worker_num and local_worker_num to 1
127
- # so that host cluster could be initialized.
128
- self.worker_num = 1
129
- self.local_worker_num = 1
130
174
  os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
131
175
  elif os.getenv("MS_SIMULATION_LEVEL"):
132
- # If simulation level env is set, load RANK_ID and RANK_SIZE envs.
133
- self.worker_num = 1
134
- self.local_worker_num = 1
135
176
  self.is_simulation = True
136
- self.sim_rank_id = os.getenv("RANK_ID", "0")
177
+ self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
137
178
  if os.getenv("RANK_SIZE"):
138
179
  self.exported_rank_size = os.getenv("RANK_SIZE")
180
+ # If sim_rank_id is set, single worker can be started.
181
+ if self.is_simulation and (self.sim_rank_id != -1):
182
+ logger.info(f"Simulation rank id is set to {self.sim_rank_id}, will dryrun a single process.")
183
+ self.local_worker_num = 1
184
+ if self.is_simulation and self.local_worker_num > 128:
185
+ self.local_worker_num = 1
186
+ self.sim_rank_id = 0
187
+ logger.warning(f"In dryrun case, local worker num is set to larger than 128. "
188
+ "To avoid a system clash, local worker num is set to 1.")
139
189
 
140
190
  self.cmd = args.task_script
141
191
  self.cmd_args = args.task_script_args
@@ -155,6 +205,21 @@ class _ProcessManager:
155
205
  finally:
156
206
  os.umask(origin_mask)
157
207
 
208
+ self.proc_rank_map = {}
209
+ self.enable_mindx = False
210
+ tft_env = os.getenv("MS_ENABLE_TFT", "")
211
+ if ("TTP:1" in tft_env) or ("UCE:1" in tft_env) or ("ARF:1" in tft_env):
212
+ try:
213
+ from taskd.python.framework.agent.ms_mgr.msrun_plugin import MSRunPlugin
214
+ self.msmgr = MSRunPlugin()
215
+ self.msmgr.register_callbacks("KILL_WORKER", self.kill_workers)
216
+ self.msmgr.register_callbacks("START_ALL_WORKER", self.start_all_workers)
217
+ self.msmgr.register_callbacks("MONITOR", self.monitor_rank_status)
218
+ self.enable_mindx = True
219
+ os.environ["MS_ENABLE_RECOVERY"] = str(1)
220
+ except Exception as e: # pylint: disable=broad-except
221
+ logger.warning(f"mindx is not installed, using original mindspore recovery strategy.: {str(e)}")
222
+
158
223
  def run(self):
159
224
  """
160
225
  Runs the process manager.
@@ -173,13 +238,15 @@ class _ProcessManager:
173
238
  else:
174
239
  sys.exit()
175
240
  else:
176
- if self.is_master:
241
+ if self.is_master and not self.is_simulation:
177
242
  self.start_scheduler()
178
- self.start_workers()
179
-
180
- if self.join:
181
- logger.warning("Distributed job is spawned. Waiting all processes to exit...")
182
- self.join_processes()
243
+ if self.enable_mindx:
244
+ self.msmgr.start()
245
+ else:
246
+ self.start_workers()
247
+ if self.join:
248
+ logger.warning("Distributed job is spawned. Waiting all processes to exit...")
249
+ self.join_processes()
183
250
 
184
251
  def start_scheduler(self):
185
252
  """
@@ -190,7 +257,8 @@ class _ProcessManager:
190
257
  os.environ['RANK_ID'] = str(0)
191
258
  msn = _MetaServerNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
192
259
  _generate_cmd_args_list(self.cmd, self.cmd_args),
193
- os.path.join(self.log_dir, "scheduler.log"))
260
+ os.path.join(self.log_dir, "scheduler.log"), self.tail_worker_log, self.join,
261
+ self.is_simulation)
194
262
  self.msn_process = msn.run()
195
263
 
196
264
  def start_workers(self):
@@ -208,9 +276,6 @@ class _ProcessManager:
208
276
  "You can access 'RANK_ID' environment variable after calling "
209
277
  "'mindspore.communication.init()'")
210
278
 
211
- if self.is_simulation and self.worker_num != 1:
212
- raise ValueError(f"Simulation level is set, worker_num must be 1, but got {self.worker_num}.")
213
-
214
279
  for i in range(self.local_worker_num):
215
280
  os.environ["DEVICE_ID"] = str(i)
216
281
  node_id, log_name = self._get_node_id_and_log_path(i)
@@ -221,16 +286,17 @@ class _ProcessManager:
221
286
  # If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
222
287
  # This is for rank_table method's compatibility consideration.
223
288
  os.environ["RANK_ID"] = str(node_id)
224
- logger.warning(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
225
- "Environment variable [RANK_ID] is exported.")
226
- if self.is_simulation:
227
- # Reset RANK_ID env to sim_rank_id.
289
+ print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
290
+ f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
291
+ if self.is_simulation and (self.sim_rank_id != -1):
292
+ # Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
228
293
  os.environ["RANK_ID"] = str(self.sim_rank_id)
294
+ logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
229
295
 
230
- cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
231
- if not cpu_num.isdigit():
232
- raise RuntimeError("Fail to get cpu number from /proc/cpuinfo.")
233
296
  if self.bind_core:
297
+ cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
298
+ if not cpu_num.isdigit():
299
+ raise RuntimeError(f"Got cpu number from '/proc/cpuinfo' is {cpu_num}, failed to bind core.")
234
300
  avg = int(cpu_num) // self.local_worker_num
235
301
  cpu_start = avg * i
236
302
  cpu_end = cpu_start + avg - 1
@@ -238,9 +304,11 @@ class _ProcessManager:
238
304
  else:
239
305
  cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
240
306
  cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
241
- node_id, cmd, log_name)
242
- process = cgn.run()
307
+ node_id, cmd, log_name, self.tail_worker_log, self.join, self.is_simulation)
308
+ process, tail_process = cgn.run()
243
309
  self.cgn_processes.append(process)
310
+ self.tail_cgn_processes.append(tail_process)
311
+ self.proc_rank_map[i] = process
244
312
 
245
313
  def join_processes(self):
246
314
  """
@@ -248,8 +316,15 @@ class _ProcessManager:
248
316
  If there's any process does not exit normally, logs will be analyzed
249
317
  so that understandable root cause of exception could be returned.
250
318
  """
319
+
320
+ def signal_handler(sig, frame):
321
+ logger.warning("msrun process received SIGNIN (Ctrl+C), terminating all workers.")
322
+ self.kill_all_processes()
323
+ sys.exit(0)
324
+
251
325
  has_exception = False
252
326
  success_cgn_processes = set()
327
+ signal.signal(signal.SIGINT, signal_handler)
253
328
  while True:
254
329
  # Traversal all workers and kill immediately if any exception happens.
255
330
  for p in self.cgn_processes:
@@ -266,15 +341,14 @@ class _ProcessManager:
266
341
 
267
342
  if has_exception:
268
343
  logger.warning("There's worker exits with exception, kill all other workers.")
269
- for p in self.cgn_processes:
270
- if p.poll() is None:
271
- p.kill()
344
+ self.kill_worker_processes()
345
+ self.kill_tail_log_processes()
272
346
  break
273
347
  elif len(success_cgn_processes) == len(self.cgn_processes):
274
348
  logger.info("All workers successfully exit!")
349
+ self.kill_tail_log_processes()
275
350
  break
276
351
 
277
-
278
352
  if self.msn_process:
279
353
  self.msn_process.wait()
280
354
  if self.msn_process.returncode != 0:
@@ -282,11 +356,40 @@ class _ProcessManager:
282
356
  logger.error(f"Scheduler process {self.msn_process.pid} exit with exception.")
283
357
 
284
358
  if has_exception:
285
- logger.warning("Analyzing exception log...")
359
+ logger.info("Analyzing exception log...")
286
360
  self._analyze_log()
287
361
  raise RuntimeError("Distributed job exited with exception. Please check logs in "
288
362
  f"directory: {self.log_dir}.")
289
363
 
364
+ def kill_tail_log_processes(self):
365
+ """
366
+ Kills all tail worker log processes.
367
+
368
+ """
369
+ for p_tail in self.tail_cgn_processes:
370
+ if p_tail is not None:
371
+ logger.debug("Tail worker log process:{p_tail.pid} has been killed!")
372
+ p_tail.kill()
373
+
374
+ def kill_worker_processes(self):
375
+ """
376
+ Kills all worker processes.
377
+
378
+ """
379
+ for p in self.cgn_processes:
380
+ if p.poll() is None:
381
+ os.killpg(os.getpgid(p.pid), signal.SIGKILL)
382
+
383
+ def kill_all_processes(self):
384
+ """
385
+ Kills all running processes, including scheduler, worker and tail log.
386
+
387
+ """
388
+ self.kill_worker_processes()
389
+ self.kill_tail_log_processes()
390
+ if self.msn_process.poll() is None:
391
+ self.msn_process.kill()
392
+
290
393
  def stop_processes(self):
291
394
  """
292
395
  Stops all running processes.
@@ -310,24 +413,135 @@ class _ProcessManager:
310
413
  self.start_scheduler()
311
414
  self.start_workers()
312
415
 
416
+ def kill_all_workers(self):
417
+ """
418
+ Kill all running worker processes.
419
+
420
+ Args:
421
+ NA.
422
+ """
423
+ for p in self.cgn_processes:
424
+ if p.poll() is None:
425
+ p.kill()
426
+ self.cgn_processes.clear()
427
+
428
+ for p in self.tail_cgn_processes:
429
+ if p is not None:
430
+ p.kill()
431
+ self.tail_cgn_processes.clear()
432
+
433
+ def kill_single_worker(self, pid):
434
+ """
435
+ Kill one worker process with specified pid.
436
+
437
+ Args:
438
+ pid: Worker process' pid.
439
+ """
440
+ kill_status = False
441
+ for i in range(len(self.cgn_processes)):
442
+ p = self.cgn_processes[i]
443
+ if p.pid == pid and p.poll() is None:
444
+ p.kill()
445
+ del self.cgn_processes[i]
446
+ tail_p = self.tail_cgn_processes[i]
447
+ if tail_p is not None:
448
+ tail_p.kill()
449
+ del self.tail_cgn_processes[i]
450
+ kill_status = True
451
+ break
452
+ if not kill_status:
453
+ logger.warning(f"There's no active worker with pid: {pid}")
454
+
455
+ def kill_workers(self, pids):
456
+ """
457
+ Kill worker process according to pids. Worker process with pid within pids list will be killed.
458
+
459
+ Args:
460
+ pids(list): a list of worker process pid. When local_ranks pids -1, kill all worker process.
461
+ """
462
+ if -1 in pids:
463
+ self.kill_all_workers()
464
+ else:
465
+ for pid in pids:
466
+ self.kill_single_worker(pid)
467
+ return 0
468
+
469
+ def monitor_rank_status(self, local_ranks):
470
+ """
471
+ Monitor the status of workers whose rank is within local_ranks list.
472
+
473
+ Args:
474
+ local_ranks(list): a list of local worker ranks. When local_ranks contains -1,
475
+ monitor all workers' status.
476
+ """
477
+ rank_status = {}
478
+ if -1 in local_ranks:
479
+ local_ranks = list(range(self.local_worker_num))
480
+ for i in local_ranks:
481
+ single_status = self.monitor_single_rank(i)
482
+ if single_status:
483
+ rank_status[i] = single_status
484
+ return rank_status
485
+
486
+ def monitor_single_rank(self, rank_id):
487
+ """
488
+ Monitor the status of a single worker with rank_id
489
+
490
+ Args:
491
+ rank_id: worker process's local rank, which is also device_id.
492
+ """
493
+ if 0 <= rank_id < self.local_worker_num:
494
+ global_rank_id = rank_id
495
+ if self.node_rank >= 0:
496
+ global_rank_id = self.node_rank * self.local_worker_num + rank_id
497
+ try:
498
+ p = self.proc_rank_map[rank_id]
499
+ p_status = p.poll()
500
+ if (not psutil.pid_exists(p.pid)) and (p_status != 0):
501
+ p_status = 300
502
+ return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id}
503
+ except KeyError:
504
+ logger.info(f"Process rank {rank_id} has not been initialized.")
505
+ return {"pid": None, "status": 200, "global_rank": global_rank_id}
506
+ else:
507
+ logger.warning(f"Invalid rank id!")
508
+ return {}
509
+
510
+ def start_all_workers(self):
511
+ """
512
+ Start all worker processes after killing all workers.
513
+
514
+ Args:
515
+ NA.
516
+ """
517
+ if self.cgn_processes:
518
+ self.kill_all_workers()
519
+ self.start_workers()
520
+ worker_status = self.monitor_rank_status([-1])
521
+ for i in range(self.local_worker_num):
522
+ if worker_status[i]["status"] != None: # pylint: disable=singleton-comparison
523
+ return 1
524
+ return 0
525
+
313
526
  def _get_node_id_and_log_path(self, index):
314
527
  """
315
528
  Generate node id and log path for corresponding process.
316
529
  """
530
+ formatted_log_name = self.format_worker_log_name()
317
531
  if self.local_worker_num > self.worker_num:
318
532
  raise ValueError(f"Total worker number is {self.worker_num}, "
319
533
  f"but got exceeded local worker number: {self.local_worker_num}.")
320
534
  if self.local_worker_num == self.worker_num:
321
- return index, os.path.join(self.log_dir, "worker_" + str(index) + ".log")
535
+ return index, os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
322
536
 
323
537
  if self.node_rank >= 0:
324
538
  # We assume that each node has same process number.
325
539
  node_id = self.node_rank * self.local_worker_num + index
326
- log_name = os.path.join(self.log_dir, "worker_" + str(node_id) + ".log")
540
+ log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(node_id) + ".log")
327
541
  else:
328
542
  # If node_rank is default value -1, let MindSpore assign rank id.
329
543
  node_id = None
330
- log_name = os.path.join(self.log_dir, "worker_" + str(index) + ".log")
544
+ log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
331
545
  return node_id, log_name
332
546
 
333
547
  def _analyze_log(self):
@@ -350,3 +564,15 @@ class _ProcessManager:
350
564
  logger.error(f"Time out nodes are {time_out_node_ids}")
351
565
 
352
566
  os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
567
+
568
+ def format_worker_log_name(self):
569
+ """
570
+ Format worker log files' name.
571
+ """
572
+ if not self.worker_log_name:
573
+ formatted_worker_log_name = "worker"
574
+ else:
575
+ current_ip = _get_local_ip(self.master_addr)
576
+ formatted_worker_log_name = re.sub(r'\{ip\}', current_ip, self.worker_log_name)
577
+ formatted_worker_log_name = re.sub(r'\{hostname\}', socket.gethostname(), formatted_worker_log_name)
578
+ return formatted_worker_log_name
@@ -16,8 +16,11 @@
16
16
  import os
17
17
  import json
18
18
  import socket
19
+ import ipaddress
19
20
  import mindspore.log as logger
20
21
 
22
+ CURRENT_IP = None
23
+
21
24
  def _generate_cmd(cmd, cmd_args, output_name):
22
25
  """
23
26
  Generates a command string to execute a Python script in the background, r
@@ -67,6 +70,24 @@ def _generate_url(addr, port):
67
70
  return url
68
71
 
69
72
 
73
+ def _get_local_ip(ip_address):
74
+ """
75
+ Get current IP address.
76
+
77
+ """
78
+ global CURRENT_IP
79
+ if CURRENT_IP is None:
80
+ try:
81
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
82
+ s.connect((ip_address, 0))
83
+ CURRENT_IP = s.getsockname()[0]
84
+ s.close()
85
+ except Exception as e:
86
+ raise RuntimeError(f"Get local ip failed: {e}. Please check whether an accessible address "
87
+ "is input by '--master_address'.")
88
+ return CURRENT_IP
89
+
90
+
70
91
  def _is_local_ip(ip_address):
71
92
  """
72
93
  Check if the current input IP address is a local IP address.
@@ -75,13 +96,8 @@ def _is_local_ip(ip_address):
75
96
  p = os.popen("ip -j addr")
76
97
  addr_info_str = p.read()
77
98
  p.close()
99
+ current_ip = _get_local_ip(ip_address)
78
100
  if not addr_info_str:
79
- # This means this host has no "ip -j addr" command.
80
- # We use socket module to get local ip address.
81
- s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
82
- s.connect((ip_address, 0))
83
- current_ip = s.getsockname()[0]
84
- s.close()
85
101
  return current_ip == ip_address
86
102
 
87
103
  addr_infos = json.loads(addr_info_str)
@@ -93,6 +109,25 @@ def _is_local_ip(ip_address):
93
109
  return False
94
110
 
95
111
 
112
+ def _convert_addr_to_ip(master_addr):
113
+ """
114
+ Check whether the input parameter 'master_addr' is IPv4. If a hostname is inserted, it will be converted
115
+ to IP and then set as master host's IP.
116
+
117
+ """
118
+ try:
119
+ ipaddress.IPv4Address(master_addr)
120
+ return master_addr
121
+ except ipaddress.AddressValueError:
122
+ try:
123
+ ip_address = socket.gethostbyname(master_addr)
124
+ logger.info(f"Convert input host name:{master_addr} to ip address:{ip_address}.")
125
+ return ip_address
126
+ except socket.gaierror as e:
127
+ raise RuntimeError(f"DNS resolution failed: {e}. Please check whether a correct host name "
128
+ "is input by '--master_address'.")
129
+
130
+
96
131
  def _send_scale_num(url, scale_num):
97
132
  """
98
133
  Send an HTTP request to a specified URL, informing scale_num.
@@ -37,8 +37,8 @@ def get_args():
37
37
  parser.add_argument(
38
38
  "--master_addr",
39
39
  default="127.0.0.1", type=str,
40
- help="specifies the IP address of the scheduler and its data type is string."
41
- " Allowed values: valid IP addresses."
40
+ help="specifies the IP address or the host name of the scheduler and its data type is string."
41
+ " Allowed values: valid IP addresses or valid host name."
42
42
  )
43
43
  parser.add_argument(
44
44
  "--master_port", default=8118, type=int,
@@ -85,13 +85,13 @@ def get_args():
85
85
  "--sim_level",
86
86
  default=-1,
87
87
  type=int,
88
- choices=[0, 1],
88
+ choices=[0, 1, 2, 3],
89
89
  help="specifies simulation level. When this argument is set, msrun only spawns one process "
90
90
  "but export RANK_SIZE with value worker_num and RANK_ID with value sim_rank_id."
91
91
  )
92
92
  parser.add_argument(
93
93
  "--sim_rank_id",
94
- default=0,
94
+ default=-1,
95
95
  type=int,
96
96
  help="specifies simulation process's rank id. Only one process is spawned in simulation scenario."
97
97
  )
@@ -102,6 +102,23 @@ def get_args():
102
102
  help="specifies rank table file path. This path is not used to initialize distributed job in "
103
103
  "'rank table file manner' but to help support other features."
104
104
  )
105
+ parser.add_argument(
106
+ "--worker_log_name",
107
+ default="",
108
+ type=str,
109
+ help="Specifies the worker log file name as a string for current node; the default is worker_[rankid]. "
110
+ "Support configuring the current IP address and host name by using {ip} and {hostname} respectively. "
111
+ "e.g. --worker_log_name=worker_{ip}_{hostname}_test, worker [rankid] log name for current node "
112
+ "will be worker_[real IP address]_[real host name]_test_[rankid]."
113
+ )
114
+ parser.add_argument(
115
+ "--tail_worker_log",
116
+ default="-1",
117
+ type=str,
118
+ help="Only tail worker log to console when '--join=True' and the configured value should be within "
119
+ "[0, local_worker_num], otherwise worker log will not be tail. All worker logs will be tail by "
120
+ "default. Support tail the specified worker log (e.g. --tail_log=0 tail the worker 0 log to console)."
121
+ )
105
122
  parser.add_argument(
106
123
  "task_script",
107
124
  type=str,
@@ -0,0 +1,24 @@
1
+ # Copyright 2025 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+
16
+ """
17
+ Parallel function operator
18
+ """
19
+
20
+ from mindspore.parallel.function.reshard_func import reshard
21
+
22
+ __all__ = []
23
+ __all__.extend(reshard_func.__all__)
24
+ __all__.sort()