mindspore 2.6.0__cp310-cp310-win_amd64.whl → 2.7.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (455) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +2 -2
  5. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +42 -11
  9. mindspore/_extends/builtin_operations.py +3 -3
  10. mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
  11. mindspore/_extends/optimize/cell_utils.py +96 -0
  12. mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
  13. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  14. mindspore/_extends/parse/__init__.py +3 -3
  15. mindspore/_extends/parse/compile_config.py +44 -22
  16. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -2
  17. mindspore/_extends/parse/parser.py +64 -83
  18. mindspore/_extends/parse/resources.py +39 -0
  19. mindspore/_extends/parse/standard_method.py +47 -14
  20. mindspore/_extends/parse/trope.py +8 -1
  21. mindspore/_extends/pijit/__init__.py +1 -2
  22. mindspore/_extends/pijit/pijit_func_white_list.py +2 -5
  23. mindspore/amp.py +4 -22
  24. mindspore/atlprov.dll +0 -0
  25. mindspore/avcodec-59.dll +0 -0
  26. mindspore/avdevice-59.dll +0 -0
  27. mindspore/avfilter-8.dll +0 -0
  28. mindspore/avformat-59.dll +0 -0
  29. mindspore/avutil-57.dll +0 -0
  30. mindspore/boost/adasum.py +1 -1
  31. mindspore/boost/boost_cell_wrapper.py +4 -4
  32. mindspore/c1.dll +0 -0
  33. mindspore/c1xx.dll +0 -0
  34. mindspore/c2.dll +0 -0
  35. mindspore/common/__init__.py +43 -12
  36. mindspore/common/_grad_function.py +2 -1
  37. mindspore/common/_pijit_context.py +28 -7
  38. mindspore/common/_stub_tensor.py +1 -209
  39. mindspore/common/_tensor_cpp_method.py +1 -1
  40. mindspore/common/_tensor_docs.py +177 -52
  41. mindspore/common/_utils.py +9 -1
  42. mindspore/common/api.py +338 -208
  43. mindspore/common/dtype.py +108 -57
  44. mindspore/common/dump.py +11 -16
  45. mindspore/common/dynamic_shape/__init__.py +0 -0
  46. mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +17 -23
  47. mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
  48. mindspore/common/file_system.py +59 -9
  49. mindspore/common/generator.py +2 -3
  50. mindspore/common/hook_handle.py +33 -5
  51. mindspore/common/jit_config.py +1 -1
  52. mindspore/common/jit_trace.py +84 -105
  53. mindspore/common/np_dtype.py +3 -3
  54. mindspore/common/parameter.py +27 -29
  55. mindspore/common/recompute.py +5 -7
  56. mindspore/common/sparse_tensor.py +0 -3
  57. mindspore/common/symbol.py +0 -1
  58. mindspore/common/tensor.py +84 -133
  59. mindspore/communication/_comm_helper.py +46 -4
  60. mindspore/communication/management.py +79 -7
  61. mindspore/context.py +47 -38
  62. mindspore/dataset/__init__.py +1 -1
  63. mindspore/dataset/audio/transforms.py +1 -1
  64. mindspore/dataset/core/config.py +38 -4
  65. mindspore/dataset/engine/datasets.py +350 -322
  66. mindspore/dataset/engine/datasets_user_defined.py +69 -23
  67. mindspore/dataset/engine/iterators.py +2 -2
  68. mindspore/dataset/engine/obs/config_loader.py +2 -2
  69. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
  70. mindspore/dataset/transforms/c_transforms.py +2 -2
  71. mindspore/dataset/transforms/py_transforms.py +7 -3
  72. mindspore/dataset/transforms/transforms.py +10 -6
  73. mindspore/dataset/vision/__init__.py +1 -1
  74. mindspore/dataset/vision/py_transforms.py +8 -8
  75. mindspore/dataset/vision/transforms.py +17 -5
  76. mindspore/dataset/vision/utils.py +632 -21
  77. mindspore/dataset/vision/validators.py +1 -0
  78. mindspore/device_context/ascend/device.py +1 -1
  79. mindspore/device_context/ascend/op_tuning.py +35 -1
  80. mindspore/device_context/gpu/__init__.py +2 -2
  81. mindspore/device_context/gpu/device.py +1 -1
  82. mindspore/device_context/gpu/op_precision.py +4 -2
  83. mindspore/device_context/gpu/op_tuning.py +6 -3
  84. mindspore/device_manager.py +16 -9
  85. mindspore/dnnl.dll +0 -0
  86. mindspore/dpcmi.dll +0 -0
  87. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +5 -4
  88. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  89. mindspore/experimental/optim/adadelta.py +13 -20
  90. mindspore/experimental/optim/adagrad.py +15 -22
  91. mindspore/experimental/optim/adam.py +17 -24
  92. mindspore/experimental/optim/adamax.py +14 -22
  93. mindspore/experimental/optim/adamw.py +28 -34
  94. mindspore/experimental/optim/asgd.py +15 -25
  95. mindspore/experimental/optim/lr_scheduler.py +27 -45
  96. mindspore/experimental/optim/nadam.py +14 -24
  97. mindspore/experimental/optim/optimizer.py +13 -23
  98. mindspore/experimental/optim/radam.py +18 -24
  99. mindspore/experimental/optim/rmsprop.py +14 -25
  100. mindspore/experimental/optim/rprop.py +15 -26
  101. mindspore/experimental/optim/sgd.py +9 -19
  102. mindspore/hal/__init__.py +4 -4
  103. mindspore/hal/contiguous_tensors_handle.py +2 -2
  104. mindspore/hal/memory.py +1 -0
  105. mindspore/include/api/cell.h +65 -5
  106. mindspore/include/api/cfg.h +24 -7
  107. mindspore/include/api/context.h +1 -0
  108. mindspore/include/api/delegate.h +10 -2
  109. mindspore/include/api/dual_abi_helper.h +100 -19
  110. mindspore/include/api/graph.h +14 -1
  111. mindspore/include/api/kernel.h +16 -3
  112. mindspore/include/api/kernel_api.h +9 -1
  113. mindspore/include/api/metrics/accuracy.h +9 -0
  114. mindspore/include/api/model.h +8 -1
  115. mindspore/include/api/model_group.h +4 -0
  116. mindspore/include/api/model_parallel_runner.h +2 -0
  117. mindspore/include/api/status.h +48 -10
  118. mindspore/include/api/types.h +8 -3
  119. mindspore/include/c_api/model_c.h +0 -58
  120. mindspore/include/c_api/tensor_c.h +0 -26
  121. mindspore/include/dataset/constants.h +9 -0
  122. mindspore/include/dataset/vision_ascend.h +1 -1
  123. mindspore/jpeg62.dll +0 -0
  124. mindspore/mindrecord/tools/cifar10.py +61 -11
  125. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
  126. mindspore/mindspore_backend_common.dll +0 -0
  127. mindspore/mindspore_backend_manager.dll +0 -0
  128. mindspore/mindspore_common.dll +0 -0
  129. mindspore/mindspore_core.dll +0 -0
  130. mindspore/mindspore_cpu_res_manager.dll +0 -0
  131. mindspore/mindspore_dump.dll +0 -0
  132. mindspore/mindspore_frontend.dll +0 -0
  133. mindspore/mindspore_glog.dll +0 -0
  134. mindspore/mindspore_memory_pool.dll +0 -0
  135. mindspore/mindspore_ms_backend.dll +0 -0
  136. mindspore/mindspore_ops.dll +0 -0
  137. mindspore/mindspore_ops_host.dll +0 -0
  138. mindspore/mindspore_ops_kernel_common.dll +0 -0
  139. mindspore/mindspore_profiler.dll +0 -0
  140. mindspore/mindspore_pyboost.dll +0 -0
  141. mindspore/mindspore_pynative.dll +0 -0
  142. mindspore/mindspore_res_manager.dll +0 -0
  143. mindspore/mindspore_runtime_pipeline.dll +0 -0
  144. mindspore/mint/__init__.py +4 -44
  145. mindspore/mint/distributed/__init__.py +5 -0
  146. mindspore/mint/distributed/distributed.py +425 -19
  147. mindspore/mint/nn/__init__.py +1 -1
  148. mindspore/mint/nn/functional.py +53 -6
  149. mindspore/mint/nn/layer/_functions.py +163 -294
  150. mindspore/mint/nn/layer/activation.py +8 -6
  151. mindspore/mint/nn/layer/conv.py +125 -101
  152. mindspore/mint/nn/layer/normalization.py +11 -25
  153. mindspore/mint/optim/adam.py +19 -18
  154. mindspore/mint/optim/adamw.py +14 -8
  155. mindspore/mint/optim/sgd.py +5 -5
  156. mindspore/msobj140.dll +0 -0
  157. mindspore/mspdb140.dll +0 -0
  158. mindspore/mspdbcore.dll +0 -0
  159. mindspore/mspdbst.dll +0 -0
  160. mindspore/mspft140.dll +0 -0
  161. mindspore/msvcdis140.dll +0 -0
  162. mindspore/msvcp140_1.dll +0 -0
  163. mindspore/msvcp140_2.dll +0 -0
  164. mindspore/msvcp140_atomic_wait.dll +0 -0
  165. mindspore/msvcp140_codecvt_ids.dll +0 -0
  166. mindspore/nn/cell.py +488 -620
  167. mindspore/nn/grad/cell_grad.py +11 -12
  168. mindspore/nn/layer/activation.py +36 -36
  169. mindspore/nn/layer/basic.py +74 -77
  170. mindspore/nn/layer/channel_shuffle.py +4 -4
  171. mindspore/nn/layer/combined.py +4 -2
  172. mindspore/nn/layer/conv.py +86 -85
  173. mindspore/nn/layer/dense.py +9 -7
  174. mindspore/nn/layer/embedding.py +50 -52
  175. mindspore/nn/layer/image.py +38 -40
  176. mindspore/nn/layer/math.py +111 -112
  177. mindspore/nn/layer/normalization.py +56 -44
  178. mindspore/nn/layer/pooling.py +58 -63
  179. mindspore/nn/layer/rnn_cells.py +33 -33
  180. mindspore/nn/layer/rnns.py +56 -56
  181. mindspore/nn/layer/thor_layer.py +74 -73
  182. mindspore/nn/layer/transformer.py +11 -1
  183. mindspore/nn/learning_rate_schedule.py +20 -20
  184. mindspore/nn/loss/loss.py +79 -81
  185. mindspore/nn/optim/adam.py +2 -4
  186. mindspore/nn/optim/adasum.py +2 -2
  187. mindspore/nn/optim/lamb.py +1 -3
  188. mindspore/nn/optim/optimizer.py +1 -1
  189. mindspore/nn/optim/tft_wrapper.py +2 -3
  190. mindspore/nn/optim/thor.py +2 -2
  191. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  192. mindspore/nn/probability/distribution/exponential.py +2 -1
  193. mindspore/nn/probability/distribution/poisson.py +2 -1
  194. mindspore/nn/sparse/sparse.py +3 -3
  195. mindspore/nn/wrap/cell_wrapper.py +73 -42
  196. mindspore/nn/wrap/grad_reducer.py +37 -52
  197. mindspore/nn/wrap/loss_scale.py +72 -74
  198. mindspore/numpy/array_creations.py +7 -7
  199. mindspore/numpy/fft.py +1 -1
  200. mindspore/numpy/math_ops.py +1 -1
  201. mindspore/numpy/utils_const.py +1 -1
  202. mindspore/opencv_core452.dll +0 -0
  203. mindspore/opencv_imgcodecs452.dll +0 -0
  204. mindspore/opencv_imgproc452.dll +0 -0
  205. mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
  206. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
  207. mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
  208. mindspore/ops/_op_impl/cpu/__init__.py +1 -0
  209. mindspore/{experimental/es/__init__.py → ops/_op_impl/cpu/joinedstr_op.py} +12 -6
  210. mindspore/ops/_vmap/vmap_array_ops.py +6 -13
  211. mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
  212. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +29 -10
  213. mindspore/ops/auto_generate/gen_extend_func.py +5 -55
  214. mindspore/ops/auto_generate/gen_ops_def.py +753 -273
  215. mindspore/ops/auto_generate/gen_ops_prim.py +1687 -958
  216. mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
  217. mindspore/ops/composite/__init__.py +10 -0
  218. mindspore/ops/composite/base.py +9 -5
  219. mindspore/ops/composite/multitype_ops/__init__.py +12 -1
  220. mindspore/ops/composite/multitype_ops/_compile_utils.py +132 -108
  221. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  222. mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
  223. mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
  224. mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
  225. mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
  226. mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
  227. mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
  228. mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
  229. mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
  230. mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
  231. mindspore/ops/function/__init__.py +4 -1
  232. mindspore/ops/function/_add_attr_func.py +11 -6
  233. mindspore/ops/function/array_func.py +17 -100
  234. mindspore/ops/function/debug_func.py +8 -5
  235. mindspore/ops/function/grad/grad_func.py +5 -13
  236. mindspore/ops/function/math_func.py +65 -399
  237. mindspore/ops/function/nn_func.py +44 -61
  238. mindspore/ops/function/other_func.py +4 -1
  239. mindspore/ops/function/random_func.py +31 -4
  240. mindspore/ops/functional.py +2 -3
  241. mindspore/ops/functional_overload.py +486 -18
  242. mindspore/ops/op_info_register.py +21 -0
  243. mindspore/ops/operations/__init__.py +5 -2
  244. mindspore/ops/operations/_custom_ops_utils.py +675 -8
  245. mindspore/ops/operations/_inner_ops.py +14 -18
  246. mindspore/ops/operations/_sequence_ops.py +1 -1
  247. mindspore/ops/operations/array_ops.py +4 -50
  248. mindspore/ops/operations/comm_ops.py +186 -41
  249. mindspore/ops/operations/custom_ops.py +244 -175
  250. mindspore/ops/operations/debug_ops.py +55 -4
  251. mindspore/ops/operations/image_ops.py +13 -13
  252. mindspore/ops/operations/manually_defined/ops_def.py +27 -28
  253. mindspore/ops/operations/math_ops.py +8 -9
  254. mindspore/ops/operations/nn_ops.py +6 -7
  255. mindspore/ops/primitive.py +9 -20
  256. mindspore/ops/tensor_method.py +52 -11
  257. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
  258. mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
  259. mindspore/ops_generate/api/functions_cc_generator.py +58 -10
  260. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
  261. mindspore/ops_generate/common/base_generator.py +14 -0
  262. mindspore/ops_generate/common/gen_constants.py +7 -2
  263. mindspore/ops_generate/common/gen_utils.py +0 -19
  264. mindspore/ops_generate/common/op_proto.py +11 -4
  265. mindspore/ops_generate/common/template.py +88 -11
  266. mindspore/ops_generate/gen_ops.py +1 -1
  267. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
  268. mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
  269. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
  270. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
  271. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
  272. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
  273. mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -16
  274. mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
  275. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
  276. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
  277. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
  278. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
  279. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
  280. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
  281. mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
  282. mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
  283. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
  284. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
  285. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
  286. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
  287. mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
  288. mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
  289. mindspore/parallel/_auto_parallel_context.py +9 -17
  290. mindspore/parallel/_cell_wrapper.py +106 -40
  291. mindspore/parallel/_parallel_serialization.py +4 -3
  292. mindspore/parallel/_ps_context.py +4 -6
  293. mindspore/parallel/_tensor.py +167 -12
  294. mindspore/parallel/_transformer/moe.py +1 -1
  295. mindspore/parallel/_transformer/transformer.py +17 -12
  296. mindspore/parallel/_utils.py +5 -11
  297. mindspore/parallel/auto_parallel.py +33 -12
  298. mindspore/parallel/checkpoint_convert.py +3 -3
  299. mindspore/parallel/checkpoint_transform.py +5 -1
  300. mindspore/parallel/cluster/process_entity/_api.py +88 -49
  301. mindspore/parallel/cluster/process_entity/_utils.py +95 -7
  302. mindspore/parallel/cluster/run.py +48 -7
  303. mindspore/parallel/function/__init__.py +8 -1
  304. mindspore/parallel/function/reshard_func.py +7 -6
  305. mindspore/parallel/nn/__init__.py +15 -2
  306. mindspore/parallel/nn/parallel_cell_wrapper.py +50 -14
  307. mindspore/parallel/nn/parallel_grad_reducer.py +7 -14
  308. mindspore/parallel/shard.py +9 -23
  309. mindspore/parallel/transform_safetensors.py +468 -174
  310. mindspore/pgodb140.dll +0 -0
  311. mindspore/pgort140.dll +0 -0
  312. mindspore/profiler/__init__.py +2 -1
  313. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
  314. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
  315. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +3 -0
  316. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
  317. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
  318. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
  319. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
  320. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
  321. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
  322. mindspore/profiler/analysis/task_manager.py +1 -1
  323. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
  324. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
  325. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +10 -9
  326. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +43 -23
  327. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
  328. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
  329. mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
  330. mindspore/profiler/common/constant.py +16 -0
  331. mindspore/profiler/common/msprof_cmd_tool.py +2 -2
  332. mindspore/profiler/common/path_manager.py +9 -0
  333. mindspore/profiler/common/profiler_context.py +50 -29
  334. mindspore/profiler/common/profiler_info.py +0 -16
  335. mindspore/profiler/common/profiler_meta_data.py +1 -0
  336. mindspore/profiler/common/profiler_op_analyse.py +239 -0
  337. mindspore/profiler/common/profiler_output_path.py +23 -8
  338. mindspore/profiler/common/profiler_parameters.py +128 -35
  339. mindspore/profiler/dynamic_profile/__init__.py +0 -0
  340. mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
  341. mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
  342. mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
  343. mindspore/profiler/dynamic_profiler.py +374 -338
  344. mindspore/profiler/envprofiler.py +42 -12
  345. mindspore/profiler/experimental_config.py +112 -7
  346. mindspore/profiler/mstx.py +33 -12
  347. mindspore/profiler/platform/__init__.py +2 -3
  348. mindspore/profiler/platform/cpu_profiler.py +10 -4
  349. mindspore/profiler/platform/npu_profiler.py +30 -20
  350. mindspore/profiler/profiler.py +218 -154
  351. mindspore/profiler/profiler_action_controller.py +65 -77
  352. mindspore/profiler/profiler_interface.py +2 -2
  353. mindspore/profiler/schedule.py +10 -4
  354. mindspore/rewrite/common/config.py +1 -0
  355. mindspore/rewrite/common/namer.py +1 -0
  356. mindspore/rewrite/common/namespace.py +1 -0
  357. mindspore/rewrite/node/node.py +31 -11
  358. mindspore/rewrite/parsers/assign_parser.py +1 -1
  359. mindspore/rewrite/symbol_tree/symbol_tree.py +2 -2
  360. mindspore/run_check/_check_version.py +7 -10
  361. mindspore/runtime/__init__.py +8 -6
  362. mindspore/runtime/event.py +10 -4
  363. mindspore/runtime/executor.py +87 -45
  364. mindspore/runtime/memory.py +22 -30
  365. mindspore/runtime/thread_bind_core.py +299 -165
  366. mindspore/safeguard/rewrite_obfuscation.py +12 -13
  367. mindspore/swresample-4.dll +0 -0
  368. mindspore/swscale-6.dll +0 -0
  369. mindspore/tbbmalloc.dll +0 -0
  370. mindspore/tinyxml2.dll +0 -0
  371. mindspore/train/_utils.py +9 -5
  372. mindspore/train/amp.py +43 -23
  373. mindspore/train/callback/__init__.py +5 -5
  374. mindspore/train/callback/_callback.py +2 -1
  375. mindspore/train/callback/_checkpoint.py +4 -14
  376. mindspore/train/callback/_flops_collector.py +11 -7
  377. mindspore/train/callback/_landscape.py +0 -1
  378. mindspore/train/callback/_train_fault_tolerance.py +72 -18
  379. mindspore/train/data_sink.py +15 -6
  380. mindspore/train/dataset_helper.py +14 -5
  381. mindspore/train/model.py +49 -47
  382. mindspore/train/serialization.py +168 -126
  383. mindspore/train/summary/summary_record.py +13 -2
  384. mindspore/train/train_thor/model_thor.py +2 -2
  385. mindspore/turbojpeg.dll +0 -0
  386. mindspore/utils/__init__.py +3 -2
  387. mindspore/utils/dryrun.py +0 -6
  388. mindspore/utils/runtime_execution_order_check.py +162 -78
  389. mindspore/utils/sdc_detect.py +68 -0
  390. mindspore/utils/utils.py +14 -17
  391. mindspore/vcmeta.dll +0 -0
  392. mindspore/vcruntime140.dll +0 -0
  393. mindspore/vcruntime140_1.dll +0 -0
  394. mindspore/version.py +1 -1
  395. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/METADATA +5 -4
  396. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/RECORD +400 -439
  397. mindspore/_deprecated/jit.py +0 -198
  398. mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
  399. mindspore/communication/_hccl_management.py +0 -297
  400. mindspore/experimental/es/embedding_service.py +0 -891
  401. mindspore/experimental/es/embedding_service_layer.py +0 -581
  402. mindspore/profiler/common/validator/__init__.py +0 -14
  403. mindspore/profiler/common/validator/validate_path.py +0 -84
  404. mindspore/profiler/parser/__init__.py +0 -14
  405. mindspore/profiler/parser/aicpu_data_parser.py +0 -272
  406. mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
  407. mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
  408. mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
  409. mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
  410. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
  411. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
  412. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
  413. mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
  414. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
  415. mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
  416. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
  417. mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
  418. mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
  419. mindspore/profiler/parser/ascend_flops_generator.py +0 -116
  420. mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
  421. mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
  422. mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
  423. mindspore/profiler/parser/ascend_memory_generator.py +0 -185
  424. mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
  425. mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
  426. mindspore/profiler/parser/ascend_op_generator.py +0 -334
  427. mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
  428. mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
  429. mindspore/profiler/parser/base_timeline_generator.py +0 -483
  430. mindspore/profiler/parser/container.py +0 -229
  431. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
  432. mindspore/profiler/parser/flops_parser.py +0 -531
  433. mindspore/profiler/parser/framework_enum.py +0 -111
  434. mindspore/profiler/parser/framework_parser.py +0 -464
  435. mindspore/profiler/parser/framework_struct.py +0 -61
  436. mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
  437. mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
  438. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
  439. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
  440. mindspore/profiler/parser/hccl_parser.py +0 -573
  441. mindspore/profiler/parser/hwts_log_parser.py +0 -122
  442. mindspore/profiler/parser/integrator.py +0 -526
  443. mindspore/profiler/parser/memory_usage_parser.py +0 -277
  444. mindspore/profiler/parser/minddata_analyzer.py +0 -800
  445. mindspore/profiler/parser/minddata_parser.py +0 -186
  446. mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
  447. mindspore/profiler/parser/op_intermediate_parser.py +0 -149
  448. mindspore/profiler/parser/optime_parser.py +0 -250
  449. mindspore/profiler/parser/profiler_info.py +0 -213
  450. mindspore/profiler/parser/step_trace_parser.py +0 -666
  451. mindspore/utils/hooks.py +0 -81
  452. /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
  453. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/WHEEL +0 -0
  454. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/entry_points.txt +0 -0
  455. {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/top_level.txt +0 -0
@@ -25,8 +25,9 @@ from mindspore.communication import get_rank, get_group_size
25
25
  from mindspore import log as logger
26
26
  from mindspore.train.serialization import _get_cur_rank_dp
27
27
  from mindspore._c_expression import _repair_device, _stop_device, _tft_sem_post, _tft_sem_enable
28
- from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm
28
+ from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm, _clean_rootinfo
29
29
  from mindspore._c_expression import clean_tdt_channel
30
+ from mindspore._c_expression import _pre_launch_send_recv
30
31
  from mindspore._c_expression import send_recv, reset_params
31
32
  from mindspore._c_expression import CollectiveManager
32
33
  from mindspore._c_expression import _get_uce_process_strategy, _get_uce_mem_info
@@ -35,6 +36,7 @@ from mindspore.ops.operations.manually_defined._inner import TensorReport
35
36
  import mindspore
36
37
  import mindspore.common.dtype as mstype
37
38
  from mindspore.parallel._recovery_context import _set_recovery_context
39
+ from mindspore import runtime
38
40
 
39
41
 
40
42
  def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
@@ -80,7 +82,7 @@ def _save_checkpoint_on_failure(step, save_info, args, cb_ctx):
80
82
  append_dict["loss_scale"] = outputs[2]
81
83
 
82
84
  ckpt_file = f"ttp_rank_{str(cur_rank)}-{str(cur_epoch_num)}_{str(step_num_in_epoch)}.ckpt"
83
- cur_ckpt_dir = _get_ckpt_dir(step, ckpt_save_path, True) + "/rank_" + str(cur_rank)
85
+ cur_ckpt_dir = os.path.join(_get_ckpt_dir(step, ckpt_save_path, True), "rank_" + str(cur_rank))
84
86
  os.makedirs(cur_ckpt_dir, exist_ok=True)
85
87
  cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
86
88
  save_checkpoint(cb_params.train_network, cur_file,
@@ -110,7 +112,7 @@ def _tft_exit_cb(ctx):
110
112
 
111
113
  def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_ctx):
112
114
  """ Callback used for TFT repair function."""
113
- logger.warning("Enter _tft_repair_callback repair type: {}".format(repair_info["repair_type"]))
115
+ logger.warning(f"Enter _tft_repair_callback repair type: {repair_info['repair_type']}")
114
116
  if (repair_info["repair_type"] in (cb_ctx.tft.RepairType.RT_UCE_HIGHLEVEL.value,
115
117
  cb_ctx.tft.RepairType.RT_UCE_LOWLEVEL.value)):
116
118
  logger.warning("Enter _tft_repair_callback uce REPARI_DEVICE device_id : {}".format(cb_ctx.device_id))
@@ -138,7 +140,7 @@ def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_
138
140
 
139
141
  def _tft_clean_callback(is_uce_error, args, ctx):
140
142
  """ Callback used for TFT clean function."""
141
- logger.warning("Enter _tft_clean_callback")
143
+ logger.warning(f"Enter _tft_clean_callback, device id:{ctx.device_id}")
142
144
  ret = 0
143
145
  if is_uce_error:
144
146
  _get_uce_mem_info(ctx.device_id)
@@ -154,12 +156,16 @@ def _tft_clean_callback(is_uce_error, args, ctx):
154
156
  logger.warning("Enter _tft_clean_callback resume_hccl_comm")
155
157
  CollectiveManager.get_instance().resume_hccl_comm()
156
158
  logger.warning("Finish _tft_clean_callback, ret: {}".format(ret))
159
+ if ctx.tft.tft_get_repair_type() == "recover":
160
+ logger.warning(f"Destroy hcom")
161
+ _finalize_comm()
162
+ logger.warning(f"Destroy hcom end")
157
163
  return ret
158
164
 
159
165
 
160
166
  def _tft_stop_callback(args, cb_ctx):
161
167
  """ Callback used for TFT stop function."""
162
- logger.warning("Enter _tft_stop_callback device_id: {}".format(cb_ctx.device_id))
168
+ logger.warning(f"Enter _tft_stop_callback device_id: {cb_ctx.device_id}")
163
169
  _stop_device(cb_ctx.device_id)
164
170
  if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
165
171
  raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
@@ -167,23 +173,25 @@ def _tft_stop_callback(args, cb_ctx):
167
173
  if cb_ctx.tft.tft_get_repair_type() == "recover":
168
174
  logger.warning(f"Reset limit step")
169
175
  cb_ctx.tft.tft_reset_limit_step()
170
- logger.info("Finish _tft_stop_callback")
176
+ logger.warning("Finish _tft_stop_callback")
171
177
 
172
178
 
173
179
  def _tft_rebuild_sub_groups(fault_ranks, args, ctx):
174
180
  """Callback used for TFT Rebuild Group function."""
175
- logger.warning(f"Enter _tft_rebuild_sub_groups, device id: ".format(ctx.device_id))
176
- _finalize_comm()
181
+ logger.warning(f"Enter _tft_rebuild_sub_groups, device id: {ctx.device_id}")
177
182
  _rebuild_world_group()
178
183
  _rebuild_sub_group()
179
184
  _set_recovery_context(is_arf=True)
185
+ logger.warning(f"try to pre launch send recv before real launch")
186
+ _pre_launch_send_recv(context.get_context('device_id'))
187
+ logger.warning(f"Pre launch send recv before real launch end")
180
188
  logger.warning("Enter _tft_rebuild_sub_groups ok ")
181
189
 
182
190
 
183
191
  class TrainFaultTolerance(Callback):
184
192
  """
185
193
  This callback is used to enable the TFT feature
186
- `MindIO TFT <https://www.hiascend.com/document/detail/zh/mindx-dl/60rc2/mindio/mindiottp/mindiottp001.html>`_
194
+ `MindIO TFT <https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html>`_
187
195
  and will execute TFT operations during training process, such as TFT init, report and exception handle.
188
196
 
189
197
  Note:
@@ -299,6 +307,12 @@ class TrainFaultTolerance(Callback):
299
307
 
300
308
  def __init__(self, ckpt_save_path=None, **kwargs):
301
309
  super(TrainFaultTolerance, self).__init__()
310
+ logger.info(f"MS_ENABLE_TFT: {os.getenv('MS_ENABLE_TFT', '')}")
311
+ if self._only_enable_tsp():
312
+ self.tft = _tft_handler.get_tft()
313
+ self._check_init()
314
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
315
+ return
302
316
  self.save_cb = kwargs.get("ckpt_save_fn", None)
303
317
  self.ckpt_save_path = ckpt_save_path
304
318
  if self.save_cb is None and self.ckpt_save_path is None:
@@ -308,15 +322,19 @@ class TrainFaultTolerance(Callback):
308
322
  self.device_id = context.get_context("device_id")
309
323
  self.cur_step_num = 0
310
324
  self.cur_epoch_num = 0
325
+ self.clean_unique_id = False
311
326
  # For TREError(Training Result Error) scene, parameter `ckpt_load_fn` must be provided to load checkpoint
312
327
  # from file for resuming training, the `ckpt_load_fn` is a function, prototype of which is:
313
328
  # `def load_checkpoint() -> tuple(dict, bool)`, the return value is a tuple containing 2 values,
314
329
  # i.e. (param_dict, remove_redundancy)
315
330
  self.ckpt_load_func = kwargs.get("ckpt_load_fn", None)
316
- self.tft = _tft_handler.get_tft()
317
331
  if self._only_enable_tre():
318
332
  return
333
+ self.tft = _tft_handler.get_tft()
319
334
  self._check_init()
335
+ if self._only_enable_tre_and_tsp():
336
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
337
+ return
320
338
  self.global_step = None
321
339
  self.learning_rate = None
322
340
  self.has_init_replica = False
@@ -336,6 +354,22 @@ class TrainFaultTolerance(Callback):
336
354
  return False
337
355
  return "TRE:1" in env_enable
338
356
 
357
+ def _only_enable_tsp(self):
358
+ """Check if only configured MS_ENABLE_TFT='{TSP:1}'"""
359
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
360
+ non_tsp_flags = ["TTP:1", "UCE:1", "ARF:1", "TRE:1"]
361
+ if any(flag in env_enable for flag in non_tsp_flags):
362
+ return False
363
+ return "TSP:1" in env_enable
364
+
365
+ def _only_enable_tre_and_tsp(self):
366
+ """Check if only configured MS_ENABLE_TFT='{TRE:1, TSP:1}'"""
367
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
368
+ other_flags = ["TTP:1", "UCE:1", "ARF:1"]
369
+ if any(flag in env_enable for flag in other_flags):
370
+ return False
371
+ return "TRE:1" in env_enable and "TSP:1" in env_enable
372
+
339
373
  def _check_init(self):
340
374
  """Check if the mindio-ttp had inited"""
341
375
  if self.tft is None:
@@ -346,11 +380,9 @@ class TrainFaultTolerance(Callback):
346
380
  _tft_handler.init(config=None)
347
381
  self.tft = _tft_handler.get_tft()
348
382
  logger.warning(f"TFT handle init ok.")
349
- mode = context.get_context("mode")
350
383
  device_target = context.get_context("device_target")
351
- if device_target != "Ascend" or mode != context.GRAPH_MODE:
352
- raise ValueError(f"MindIO adataper only support on Ascend device with GRAPH Mode!"
353
- f"device:{device_target}, run mode: {mode}")
384
+ if device_target != "Ascend":
385
+ raise ValueError(f"MindIO adataper only support on Ascend device but got device {device_target}!")
354
386
 
355
387
  def _is_params_consistent(self):
356
388
  for key, param in self.cb_params.train_network.parameters_and_names():
@@ -427,6 +459,8 @@ class TrainFaultTolerance(Callback):
427
459
  self.tft.tft_register_clean_handler(_tft_clean_callback, self)
428
460
  self.tft.tft_register_repair_handler(_tft_repair_callback, self)
429
461
  self.tft.tft_register_rebuild_group_handler(_tft_rebuild_sub_groups, self)
462
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
463
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
430
464
 
431
465
  def _reset_acc_grads(self):
432
466
  accu_grad_params = map(lambda e: e[1],
@@ -436,6 +470,12 @@ class TrainFaultTolerance(Callback):
436
470
  if reset_params(accu_grad_list) != 0:
437
471
  raise ValueError("Call reset_params failed.")
438
472
 
473
+ def _clear_unique_id(self):
474
+ """Clean unique id on first train step end"""
475
+ if not self.clean_unique_id and ("ARF:1" in os.getenv("MS_ENABLE_TFT", "")):
476
+ _clean_rootinfo()
477
+ self.clean_unique_id = True
478
+
439
479
  def on_train_step_end(self, run_context):
440
480
  """
441
481
  Report status to MindIO TFT after every step finished.
@@ -446,13 +486,19 @@ class TrainFaultTolerance(Callback):
446
486
  """
447
487
  if self._only_enable_tre():
448
488
  return
449
- if self.has_init_replica is False:
450
- self.has_init_replica = True
451
- self._set_tft_optimizer_replica(run_context)
489
+
452
490
  cb_params = run_context.original_args()
453
491
  logger.info("START Set optimizer finish step status to TFT. step: {}".format(cb_params.cur_step_num))
454
492
  self.cur_step_num = cb_params.cur_step_num
455
493
  self.cur_epoch_num = cb_params.cur_epoch_num
494
+ if self._only_enable_tsp() or self._only_enable_tre_and_tsp():
495
+ logger.info("Go into tft_pause_train.")
496
+ self.tft.tft_pause_train(self.cur_step_num)
497
+ return
498
+
499
+ if self.has_init_replica is False:
500
+ self.has_init_replica = True
501
+ self._set_tft_optimizer_replica(run_context)
456
502
  if cb_params.optimizer is not None:
457
503
  self.global_step = cb_params.optimizer.global_step.clone()
458
504
  self.assign(cb_params.optimizer.tft_g_one_flag, self.g_one)
@@ -462,7 +508,13 @@ class TrainFaultTolerance(Callback):
462
508
  else:
463
509
  raise ValueError("TFT feature need optimizer or network's optimizer!")
464
510
  self.tft.tft_end_updating_os(cb_params.cur_step_num + self.initial_step)
511
+ if cb_params.is_arf:
512
+ self.clean_unique_id = False
513
+ self._clear_unique_id()
465
514
  logger.info("END Set optimizer finish step status to TFT.")
515
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
516
+ logger.info("Go into tft_pause_train.")
517
+ self.tft.tft_pause_train(self.cur_step_num)
466
518
 
467
519
  def on_train_begin(self, run_context):
468
520
  """
@@ -472,6 +524,8 @@ class TrainFaultTolerance(Callback):
472
524
  run_context (RunContext): Context of the train running. Refer to
473
525
  :class:`mindspore.train.RunContext` for detail.
474
526
  """
527
+ if self._only_enable_tsp():
528
+ return
475
529
  cb_params = run_context.original_args()
476
530
  if self._only_enable_tre():
477
531
  self.cb_params = cb_params
@@ -491,6 +545,6 @@ class TrainFaultTolerance(Callback):
491
545
  run_context (RunContext): Context of the train running. Refer to
492
546
  :class:`mindspore.train.RunContext` for detail.
493
547
  """
494
- if self._only_enable_tre():
548
+ if self._only_enable_tre() or self._only_enable_tsp() or self._only_enable_tre_and_tsp():
495
549
  return
496
550
  _tft_handler.unregister_tft()
@@ -16,9 +16,9 @@
16
16
  from functools import wraps
17
17
  import mindspore.ops as ops
18
18
  from mindspore import context
19
- from mindspore.common.dtype import pytype_to_dtype
19
+ from mindspore.common.dtype import _pytype_to_dtype
20
20
  from mindspore.common.api import jit
21
- from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
21
+ from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, enable_data_broadcast
22
22
  from mindspore.train.dataset_helper import _has_dynamic_shape, _check_inputs
23
23
  import mindspore.dataset as ds
24
24
  from mindspore._c_expression import _set_dataset_mode_config
@@ -41,6 +41,15 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
41
41
  is_info_queue = (create_info and sink_size == 1 and dataset_size != 1 and
42
42
  input_signature is None and not dynamic_shape and
43
43
  context.get_context('device_target') == 'Ascend')
44
+
45
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
46
+ # otherwise get_data_info will stuck since some rank do not consume data.
47
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
48
+ data_broadcast = enable_data_broadcast()
49
+
50
+ if use_pipeline_parallel or data_broadcast:
51
+ is_info_queue = False
52
+
44
53
  transfer_dataset = _exec_datagraph(dataset, sink_size, create_data_info_queue=is_info_queue)
45
54
  dataset.__transfer_dataset__ = transfer_dataset
46
55
 
@@ -52,7 +61,7 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
52
61
  _check_inputs(input_signature, dataset_shapes, dataset_types)
53
62
 
54
63
  queue_name = transfer_dataset.queue_name
55
- if _need_to_full() and context.get_context('mode') == context.GRAPH_MODE:
64
+ if _need_to_full():
56
65
  device_num = _get_device_num() // _get_pipeline_stages()
57
66
  dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
58
67
  next_op = ops.GetNext(dataset_types, dataset_shapes, len(dataset_types), queue_name)
@@ -85,12 +94,12 @@ def _get_next_op(dataset, ori_next_op, is_info_queue):
85
94
 
86
95
  queue_name = dataset.__transfer_dataset__.queue_name
87
96
  dataset_types, dataset_shapes = dataset.__transfer_dataset__.get_data_info()
88
- dataset_types = [pytype_to_dtype(x) for x in dataset_types]
97
+ dataset_types = [_pytype_to_dtype(x) for x in dataset_types] # pylint:disable=protected-access
89
98
  key = str(dataset_types) + str(dataset_shapes)
90
99
  if key in dataset.__sink_aux__.next_ops:
91
100
  next_op = dataset.__sink_aux__.next_ops[key]
92
101
  else:
93
- if _need_to_full() and context.get_context('mode') == context.GRAPH_MODE:
102
+ if _need_to_full():
94
103
  device_num = _get_device_num() // _get_pipeline_stages()
95
104
  dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
96
105
  next_op = ops.GetNext(dataset_types, dataset_shapes, len(dataset_types), queue_name)
@@ -214,7 +223,7 @@ def data_sink(fn, dataset, sink_size=1, jit_config=None, input_signature=None):
214
223
  loop = sink_size
215
224
  create_info = True
216
225
  if jit_config is None:
217
- create_info = (loop == 1)
226
+ create_info = loop == 1
218
227
  loop = 1
219
228
  ori_next_op, is_info_queue = _init_sink_dataset(dataset, loop, input_signature, create_info)
220
229
 
@@ -1,4 +1,4 @@
1
- # Copyright 2020 Huawei Technologies Co., Ltd
1
+ # Copyright 2020-2025 Huawei Technologies Co., Ltd
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ import copy
20
20
 
21
21
  from mindspore import _checkparam as Validator
22
22
  from mindspore import log as logger
23
- from mindspore.common._auto_dynamic import is_auto_dynamic, convert_new_shapes
24
- from mindspore.common.dtype import pytype_to_dtype
23
+ from mindspore.common.dynamic_shape._auto_dynamic import is_auto_dynamic, convert_new_shapes
24
+ from mindspore.common.dtype import _pytype_to_dtype
25
25
  from mindspore.common.api import _cell_graph_executor, _is_args_fullmode, ARG_SPECIFIED
26
26
  from mindspore.common._utils import is_shape_unknown
27
27
  from mindspore.dataset.core import config as dataset_config
@@ -34,7 +34,7 @@ from mindspore.parallel._utils import _get_device_num, _get_global_rank, _need_t
34
34
  _origin_shapes, _dynamic_shape_for_dataset
35
35
  from mindspore.parallel._ps_context import _is_role_sched
36
36
  from mindspore.ops import operations as P
37
- from mindspore.common.auto_dynamic_shape import _auto_dynamic_shape
37
+ from mindspore.common.dynamic_shape.auto_dynamic_shape import _auto_dynamic_shape
38
38
 
39
39
 
40
40
  def _send_data(dataset, epoch_num):
@@ -275,7 +275,7 @@ def connect_network_with_dataset(network, dataset_helper):
275
275
  # Need to do full_batch for shapes which also do in the _DatasetIterMSLoopSink
276
276
  if _need_to_full():
277
277
  dataset_shapes = _to_full_shapes(dataset_shapes, _get_device_num() // _get_pipeline_stages())
278
- dataset_types = [pytype_to_dtype(x) for x in dataset_types]
278
+ dataset_types = [_pytype_to_dtype(x) for x in dataset_types] # pylint:disable=protected-access
279
279
  if not is_dynamic:
280
280
  dataset_shapes = _auto_dynamic_shape.auto_dynamic_generate_compile_args(dataset_shapes, True)
281
281
  key = str(dataset_types) + str(dataset_shapes)
@@ -564,6 +564,15 @@ class _DatasetIter:
564
564
  self.sink_size = dataset.__loop_size__
565
565
  create_data_info_queue = (
566
566
  sink_size == 1 and self.sink_count == 1 and dataset.get_dataset_size() != 1)
567
+
568
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
569
+ # otherwise get_data_info will stuck since some rank do not consume data.
570
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
571
+ data_broadcast = enable_data_broadcast()
572
+
573
+ if use_pipeline_parallel or data_broadcast:
574
+ create_data_info_queue = False
575
+
567
576
  dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size,
568
577
  create_data_info_queue=create_data_info_queue)
569
578
 
mindspore/train/model.py CHANGED
@@ -57,7 +57,7 @@ from mindspore.dataset.engine.datasets import _set_training_dataset, _reset_trai
57
57
  from mindspore.train import amp
58
58
  from mindspore._c_expression import _framework_profiler_step_start, _framework_profiler_step_end
59
59
  from mindspore._c_expression import _get_optimzer_timestamps
60
- from mindspore._c_expression import clean_tdt_channel
60
+ from mindspore._c_expression import clean_tdt_channel, _clean_rootinfo
61
61
 
62
62
  from mindspore.parallel._utils import _init_auto_parallel_context, _clear_auto_parallel_context
63
63
  from .serialization import load_param_into_net
@@ -154,6 +154,9 @@ def _handle_exception_info(obj, uce_env, tft, e):
154
154
  tft.tft_report_error(tft.ReportState.RS_UNKNOWN.value)
155
155
  raise e
156
156
  tft.tft_report_error(tft.ReportState.RS_UCE.value)
157
+ elif "HCCEError" in e_str:
158
+ logger.warning("uce wrapper caught HCCEError")
159
+ tft.tft_report_error(tft.ReportState.RS_HCCL_FAILED.value)
157
160
  elif "ForceStopError" in e_str:
158
161
  logger.warning("uce wrapper caught RuntimeError ForceStopError")
159
162
  force_stop_err = tft.ReportState.RS_NORMAL.value
@@ -246,9 +249,8 @@ def _handle_tft(func):
246
249
  if isinstance(item, TrainFaultTolerance):
247
250
  obj = item
248
251
  if obj:
249
- tft = obj.tft
250
252
  tft_env = os.getenv("MS_ENABLE_TFT", "")
251
- uce_env = "UCE:1" in tft_env or "ARF:1" in tft_env
253
+ uce_env = "UCE:1" in tft_env or "ARF:1" in tft_env or "HCCE:1" in tft_env
252
254
  tre_env = "TRE:1" in tft_env
253
255
  while True:
254
256
  try:
@@ -260,11 +262,11 @@ def _handle_tft(func):
260
262
  _update_ckpt_callback_info(repair_step, **kwargs)
261
263
  logger.warning(f'Resume training after TREError from step {repair_step}.')
262
264
  else:
263
- _handle_exception_info(obj, uce_env, tft, e)
264
- ret = tft.tft_wait_next_action()
265
- if ret == tft.Action.EXIT.value:
265
+ _handle_exception_info(obj, uce_env, obj.tft, e)
266
+ ret = obj.tft.tft_wait_next_action()
267
+ if ret == obj.tft.Action.EXIT.value:
266
268
  raise e
267
- repair_step = tft.tft_get_repair_step()
269
+ repair_step = obj.tft.tft_get_repair_step()
268
270
  logger.warning(
269
271
  "uce wrapper caught repair finish REPAIR STEP: {} batch_num:{}".format(repair_step,
270
272
  self.batch_num))
@@ -274,6 +276,7 @@ def _handle_tft(func):
274
276
  cb_initial_step = _calc_cb_initial_step(initial_epoch, initial_step, *args, **kwargs)
275
277
  if not self.enable_tre:
276
278
  kwargs["initial_step"] = cb_initial_step
279
+ self._initial_step = 0
277
280
  # reset all accu grads to zero
278
281
  obj._reset_acc_grads()
279
282
  logger.warning(
@@ -281,9 +284,9 @@ def _handle_tft(func):
281
284
  cb_initial_step))
282
285
  continue
283
286
  except BaseException as e:
284
- if tft:
287
+ if obj.tft:
285
288
  logger.error("uce wrapper caught BaseException error, enter MindIO TTP process.", exc_info=True)
286
- tft.tft_report_error(tft.ReportState.RS_UNKNOWN.value)
289
+ obj.tft.tft_report_error(obj.tft.ReportState.RS_UNKNOWN.value)
287
290
  raise e
288
291
  else:
289
292
  return func(self, *args, **kwargs)
@@ -300,9 +303,6 @@ def _check_tft():
300
303
  ascend_target = MSContext.get_instance().get_ascend_soc_version()
301
304
  if ascend_target == 'ascend910':
302
305
  raise ValueError("TFT is not supported when using ascend910")
303
- ms_mode = context.get_context("mode")
304
- if ms_mode != mindspore.GRAPH_MODE:
305
- raise ValueError("TFT is only supported in GRAPH_MODE")
306
306
  jit_level = context.get_context("jit_level")
307
307
  if jit_level == "O2" and ("UCE:1" in tft_env or "ARF:1" in tft_env):
308
308
  raise ValueError("TFT is not supported when using jit_level == O2")
@@ -443,6 +443,11 @@ def _set_with_processed_inputs(network, inputs):
443
443
  "Reset inputs from a process inputs, should be a list/tuple or a dict, but got %s!" % str(inputs))
444
444
 
445
445
 
446
+ def _check_tft_reset_dataset():
447
+ env_tft = os.getenv("MS_ENABLE_TFT", "")
448
+ return any([v in env_tft for v in ["TRE:1", "UCE:1", "HCCE:1", "ARF:1"]])
449
+
450
+
446
451
  class Model:
447
452
  """
448
453
  High-Level API for training or inference.
@@ -561,7 +566,9 @@ class Model:
561
566
  self._mindspore_lite_model_group_id = id(self) & 0xFFFF
562
567
  self.batch_num = -1
563
568
  self.enable_tre = "TRE:1" in os.getenv("MS_ENABLE_TFT", "")
569
+ self.enable_hcce = "HCCE:1" in os.getenv("MS_ENABLE_TFT", "")
564
570
  self._initial_step = None
571
+ self._need_reset_data = _check_tft_reset_dataset()
565
572
  _clear_auto_parallel_context(self._network)
566
573
 
567
574
  def _check_for_graph_cell(self, kwargs):
@@ -761,7 +768,7 @@ class Model:
761
768
  logger.info("Begin to connect network with dataset.")
762
769
  network = connect_network_with_dataset(network, dataset_helper)
763
770
 
764
- if (_get_recovery_context("enable_recovery") or self.enable_tre) and is_train:
771
+ if (_get_recovery_context("enable_recovery") or self._need_reset_data) and is_train:
765
772
  _set_training_dataset(dataset_helper)
766
773
 
767
774
  network.set_train(is_train)
@@ -805,7 +812,7 @@ class Model:
805
812
  """
806
813
  if os.environ.get("MS_ENABLE_CKPT_D2H_ASYNC") != "1":
807
814
  return
808
- if (context.get_context("mode") == context.GRAPH_MODE) and (context.get_context("device_target") == "Ascend"):
815
+ if context.get_context("device_target") == "Ascend":
809
816
  cb_params.need_ckpt, cb_params.save_checkpoint_steps, \
810
817
  cb_params.last_triggered_step = self._check_need_ckpt(cb_params.list_callback)
811
818
  logger.info(f"need_ckpt:{cb_params.need_ckpt},"
@@ -873,8 +880,8 @@ class Model:
873
880
  sink_size (int): Control the amount of data in each sink. Default: -1.
874
881
  epoch (int): Total number of iterations on the data. Default: 1.
875
882
  """
876
- if context.get_context("mode") != context.GRAPH_MODE or context.get_context("device_target") != "Ascend":
877
- raise RuntimeError('Pre-init process only supports GRAPH MODE and Ascend target currently.')
883
+ if context.get_context("device_target") != "Ascend":
884
+ raise RuntimeError('Pre-init process only supports Ascend target currently.')
878
885
 
879
886
  if not train_dataset and not valid_dataset:
880
887
  raise ValueError("The argument 'train_dataset' and 'valid_dataset' can not both be None or empty.")
@@ -1057,7 +1064,7 @@ class Model:
1057
1064
  initial_epoch (int): Epoch at which to start train, it used for resuming a previous training run.
1058
1065
  Default: 0.
1059
1066
  """
1060
- is_graph = (context.get_context("mode") == context.GRAPH_MODE)
1067
+ is_graph = context.get_context("mode") == context.GRAPH_MODE
1061
1068
  dataset_size = train_dataset.get_dataset_size()
1062
1069
  if dataset_size % sink_size != 0:
1063
1070
  logger.info("In dataset_sink mode (dataset_size % sink_size) should equal to 0, "
@@ -1126,6 +1133,7 @@ class Model:
1126
1133
  if cb_params.is_arf:
1127
1134
  cb_params.is_arf = False
1128
1135
  _set_recovery_context(is_arf=False)
1136
+ _clean_rootinfo()
1129
1137
 
1130
1138
  # Embedding cache server only run one step.
1131
1139
  if is_embedding_cache_server:
@@ -1204,8 +1212,6 @@ class Model:
1204
1212
  if not enable_recovery:
1205
1213
  self.enable_recovery = False
1206
1214
  else:
1207
- if context.get_context("mode") != context.GRAPH_MODE:
1208
- raise RuntimeError("Recovery for training only support graph mode currently.")
1209
1215
  self.enable_recovery = enable_recovery and _is_role_worker()
1210
1216
 
1211
1217
  def _check_need_load_ckpt(self, cb_params, dataset_size, sink_size=-1):
@@ -1340,6 +1346,7 @@ class Model:
1340
1346
  if cb_params.is_arf:
1341
1347
  cb_params.is_arf = False
1342
1348
  _set_recovery_context(is_arf=False)
1349
+ _clean_rootinfo()
1343
1350
  # Embedding cache server only run one step.
1344
1351
  if is_embedding_cache_server:
1345
1352
  break
@@ -2182,9 +2189,6 @@ class Model:
2182
2189
  dataset_sink_mode (bool): Determines whether to pass the data through dataset channel.
2183
2190
  sink_size (int): Control the amount of data in each sink.
2184
2191
  """
2185
- if context.get_context("mode") != context.GRAPH_MODE:
2186
- raise RuntimeError("Pre-compile process that generate parameter layout for the train network "
2187
- "only supports GRAPH MODE and Ascend target currently.")
2188
2192
  if _get_parallel_mode() not in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
2189
2193
  raise RuntimeError("'infer_train_layout' only supports 'semi_auto_parallel' and 'auto_parallel' "
2190
2194
  "mode, but got {}.".format(_get_parallel_mode()))
@@ -2303,6 +2307,7 @@ class Model:
2303
2307
 
2304
2308
  Examples:
2305
2309
  >>> import numpy as np
2310
+ >>> import mindspore as ms
2306
2311
  >>> import mindspore.nn as nn
2307
2312
  >>> from mindspore import Tensor
2308
2313
  >>> from mindspore.train import Model
@@ -2312,28 +2317,28 @@ class Model:
2312
2317
  >>> from mindspore.parallel.auto_parallel import AutoParallel
2313
2318
  >>>
2314
2319
  >>> class Net(nn.Cell):
2315
- >>> def __init__(self):
2316
- >>> super(Net, self).__init__()
2317
- >>> self.fc1 = nn.Dense(128, 768, activation='relu')
2318
- >>> self.fc2 = nn.Dense(128, 768, activation='relu')
2319
- >>> self.fc3 = nn.Dense(128, 768, activation='relu')
2320
- >>> self.fc4 = nn.Dense(768, 768, activation='relu')
2321
- >>> self.relu4 = nn.ReLU()
2322
- >>> self.relu5 = nn.ReLU()
2323
- >>> self.transpose = P.Transpose()
2324
- >>> self.matmul1 = P.MatMul()
2325
- >>> self.matmul2 = P.MatMul()
2326
- >>>
2327
- >>> def construct(self, x):
2328
- >>> q = self.fc1(x)
2329
- >>> k = self.fc2(x)
2330
- >>> v = self.fc3(x)
2331
- >>> k = self.transpose(k, (1, 0))
2332
- >>> c = self.relu4(self.matmul1(q, k))
2333
- >>> s = self.relu5(self.matmul2(c, v))
2334
- >>> s = self.fc4(s)
2335
- >>> return s
2336
- >>>
2320
+ ... def __init__(self):
2321
+ ... super(Net, self).__init__()
2322
+ ... self.fc1 = nn.Dense(128, 768, activation='relu')
2323
+ ... self.fc2 = nn.Dense(128, 768, activation='relu')
2324
+ ... self.fc3 = nn.Dense(128, 768, activation='relu')
2325
+ ... self.fc4 = nn.Dense(768, 768, activation='relu')
2326
+ ... self.relu4 = nn.ReLU()
2327
+ ... self.relu5 = nn.ReLU()
2328
+ ... self.transpose = P.Transpose()
2329
+ ... self.matmul1 = P.MatMul()
2330
+ ... self.matmul2 = P.MatMul()
2331
+ ...
2332
+ ... def construct(self, x):
2333
+ ... q = self.fc1(x)
2334
+ ... k = self.fc2(x)
2335
+ ... v = self.fc3(x)
2336
+ ... k = self.transpose(k, (1, 0))
2337
+ ... c = self.relu4(self.matmul1(q, k))
2338
+ ... s = self.relu5(self.matmul2(c, v))
2339
+ ... s = self.fc4(s)
2340
+ ... return s
2341
+ ...
2337
2342
  >>> ms.set_context(mode=ms.GRAPH_MODE)
2338
2343
  >>> init()
2339
2344
  >>> inputs = Tensor(np.ones([32, 128]).astype(np.float32))
@@ -2343,9 +2348,6 @@ class Model:
2343
2348
  >>> predict_map = model.infer_predict_layout(inputs)
2344
2349
  """
2345
2350
  _init_auto_parallel_context(self._network)
2346
- if context.get_context("mode") != context.GRAPH_MODE:
2347
- raise RuntimeError("Pre-compile process that generate parameter layout for the predict network "
2348
- "only supports GRAPH MODE and Ascend target currently.")
2349
2351
  if _get_parallel_mode() not in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
2350
2352
  raise RuntimeError('Infer predict layout only supports semi auto parallel and auto parallel mode.')
2351
2353
  _parallel_predict_check()