mindspore 2.6.0__cp311-cp311-win_amd64.whl → 2.7.0rc1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (403) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +1 -1
  5. mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +40 -9
  9. mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
  10. mindspore/_extends/optimize/cell_utils.py +96 -0
  11. mindspore/_extends/parse/__init__.py +2 -2
  12. mindspore/_extends/parse/compile_config.py +44 -22
  13. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -1
  14. mindspore/_extends/parse/parser.py +36 -61
  15. mindspore/_extends/parse/resources.py +39 -0
  16. mindspore/_extends/parse/standard_method.py +32 -13
  17. mindspore/_extends/parse/trope.py +8 -1
  18. mindspore/_extends/pijit/__init__.py +1 -2
  19. mindspore/amp.py +4 -4
  20. mindspore/atlprov.dll +0 -0
  21. mindspore/avcodec-59.dll +0 -0
  22. mindspore/avdevice-59.dll +0 -0
  23. mindspore/avfilter-8.dll +0 -0
  24. mindspore/avformat-59.dll +0 -0
  25. mindspore/avutil-57.dll +0 -0
  26. mindspore/boost/adasum.py +1 -1
  27. mindspore/boost/boost_cell_wrapper.py +4 -4
  28. mindspore/c1.dll +0 -0
  29. mindspore/c1xx.dll +0 -0
  30. mindspore/c2.dll +0 -0
  31. mindspore/common/__init__.py +27 -2
  32. mindspore/common/_grad_function.py +2 -1
  33. mindspore/common/_pijit_context.py +28 -7
  34. mindspore/common/_stub_tensor.py +1 -209
  35. mindspore/common/_tensor_cpp_method.py +1 -1
  36. mindspore/common/_tensor_docs.py +76 -15
  37. mindspore/common/api.py +193 -112
  38. mindspore/common/dtype.py +21 -11
  39. mindspore/common/dump.py +10 -15
  40. mindspore/common/generator.py +2 -3
  41. mindspore/common/hook_handle.py +11 -2
  42. mindspore/common/jit_config.py +1 -1
  43. mindspore/common/jit_trace.py +84 -105
  44. mindspore/common/parameter.py +26 -12
  45. mindspore/common/recompute.py +3 -3
  46. mindspore/common/sparse_tensor.py +0 -3
  47. mindspore/common/symbol.py +0 -1
  48. mindspore/common/tensor.py +48 -83
  49. mindspore/communication/_comm_helper.py +46 -4
  50. mindspore/communication/management.py +79 -7
  51. mindspore/context.py +38 -23
  52. mindspore/dataset/core/config.py +3 -3
  53. mindspore/dataset/engine/datasets.py +20 -7
  54. mindspore/dataset/engine/datasets_user_defined.py +32 -2
  55. mindspore/dataset/engine/iterators.py +2 -2
  56. mindspore/dataset/engine/obs/config_loader.py +2 -2
  57. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
  58. mindspore/dataset/transforms/py_transforms.py +7 -3
  59. mindspore/dataset/transforms/transforms.py +7 -3
  60. mindspore/dataset/vision/validators.py +1 -0
  61. mindspore/device_context/ascend/device.py +1 -1
  62. mindspore/device_context/gpu/__init__.py +2 -2
  63. mindspore/device_context/gpu/device.py +1 -1
  64. mindspore/device_context/gpu/op_precision.py +4 -2
  65. mindspore/device_context/gpu/op_tuning.py +6 -3
  66. mindspore/device_manager.py +16 -9
  67. mindspore/dnnl.dll +0 -0
  68. mindspore/dpcmi.dll +0 -0
  69. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -5
  70. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  71. mindspore/experimental/optim/adadelta.py +13 -20
  72. mindspore/experimental/optim/adagrad.py +15 -22
  73. mindspore/experimental/optim/adam.py +17 -24
  74. mindspore/experimental/optim/adamax.py +14 -22
  75. mindspore/experimental/optim/adamw.py +28 -34
  76. mindspore/experimental/optim/asgd.py +15 -25
  77. mindspore/experimental/optim/lr_scheduler.py +27 -45
  78. mindspore/experimental/optim/nadam.py +14 -24
  79. mindspore/experimental/optim/optimizer.py +13 -23
  80. mindspore/experimental/optim/radam.py +18 -24
  81. mindspore/experimental/optim/rmsprop.py +14 -25
  82. mindspore/experimental/optim/rprop.py +15 -26
  83. mindspore/experimental/optim/sgd.py +9 -19
  84. mindspore/hal/__init__.py +4 -4
  85. mindspore/hal/contiguous_tensors_handle.py +2 -2
  86. mindspore/hal/memory.py +1 -0
  87. mindspore/include/api/cell.h +37 -1
  88. mindspore/include/api/delegate.h +10 -0
  89. mindspore/include/api/model.h +3 -0
  90. mindspore/include/api/types.h +2 -2
  91. mindspore/include/c_api/model_c.h +0 -58
  92. mindspore/include/c_api/tensor_c.h +0 -26
  93. mindspore/include/dataset/vision_ascend.h +1 -1
  94. mindspore/jpeg62.dll +0 -0
  95. mindspore/mindrecord/tools/cifar10.py +60 -11
  96. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
  97. mindspore/mindspore_backend_common.dll +0 -0
  98. mindspore/mindspore_backend_manager.dll +0 -0
  99. mindspore/mindspore_common.dll +0 -0
  100. mindspore/mindspore_core.dll +0 -0
  101. mindspore/mindspore_cpu_res_manager.dll +0 -0
  102. mindspore/mindspore_dump.dll +0 -0
  103. mindspore/mindspore_frontend.dll +0 -0
  104. mindspore/mindspore_glog.dll +0 -0
  105. mindspore/mindspore_memory_pool.dll +0 -0
  106. mindspore/mindspore_ms_backend.dll +0 -0
  107. mindspore/mindspore_ops.dll +0 -0
  108. mindspore/mindspore_ops_host.dll +0 -0
  109. mindspore/mindspore_ops_kernel_common.dll +0 -0
  110. mindspore/mindspore_profiler.dll +0 -0
  111. mindspore/mindspore_pyboost.dll +0 -0
  112. mindspore/mindspore_pynative.dll +0 -0
  113. mindspore/mindspore_res_manager.dll +0 -0
  114. mindspore/mindspore_runtime_pipeline.dll +0 -0
  115. mindspore/mint/__init__.py +4 -44
  116. mindspore/mint/distributed/__init__.py +1 -0
  117. mindspore/mint/distributed/distributed.py +208 -5
  118. mindspore/mint/nn/__init__.py +1 -1
  119. mindspore/mint/nn/functional.py +53 -6
  120. mindspore/mint/nn/layer/_functions.py +164 -294
  121. mindspore/mint/nn/layer/activation.py +8 -6
  122. mindspore/mint/nn/layer/conv.py +122 -98
  123. mindspore/mint/nn/layer/normalization.py +8 -22
  124. mindspore/mint/optim/adam.py +19 -18
  125. mindspore/mint/optim/adamw.py +14 -8
  126. mindspore/mint/optim/sgd.py +5 -5
  127. mindspore/msobj140.dll +0 -0
  128. mindspore/mspdb140.dll +0 -0
  129. mindspore/mspdbcore.dll +0 -0
  130. mindspore/mspdbst.dll +0 -0
  131. mindspore/mspft140.dll +0 -0
  132. mindspore/msvcdis140.dll +0 -0
  133. mindspore/msvcp140_1.dll +0 -0
  134. mindspore/msvcp140_2.dll +0 -0
  135. mindspore/msvcp140_atomic_wait.dll +0 -0
  136. mindspore/msvcp140_codecvt_ids.dll +0 -0
  137. mindspore/nn/cell.py +325 -499
  138. mindspore/nn/grad/cell_grad.py +11 -12
  139. mindspore/nn/layer/activation.py +32 -34
  140. mindspore/nn/layer/basic.py +67 -64
  141. mindspore/nn/layer/channel_shuffle.py +4 -4
  142. mindspore/nn/layer/combined.py +4 -2
  143. mindspore/nn/layer/conv.py +86 -85
  144. mindspore/nn/layer/dense.py +9 -7
  145. mindspore/nn/layer/embedding.py +50 -52
  146. mindspore/nn/layer/image.py +37 -39
  147. mindspore/nn/layer/math.py +111 -112
  148. mindspore/nn/layer/normalization.py +56 -44
  149. mindspore/nn/layer/pooling.py +58 -63
  150. mindspore/nn/layer/rnn_cells.py +33 -33
  151. mindspore/nn/layer/rnns.py +56 -56
  152. mindspore/nn/layer/thor_layer.py +74 -73
  153. mindspore/nn/layer/transformer.py +11 -1
  154. mindspore/nn/learning_rate_schedule.py +20 -20
  155. mindspore/nn/loss/loss.py +79 -81
  156. mindspore/nn/optim/adam.py +1 -1
  157. mindspore/nn/optim/adasum.py +2 -2
  158. mindspore/nn/optim/optimizer.py +1 -1
  159. mindspore/nn/optim/thor.py +2 -2
  160. mindspore/nn/probability/distribution/exponential.py +2 -1
  161. mindspore/nn/probability/distribution/poisson.py +2 -1
  162. mindspore/nn/sparse/sparse.py +3 -3
  163. mindspore/nn/wrap/cell_wrapper.py +34 -37
  164. mindspore/nn/wrap/grad_reducer.py +37 -37
  165. mindspore/nn/wrap/loss_scale.py +72 -74
  166. mindspore/numpy/array_creations.py +5 -5
  167. mindspore/numpy/fft.py +1 -1
  168. mindspore/numpy/math_ops.py +1 -1
  169. mindspore/opencv_core452.dll +0 -0
  170. mindspore/opencv_imgcodecs452.dll +0 -0
  171. mindspore/opencv_imgproc452.dll +0 -0
  172. mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
  173. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
  174. mindspore/ops/_vmap/vmap_array_ops.py +6 -13
  175. mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
  176. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +17 -8
  177. mindspore/ops/auto_generate/gen_extend_func.py +1 -51
  178. mindspore/ops/auto_generate/gen_ops_def.py +463 -257
  179. mindspore/ops/auto_generate/gen_ops_prim.py +1127 -885
  180. mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
  181. mindspore/ops/composite/__init__.py +10 -0
  182. mindspore/ops/composite/base.py +8 -4
  183. mindspore/ops/composite/multitype_ops/__init__.py +12 -1
  184. mindspore/ops/composite/multitype_ops/_compile_utils.py +132 -108
  185. mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
  186. mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
  187. mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
  188. mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
  189. mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
  190. mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
  191. mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
  192. mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
  193. mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
  194. mindspore/ops/function/__init__.py +3 -1
  195. mindspore/ops/function/_add_attr_func.py +11 -6
  196. mindspore/ops/function/array_func.py +7 -94
  197. mindspore/ops/function/debug_func.py +4 -3
  198. mindspore/ops/function/grad/grad_func.py +1 -1
  199. mindspore/ops/function/math_func.py +21 -367
  200. mindspore/ops/function/nn_func.py +26 -41
  201. mindspore/ops/function/other_func.py +4 -1
  202. mindspore/ops/function/random_func.py +31 -4
  203. mindspore/ops/functional.py +0 -2
  204. mindspore/ops/functional_overload.py +463 -6
  205. mindspore/ops/op_info_register.py +21 -0
  206. mindspore/ops/operations/__init__.py +5 -2
  207. mindspore/ops/operations/_custom_ops_utils.py +675 -8
  208. mindspore/ops/operations/_inner_ops.py +3 -6
  209. mindspore/ops/operations/_sequence_ops.py +1 -1
  210. mindspore/ops/operations/comm_ops.py +185 -26
  211. mindspore/ops/operations/custom_ops.py +235 -172
  212. mindspore/ops/operations/debug_ops.py +55 -4
  213. mindspore/ops/operations/image_ops.py +13 -13
  214. mindspore/ops/operations/manually_defined/ops_def.py +15 -16
  215. mindspore/ops/operations/math_ops.py +3 -4
  216. mindspore/ops/operations/nn_ops.py +5 -6
  217. mindspore/ops/primitive.py +6 -10
  218. mindspore/ops/tensor_method.py +36 -4
  219. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
  220. mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
  221. mindspore/ops_generate/api/functions_cc_generator.py +58 -10
  222. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
  223. mindspore/ops_generate/common/base_generator.py +14 -0
  224. mindspore/ops_generate/common/gen_constants.py +7 -2
  225. mindspore/ops_generate/common/gen_utils.py +0 -19
  226. mindspore/ops_generate/common/op_proto.py +11 -4
  227. mindspore/ops_generate/common/template.py +88 -11
  228. mindspore/ops_generate/gen_ops.py +1 -1
  229. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
  230. mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
  231. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
  232. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
  233. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
  234. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
  235. mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -0
  236. mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
  237. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
  238. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
  239. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
  240. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
  241. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
  242. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
  243. mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
  244. mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
  245. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
  246. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
  247. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
  248. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
  249. mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
  250. mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
  251. mindspore/parallel/_auto_parallel_context.py +4 -2
  252. mindspore/parallel/_cell_wrapper.py +106 -40
  253. mindspore/parallel/_parallel_serialization.py +1 -1
  254. mindspore/parallel/_ps_context.py +4 -6
  255. mindspore/parallel/_tensor.py +167 -12
  256. mindspore/parallel/_transformer/moe.py +1 -1
  257. mindspore/parallel/_transformer/transformer.py +13 -8
  258. mindspore/parallel/auto_parallel.py +12 -5
  259. mindspore/parallel/checkpoint_convert.py +3 -3
  260. mindspore/parallel/checkpoint_transform.py +3 -1
  261. mindspore/parallel/cluster/process_entity/_api.py +84 -48
  262. mindspore/parallel/cluster/process_entity/_utils.py +95 -7
  263. mindspore/parallel/cluster/run.py +43 -4
  264. mindspore/parallel/function/__init__.py +8 -1
  265. mindspore/parallel/function/reshard_func.py +1 -1
  266. mindspore/parallel/nn/__init__.py +15 -2
  267. mindspore/parallel/nn/parallel_cell_wrapper.py +9 -10
  268. mindspore/parallel/nn/parallel_grad_reducer.py +7 -6
  269. mindspore/parallel/shard.py +2 -2
  270. mindspore/parallel/transform_safetensors.py +462 -174
  271. mindspore/pgodb140.dll +0 -0
  272. mindspore/pgort140.dll +0 -0
  273. mindspore/profiler/__init__.py +2 -1
  274. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
  275. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
  276. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +3 -0
  277. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
  278. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
  279. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
  280. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
  281. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
  282. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
  283. mindspore/profiler/analysis/task_manager.py +1 -1
  284. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
  285. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
  286. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +42 -22
  287. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
  288. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
  289. mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
  290. mindspore/profiler/common/constant.py +16 -0
  291. mindspore/profiler/common/profiler_context.py +25 -27
  292. mindspore/profiler/common/profiler_info.py +0 -16
  293. mindspore/profiler/common/profiler_op_analyse.py +235 -0
  294. mindspore/profiler/common/profiler_output_path.py +23 -8
  295. mindspore/profiler/common/profiler_parameters.py +128 -35
  296. mindspore/profiler/dynamic_profile/__init__.py +0 -0
  297. mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
  298. mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
  299. mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
  300. mindspore/profiler/dynamic_profiler.py +305 -314
  301. mindspore/profiler/envprofiler.py +12 -7
  302. mindspore/profiler/experimental_config.py +96 -6
  303. mindspore/profiler/mstx.py +33 -12
  304. mindspore/profiler/platform/__init__.py +2 -3
  305. mindspore/profiler/platform/npu_profiler.py +29 -19
  306. mindspore/profiler/profiler.py +35 -19
  307. mindspore/profiler/profiler_action_controller.py +64 -76
  308. mindspore/profiler/schedule.py +10 -4
  309. mindspore/rewrite/common/config.py +1 -0
  310. mindspore/rewrite/common/namer.py +1 -0
  311. mindspore/rewrite/common/namespace.py +1 -0
  312. mindspore/rewrite/node/node.py +31 -11
  313. mindspore/rewrite/parsers/assign_parser.py +1 -1
  314. mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
  315. mindspore/run_check/_check_version.py +7 -10
  316. mindspore/runtime/__init__.py +5 -5
  317. mindspore/runtime/event.py +10 -4
  318. mindspore/runtime/executor.py +60 -45
  319. mindspore/runtime/memory.py +21 -30
  320. mindspore/runtime/thread_bind_core.py +298 -164
  321. mindspore/safeguard/rewrite_obfuscation.py +12 -13
  322. mindspore/swresample-4.dll +0 -0
  323. mindspore/swscale-6.dll +0 -0
  324. mindspore/tbbmalloc.dll +0 -0
  325. mindspore/tinyxml2.dll +0 -0
  326. mindspore/train/_utils.py +6 -2
  327. mindspore/train/amp.py +43 -20
  328. mindspore/train/callback/__init__.py +5 -5
  329. mindspore/train/callback/_checkpoint.py +3 -6
  330. mindspore/train/callback/_flops_collector.py +1 -1
  331. mindspore/train/callback/_landscape.py +0 -1
  332. mindspore/train/callback/_train_fault_tolerance.py +71 -13
  333. mindspore/train/data_sink.py +11 -2
  334. mindspore/train/dataset_helper.py +9 -0
  335. mindspore/train/model.py +51 -33
  336. mindspore/train/serialization.py +133 -111
  337. mindspore/train/summary/summary_record.py +13 -2
  338. mindspore/turbojpeg.dll +0 -0
  339. mindspore/utils/__init__.py +3 -2
  340. mindspore/utils/dryrun.py +0 -6
  341. mindspore/utils/runtime_execution_order_check.py +162 -78
  342. mindspore/utils/sdc_detect.py +68 -0
  343. mindspore/utils/utils.py +6 -9
  344. mindspore/vcmeta.dll +0 -0
  345. mindspore/vcruntime140.dll +0 -0
  346. mindspore/vcruntime140_1.dll +0 -0
  347. mindspore/version.py +1 -1
  348. {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +5 -4
  349. {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +352 -390
  350. mindspore/_deprecated/jit.py +0 -198
  351. mindspore/experimental/es/__init__.py +0 -22
  352. mindspore/experimental/es/embedding_service.py +0 -891
  353. mindspore/experimental/es/embedding_service_layer.py +0 -581
  354. mindspore/profiler/parser/__init__.py +0 -14
  355. mindspore/profiler/parser/aicpu_data_parser.py +0 -272
  356. mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
  357. mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
  358. mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
  359. mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
  360. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
  361. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
  362. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
  363. mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
  364. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
  365. mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
  366. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
  367. mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
  368. mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
  369. mindspore/profiler/parser/ascend_flops_generator.py +0 -116
  370. mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
  371. mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
  372. mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
  373. mindspore/profiler/parser/ascend_memory_generator.py +0 -185
  374. mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
  375. mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
  376. mindspore/profiler/parser/ascend_op_generator.py +0 -334
  377. mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
  378. mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
  379. mindspore/profiler/parser/base_timeline_generator.py +0 -483
  380. mindspore/profiler/parser/container.py +0 -229
  381. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
  382. mindspore/profiler/parser/flops_parser.py +0 -531
  383. mindspore/profiler/parser/framework_enum.py +0 -111
  384. mindspore/profiler/parser/framework_parser.py +0 -464
  385. mindspore/profiler/parser/framework_struct.py +0 -61
  386. mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
  387. mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
  388. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
  389. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
  390. mindspore/profiler/parser/hccl_parser.py +0 -573
  391. mindspore/profiler/parser/hwts_log_parser.py +0 -122
  392. mindspore/profiler/parser/integrator.py +0 -526
  393. mindspore/profiler/parser/memory_usage_parser.py +0 -277
  394. mindspore/profiler/parser/minddata_analyzer.py +0 -800
  395. mindspore/profiler/parser/minddata_parser.py +0 -186
  396. mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
  397. mindspore/profiler/parser/op_intermediate_parser.py +0 -149
  398. mindspore/profiler/parser/optime_parser.py +0 -250
  399. mindspore/profiler/parser/profiler_info.py +0 -213
  400. mindspore/profiler/parser/step_trace_parser.py +0 -666
  401. {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
  402. {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
  403. {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
@@ -81,7 +81,7 @@ def _transform_target_modules(target_modules):
81
81
  obfuscate_layers = target_modules[2].split(':')
82
82
  if obfuscate_layers[1] != 'all':
83
83
  max_layers = int(obfuscate_layers[1])
84
- layers = [i for i in range(0, max_layers)]
84
+ layers = list(range(0, max_layers))
85
85
  path_new = path.replace("blocks", "blocks/${layer}")
86
86
  network_obf_template['insert_ops'][0]['input_y'] = "obf_metadata_${layer}"
87
87
  weight_obf_template['weight_obf_ops'][0]['input_y'] = "obf_metadata_${layer}"
@@ -95,8 +95,8 @@ def _transform_target_modules(target_modules):
95
95
  obf_config['obf_metadata_config'].append(obf_medatadata)
96
96
 
97
97
  for name in target_list:
98
- target_weight = path_new + '/' + name + '/weight'
99
- target_bias = path_new + '/' + name + '/bias'
98
+ target_weight = '/'.join([path_new, name, 'weight'])
99
+ target_bias = '/'.join([path_new, name, 'bias'])
100
100
  weight_obf = weight_obf_template.copy()
101
101
  weight_obf['target'] = target_weight
102
102
  bias_obf = weight_obf_template.copy()
@@ -185,7 +185,7 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
185
185
  def _gen_obf_metadata(config):
186
186
  name = config.get('name')
187
187
  if name is None:
188
- return False
188
+ return
189
189
  save_metadata = config.get('save_metadata', False)
190
190
  metadata_op_name = config.get('metadata_op')
191
191
  layers = config.get('layers')
@@ -213,7 +213,6 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
213
213
  saved_obf_tensor = metadata_op(saved_obf_tensor)
214
214
  if saved_obf_tensor is not None:
215
215
  saved_metadata[obf_name] = saved_obf_tensor.asnumpy()
216
- return True
217
216
 
218
217
  if not isinstance(network, nn.Cell):
219
218
  raise TypeError("network must be nn.Cell, but got {}.".format(type(network)))
@@ -283,13 +282,13 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
283
282
  def _obfuscate_param(param, obf_metadata, obf_ops, layer=0):
284
283
  param_dtype = F.dtype(param)
285
284
  obf_param = param
286
- for i in range(len(obf_ops)):
287
- op_name = obf_ops[i].get('name')
285
+ for obf_op in obf_ops:
286
+ op_name = obf_op.get('name')
288
287
  if not isinstance(op_name, str):
289
288
  raise TypeError('{} should be str type, but got {}'.format(op_name, type(op_name)))
290
289
  if op_name == 'mul':
291
290
  input_x = obf_param
292
- input_y_name = _get_op_input_name(obf_ops[i], 'input_y', layer)
291
+ input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
293
292
  input_y = obf_metadata.get(input_y_name)
294
293
  if input_x is None or input_y is None:
295
294
  log.error("input_x or input_y is None")
@@ -297,22 +296,22 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
297
296
  input_y = F.cast(input_y, param_dtype)
298
297
  obf_param = ops.mul(input_x, input_y)
299
298
  elif op_name == 'permuate':
300
- input_x_name = _get_op_input_name(obf_ops[i], 'input_x', layer)
299
+ input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
301
300
  p = obf_metadata.get(input_x_name, None)
302
301
  if p is None or obf_param is None:
303
302
  log.error("input_x or param is None")
304
303
  return None
305
304
  obf_param = obf_param[p]
306
305
  elif op_name == 'matmul':
307
- input_x_name = _get_op_input_name(obf_ops[i], 'input_x', layer)
308
- input_y_name = _get_op_input_name(obf_ops[i], 'input_y', layer)
306
+ input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
307
+ input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
309
308
  input_x = _get_op_input(input_x_name, obf_param)
310
309
  input_y = _get_op_input(input_y_name, obf_param)
311
310
  if input_x is None or input_y is None:
312
311
  log.error("the input_x or input_y of op: {} is None.".format(op_name))
313
312
  return None
314
- input_x = ops.transpose(input_x, (1, 0)) if obf_ops[i].get('transpose_a', False) else input_x
315
- input_y = ops.transpose(input_y, (1, 0)) if obf_ops[i].get('transpose_b', False) else input_y
313
+ input_x = ops.transpose(input_x, (1, 0)) if obf_op.get('transpose_a', False) else input_x
314
+ input_y = ops.transpose(input_y, (1, 0)) if obf_op.get('transpose_b', False) else input_y
316
315
  obf_param = ops.matmul(F.cast(input_x, param_dtype), F.cast(input_y, param_dtype))
317
316
  else:
318
317
  log.error("unsupported op, op must be matmul or permuate or mul, but got {}."
Binary file
mindspore/swscale-6.dll CHANGED
Binary file
mindspore/tbbmalloc.dll CHANGED
Binary file
mindspore/tinyxml2.dll CHANGED
Binary file
mindspore/train/_utils.py CHANGED
@@ -582,7 +582,8 @@ def _progress_bar(iterable, total=None):
582
582
  print_progress_bar(i)
583
583
 
584
584
 
585
- def _load_and_transform(path, name_map, load_func, transform_func):
585
+ def _load_and_transform(path, name_map, load_func, transform_func=None):
586
+ """use load_func to load and use transform_func to convert"""
586
587
  if load_func is not None:
587
588
  param_dict = load_func(path)
588
589
  else:
@@ -590,5 +591,8 @@ def _load_and_transform(path, name_map, load_func, transform_func):
590
591
  transform_dict = {}
591
592
  for k, v in param_dict.items():
592
593
  new_name = name_map.get(k, k) if name_map is not None else k
593
- transform_dict[new_name] = transform_func(v, new_name)
594
+ if transform_func is not None:
595
+ transform_dict[new_name] = transform_func(v, new_name)
596
+ else:
597
+ transform_dict[new_name] = v
594
598
  return transform_dict
mindspore/train/amp.py CHANGED
@@ -69,6 +69,9 @@ AMP_BLACK_LIST = [
69
69
  AMP_AUTO_WHITE_LIST = [
70
70
  P.Conv2D,
71
71
  P.Conv3D,
72
+ gen.Conv2DExt,
73
+ gen.Conv3DExt,
74
+ gen.ConvTranspose2D,
72
75
  P.Conv2DTranspose,
73
76
  P.Conv3DTranspose,
74
77
  gen.Convolution,
@@ -80,6 +83,10 @@ AMP_AUTO_WHITE_LIST = [
80
83
  P.Einsum,
81
84
  gen.Dense,
82
85
  gen.Addmm,
86
+ gen.Addbmm,
87
+ gen.Addmv,
88
+ gen.Baddbmm,
89
+ gen.Mv,
83
90
  ]
84
91
 
85
92
  AMP_AUTO_BLACK_LIST = [
@@ -90,8 +97,10 @@ AMP_AUTO_BLACK_LIST = [
90
97
  P.Erfinv,
91
98
  P.Exp,
92
99
  P.Expm1,
93
- P.Log,
94
- P.Log1p,
100
+ gen.Log,
101
+ gen.Log10,
102
+ gen.Log1p,
103
+ gen.Log2,
95
104
  P.Reciprocal,
96
105
  P.Rsqrt,
97
106
  P.Sinh,
@@ -103,6 +112,7 @@ AMP_AUTO_BLACK_LIST = [
103
112
  P.BatchNorm,
104
113
  gen.BatchNormExt,
105
114
  gen.GroupNorm,
115
+ gen.Norm,
106
116
  P.KLDivLoss,
107
117
  P.SmoothL1Loss,
108
118
  P.MultilabelMarginLoss,
@@ -113,7 +123,19 @@ AMP_AUTO_BLACK_LIST = [
113
123
  P.Pdist,
114
124
  P.Cdist,
115
125
  P.Renorm,
126
+ gen.ReduceProd,
127
+ gen.Softmax,
128
+ gen.LogSoftmax,
129
+ gen.LogSoftmaxExt,
130
+ gen.CumProd,
131
+ gen.CumSum,
132
+ gen.CumsumExt,
133
+ gen.ProdExt,
134
+ gen.SumExt,
135
+ gen.L1LossExt,
116
136
  gen.MSELossExt,
137
+ gen.NLLLoss,
138
+ gen.NLLLoss2d,
117
139
  ]
118
140
 
119
141
  # Indicates which inputs of primitives need to be converted
@@ -358,7 +380,7 @@ def _auto_black_list(network, black_list, dtype):
358
380
  return network
359
381
 
360
382
 
361
- class amp_decorator:
383
+ class AmpDecorator:
362
384
  """
363
385
  Auto mixed precision decorator.
364
386
  Type of lists: List[Tuple[str, List[int]]]
@@ -384,7 +406,7 @@ def _set_amp_decorator(obj, amp_level, amp_dtype, white_list, black_list):
384
406
  if inspect.isfunction(obj) or inspect.ismethod(obj):
385
407
  @functools.wraps(obj)
386
408
  def wrapper(*args, **kwargs):
387
- with amp_decorator(amp_level, amp_dtype, white_list, black_list):
409
+ with AmpDecorator(amp_level, amp_dtype, white_list, black_list):
388
410
  return obj(*args, **kwargs)
389
411
  return wrapper
390
412
  if isinstance(obj, nn.Cell):
@@ -423,17 +445,18 @@ def auto_mixed_precision(network, amp_level="O0", dtype=mstype.float16):
423
445
 
424
446
  Operators in `auto_whitelist` are:
425
447
 
426
- ``Conv2D``, ``Conv3D``, ``Conv2DTranspose``, ``Conv3DTranspose``, ``Convolution``, ``MatMul``, ``MatMulExt``,
427
- ``BatchMatMul``, ``BatchMatMulExt``, ``PReLU``, ``Einsum``, ``Dense``, ``Addmm``
448
+ ``Conv2D``, ``Conv2DExt``, ``Conv3D``, ``Conv3DExt``, ``Conv2DTranspose``, ``ConvTranspose2D``,
449
+ ``Conv3DTranspose``, ``Convolution``, ``MatMul``, ``MatMulExt``, ``BatchMatMul``, ``BatchMatMulExt``, ``PReLU``,
450
+ ``Einsum``, ``Dense``, ``Addmm``, ``Addbmm``, ``Addmv``, ``Baddbmm``, ``Mv``
428
451
 
429
452
  Operators in `auto_blacklist` are:
430
453
 
431
- ``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log1p``, ``Reciprocal``,
432
- ``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``, ``BatchNorm``,
433
- ``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``, ``SoftMarginLoss``,
434
- ``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``, ``Renorm``,
435
- ``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``CumProd``, ``CumSum``, ``CumsumExt``, ``ProdExt``, ``SumExt``,
436
- ``Norm``, ``MSELossExt``
454
+ ``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log10``, ``Log1p``, ``Log2``,
455
+ ``Reciprocal``, ``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``,
456
+ ``BatchNorm``, ``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``,
457
+ ``SoftMarginLoss``, ``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``,
458
+ ``Renorm``, ``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``LogSoftmaxExt``, ``CumProd``, ``CumSum``,
459
+ ``CumsumExt``, ``ProdExt``, ``SumExt``, ``Norm``, ``L1LossExt``, ``MSELossExt``, ``NLLLoss``, ``NLLLoss2d``
437
460
 
438
461
  Operators in `promote_list` are:
439
462
 
@@ -638,7 +661,7 @@ def _add_loss_network(network, loss_fn, cast_model_type):
638
661
 
639
662
 
640
663
  def _is_grad_accumulation(mcell):
641
- if mcell.cls_name == "GradAccumulationCell" or mcell.cls_name == "GradAccumulation":
664
+ if mcell.cls_name in {"GradAccumulationCell", "GradAccumulation"}:
642
665
  return True
643
666
  for cell in mcell.cells():
644
667
  if _is_grad_accumulation(cell):
@@ -675,23 +698,23 @@ def build_train_network(network, optimizer, loss_fn=None, level='O0', boost_leve
675
698
  Build the mixed precision training cell automatically.
676
699
 
677
700
  Note:
678
- - After using `custom_mixed_precision` or `auto_mixed_precision` for precision conversion, it is not supported
679
- to perform the precision conversion again. If `build_train_network` is used to train a converted network,
680
- `level` need to be configured to ``O0`` to avoid the duplicated accuracy conversion.
701
+ After using `custom_mixed_precision` or `auto_mixed_precision` for precision conversion, it is not supported
702
+ to perform the precision conversion again. If `build_train_network` is used to train a converted network,
703
+ `level` need to be configured to ``O0`` to avoid the duplicated accuracy conversion.
681
704
 
682
705
  Args:
683
706
  network (Cell): Definition of the network.
684
707
  optimizer (:class:`mindspore.nn.Optimizer`): Define the optimizer to update the Parameter.
685
- loss_fn (Union[None, Cell]): Define the loss function. If None, the `network` should have the loss inside.
686
- Default: ``None`` .
687
- level (str): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
708
+ loss_fn (Union[None, Cell], optional): Define the loss function. If None,
709
+ the `network` should have the loss inside. Default: ``None`` .
710
+ level (str, optional): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
688
711
 
689
712
  For details on amp level, refer to :func:`mindspore.amp.auto_mixed_precision`.
690
713
 
691
714
  Property of `keep_batchnorm_fp32`, `cast_model_type` and `loss_scale_manager` determined by `level`
692
715
  setting may be overwritten by settings in `kwargs`.
693
716
 
694
- boost_level (str): Option for argument `level` in `mindspore.boost` , level for boost mode
717
+ boost_level (str, optional): Option for argument `level` in `mindspore.boost` , level for boost mode
695
718
  training. Supports ['O0', 'O1', 'O2']. Default: ``'O0'`` .
696
719
 
697
720
  - 'O0': Do not change.
@@ -15,6 +15,11 @@
15
15
  """Callback related classes and functions."""
16
16
  from __future__ import absolute_import
17
17
 
18
+ __all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
19
+ "SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
20
+ "History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
21
+ "TrainFaultTolerance"]
22
+
18
23
  from mindspore.train.callback._callback import Callback
19
24
  from mindspore.train.callback._callback import CallbackManager as _CallbackManager
20
25
  from mindspore.train.callback._callback import InternalCallbackParam as _InternalCallbackParam
@@ -37,8 +42,3 @@ from mindspore.train.callback._on_request_exit import OnRequestExit
37
42
  from mindspore.train.callback._backup_and_restore import BackupAndRestore
38
43
  from mindspore.train.callback._flops_collector import FlopsUtilizationCollector
39
44
  from mindspore.train.callback._train_fault_tolerance import TrainFaultTolerance
40
-
41
- __all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
42
- "SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
43
- "History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
44
- "TrainFaultTolerance"]
@@ -411,8 +411,6 @@ class CheckpointConfig:
411
411
  handle_append_info["epoch_num"] = 0
412
412
  if "step_num" in append_info:
413
413
  handle_append_info["step_num"] = 0
414
- if "random_op" in append_info:
415
- handle_append_info["random_op"] = 0
416
414
  dict_num = 0
417
415
  for element in append_info:
418
416
  if not isinstance(element, str) and not isinstance(element, dict):
@@ -588,8 +586,6 @@ class ModelCheckpoint(Callback):
588
586
  # save graph (only once)
589
587
  if not self._graph_saved:
590
588
  graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
591
- if os.path.isfile(graph_file_name) and context.get_context("mode") == context.GRAPH_MODE:
592
- os.remove(graph_file_name)
593
589
  _save_graph(cb_params.train_network, graph_file_name)
594
590
  self._graph_saved = True
595
591
  self._save_ckpt(cb_params)
@@ -713,12 +709,13 @@ class ModelCheckpoint(Callback):
713
709
  save_checkpoint(network, cur_file, False, self._config.async_save,
714
710
  self._append_dict, self._config.enc_key, self._config.enc_mode,
715
711
  crc_check=self._config.crc_check, format=self._config.format,
716
- incremental=self._map_param_inc, choice_func=choice_func)
712
+ incremental=self._map_param_inc, choice_func=choice_func,
713
+ remove_redundancy=self._config.remove_redundancy)
717
714
  else:
718
715
  save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
719
716
  self._append_dict, self._config.enc_key, self._config.enc_mode,
720
717
  crc_check=self._config.crc_check, format=self._config.format,
721
- incremental=self._map_param_inc)
718
+ incremental=self._map_param_inc, remove_redundancy=self._config.remove_redundancy)
722
719
 
723
720
  self._latest_ckpt_file_name = cur_file
724
721
 
@@ -53,7 +53,7 @@ class FlopsUtilizationCollector(Callback):
53
53
  The FlopsUtilizationCollector interface counts the model utilization information MFU
54
54
  and the hardware utilization information HFU.
55
55
  Currently, the API counts only the forward and backward flops of MatMul,
56
- BatchMatMul, FlashAttentionScore, and Conv2D operators.
56
+ BatchMatMul, flash_attention_score, and Conv2D operators.
57
57
  Only used in graph mode with static shape.
58
58
 
59
59
  Args:
@@ -404,7 +404,6 @@ class SummaryLandscape:
404
404
  def _set_context(device_id):
405
405
  """Set context."""
406
406
  context.set_context(device_id=device_id)
407
- context.set_context(mode=context.GRAPH_MODE)
408
407
 
409
408
  def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
410
409
  """Create landscape by PCA."""
@@ -25,8 +25,9 @@ from mindspore.communication import get_rank, get_group_size
25
25
  from mindspore import log as logger
26
26
  from mindspore.train.serialization import _get_cur_rank_dp
27
27
  from mindspore._c_expression import _repair_device, _stop_device, _tft_sem_post, _tft_sem_enable
28
- from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm
28
+ from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm, _clean_rootinfo
29
29
  from mindspore._c_expression import clean_tdt_channel
30
+ from mindspore._c_expression import _pre_launch_send_recv
30
31
  from mindspore._c_expression import send_recv, reset_params
31
32
  from mindspore._c_expression import CollectiveManager
32
33
  from mindspore._c_expression import _get_uce_process_strategy, _get_uce_mem_info
@@ -35,6 +36,7 @@ from mindspore.ops.operations.manually_defined._inner import TensorReport
35
36
  import mindspore
36
37
  import mindspore.common.dtype as mstype
37
38
  from mindspore.parallel._recovery_context import _set_recovery_context
39
+ from mindspore import runtime
38
40
 
39
41
 
40
42
  def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
@@ -80,7 +82,7 @@ def _save_checkpoint_on_failure(step, save_info, args, cb_ctx):
80
82
  append_dict["loss_scale"] = outputs[2]
81
83
 
82
84
  ckpt_file = f"ttp_rank_{str(cur_rank)}-{str(cur_epoch_num)}_{str(step_num_in_epoch)}.ckpt"
83
- cur_ckpt_dir = _get_ckpt_dir(step, ckpt_save_path, True) + "/rank_" + str(cur_rank)
85
+ cur_ckpt_dir = os.path.join(_get_ckpt_dir(step, ckpt_save_path, True), "rank_" + str(cur_rank))
84
86
  os.makedirs(cur_ckpt_dir, exist_ok=True)
85
87
  cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
86
88
  save_checkpoint(cb_params.train_network, cur_file,
@@ -110,7 +112,7 @@ def _tft_exit_cb(ctx):
110
112
 
111
113
  def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_ctx):
112
114
  """ Callback used for TFT repair function."""
113
- logger.warning("Enter _tft_repair_callback repair type: {}".format(repair_info["repair_type"]))
115
+ logger.warning(f"Enter _tft_repair_callback repair type: {repair_info['repair_type']}")
114
116
  if (repair_info["repair_type"] in (cb_ctx.tft.RepairType.RT_UCE_HIGHLEVEL.value,
115
117
  cb_ctx.tft.RepairType.RT_UCE_LOWLEVEL.value)):
116
118
  logger.warning("Enter _tft_repair_callback uce REPARI_DEVICE device_id : {}".format(cb_ctx.device_id))
@@ -138,7 +140,7 @@ def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_
138
140
 
139
141
  def _tft_clean_callback(is_uce_error, args, ctx):
140
142
  """ Callback used for TFT clean function."""
141
- logger.warning("Enter _tft_clean_callback")
143
+ logger.warning(f"Enter _tft_clean_callback, device id:{ctx.device_id}")
142
144
  ret = 0
143
145
  if is_uce_error:
144
146
  _get_uce_mem_info(ctx.device_id)
@@ -154,29 +156,36 @@ def _tft_clean_callback(is_uce_error, args, ctx):
154
156
  logger.warning("Enter _tft_clean_callback resume_hccl_comm")
155
157
  CollectiveManager.get_instance().resume_hccl_comm()
156
158
  logger.warning("Finish _tft_clean_callback, ret: {}".format(ret))
159
+ if ctx.tft.tft_get_repair_type() == "recover":
160
+ logger.warning(f"Destroy hcom")
161
+ _finalize_comm()
162
+ logger.warning(f"Destroy hcom end")
157
163
  return ret
158
164
 
159
165
 
160
166
  def _tft_stop_callback(args, cb_ctx):
161
167
  """ Callback used for TFT stop function."""
162
- logger.warning("Enter _tft_stop_callback device_id: {}".format(cb_ctx.device_id))
168
+ logger.warning(f"Enter _tft_stop_callback device_id: {cb_ctx.device_id}")
163
169
  _stop_device(cb_ctx.device_id)
170
+ cb_ctx.stop_been_called = True
164
171
  if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
165
172
  raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
166
173
  cb_ctx.is_uce_rank = False
167
174
  if cb_ctx.tft.tft_get_repair_type() == "recover":
168
175
  logger.warning(f"Reset limit step")
169
176
  cb_ctx.tft.tft_reset_limit_step()
170
- logger.info("Finish _tft_stop_callback")
177
+ logger.warning("Finish _tft_stop_callback")
171
178
 
172
179
 
173
180
  def _tft_rebuild_sub_groups(fault_ranks, args, ctx):
174
181
  """Callback used for TFT Rebuild Group function."""
175
- logger.warning(f"Enter _tft_rebuild_sub_groups, device id: ".format(ctx.device_id))
176
- _finalize_comm()
182
+ logger.warning(f"Enter _tft_rebuild_sub_groups, device id: {ctx.device_id}")
177
183
  _rebuild_world_group()
178
184
  _rebuild_sub_group()
179
185
  _set_recovery_context(is_arf=True)
186
+ logger.warning(f"try to pre launch send recv before real launch")
187
+ _pre_launch_send_recv(context.get_context('device_id'))
188
+ logger.warning(f"Pre launch send recv before real launch end")
180
189
  logger.warning("Enter _tft_rebuild_sub_groups ok ")
181
190
 
182
191
 
@@ -299,6 +308,12 @@ class TrainFaultTolerance(Callback):
299
308
 
300
309
  def __init__(self, ckpt_save_path=None, **kwargs):
301
310
  super(TrainFaultTolerance, self).__init__()
311
+ logger.info(f"MS_ENABLE_TFT: {os.getenv('MS_ENABLE_TFT', '')}")
312
+ if self._only_enable_tsp():
313
+ self.tft = _tft_handler.get_tft()
314
+ self._check_init()
315
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
316
+ return
302
317
  self.save_cb = kwargs.get("ckpt_save_fn", None)
303
318
  self.ckpt_save_path = ckpt_save_path
304
319
  if self.save_cb is None and self.ckpt_save_path is None:
@@ -308,19 +323,24 @@ class TrainFaultTolerance(Callback):
308
323
  self.device_id = context.get_context("device_id")
309
324
  self.cur_step_num = 0
310
325
  self.cur_epoch_num = 0
326
+ self.clean_unique_id = False
311
327
  # For TREError(Training Result Error) scene, parameter `ckpt_load_fn` must be provided to load checkpoint
312
328
  # from file for resuming training, the `ckpt_load_fn` is a function, prototype of which is:
313
329
  # `def load_checkpoint() -> tuple(dict, bool)`, the return value is a tuple containing 2 values,
314
330
  # i.e. (param_dict, remove_redundancy)
315
331
  self.ckpt_load_func = kwargs.get("ckpt_load_fn", None)
316
- self.tft = _tft_handler.get_tft()
317
332
  if self._only_enable_tre():
318
333
  return
334
+ self.tft = _tft_handler.get_tft()
319
335
  self._check_init()
336
+ if self._only_enable_tre_and_tsp():
337
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
338
+ return
320
339
  self.global_step = None
321
340
  self.learning_rate = None
322
341
  self.has_init_replica = False
323
342
  self.is_uce_rank = False
343
+ self.stop_been_called = False
324
344
 
325
345
  self.assign = mindspore.ops.Assign()
326
346
  self.g_one = Parameter(Tensor([1], dtype=mstype.int32))
@@ -336,6 +356,22 @@ class TrainFaultTolerance(Callback):
336
356
  return False
337
357
  return "TRE:1" in env_enable
338
358
 
359
+ def _only_enable_tsp(self):
360
+ """Check if only configured MS_ENABLE_TFT='{TSP:1}'"""
361
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
362
+ non_tsp_flags = ["TTP:1", "UCE:1", "ARF:1", "TRE:1"]
363
+ if any(flag in env_enable for flag in non_tsp_flags):
364
+ return False
365
+ return "TSP:1" in env_enable
366
+
367
+ def _only_enable_tre_and_tsp(self):
368
+ """Check if only configured MS_ENABLE_TFT='{TRE:1, TSP:1}'"""
369
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
370
+ other_flags = ["TTP:1", "UCE:1", "ARF:1"]
371
+ if any(flag in env_enable for flag in other_flags):
372
+ return False
373
+ return "TRE:1" in env_enable and "TSP:1" in env_enable
374
+
339
375
  def _check_init(self):
340
376
  """Check if the mindio-ttp had inited"""
341
377
  if self.tft is None:
@@ -427,6 +463,8 @@ class TrainFaultTolerance(Callback):
427
463
  self.tft.tft_register_clean_handler(_tft_clean_callback, self)
428
464
  self.tft.tft_register_repair_handler(_tft_repair_callback, self)
429
465
  self.tft.tft_register_rebuild_group_handler(_tft_rebuild_sub_groups, self)
466
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
467
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
430
468
 
431
469
  def _reset_acc_grads(self):
432
470
  accu_grad_params = map(lambda e: e[1],
@@ -436,6 +474,12 @@ class TrainFaultTolerance(Callback):
436
474
  if reset_params(accu_grad_list) != 0:
437
475
  raise ValueError("Call reset_params failed.")
438
476
 
477
+ def _clear_unique_id(self):
478
+ """Clean unique id on first train step end"""
479
+ if not self.clean_unique_id and ("ARF:1" in os.getenv("MS_ENABLE_TFT", "")):
480
+ _clean_rootinfo()
481
+ self.clean_unique_id = True
482
+
439
483
  def on_train_step_end(self, run_context):
440
484
  """
441
485
  Report status to MindIO TFT after every step finished.
@@ -446,13 +490,19 @@ class TrainFaultTolerance(Callback):
446
490
  """
447
491
  if self._only_enable_tre():
448
492
  return
449
- if self.has_init_replica is False:
450
- self.has_init_replica = True
451
- self._set_tft_optimizer_replica(run_context)
493
+
452
494
  cb_params = run_context.original_args()
453
495
  logger.info("START Set optimizer finish step status to TFT. step: {}".format(cb_params.cur_step_num))
454
496
  self.cur_step_num = cb_params.cur_step_num
455
497
  self.cur_epoch_num = cb_params.cur_epoch_num
498
+ if self._only_enable_tsp() or self._only_enable_tre_and_tsp():
499
+ logger.info("Go into tft_pause_train.")
500
+ self.tft.tft_pause_train(self.cur_step_num)
501
+ return
502
+
503
+ if self.has_init_replica is False:
504
+ self.has_init_replica = True
505
+ self._set_tft_optimizer_replica(run_context)
456
506
  if cb_params.optimizer is not None:
457
507
  self.global_step = cb_params.optimizer.global_step.clone()
458
508
  self.assign(cb_params.optimizer.tft_g_one_flag, self.g_one)
@@ -462,7 +512,13 @@ class TrainFaultTolerance(Callback):
462
512
  else:
463
513
  raise ValueError("TFT feature need optimizer or network's optimizer!")
464
514
  self.tft.tft_end_updating_os(cb_params.cur_step_num + self.initial_step)
515
+ if cb_params.is_arf:
516
+ self.clean_unique_id = False
517
+ self._clear_unique_id()
465
518
  logger.info("END Set optimizer finish step status to TFT.")
519
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
520
+ logger.info("Go into tft_pause_train.")
521
+ self.tft.tft_pause_train(self.cur_step_num)
466
522
 
467
523
  def on_train_begin(self, run_context):
468
524
  """
@@ -472,6 +528,8 @@ class TrainFaultTolerance(Callback):
472
528
  run_context (RunContext): Context of the train running. Refer to
473
529
  :class:`mindspore.train.RunContext` for detail.
474
530
  """
531
+ if self._only_enable_tsp():
532
+ return
475
533
  cb_params = run_context.original_args()
476
534
  if self._only_enable_tre():
477
535
  self.cb_params = cb_params
@@ -491,6 +549,6 @@ class TrainFaultTolerance(Callback):
491
549
  run_context (RunContext): Context of the train running. Refer to
492
550
  :class:`mindspore.train.RunContext` for detail.
493
551
  """
494
- if self._only_enable_tre():
552
+ if self._only_enable_tre() or self._only_enable_tsp() or self._only_enable_tre_and_tsp():
495
553
  return
496
554
  _tft_handler.unregister_tft()
@@ -18,7 +18,7 @@ import mindspore.ops as ops
18
18
  from mindspore import context
19
19
  from mindspore.common.dtype import pytype_to_dtype
20
20
  from mindspore.common.api import jit
21
- from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
21
+ from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, enable_data_broadcast
22
22
  from mindspore.train.dataset_helper import _has_dynamic_shape, _check_inputs
23
23
  import mindspore.dataset as ds
24
24
  from mindspore._c_expression import _set_dataset_mode_config
@@ -41,6 +41,15 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
41
41
  is_info_queue = (create_info and sink_size == 1 and dataset_size != 1 and
42
42
  input_signature is None and not dynamic_shape and
43
43
  context.get_context('device_target') == 'Ascend')
44
+
45
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
46
+ # otherwise get_data_info will stuck since some rank do not consume data.
47
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
48
+ data_broadcast = enable_data_broadcast()
49
+
50
+ if use_pipeline_parallel or data_broadcast:
51
+ is_info_queue = False
52
+
44
53
  transfer_dataset = _exec_datagraph(dataset, sink_size, create_data_info_queue=is_info_queue)
45
54
  dataset.__transfer_dataset__ = transfer_dataset
46
55
 
@@ -214,7 +223,7 @@ def data_sink(fn, dataset, sink_size=1, jit_config=None, input_signature=None):
214
223
  loop = sink_size
215
224
  create_info = True
216
225
  if jit_config is None:
217
- create_info = (loop == 1)
226
+ create_info = loop == 1
218
227
  loop = 1
219
228
  ori_next_op, is_info_queue = _init_sink_dataset(dataset, loop, input_signature, create_info)
220
229
 
@@ -564,6 +564,15 @@ class _DatasetIter:
564
564
  self.sink_size = dataset.__loop_size__
565
565
  create_data_info_queue = (
566
566
  sink_size == 1 and self.sink_count == 1 and dataset.get_dataset_size() != 1)
567
+
568
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
569
+ # otherwise get_data_info will stuck since some rank do not consume data.
570
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
571
+ data_broadcast = enable_data_broadcast()
572
+
573
+ if use_pipeline_parallel or data_broadcast:
574
+ create_data_info_queue = False
575
+
567
576
  dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size,
568
577
  create_data_info_queue=create_data_info_queue)
569
578