mindspore 2.6.0rc1__cp310-cp310-win_amd64.whl → 2.7.0rc1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (407) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
  3. mindspore/Newtonsoft.Json.dll +0 -0
  4. mindspore/__init__.py +1 -1
  5. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  6. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  7. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  8. mindspore/_checkparam.py +40 -9
  9. mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
  10. mindspore/_extends/optimize/cell_utils.py +96 -0
  11. mindspore/_extends/parse/__init__.py +2 -2
  12. mindspore/_extends/parse/compile_config.py +44 -22
  13. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -1
  14. mindspore/_extends/parse/parser.py +37 -62
  15. mindspore/_extends/parse/resources.py +39 -0
  16. mindspore/_extends/parse/standard_method.py +43 -13
  17. mindspore/_extends/parse/trope.py +8 -1
  18. mindspore/_extends/pijit/__init__.py +1 -2
  19. mindspore/amp.py +4 -4
  20. mindspore/atlprov.dll +0 -0
  21. mindspore/avcodec-59.dll +0 -0
  22. mindspore/avdevice-59.dll +0 -0
  23. mindspore/avfilter-8.dll +0 -0
  24. mindspore/avformat-59.dll +0 -0
  25. mindspore/avutil-57.dll +0 -0
  26. mindspore/boost/adasum.py +1 -1
  27. mindspore/boost/boost_cell_wrapper.py +4 -4
  28. mindspore/c1.dll +0 -0
  29. mindspore/c1xx.dll +0 -0
  30. mindspore/c2.dll +0 -0
  31. mindspore/common/__init__.py +27 -2
  32. mindspore/common/_grad_function.py +2 -1
  33. mindspore/common/_pijit_context.py +28 -7
  34. mindspore/common/_stub_tensor.py +1 -209
  35. mindspore/common/_tensor_cpp_method.py +1 -1
  36. mindspore/common/_tensor_docs.py +77 -16
  37. mindspore/common/api.py +238 -113
  38. mindspore/common/dtype.py +21 -11
  39. mindspore/common/dump.py +10 -15
  40. mindspore/common/generator.py +5 -3
  41. mindspore/common/hook_handle.py +11 -2
  42. mindspore/common/jit_config.py +1 -1
  43. mindspore/common/jit_trace.py +84 -105
  44. mindspore/common/parameter.py +26 -12
  45. mindspore/common/recompute.py +3 -3
  46. mindspore/common/sparse_tensor.py +0 -3
  47. mindspore/common/symbol.py +0 -1
  48. mindspore/common/tensor.py +81 -81
  49. mindspore/communication/_comm_helper.py +46 -4
  50. mindspore/communication/management.py +79 -7
  51. mindspore/context.py +58 -40
  52. mindspore/dataset/core/config.py +3 -3
  53. mindspore/dataset/engine/datasets.py +20 -7
  54. mindspore/dataset/engine/datasets_user_defined.py +33 -3
  55. mindspore/dataset/engine/iterators.py +2 -2
  56. mindspore/dataset/engine/obs/config_loader.py +2 -2
  57. mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
  58. mindspore/dataset/transforms/py_transforms.py +7 -3
  59. mindspore/dataset/transforms/transforms.py +7 -3
  60. mindspore/dataset/vision/validators.py +1 -0
  61. mindspore/device_context/ascend/device.py +1 -1
  62. mindspore/device_context/gpu/__init__.py +2 -2
  63. mindspore/device_context/gpu/device.py +1 -1
  64. mindspore/device_context/gpu/op_precision.py +4 -2
  65. mindspore/device_context/gpu/op_tuning.py +6 -3
  66. mindspore/device_manager.py +16 -9
  67. mindspore/dnnl.dll +0 -0
  68. mindspore/dpcmi.dll +0 -0
  69. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -7
  70. mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
  71. mindspore/experimental/optim/adadelta.py +13 -20
  72. mindspore/experimental/optim/adagrad.py +15 -22
  73. mindspore/experimental/optim/adam.py +17 -24
  74. mindspore/experimental/optim/adamax.py +14 -22
  75. mindspore/experimental/optim/adamw.py +28 -34
  76. mindspore/experimental/optim/asgd.py +15 -25
  77. mindspore/experimental/optim/lr_scheduler.py +27 -45
  78. mindspore/experimental/optim/nadam.py +14 -24
  79. mindspore/experimental/optim/optimizer.py +13 -23
  80. mindspore/experimental/optim/radam.py +18 -24
  81. mindspore/experimental/optim/rmsprop.py +14 -25
  82. mindspore/experimental/optim/rprop.py +15 -26
  83. mindspore/experimental/optim/sgd.py +9 -19
  84. mindspore/hal/__init__.py +4 -4
  85. mindspore/hal/contiguous_tensors_handle.py +2 -2
  86. mindspore/hal/memory.py +27 -7
  87. mindspore/include/api/cell.h +37 -1
  88. mindspore/include/api/delegate.h +10 -0
  89. mindspore/include/api/model.h +3 -0
  90. mindspore/include/api/types.h +2 -2
  91. mindspore/include/c_api/model_c.h +0 -58
  92. mindspore/include/c_api/tensor_c.h +0 -26
  93. mindspore/include/dataset/vision_ascend.h +1 -1
  94. mindspore/jpeg62.dll +0 -0
  95. mindspore/mindrecord/tools/cifar10.py +60 -11
  96. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
  97. mindspore/mindspore_backend_common.dll +0 -0
  98. mindspore/mindspore_backend_manager.dll +0 -0
  99. mindspore/mindspore_common.dll +0 -0
  100. mindspore/mindspore_core.dll +0 -0
  101. mindspore/mindspore_cpu_res_manager.dll +0 -0
  102. mindspore/mindspore_dump.dll +0 -0
  103. mindspore/mindspore_frontend.dll +0 -0
  104. mindspore/mindspore_glog.dll +0 -0
  105. mindspore/mindspore_memory_pool.dll +0 -0
  106. mindspore/mindspore_ms_backend.dll +0 -0
  107. mindspore/mindspore_ops.dll +0 -0
  108. mindspore/mindspore_ops_host.dll +0 -0
  109. mindspore/mindspore_ops_kernel_common.dll +0 -0
  110. mindspore/mindspore_profiler.dll +0 -0
  111. mindspore/mindspore_pyboost.dll +0 -0
  112. mindspore/mindspore_pynative.dll +0 -0
  113. mindspore/mindspore_res_manager.dll +0 -0
  114. mindspore/mindspore_runtime_pipeline.dll +0 -0
  115. mindspore/mint/__init__.py +6 -46
  116. mindspore/mint/distributed/__init__.py +1 -0
  117. mindspore/mint/distributed/distributed.py +212 -9
  118. mindspore/mint/nn/__init__.py +1 -1
  119. mindspore/mint/nn/functional.py +53 -6
  120. mindspore/mint/nn/layer/_functions.py +164 -294
  121. mindspore/mint/nn/layer/activation.py +8 -6
  122. mindspore/mint/nn/layer/conv.py +137 -101
  123. mindspore/mint/nn/layer/normalization.py +8 -22
  124. mindspore/mint/optim/adam.py +19 -18
  125. mindspore/mint/optim/adamw.py +14 -8
  126. mindspore/mint/optim/sgd.py +5 -5
  127. mindspore/msobj140.dll +0 -0
  128. mindspore/mspdb140.dll +0 -0
  129. mindspore/mspdbcore.dll +0 -0
  130. mindspore/mspdbst.dll +0 -0
  131. mindspore/mspft140.dll +0 -0
  132. mindspore/msvcdis140.dll +0 -0
  133. mindspore/msvcp140_1.dll +0 -0
  134. mindspore/msvcp140_2.dll +0 -0
  135. mindspore/msvcp140_atomic_wait.dll +0 -0
  136. mindspore/msvcp140_codecvt_ids.dll +0 -0
  137. mindspore/nn/cell.py +328 -502
  138. mindspore/nn/grad/cell_grad.py +11 -12
  139. mindspore/nn/layer/activation.py +32 -34
  140. mindspore/nn/layer/basic.py +67 -64
  141. mindspore/nn/layer/channel_shuffle.py +4 -4
  142. mindspore/nn/layer/combined.py +4 -2
  143. mindspore/nn/layer/conv.py +117 -110
  144. mindspore/nn/layer/dense.py +9 -7
  145. mindspore/nn/layer/embedding.py +50 -52
  146. mindspore/nn/layer/image.py +37 -39
  147. mindspore/nn/layer/math.py +111 -112
  148. mindspore/nn/layer/normalization.py +56 -44
  149. mindspore/nn/layer/pooling.py +58 -63
  150. mindspore/nn/layer/rnn_cells.py +33 -33
  151. mindspore/nn/layer/rnns.py +56 -56
  152. mindspore/nn/layer/thor_layer.py +74 -73
  153. mindspore/nn/layer/transformer.py +11 -1
  154. mindspore/nn/learning_rate_schedule.py +20 -20
  155. mindspore/nn/loss/loss.py +79 -81
  156. mindspore/nn/optim/adam.py +3 -3
  157. mindspore/nn/optim/adasum.py +2 -2
  158. mindspore/nn/optim/asgd.py +2 -0
  159. mindspore/nn/optim/optimizer.py +1 -1
  160. mindspore/nn/optim/thor.py +2 -2
  161. mindspore/nn/probability/distribution/exponential.py +2 -1
  162. mindspore/nn/probability/distribution/poisson.py +2 -1
  163. mindspore/nn/sparse/sparse.py +3 -3
  164. mindspore/nn/wrap/cell_wrapper.py +34 -37
  165. mindspore/nn/wrap/grad_reducer.py +37 -37
  166. mindspore/nn/wrap/loss_scale.py +72 -74
  167. mindspore/numpy/array_creations.py +5 -5
  168. mindspore/numpy/fft.py +1 -1
  169. mindspore/numpy/math_ops.py +5 -5
  170. mindspore/opencv_core452.dll +0 -0
  171. mindspore/opencv_imgcodecs452.dll +0 -0
  172. mindspore/opencv_imgproc452.dll +0 -0
  173. mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
  174. mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
  175. mindspore/ops/_vmap/vmap_array_ops.py +31 -13
  176. mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
  177. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +42 -11
  178. mindspore/ops/auto_generate/gen_extend_func.py +23 -141
  179. mindspore/ops/auto_generate/gen_ops_def.py +727 -321
  180. mindspore/ops/auto_generate/gen_ops_prim.py +1721 -984
  181. mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
  182. mindspore/ops/composite/__init__.py +10 -0
  183. mindspore/ops/composite/base.py +8 -4
  184. mindspore/ops/composite/multitype_ops/__init__.py +12 -1
  185. mindspore/ops/composite/multitype_ops/_compile_utils.py +133 -109
  186. mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
  187. mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
  188. mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
  189. mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
  190. mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
  191. mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
  192. mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
  193. mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
  194. mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
  195. mindspore/ops/function/__init__.py +3 -1
  196. mindspore/ops/function/_add_attr_func.py +11 -6
  197. mindspore/ops/function/array_func.py +9 -96
  198. mindspore/ops/function/debug_func.py +4 -3
  199. mindspore/ops/function/grad/grad_func.py +1 -1
  200. mindspore/ops/function/math_func.py +33 -540
  201. mindspore/ops/function/nn_func.py +28 -74
  202. mindspore/ops/function/other_func.py +4 -1
  203. mindspore/ops/function/random_func.py +44 -5
  204. mindspore/ops/function/vmap_func.py +2 -1
  205. mindspore/ops/functional.py +2 -3
  206. mindspore/ops/functional_overload.py +571 -6
  207. mindspore/ops/op_info_register.py +21 -0
  208. mindspore/ops/operations/__init__.py +16 -11
  209. mindspore/ops/operations/_custom_ops_utils.py +689 -34
  210. mindspore/ops/operations/_inner_ops.py +3 -6
  211. mindspore/ops/operations/_sequence_ops.py +1 -1
  212. mindspore/ops/operations/array_ops.py +2 -2
  213. mindspore/ops/operations/comm_ops.py +185 -26
  214. mindspore/ops/operations/custom_ops.py +294 -174
  215. mindspore/ops/operations/debug_ops.py +59 -4
  216. mindspore/ops/operations/image_ops.py +13 -13
  217. mindspore/ops/operations/manually_defined/ops_def.py +15 -16
  218. mindspore/ops/operations/math_ops.py +3 -4
  219. mindspore/ops/operations/nn_ops.py +7 -39
  220. mindspore/ops/primitive.py +6 -10
  221. mindspore/ops/tensor_method.py +47 -8
  222. mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
  223. mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
  224. mindspore/ops_generate/api/functions_cc_generator.py +58 -10
  225. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
  226. mindspore/ops_generate/common/base_generator.py +14 -0
  227. mindspore/ops_generate/common/gen_constants.py +8 -3
  228. mindspore/ops_generate/common/gen_utils.py +0 -19
  229. mindspore/ops_generate/common/op_proto.py +11 -4
  230. mindspore/ops_generate/common/template.py +88 -11
  231. mindspore/ops_generate/gen_ops.py +1 -1
  232. mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
  233. mindspore/ops_generate/op_def/ops_def_cc_generator.py +0 -3
  234. mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
  235. mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
  236. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
  237. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
  238. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
  239. mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -0
  240. mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
  241. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
  242. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
  243. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
  244. mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
  245. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
  246. mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
  247. mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
  248. mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
  249. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
  250. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
  251. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
  252. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
  253. mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
  254. mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
  255. mindspore/parallel/_auto_parallel_context.py +11 -8
  256. mindspore/parallel/_cell_wrapper.py +113 -45
  257. mindspore/parallel/_parallel_serialization.py +1 -1
  258. mindspore/parallel/_ps_context.py +4 -6
  259. mindspore/parallel/_tensor.py +167 -12
  260. mindspore/parallel/_transformer/moe.py +1 -1
  261. mindspore/parallel/_transformer/transformer.py +13 -8
  262. mindspore/parallel/auto_parallel.py +14 -7
  263. mindspore/parallel/checkpoint_convert.py +3 -3
  264. mindspore/parallel/checkpoint_transform.py +11 -7
  265. mindspore/parallel/cluster/process_entity/_api.py +84 -48
  266. mindspore/parallel/cluster/process_entity/_utils.py +95 -7
  267. mindspore/parallel/cluster/run.py +43 -4
  268. mindspore/parallel/function/__init__.py +8 -1
  269. mindspore/parallel/function/reshard_func.py +6 -7
  270. mindspore/parallel/nn/__init__.py +15 -2
  271. mindspore/parallel/nn/parallel_cell_wrapper.py +9 -10
  272. mindspore/parallel/nn/parallel_grad_reducer.py +7 -6
  273. mindspore/parallel/shard.py +3 -4
  274. mindspore/parallel/transform_safetensors.py +463 -174
  275. mindspore/pgodb140.dll +0 -0
  276. mindspore/pgort140.dll +0 -0
  277. mindspore/profiler/__init__.py +2 -1
  278. mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
  279. mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
  280. mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +12 -6
  281. mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
  282. mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
  283. mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
  284. mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
  285. mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
  286. mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
  287. mindspore/profiler/analysis/task_manager.py +1 -1
  288. mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
  289. mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
  290. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +42 -22
  291. mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
  292. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
  293. mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
  294. mindspore/profiler/common/constant.py +16 -0
  295. mindspore/profiler/common/profiler_context.py +25 -27
  296. mindspore/profiler/common/profiler_info.py +0 -16
  297. mindspore/profiler/common/profiler_op_analyse.py +235 -0
  298. mindspore/profiler/common/profiler_output_path.py +23 -8
  299. mindspore/profiler/common/profiler_parameters.py +128 -35
  300. mindspore/profiler/dynamic_profile/__init__.py +0 -0
  301. mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
  302. mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
  303. mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
  304. mindspore/profiler/dynamic_profiler.py +305 -314
  305. mindspore/profiler/envprofiler.py +12 -7
  306. mindspore/profiler/experimental_config.py +96 -6
  307. mindspore/profiler/mstx.py +33 -12
  308. mindspore/profiler/platform/__init__.py +2 -3
  309. mindspore/profiler/platform/npu_profiler.py +29 -19
  310. mindspore/profiler/profiler.py +35 -19
  311. mindspore/profiler/profiler_action_controller.py +64 -76
  312. mindspore/profiler/schedule.py +10 -4
  313. mindspore/rewrite/common/config.py +1 -0
  314. mindspore/rewrite/common/namer.py +1 -0
  315. mindspore/rewrite/common/namespace.py +1 -0
  316. mindspore/rewrite/node/node.py +31 -11
  317. mindspore/rewrite/parsers/assign_parser.py +1 -1
  318. mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
  319. mindspore/run_check/_check_version.py +7 -10
  320. mindspore/runtime/__init__.py +5 -5
  321. mindspore/runtime/event.py +10 -4
  322. mindspore/runtime/executor.py +60 -45
  323. mindspore/runtime/memory.py +30 -32
  324. mindspore/runtime/thread_bind_core.py +298 -164
  325. mindspore/safeguard/rewrite_obfuscation.py +12 -13
  326. mindspore/swresample-4.dll +0 -0
  327. mindspore/swscale-6.dll +0 -0
  328. mindspore/tbbmalloc.dll +0 -0
  329. mindspore/tinyxml2.dll +0 -0
  330. mindspore/train/_utils.py +14 -4
  331. mindspore/train/amp.py +43 -20
  332. mindspore/train/callback/__init__.py +5 -5
  333. mindspore/train/callback/_checkpoint.py +3 -6
  334. mindspore/train/callback/_flops_collector.py +1 -1
  335. mindspore/train/callback/_landscape.py +0 -1
  336. mindspore/train/callback/_train_fault_tolerance.py +97 -16
  337. mindspore/train/data_sink.py +11 -2
  338. mindspore/train/dataset_helper.py +9 -0
  339. mindspore/train/model.py +135 -55
  340. mindspore/train/serialization.py +133 -111
  341. mindspore/train/summary/summary_record.py +13 -2
  342. mindspore/turbojpeg.dll +0 -0
  343. mindspore/utils/__init__.py +3 -2
  344. mindspore/utils/dryrun.py +0 -6
  345. mindspore/utils/runtime_execution_order_check.py +163 -77
  346. mindspore/utils/sdc_detect.py +68 -0
  347. mindspore/utils/utils.py +6 -9
  348. mindspore/vcmeta.dll +0 -0
  349. mindspore/vcruntime140.dll +0 -0
  350. mindspore/vcruntime140_1.dll +0 -0
  351. mindspore/version.py +1 -1
  352. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +5 -4
  353. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +356 -394
  354. mindspore/_deprecated/jit.py +0 -198
  355. mindspore/experimental/es/__init__.py +0 -22
  356. mindspore/experimental/es/embedding_service.py +0 -891
  357. mindspore/experimental/es/embedding_service_layer.py +0 -581
  358. mindspore/profiler/parser/__init__.py +0 -14
  359. mindspore/profiler/parser/aicpu_data_parser.py +0 -272
  360. mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
  361. mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
  362. mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
  363. mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
  364. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
  365. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
  366. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
  367. mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
  368. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
  369. mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
  370. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
  371. mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
  372. mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
  373. mindspore/profiler/parser/ascend_flops_generator.py +0 -116
  374. mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
  375. mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
  376. mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
  377. mindspore/profiler/parser/ascend_memory_generator.py +0 -185
  378. mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
  379. mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
  380. mindspore/profiler/parser/ascend_op_generator.py +0 -334
  381. mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
  382. mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
  383. mindspore/profiler/parser/base_timeline_generator.py +0 -483
  384. mindspore/profiler/parser/container.py +0 -229
  385. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
  386. mindspore/profiler/parser/flops_parser.py +0 -531
  387. mindspore/profiler/parser/framework_enum.py +0 -111
  388. mindspore/profiler/parser/framework_parser.py +0 -464
  389. mindspore/profiler/parser/framework_struct.py +0 -61
  390. mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
  391. mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
  392. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
  393. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
  394. mindspore/profiler/parser/hccl_parser.py +0 -573
  395. mindspore/profiler/parser/hwts_log_parser.py +0 -122
  396. mindspore/profiler/parser/integrator.py +0 -526
  397. mindspore/profiler/parser/memory_usage_parser.py +0 -277
  398. mindspore/profiler/parser/minddata_analyzer.py +0 -800
  399. mindspore/profiler/parser/minddata_parser.py +0 -186
  400. mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
  401. mindspore/profiler/parser/op_intermediate_parser.py +0 -149
  402. mindspore/profiler/parser/optime_parser.py +0 -250
  403. mindspore/profiler/parser/profiler_info.py +0 -213
  404. mindspore/profiler/parser/step_trace_parser.py +0 -666
  405. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
  406. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
  407. {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
@@ -81,7 +81,7 @@ def _transform_target_modules(target_modules):
81
81
  obfuscate_layers = target_modules[2].split(':')
82
82
  if obfuscate_layers[1] != 'all':
83
83
  max_layers = int(obfuscate_layers[1])
84
- layers = [i for i in range(0, max_layers)]
84
+ layers = list(range(0, max_layers))
85
85
  path_new = path.replace("blocks", "blocks/${layer}")
86
86
  network_obf_template['insert_ops'][0]['input_y'] = "obf_metadata_${layer}"
87
87
  weight_obf_template['weight_obf_ops'][0]['input_y'] = "obf_metadata_${layer}"
@@ -95,8 +95,8 @@ def _transform_target_modules(target_modules):
95
95
  obf_config['obf_metadata_config'].append(obf_medatadata)
96
96
 
97
97
  for name in target_list:
98
- target_weight = path_new + '/' + name + '/weight'
99
- target_bias = path_new + '/' + name + '/bias'
98
+ target_weight = '/'.join([path_new, name, 'weight'])
99
+ target_bias = '/'.join([path_new, name, 'bias'])
100
100
  weight_obf = weight_obf_template.copy()
101
101
  weight_obf['target'] = target_weight
102
102
  bias_obf = weight_obf_template.copy()
@@ -185,7 +185,7 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
185
185
  def _gen_obf_metadata(config):
186
186
  name = config.get('name')
187
187
  if name is None:
188
- return False
188
+ return
189
189
  save_metadata = config.get('save_metadata', False)
190
190
  metadata_op_name = config.get('metadata_op')
191
191
  layers = config.get('layers')
@@ -213,7 +213,6 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
213
213
  saved_obf_tensor = metadata_op(saved_obf_tensor)
214
214
  if saved_obf_tensor is not None:
215
215
  saved_metadata[obf_name] = saved_obf_tensor.asnumpy()
216
- return True
217
216
 
218
217
  if not isinstance(network, nn.Cell):
219
218
  raise TypeError("network must be nn.Cell, but got {}.".format(type(network)))
@@ -283,13 +282,13 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
283
282
  def _obfuscate_param(param, obf_metadata, obf_ops, layer=0):
284
283
  param_dtype = F.dtype(param)
285
284
  obf_param = param
286
- for i in range(len(obf_ops)):
287
- op_name = obf_ops[i].get('name')
285
+ for obf_op in obf_ops:
286
+ op_name = obf_op.get('name')
288
287
  if not isinstance(op_name, str):
289
288
  raise TypeError('{} should be str type, but got {}'.format(op_name, type(op_name)))
290
289
  if op_name == 'mul':
291
290
  input_x = obf_param
292
- input_y_name = _get_op_input_name(obf_ops[i], 'input_y', layer)
291
+ input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
293
292
  input_y = obf_metadata.get(input_y_name)
294
293
  if input_x is None or input_y is None:
295
294
  log.error("input_x or input_y is None")
@@ -297,22 +296,22 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
297
296
  input_y = F.cast(input_y, param_dtype)
298
297
  obf_param = ops.mul(input_x, input_y)
299
298
  elif op_name == 'permuate':
300
- input_x_name = _get_op_input_name(obf_ops[i], 'input_x', layer)
299
+ input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
301
300
  p = obf_metadata.get(input_x_name, None)
302
301
  if p is None or obf_param is None:
303
302
  log.error("input_x or param is None")
304
303
  return None
305
304
  obf_param = obf_param[p]
306
305
  elif op_name == 'matmul':
307
- input_x_name = _get_op_input_name(obf_ops[i], 'input_x', layer)
308
- input_y_name = _get_op_input_name(obf_ops[i], 'input_y', layer)
306
+ input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
307
+ input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
309
308
  input_x = _get_op_input(input_x_name, obf_param)
310
309
  input_y = _get_op_input(input_y_name, obf_param)
311
310
  if input_x is None or input_y is None:
312
311
  log.error("the input_x or input_y of op: {} is None.".format(op_name))
313
312
  return None
314
- input_x = ops.transpose(input_x, (1, 0)) if obf_ops[i].get('transpose_a', False) else input_x
315
- input_y = ops.transpose(input_y, (1, 0)) if obf_ops[i].get('transpose_b', False) else input_y
313
+ input_x = ops.transpose(input_x, (1, 0)) if obf_op.get('transpose_a', False) else input_x
314
+ input_y = ops.transpose(input_y, (1, 0)) if obf_op.get('transpose_b', False) else input_y
316
315
  obf_param = ops.matmul(F.cast(input_x, param_dtype), F.cast(input_y, param_dtype))
317
316
  else:
318
317
  log.error("unsupported op, op must be matmul or permuate or mul, but got {}."
Binary file
mindspore/swscale-6.dll CHANGED
Binary file
mindspore/tbbmalloc.dll CHANGED
Binary file
mindspore/tinyxml2.dll CHANGED
Binary file
mindspore/train/_utils.py CHANGED
@@ -323,9 +323,15 @@ def parse_strategy_ckpt(file_name):
323
323
  def _get_strategy_opt_shard(param_redundancy_dict, parameter_layout_opt_shard):
324
324
  """Strategy ckpt append opt shard."""
325
325
  for key, value in parameter_layout_opt_shard.items():
326
- if value[1] not in (-1, 0):
327
- opt_para_num = value[1]
326
+ if value[1] != 0:
328
327
  param_redundancy_ranks = param_redundancy_dict.get(key)
328
+ if value[1] != -1:
329
+ opt_para_num = value[1]
330
+ elif param_redundancy_ranks:
331
+ opt_para_num = len(param_redundancy_ranks) * len(param_redundancy_ranks[0]) // value[0]
332
+ else:
333
+ raise ValueError(f"For get_parameter_redundancy, the format of the parallel communication domain for "
334
+ f"the optimizer is incorrect.")
329
335
  res = []
330
336
  for param_ranks in param_redundancy_ranks:
331
337
  if len(param_ranks) % opt_para_num == 0:
@@ -576,7 +582,8 @@ def _progress_bar(iterable, total=None):
576
582
  print_progress_bar(i)
577
583
 
578
584
 
579
- def _load_and_transform(path, name_map, load_func, transform_func):
585
+ def _load_and_transform(path, name_map, load_func, transform_func=None):
586
+ """use load_func to load and use transform_func to convert"""
580
587
  if load_func is not None:
581
588
  param_dict = load_func(path)
582
589
  else:
@@ -584,5 +591,8 @@ def _load_and_transform(path, name_map, load_func, transform_func):
584
591
  transform_dict = {}
585
592
  for k, v in param_dict.items():
586
593
  new_name = name_map.get(k, k) if name_map is not None else k
587
- transform_dict[new_name] = transform_func(v, new_name)
594
+ if transform_func is not None:
595
+ transform_dict[new_name] = transform_func(v, new_name)
596
+ else:
597
+ transform_dict[new_name] = v
588
598
  return transform_dict
mindspore/train/amp.py CHANGED
@@ -69,6 +69,9 @@ AMP_BLACK_LIST = [
69
69
  AMP_AUTO_WHITE_LIST = [
70
70
  P.Conv2D,
71
71
  P.Conv3D,
72
+ gen.Conv2DExt,
73
+ gen.Conv3DExt,
74
+ gen.ConvTranspose2D,
72
75
  P.Conv2DTranspose,
73
76
  P.Conv3DTranspose,
74
77
  gen.Convolution,
@@ -80,6 +83,10 @@ AMP_AUTO_WHITE_LIST = [
80
83
  P.Einsum,
81
84
  gen.Dense,
82
85
  gen.Addmm,
86
+ gen.Addbmm,
87
+ gen.Addmv,
88
+ gen.Baddbmm,
89
+ gen.Mv,
83
90
  ]
84
91
 
85
92
  AMP_AUTO_BLACK_LIST = [
@@ -90,8 +97,10 @@ AMP_AUTO_BLACK_LIST = [
90
97
  P.Erfinv,
91
98
  P.Exp,
92
99
  P.Expm1,
93
- P.Log,
94
- P.Log1p,
100
+ gen.Log,
101
+ gen.Log10,
102
+ gen.Log1p,
103
+ gen.Log2,
95
104
  P.Reciprocal,
96
105
  P.Rsqrt,
97
106
  P.Sinh,
@@ -103,6 +112,7 @@ AMP_AUTO_BLACK_LIST = [
103
112
  P.BatchNorm,
104
113
  gen.BatchNormExt,
105
114
  gen.GroupNorm,
115
+ gen.Norm,
106
116
  P.KLDivLoss,
107
117
  P.SmoothL1Loss,
108
118
  P.MultilabelMarginLoss,
@@ -113,7 +123,19 @@ AMP_AUTO_BLACK_LIST = [
113
123
  P.Pdist,
114
124
  P.Cdist,
115
125
  P.Renorm,
126
+ gen.ReduceProd,
127
+ gen.Softmax,
128
+ gen.LogSoftmax,
129
+ gen.LogSoftmaxExt,
130
+ gen.CumProd,
131
+ gen.CumSum,
132
+ gen.CumsumExt,
133
+ gen.ProdExt,
134
+ gen.SumExt,
135
+ gen.L1LossExt,
116
136
  gen.MSELossExt,
137
+ gen.NLLLoss,
138
+ gen.NLLLoss2d,
117
139
  ]
118
140
 
119
141
  # Indicates which inputs of primitives need to be converted
@@ -358,7 +380,7 @@ def _auto_black_list(network, black_list, dtype):
358
380
  return network
359
381
 
360
382
 
361
- class amp_decorator:
383
+ class AmpDecorator:
362
384
  """
363
385
  Auto mixed precision decorator.
364
386
  Type of lists: List[Tuple[str, List[int]]]
@@ -384,7 +406,7 @@ def _set_amp_decorator(obj, amp_level, amp_dtype, white_list, black_list):
384
406
  if inspect.isfunction(obj) or inspect.ismethod(obj):
385
407
  @functools.wraps(obj)
386
408
  def wrapper(*args, **kwargs):
387
- with amp_decorator(amp_level, amp_dtype, white_list, black_list):
409
+ with AmpDecorator(amp_level, amp_dtype, white_list, black_list):
388
410
  return obj(*args, **kwargs)
389
411
  return wrapper
390
412
  if isinstance(obj, nn.Cell):
@@ -423,17 +445,18 @@ def auto_mixed_precision(network, amp_level="O0", dtype=mstype.float16):
423
445
 
424
446
  Operators in `auto_whitelist` are:
425
447
 
426
- ``Conv2D``, ``Conv3D``, ``Conv2DTranspose``, ``Conv3DTranspose``, ``Convolution``, ``MatMul``, ``MatMulExt``,
427
- ``BatchMatMul``, ``BatchMatMulExt``, ``PReLU``, ``Einsum``, ``Dense``, ``Addmm``
448
+ ``Conv2D``, ``Conv2DExt``, ``Conv3D``, ``Conv3DExt``, ``Conv2DTranspose``, ``ConvTranspose2D``,
449
+ ``Conv3DTranspose``, ``Convolution``, ``MatMul``, ``MatMulExt``, ``BatchMatMul``, ``BatchMatMulExt``, ``PReLU``,
450
+ ``Einsum``, ``Dense``, ``Addmm``, ``Addbmm``, ``Addmv``, ``Baddbmm``, ``Mv``
428
451
 
429
452
  Operators in `auto_blacklist` are:
430
453
 
431
- ``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log1p``, ``Reciprocal``,
432
- ``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``, ``BatchNorm``,
433
- ``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``, ``SoftMarginLoss``,
434
- ``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``, ``Renorm``,
435
- ``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``CumProd``, ``CumSum``, ``CumsumExt``, ``ProdExt``, ``SumExt``,
436
- ``Norm``, ``MSELossExt``
454
+ ``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log10``, ``Log1p``, ``Log2``,
455
+ ``Reciprocal``, ``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``,
456
+ ``BatchNorm``, ``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``,
457
+ ``SoftMarginLoss``, ``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``,
458
+ ``Renorm``, ``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``LogSoftmaxExt``, ``CumProd``, ``CumSum``,
459
+ ``CumsumExt``, ``ProdExt``, ``SumExt``, ``Norm``, ``L1LossExt``, ``MSELossExt``, ``NLLLoss``, ``NLLLoss2d``
437
460
 
438
461
  Operators in `promote_list` are:
439
462
 
@@ -638,7 +661,7 @@ def _add_loss_network(network, loss_fn, cast_model_type):
638
661
 
639
662
 
640
663
  def _is_grad_accumulation(mcell):
641
- if mcell.cls_name == "GradAccumulationCell" or mcell.cls_name == "GradAccumulation":
664
+ if mcell.cls_name in {"GradAccumulationCell", "GradAccumulation"}:
642
665
  return True
643
666
  for cell in mcell.cells():
644
667
  if _is_grad_accumulation(cell):
@@ -675,23 +698,23 @@ def build_train_network(network, optimizer, loss_fn=None, level='O0', boost_leve
675
698
  Build the mixed precision training cell automatically.
676
699
 
677
700
  Note:
678
- - After using `custom_mixed_precision` or `auto_mixed_precision` for precision conversion, it is not supported
679
- to perform the precision conversion again. If `build_train_network` is used to train a converted network,
680
- `level` need to be configured to ``O0`` to avoid the duplicated accuracy conversion.
701
+ After using `custom_mixed_precision` or `auto_mixed_precision` for precision conversion, it is not supported
702
+ to perform the precision conversion again. If `build_train_network` is used to train a converted network,
703
+ `level` need to be configured to ``O0`` to avoid the duplicated accuracy conversion.
681
704
 
682
705
  Args:
683
706
  network (Cell): Definition of the network.
684
707
  optimizer (:class:`mindspore.nn.Optimizer`): Define the optimizer to update the Parameter.
685
- loss_fn (Union[None, Cell]): Define the loss function. If None, the `network` should have the loss inside.
686
- Default: ``None`` .
687
- level (str): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
708
+ loss_fn (Union[None, Cell], optional): Define the loss function. If None,
709
+ the `network` should have the loss inside. Default: ``None`` .
710
+ level (str, optional): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
688
711
 
689
712
  For details on amp level, refer to :func:`mindspore.amp.auto_mixed_precision`.
690
713
 
691
714
  Property of `keep_batchnorm_fp32`, `cast_model_type` and `loss_scale_manager` determined by `level`
692
715
  setting may be overwritten by settings in `kwargs`.
693
716
 
694
- boost_level (str): Option for argument `level` in `mindspore.boost` , level for boost mode
717
+ boost_level (str, optional): Option for argument `level` in `mindspore.boost` , level for boost mode
695
718
  training. Supports ['O0', 'O1', 'O2']. Default: ``'O0'`` .
696
719
 
697
720
  - 'O0': Do not change.
@@ -15,6 +15,11 @@
15
15
  """Callback related classes and functions."""
16
16
  from __future__ import absolute_import
17
17
 
18
+ __all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
19
+ "SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
20
+ "History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
21
+ "TrainFaultTolerance"]
22
+
18
23
  from mindspore.train.callback._callback import Callback
19
24
  from mindspore.train.callback._callback import CallbackManager as _CallbackManager
20
25
  from mindspore.train.callback._callback import InternalCallbackParam as _InternalCallbackParam
@@ -37,8 +42,3 @@ from mindspore.train.callback._on_request_exit import OnRequestExit
37
42
  from mindspore.train.callback._backup_and_restore import BackupAndRestore
38
43
  from mindspore.train.callback._flops_collector import FlopsUtilizationCollector
39
44
  from mindspore.train.callback._train_fault_tolerance import TrainFaultTolerance
40
-
41
- __all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
42
- "SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
43
- "History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
44
- "TrainFaultTolerance"]
@@ -411,8 +411,6 @@ class CheckpointConfig:
411
411
  handle_append_info["epoch_num"] = 0
412
412
  if "step_num" in append_info:
413
413
  handle_append_info["step_num"] = 0
414
- if "random_op" in append_info:
415
- handle_append_info["random_op"] = 0
416
414
  dict_num = 0
417
415
  for element in append_info:
418
416
  if not isinstance(element, str) and not isinstance(element, dict):
@@ -588,8 +586,6 @@ class ModelCheckpoint(Callback):
588
586
  # save graph (only once)
589
587
  if not self._graph_saved:
590
588
  graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
591
- if os.path.isfile(graph_file_name) and context.get_context("mode") == context.GRAPH_MODE:
592
- os.remove(graph_file_name)
593
589
  _save_graph(cb_params.train_network, graph_file_name)
594
590
  self._graph_saved = True
595
591
  self._save_ckpt(cb_params)
@@ -713,12 +709,13 @@ class ModelCheckpoint(Callback):
713
709
  save_checkpoint(network, cur_file, False, self._config.async_save,
714
710
  self._append_dict, self._config.enc_key, self._config.enc_mode,
715
711
  crc_check=self._config.crc_check, format=self._config.format,
716
- incremental=self._map_param_inc, choice_func=choice_func)
712
+ incremental=self._map_param_inc, choice_func=choice_func,
713
+ remove_redundancy=self._config.remove_redundancy)
717
714
  else:
718
715
  save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
719
716
  self._append_dict, self._config.enc_key, self._config.enc_mode,
720
717
  crc_check=self._config.crc_check, format=self._config.format,
721
- incremental=self._map_param_inc)
718
+ incremental=self._map_param_inc, remove_redundancy=self._config.remove_redundancy)
722
719
 
723
720
  self._latest_ckpt_file_name = cur_file
724
721
 
@@ -53,7 +53,7 @@ class FlopsUtilizationCollector(Callback):
53
53
  The FlopsUtilizationCollector interface counts the model utilization information MFU
54
54
  and the hardware utilization information HFU.
55
55
  Currently, the API counts only the forward and backward flops of MatMul,
56
- BatchMatMul, FlashAttentionScore, and Conv2D operators.
56
+ BatchMatMul, flash_attention_score, and Conv2D operators.
57
57
  Only used in graph mode with static shape.
58
58
 
59
59
  Args:
@@ -404,7 +404,6 @@ class SummaryLandscape:
404
404
  def _set_context(device_id):
405
405
  """Set context."""
406
406
  context.set_context(device_id=device_id)
407
- context.set_context(mode=context.GRAPH_MODE)
408
407
 
409
408
  def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
410
409
  """Create landscape by PCA."""
@@ -25,8 +25,9 @@ from mindspore.communication import get_rank, get_group_size
25
25
  from mindspore import log as logger
26
26
  from mindspore.train.serialization import _get_cur_rank_dp
27
27
  from mindspore._c_expression import _repair_device, _stop_device, _tft_sem_post, _tft_sem_enable
28
- from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm
28
+ from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm, _clean_rootinfo
29
29
  from mindspore._c_expression import clean_tdt_channel
30
+ from mindspore._c_expression import _pre_launch_send_recv
30
31
  from mindspore._c_expression import send_recv, reset_params
31
32
  from mindspore._c_expression import CollectiveManager
32
33
  from mindspore._c_expression import _get_uce_process_strategy, _get_uce_mem_info
@@ -35,6 +36,7 @@ from mindspore.ops.operations.manually_defined._inner import TensorReport
35
36
  import mindspore
36
37
  import mindspore.common.dtype as mstype
37
38
  from mindspore.parallel._recovery_context import _set_recovery_context
39
+ from mindspore import runtime
38
40
 
39
41
 
40
42
  def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
@@ -80,7 +82,7 @@ def _save_checkpoint_on_failure(step, save_info, args, cb_ctx):
80
82
  append_dict["loss_scale"] = outputs[2]
81
83
 
82
84
  ckpt_file = f"ttp_rank_{str(cur_rank)}-{str(cur_epoch_num)}_{str(step_num_in_epoch)}.ckpt"
83
- cur_ckpt_dir = _get_ckpt_dir(step, ckpt_save_path, True) + "/rank_" + str(cur_rank)
85
+ cur_ckpt_dir = os.path.join(_get_ckpt_dir(step, ckpt_save_path, True), "rank_" + str(cur_rank))
84
86
  os.makedirs(cur_ckpt_dir, exist_ok=True)
85
87
  cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
86
88
  save_checkpoint(cb_params.train_network, cur_file,
@@ -110,7 +112,7 @@ def _tft_exit_cb(ctx):
110
112
 
111
113
  def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_ctx):
112
114
  """ Callback used for TFT repair function."""
113
- logger.warning("Enter _tft_repair_callback repair type: {}".format(repair_info["repair_type"]))
115
+ logger.warning(f"Enter _tft_repair_callback repair type: {repair_info['repair_type']}")
114
116
  if (repair_info["repair_type"] in (cb_ctx.tft.RepairType.RT_UCE_HIGHLEVEL.value,
115
117
  cb_ctx.tft.RepairType.RT_UCE_LOWLEVEL.value)):
116
118
  logger.warning("Enter _tft_repair_callback uce REPARI_DEVICE device_id : {}".format(cb_ctx.device_id))
@@ -138,7 +140,7 @@ def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_
138
140
 
139
141
  def _tft_clean_callback(is_uce_error, args, ctx):
140
142
  """ Callback used for TFT clean function."""
141
- logger.warning("Enter _tft_clean_callback")
143
+ logger.warning(f"Enter _tft_clean_callback, device id:{ctx.device_id}")
142
144
  ret = 0
143
145
  if is_uce_error:
144
146
  _get_uce_mem_info(ctx.device_id)
@@ -154,29 +156,36 @@ def _tft_clean_callback(is_uce_error, args, ctx):
154
156
  logger.warning("Enter _tft_clean_callback resume_hccl_comm")
155
157
  CollectiveManager.get_instance().resume_hccl_comm()
156
158
  logger.warning("Finish _tft_clean_callback, ret: {}".format(ret))
159
+ if ctx.tft.tft_get_repair_type() == "recover":
160
+ logger.warning(f"Destroy hcom")
161
+ _finalize_comm()
162
+ logger.warning(f"Destroy hcom end")
157
163
  return ret
158
164
 
159
165
 
160
166
  def _tft_stop_callback(args, cb_ctx):
161
167
  """ Callback used for TFT stop function."""
162
- logger.warning("Enter _tft_stop_callback device_id: {}".format(cb_ctx.device_id))
168
+ logger.warning(f"Enter _tft_stop_callback device_id: {cb_ctx.device_id}")
163
169
  _stop_device(cb_ctx.device_id)
170
+ cb_ctx.stop_been_called = True
164
171
  if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
165
172
  raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
166
173
  cb_ctx.is_uce_rank = False
167
174
  if cb_ctx.tft.tft_get_repair_type() == "recover":
168
175
  logger.warning(f"Reset limit step")
169
176
  cb_ctx.tft.tft_reset_limit_step()
170
- logger.info("Finish _tft_stop_callback")
177
+ logger.warning("Finish _tft_stop_callback")
171
178
 
172
179
 
173
180
  def _tft_rebuild_sub_groups(fault_ranks, args, ctx):
174
181
  """Callback used for TFT Rebuild Group function."""
175
- logger.warning(f"Enter _tft_rebuild_sub_groups, device id: ".format(ctx.device_id))
176
- _finalize_comm()
182
+ logger.warning(f"Enter _tft_rebuild_sub_groups, device id: {ctx.device_id}")
177
183
  _rebuild_world_group()
178
184
  _rebuild_sub_group()
179
185
  _set_recovery_context(is_arf=True)
186
+ logger.warning(f"try to pre launch send recv before real launch")
187
+ _pre_launch_send_recv(context.get_context('device_id'))
188
+ logger.warning(f"Pre launch send recv before real launch end")
180
189
  logger.warning("Enter _tft_rebuild_sub_groups ok ")
181
190
 
182
191
 
@@ -299,27 +308,70 @@ class TrainFaultTolerance(Callback):
299
308
 
300
309
  def __init__(self, ckpt_save_path=None, **kwargs):
301
310
  super(TrainFaultTolerance, self).__init__()
311
+ logger.info(f"MS_ENABLE_TFT: {os.getenv('MS_ENABLE_TFT', '')}")
312
+ if self._only_enable_tsp():
313
+ self.tft = _tft_handler.get_tft()
314
+ self._check_init()
315
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
316
+ return
302
317
  self.save_cb = kwargs.get("ckpt_save_fn", None)
303
318
  self.ckpt_save_path = ckpt_save_path
304
319
  if self.save_cb is None and self.ckpt_save_path is None:
305
320
  raise ValueError("TrainFaultTolerance construct need to set ckpt_save_fn or ckpt_save_path!")
321
+ self.cb_params = None
322
+ self.initial_step = kwargs.get("initial_step", 0)
323
+ self.device_id = context.get_context("device_id")
324
+ self.cur_step_num = 0
325
+ self.cur_epoch_num = 0
326
+ self.clean_unique_id = False
327
+ # For TREError(Training Result Error) scene, parameter `ckpt_load_fn` must be provided to load checkpoint
328
+ # from file for resuming training, the `ckpt_load_fn` is a function, prototype of which is:
329
+ # `def load_checkpoint() -> tuple(dict, bool)`, the return value is a tuple containing 2 values,
330
+ # i.e. (param_dict, remove_redundancy)
331
+ self.ckpt_load_func = kwargs.get("ckpt_load_fn", None)
332
+ if self._only_enable_tre():
333
+ return
306
334
  self.tft = _tft_handler.get_tft()
307
335
  self._check_init()
336
+ if self._only_enable_tre_and_tsp():
337
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
338
+ return
308
339
  self.global_step = None
309
340
  self.learning_rate = None
310
341
  self.has_init_replica = False
311
342
  self.is_uce_rank = False
312
- self.cb_params = None
313
- self.initial_step = kwargs.get("initial_step", 0)
314
- self.device_id = context.get_context("device_id")
343
+ self.stop_been_called = False
344
+
315
345
  self.assign = mindspore.ops.Assign()
316
346
  self.g_one = Parameter(Tensor([1], dtype=mstype.int32))
317
347
  self.s1 = mindspore.hal.Stream()
318
- self.cur_step_num = 0
319
- self.cur_epoch_num = 0
320
348
  _tft_sem_enable()
321
349
  self._tft_register()
322
350
 
351
+ def _only_enable_tre(self):
352
+ """Check if only configured MS_ENABLE_TFT='{TRE:1}'"""
353
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
354
+ non_tre_flags = ["TTP:1", "UCE:1", "ARF:1"]
355
+ if any(flag in env_enable for flag in non_tre_flags):
356
+ return False
357
+ return "TRE:1" in env_enable
358
+
359
+ def _only_enable_tsp(self):
360
+ """Check if only configured MS_ENABLE_TFT='{TSP:1}'"""
361
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
362
+ non_tsp_flags = ["TTP:1", "UCE:1", "ARF:1", "TRE:1"]
363
+ if any(flag in env_enable for flag in non_tsp_flags):
364
+ return False
365
+ return "TSP:1" in env_enable
366
+
367
+ def _only_enable_tre_and_tsp(self):
368
+ """Check if only configured MS_ENABLE_TFT='{TRE:1, TSP:1}'"""
369
+ env_enable = os.getenv("MS_ENABLE_TFT", "")
370
+ other_flags = ["TTP:1", "UCE:1", "ARF:1"]
371
+ if any(flag in env_enable for flag in other_flags):
372
+ return False
373
+ return "TRE:1" in env_enable and "TSP:1" in env_enable
374
+
323
375
  def _check_init(self):
324
376
  """Check if the mindio-ttp had inited"""
325
377
  if self.tft is None:
@@ -411,6 +463,8 @@ class TrainFaultTolerance(Callback):
411
463
  self.tft.tft_register_clean_handler(_tft_clean_callback, self)
412
464
  self.tft.tft_register_repair_handler(_tft_repair_callback, self)
413
465
  self.tft.tft_register_rebuild_group_handler(_tft_rebuild_sub_groups, self)
466
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
467
+ self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
414
468
 
415
469
  def _reset_acc_grads(self):
416
470
  accu_grad_params = map(lambda e: e[1],
@@ -420,6 +474,12 @@ class TrainFaultTolerance(Callback):
420
474
  if reset_params(accu_grad_list) != 0:
421
475
  raise ValueError("Call reset_params failed.")
422
476
 
477
+ def _clear_unique_id(self):
478
+ """Clean unique id on first train step end"""
479
+ if not self.clean_unique_id and ("ARF:1" in os.getenv("MS_ENABLE_TFT", "")):
480
+ _clean_rootinfo()
481
+ self.clean_unique_id = True
482
+
423
483
  def on_train_step_end(self, run_context):
424
484
  """
425
485
  Report status to MindIO TFT after every step finished.
@@ -428,13 +488,21 @@ class TrainFaultTolerance(Callback):
428
488
  run_context (RunContext): Context of the train running. Refer to
429
489
  :class:`mindspore.train.RunContext` for detail.
430
490
  """
431
- if self.has_init_replica is False:
432
- self.has_init_replica = True
433
- self._set_tft_optimizer_replica(run_context)
491
+ if self._only_enable_tre():
492
+ return
493
+
434
494
  cb_params = run_context.original_args()
435
495
  logger.info("START Set optimizer finish step status to TFT. step: {}".format(cb_params.cur_step_num))
436
496
  self.cur_step_num = cb_params.cur_step_num
437
497
  self.cur_epoch_num = cb_params.cur_epoch_num
498
+ if self._only_enable_tsp() or self._only_enable_tre_and_tsp():
499
+ logger.info("Go into tft_pause_train.")
500
+ self.tft.tft_pause_train(self.cur_step_num)
501
+ return
502
+
503
+ if self.has_init_replica is False:
504
+ self.has_init_replica = True
505
+ self._set_tft_optimizer_replica(run_context)
438
506
  if cb_params.optimizer is not None:
439
507
  self.global_step = cb_params.optimizer.global_step.clone()
440
508
  self.assign(cb_params.optimizer.tft_g_one_flag, self.g_one)
@@ -444,7 +512,13 @@ class TrainFaultTolerance(Callback):
444
512
  else:
445
513
  raise ValueError("TFT feature need optimizer or network's optimizer!")
446
514
  self.tft.tft_end_updating_os(cb_params.cur_step_num + self.initial_step)
515
+ if cb_params.is_arf:
516
+ self.clean_unique_id = False
517
+ self._clear_unique_id()
447
518
  logger.info("END Set optimizer finish step status to TFT.")
519
+ if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
520
+ logger.info("Go into tft_pause_train.")
521
+ self.tft.tft_pause_train(self.cur_step_num)
448
522
 
449
523
  def on_train_begin(self, run_context):
450
524
  """
@@ -454,7 +528,12 @@ class TrainFaultTolerance(Callback):
454
528
  run_context (RunContext): Context of the train running. Refer to
455
529
  :class:`mindspore.train.RunContext` for detail.
456
530
  """
531
+ if self._only_enable_tsp():
532
+ return
457
533
  cb_params = run_context.original_args()
534
+ if self._only_enable_tre():
535
+ self.cb_params = cb_params
536
+ return
458
537
  sink_size = cb_params.get("sink_size", 0)
459
538
  if sink_size > 1:
460
539
  raise ValueError("TFT feature doesn't support sink_size > 1.")
@@ -470,4 +549,6 @@ class TrainFaultTolerance(Callback):
470
549
  run_context (RunContext): Context of the train running. Refer to
471
550
  :class:`mindspore.train.RunContext` for detail.
472
551
  """
552
+ if self._only_enable_tre() or self._only_enable_tsp() or self._only_enable_tre_and_tsp():
553
+ return
473
554
  _tft_handler.unregister_tft()
@@ -18,7 +18,7 @@ import mindspore.ops as ops
18
18
  from mindspore import context
19
19
  from mindspore.common.dtype import pytype_to_dtype
20
20
  from mindspore.common.api import jit
21
- from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
21
+ from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, enable_data_broadcast
22
22
  from mindspore.train.dataset_helper import _has_dynamic_shape, _check_inputs
23
23
  import mindspore.dataset as ds
24
24
  from mindspore._c_expression import _set_dataset_mode_config
@@ -41,6 +41,15 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
41
41
  is_info_queue = (create_info and sink_size == 1 and dataset_size != 1 and
42
42
  input_signature is None and not dynamic_shape and
43
43
  context.get_context('device_target') == 'Ascend')
44
+
45
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
46
+ # otherwise get_data_info will stuck since some rank do not consume data.
47
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
48
+ data_broadcast = enable_data_broadcast()
49
+
50
+ if use_pipeline_parallel or data_broadcast:
51
+ is_info_queue = False
52
+
44
53
  transfer_dataset = _exec_datagraph(dataset, sink_size, create_data_info_queue=is_info_queue)
45
54
  dataset.__transfer_dataset__ = transfer_dataset
46
55
 
@@ -214,7 +223,7 @@ def data_sink(fn, dataset, sink_size=1, jit_config=None, input_signature=None):
214
223
  loop = sink_size
215
224
  create_info = True
216
225
  if jit_config is None:
217
- create_info = (loop == 1)
226
+ create_info = loop == 1
218
227
  loop = 1
219
228
  ori_next_op, is_info_queue = _init_sink_dataset(dataset, loop, input_signature, create_info)
220
229
 
@@ -564,6 +564,15 @@ class _DatasetIter:
564
564
  self.sink_size = dataset.__loop_size__
565
565
  create_data_info_queue = (
566
566
  sink_size == 1 and self.sink_count == 1 and dataset.get_dataset_size() != 1)
567
+
568
+ # Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
569
+ # otherwise get_data_info will stuck since some rank do not consume data.
570
+ use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
571
+ data_broadcast = enable_data_broadcast()
572
+
573
+ if use_pipeline_parallel or data_broadcast:
574
+ create_data_info_queue = False
575
+
567
576
  dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size,
568
577
  create_data_info_queue=create_data_info_queue)
569
578