mindspore 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +6 -4
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_check_jit_forbidden_api.py +3 -0
- mindspore/_checkparam.py +3 -33
- mindspore/_deprecated/__init__.py +17 -0
- mindspore/_deprecated/jit.py +198 -0
- mindspore/_extends/builtin_operations.py +1 -1
- mindspore/_extends/parse/__init__.py +6 -7
- mindspore/_extends/parse/compile_config.py +19 -0
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +22 -3
- mindspore/_extends/parse/jit_fallback_modules/__init__.py +0 -0
- mindspore/_extends/parse/jit_fallback_modules/check_utils.py +123 -0
- mindspore/_extends/parse/jit_fallback_modules/third_party_modules.py +50 -0
- mindspore/_extends/parse/parser.py +25 -194
- mindspore/_extends/parse/resources.py +1 -5
- mindspore/_extends/parse/standard_method.py +109 -75
- mindspore/_extends/pijit/__init__.py +2 -2
- mindspore/_extends/pijit/pijit_func_white_list.py +16 -11
- mindspore/_extends/pijit/tensor_func_list.py +27 -0
- mindspore/_extends/utils.py +1 -1
- mindspore/amp.py +4 -4
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/__init__.py +2 -2
- mindspore/boost/base.py +3 -7
- mindspore/boost/boost_cell_wrapper.py +2 -2
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +4 -3
- mindspore/common/_grad_function.py +56 -0
- mindspore/common/_pijit_context.py +14 -5
- mindspore/common/_register_for_tensor.py +1 -1
- mindspore/common/_stub_tensor.py +5 -10
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +2014 -3386
- mindspore/common/api.py +386 -355
- mindspore/common/auto_dynamic_shape.py +41 -44
- mindspore/common/dtype.py +5 -2
- mindspore/common/dump.py +7 -5
- mindspore/common/file_system.py +3 -0
- mindspore/common/generator.py +3 -0
- mindspore/common/hook_handle.py +5 -3
- mindspore/common/initializer.py +10 -6
- mindspore/common/jit_begin_end.py +94 -0
- mindspore/common/jit_config.py +6 -1
- mindspore/common/jit_context.py +76 -0
- mindspore/common/jit_trace.py +378 -0
- mindspore/common/lazy_inline.py +2 -2
- mindspore/common/mutable.py +5 -4
- mindspore/common/parameter.py +106 -39
- mindspore/common/seed.py +2 -2
- mindspore/common/sparse_tensor.py +23 -17
- mindspore/common/tensor.py +332 -714
- mindspore/communication/__init__.py +7 -5
- mindspore/communication/_comm_helper.py +47 -2
- mindspore/communication/comm_func.py +70 -53
- mindspore/communication/management.py +83 -17
- mindspore/context.py +228 -571
- mindspore/dataset/__init__.py +44 -20
- mindspore/dataset/audio/__init__.py +2 -8
- mindspore/dataset/audio/transforms.py +3 -17
- mindspore/dataset/core/config.py +3 -3
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +102 -120
- mindspore/dataset/engine/datasets_audio.py +22 -22
- mindspore/dataset/engine/datasets_standard_format.py +43 -24
- mindspore/dataset/engine/datasets_text.py +78 -85
- mindspore/dataset/engine/datasets_user_defined.py +109 -77
- mindspore/dataset/engine/datasets_vision.py +111 -108
- mindspore/dataset/engine/iterators.py +5 -3
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +1 -1
- mindspore/dataset/engine/samplers.py +279 -57
- mindspore/dataset/engine/serializer_deserializer.py +2 -1
- mindspore/dataset/engine/validators.py +10 -0
- mindspore/dataset/text/__init__.py +7 -6
- mindspore/dataset/text/transforms.py +6 -5
- mindspore/dataset/text/utils.py +3 -3
- mindspore/dataset/transforms/__init__.py +0 -9
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/utils/browse_dataset.py +1 -1
- mindspore/dataset/vision/__init__.py +2 -9
- mindspore/dataset/vision/transforms.py +202 -158
- mindspore/dataset/vision/utils.py +7 -5
- mindspore/device_context/ascend/op_debug.py +60 -1
- mindspore/device_context/ascend/op_tuning.py +0 -4
- mindspore/device_manager.py +39 -3
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/es/embedding_service.py +35 -27
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -2
- mindspore/experimental/map_parameter.py +4 -4
- mindspore/experimental/optim/adadelta.py +22 -26
- mindspore/experimental/optim/adagrad.py +4 -4
- mindspore/experimental/optim/adam.py +4 -0
- mindspore/experimental/optim/adamax.py +4 -4
- mindspore/experimental/optim/adamw.py +4 -0
- mindspore/experimental/optim/asgd.py +1 -1
- mindspore/experimental/optim/lr_scheduler.py +40 -22
- mindspore/experimental/optim/radam.py +5 -5
- mindspore/experimental/optim/rprop.py +1 -1
- mindspore/experimental/optim/sgd.py +1 -1
- mindspore/hal/contiguous_tensors_handle.py +6 -10
- mindspore/hal/device.py +55 -81
- mindspore/hal/event.py +38 -55
- mindspore/hal/memory.py +115 -147
- mindspore/hal/stream.py +81 -125
- mindspore/include/dataset/constants.h +7 -4
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +40 -2
- mindspore/mindrecord/__init__.py +20 -7
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_backend.dll → mindspore_ops_host.dll} +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +133 -702
- mindspore/mint/distributed/__init__.py +5 -1
- mindspore/mint/distributed/distributed.py +198 -113
- mindspore/mint/linalg/__init__.py +2 -0
- mindspore/mint/nn/__init__.py +280 -18
- mindspore/mint/nn/functional.py +282 -64
- mindspore/mint/nn/layer/__init__.py +4 -0
- mindspore/mint/nn/layer/_functions.py +7 -3
- mindspore/mint/nn/layer/activation.py +120 -13
- mindspore/mint/nn/layer/conv.py +234 -28
- mindspore/mint/nn/layer/normalization.py +15 -16
- mindspore/mint/nn/layer/padding.py +1 -1
- mindspore/mint/nn/layer/pooling.py +66 -1
- mindspore/mint/optim/__init__.py +2 -1
- mindspore/mint/optim/sgd.py +171 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/__init__.py +4 -1
- mindspore/nn/cell.py +1253 -179
- mindspore/nn/layer/activation.py +23 -21
- mindspore/nn/layer/basic.py +22 -16
- mindspore/nn/layer/container.py +1 -1
- mindspore/nn/layer/conv.py +53 -42
- mindspore/nn/layer/embedding.py +9 -8
- mindspore/nn/layer/normalization.py +48 -42
- mindspore/nn/layer/pooling.py +75 -31
- mindspore/nn/layer/transformer.py +11 -10
- mindspore/nn/learning_rate_schedule.py +4 -2
- mindspore/nn/loss/loss.py +27 -19
- mindspore/nn/optim/ada_grad.py +6 -5
- mindspore/nn/optim/adadelta.py +9 -7
- mindspore/nn/optim/adafactor.py +1 -1
- mindspore/nn/optim/adam.py +18 -14
- mindspore/nn/optim/adamax.py +8 -7
- mindspore/nn/optim/adasum.py +5 -5
- mindspore/nn/optim/asgd.py +3 -1
- mindspore/nn/optim/ftrl.py +11 -9
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/lazyadam.py +12 -10
- mindspore/nn/optim/momentum.py +7 -6
- mindspore/nn/optim/optimizer.py +2 -2
- mindspore/nn/optim/proximal_ada_grad.py +12 -10
- mindspore/nn/optim/rmsprop.py +13 -12
- mindspore/nn/optim/rprop.py +9 -7
- mindspore/nn/optim/sgd.py +9 -6
- mindspore/nn/optim/tft_wrapper.py +5 -2
- mindspore/nn/probability/bijector/bijector.py +17 -11
- mindspore/nn/probability/bijector/gumbel_cdf.py +5 -5
- mindspore/nn/probability/bijector/invert.py +2 -2
- mindspore/nn/probability/bijector/scalar_affine.py +3 -3
- mindspore/nn/probability/bijector/softplus.py +3 -2
- mindspore/nn/probability/distribution/beta.py +3 -3
- mindspore/nn/probability/distribution/categorical.py +1 -1
- mindspore/nn/probability/distribution/cauchy.py +4 -2
- mindspore/nn/probability/distribution/exponential.py +6 -7
- mindspore/nn/probability/distribution/gamma.py +2 -2
- mindspore/nn/probability/distribution/gumbel.py +2 -2
- mindspore/nn/probability/distribution/half_normal.py +5 -3
- mindspore/nn/probability/distribution/logistic.py +5 -3
- mindspore/nn/probability/distribution/poisson.py +1 -1
- mindspore/nn/probability/distribution/uniform.py +5 -3
- mindspore/nn/reinforcement/_tensors_queue.py +1 -1
- mindspore/nn/reinforcement/tensor_array.py +1 -1
- mindspore/nn/wrap/__init__.py +6 -6
- mindspore/nn/wrap/cell_wrapper.py +178 -117
- mindspore/nn/wrap/grad_reducer.py +45 -36
- mindspore/nn/wrap/loss_scale.py +3 -3
- mindspore/numpy/array_creations.py +3 -3
- mindspore/numpy/array_ops.py +1 -1
- mindspore/numpy/utils.py +1 -2
- mindspore/numpy/utils_const.py +1 -2
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +3 -2
- mindspore/ops/_grad_experimental/grad_comm_ops.py +18 -3
- mindspore/ops/_grad_experimental/grad_debug_ops.py +8 -1
- mindspore/ops/_grad_experimental/taylor_rule.py +29 -0
- mindspore/ops/_register_for_op.py +0 -11
- mindspore/{ops_generate → ops/_utils}/arg_dtype_cast.py +123 -4
- mindspore/{ops_generate → ops/_utils}/arg_handler.py +3 -4
- mindspore/ops/_vmap/vmap_array_ops.py +32 -6
- mindspore/ops/_vmap/vmap_grad_nn_ops.py +2 -1
- mindspore/ops/_vmap/vmap_math_ops.py +4 -7
- mindspore/ops/_vmap/vmap_nn_ops.py +9 -8
- mindspore/ops/auto_generate/__init__.py +4 -3
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +127 -52
- mindspore/ops/auto_generate/gen_extend_func.py +286 -208
- mindspore/ops/auto_generate/gen_ops_def.py +2783 -2335
- mindspore/ops/auto_generate/gen_ops_prim.py +8992 -2686
- mindspore/ops/auto_generate/pyboost_inner_prim.py +106 -76
- mindspore/ops/composite/__init__.py +2 -1
- mindspore/ops/composite/base.py +19 -24
- mindspore/ops/composite/math_ops.py +6 -16
- mindspore/ops/composite/multitype_ops/__init__.py +5 -2
- mindspore/ops/composite/multitype_ops/_compile_utils.py +4 -5
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -2
- mindspore/ops/composite/multitype_ops/add_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_and_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_or_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_xor_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/div_impl.py +6 -4
- mindspore/ops/composite/multitype_ops/equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/getitem_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/greater_equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/greater_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/in_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/invert_impl.py +50 -0
- mindspore/ops/composite/multitype_ops/left_shift_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/less_equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/less_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/logic_not_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/logical_and_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/logical_or_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/mod_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/mul_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/negative_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/not_equal_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/ones_like_impl.py +18 -0
- mindspore/ops/composite/multitype_ops/pow_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/right_shift_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/setitem_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/sub_impl.py +2 -1
- mindspore/ops/function/__init__.py +28 -2
- mindspore/ops/function/_add_attr_func.py +58 -0
- mindspore/ops/function/array_func.py +1631 -2347
- mindspore/ops/function/clip_func.py +38 -45
- mindspore/ops/function/debug_func.py +36 -44
- mindspore/ops/function/grad/__init__.py +1 -0
- mindspore/ops/function/grad/grad_func.py +104 -71
- mindspore/ops/function/image_func.py +1 -1
- mindspore/ops/function/linalg_func.py +46 -78
- mindspore/ops/function/math_func.py +3024 -3855
- mindspore/ops/function/nn_func.py +678 -274
- mindspore/ops/function/other_func.py +159 -1
- mindspore/ops/function/parameter_func.py +17 -30
- mindspore/ops/function/random_func.py +216 -361
- mindspore/ops/function/reshard_func.py +4 -70
- mindspore/ops/function/sparse_func.py +3 -3
- mindspore/ops/function/sparse_unary_func.py +5 -5
- mindspore/ops/function/spectral_func.py +25 -58
- mindspore/ops/function/vmap_func.py +26 -18
- mindspore/ops/functional.py +8 -5
- mindspore/ops/functional_overload.py +655 -4
- mindspore/ops/op_info_register.py +32 -244
- mindspore/ops/operations/__init__.py +21 -14
- mindspore/ops/operations/_custom_ops_utils.py +235 -0
- mindspore/ops/operations/_grad_ops.py +1 -10
- mindspore/ops/operations/_inner_ops.py +5 -76
- mindspore/ops/operations/_ms_kernel.py +4 -10
- mindspore/ops/operations/_rl_inner_ops.py +1 -1
- mindspore/ops/operations/_scalar_ops.py +3 -2
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/_tensor_array.py +1 -1
- mindspore/ops/operations/array_ops.py +39 -24
- mindspore/ops/operations/comm_ops.py +150 -107
- mindspore/ops/operations/custom_ops.py +287 -32
- mindspore/ops/operations/debug_ops.py +119 -16
- mindspore/ops/operations/inner_ops.py +1 -1
- mindspore/ops/operations/linalg_ops.py +1 -58
- mindspore/ops/operations/manually_defined/_inner.py +1 -1
- mindspore/ops/operations/manually_defined/ops_def.py +746 -79
- mindspore/ops/operations/math_ops.py +21 -18
- mindspore/ops/operations/nn_ops.py +67 -224
- mindspore/ops/operations/other_ops.py +62 -9
- mindspore/ops/operations/random_ops.py +13 -7
- mindspore/ops/operations/reshard_ops.py +1 -1
- mindspore/ops/operations/sparse_ops.py +2 -2
- mindspore/ops/primitive.py +43 -32
- mindspore/ops/tensor_method.py +243 -17
- mindspore/ops_generate/__init__.py +0 -5
- mindspore/ops_generate/aclnn/__init__.py +0 -0
- mindspore/ops_generate/{aclnn_kernel_register_auto_cc_generator.py → aclnn/aclnn_kernel_register_auto_cc_generator.py} +43 -18
- mindspore/ops_generate/{gen_aclnn_implement.py → aclnn/gen_aclnn_implement.py} +49 -51
- mindspore/ops_generate/api/__init__.py +0 -0
- mindspore/ops_generate/{add_tensor_docs_generator.py → api/add_tensor_docs_generator.py} +9 -7
- mindspore/ops_generate/{cpp_create_prim_instance_helper_generator.py → api/cpp_create_prim_instance_helper_generator.py} +6 -9
- mindspore/ops_generate/{functional_map_cpp_generator.py → api/functional_map_cpp_generator.py} +25 -12
- mindspore/ops_generate/{functional_overload_py_generator.py → api/functional_overload_py_generator.py} +8 -6
- mindspore/ops_generate/{functions_cc_generator.py → api/functions_cc_generator.py} +14 -10
- mindspore/ops_generate/api/gen_api.py +103 -0
- mindspore/ops_generate/{op_api_proto.py → api/op_api_proto.py} +98 -69
- mindspore/ops_generate/{tensor_func_reg_cpp_generator.py → api/tensor_func_reg_cpp_generator.py} +82 -43
- mindspore/ops_generate/common/__init__.py +0 -0
- mindspore/ops_generate/common/gen_constants.py +91 -0
- mindspore/ops_generate/{gen_utils.py → common/gen_utils.py} +72 -19
- mindspore/ops_generate/{op_proto.py → common/op_proto.py} +64 -1
- mindspore/ops_generate/{template.py → common/template.py} +96 -84
- mindspore/ops_generate/gen_ops.py +23 -325
- mindspore/ops_generate/op_def/__init__.py +0 -0
- mindspore/ops_generate/op_def/gen_op_def.py +90 -0
- mindspore/ops_generate/{lite_ops_cpp_generator.py → op_def/lite_ops_cpp_generator.py} +47 -11
- mindspore/ops_generate/{ops_def_cc_generator.py → op_def/ops_def_cc_generator.py} +18 -10
- mindspore/ops_generate/{ops_def_h_generator.py → op_def/ops_def_h_generator.py} +5 -5
- mindspore/ops_generate/{ops_name_h_generator.py → op_def/ops_name_h_generator.py} +30 -15
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +125 -0
- mindspore/ops_generate/op_def_py/__init__.py +0 -0
- mindspore/ops_generate/op_def_py/gen_op_def_py.py +47 -0
- mindspore/ops_generate/{op_def_py_generator.py → op_def_py/op_def_py_generator.py} +6 -5
- mindspore/ops_generate/{op_prim_py_generator.py → op_def_py/op_prim_py_generator.py} +24 -15
- mindspore/ops_generate/pyboost/__init__.py +0 -0
- mindspore/ops_generate/{auto_grad_impl_cc_generator.py → pyboost/auto_grad_impl_cc_generator.py} +11 -7
- mindspore/ops_generate/{auto_grad_reg_cc_generator.py → pyboost/auto_grad_reg_cc_generator.py} +7 -7
- mindspore/ops_generate/{gen_pyboost_func.py → pyboost/gen_pyboost_func.py} +40 -16
- mindspore/ops_generate/{op_template_parser.py → pyboost/op_template_parser.py} +105 -24
- mindspore/ops_generate/{pyboost_functions_cpp_generator.py → pyboost/pyboost_functions_cpp_generator.py} +55 -18
- mindspore/ops_generate/{pyboost_functions_h_generator.py → pyboost/pyboost_functions_h_generator.py} +42 -10
- mindspore/ops_generate/{pyboost_functions_py_generator.py → pyboost/pyboost_functions_py_generator.py} +6 -6
- mindspore/ops_generate/{pyboost_grad_function_cpp_generator.py → pyboost/pyboost_grad_function_cpp_generator.py} +11 -10
- mindspore/ops_generate/{pyboost_inner_prim_generator.py → pyboost/pyboost_inner_prim_generator.py} +8 -7
- mindspore/ops_generate/{pyboost_native_grad_functions_generator.py → pyboost/pyboost_native_grad_functions_generator.py} +14 -10
- mindspore/ops_generate/{pyboost_op_cpp_code_generator.py → pyboost/pyboost_op_cpp_code_generator.py} +140 -53
- mindspore/ops_generate/{pyboost_overload_functions_cpp_generator.py → pyboost/pyboost_overload_functions_cpp_generator.py} +28 -15
- mindspore/ops_generate/{pyboost_utils.py → pyboost/pyboost_utils.py} +88 -4
- mindspore/ops_generate/resources/__init__.py +0 -0
- mindspore/ops_generate/resources/resource_list.py +30 -0
- mindspore/ops_generate/resources/resource_loader.py +36 -0
- mindspore/ops_generate/resources/resource_manager.py +64 -0
- mindspore/ops_generate/resources/yaml_loader.py +88 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +122 -0
- mindspore/parallel/__init__.py +6 -2
- mindspore/parallel/_auto_parallel_context.py +140 -12
- mindspore/parallel/_cell_wrapper.py +132 -15
- mindspore/parallel/_parallel_serialization.py +95 -4
- mindspore/parallel/_ps_context.py +1 -1
- mindspore/parallel/_recovery_context.py +7 -2
- mindspore/parallel/_tensor.py +142 -18
- mindspore/parallel/_utils.py +198 -25
- mindspore/parallel/algo_parameter_config.py +3 -3
- mindspore/parallel/auto_parallel.py +732 -0
- mindspore/parallel/checkpoint_convert.py +159 -0
- mindspore/parallel/checkpoint_transform.py +658 -37
- mindspore/parallel/cluster/process_entity/_api.py +151 -19
- mindspore/parallel/cluster/run.py +1 -1
- mindspore/parallel/function/__init__.py +24 -0
- mindspore/parallel/function/reshard_func.py +258 -0
- mindspore/parallel/nn/__init__.py +25 -0
- mindspore/parallel/nn/parallel_cell_wrapper.py +263 -0
- mindspore/parallel/nn/parallel_grad_reducer.py +169 -0
- mindspore/parallel/parameter_broadcast.py +24 -13
- mindspore/parallel/shard.py +137 -62
- mindspore/parallel/transform_safetensors.py +288 -95
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +9 -5
- mindspore/profiler/analysis/parser/ascend_cann_parser.py +6 -2
- mindspore/profiler/analysis/parser/ms_framework_parser.py +4 -4
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -4
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +25 -0
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +241 -86
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +41 -2
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +33 -35
- mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +7 -0
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +8 -3
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +141 -30
- mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +5 -6
- mindspore/profiler/common/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/common/constant.py +12 -0
- mindspore/profiler/common/msprof_cmd_tool.py +42 -23
- mindspore/profiler/common/path_manager.py +24 -0
- mindspore/profiler/common/profiler_context.py +26 -2
- mindspore/profiler/common/profiler_meta_data.py +74 -0
- mindspore/profiler/common/profiler_parameters.py +59 -18
- mindspore/profiler/common/profiler_path_manager.py +66 -7
- mindspore/profiler/dynamic_profiler.py +112 -79
- mindspore/profiler/envprofiler.py +26 -1
- mindspore/profiler/experimental_config.py +197 -0
- mindspore/profiler/mstx.py +57 -14
- mindspore/profiler/platform/npu_profiler.py +33 -7
- mindspore/profiler/profiler.py +541 -45
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +4 -0
- mindspore/profiler/schedule.py +57 -22
- mindspore/rewrite/api/node.py +15 -13
- mindspore/rewrite/api/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +25 -14
- mindspore/run_check/run_check.py +1 -1
- mindspore/runtime/__init__.py +2 -2
- mindspore/runtime/executor.py +40 -11
- mindspore/runtime/memory.py +37 -13
- mindspore/safeguard/rewrite_obfuscation.py +12 -9
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +8 -8
- mindspore/train/_utils.py +43 -9
- mindspore/train/amp.py +1 -1
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +2 -16
- mindspore/train/callback/_checkpoint.py +24 -40
- mindspore/train/callback/_cluster_monitor.py +14 -18
- mindspore/train/callback/_flops_collector.py +2 -3
- mindspore/train/callback/_history.py +7 -4
- mindspore/train/callback/_lambda_callback.py +2 -2
- mindspore/train/callback/_landscape.py +0 -3
- mindspore/train/callback/_loss_monitor.py +2 -1
- mindspore/train/callback/_on_request_exit.py +6 -5
- mindspore/train/callback/_reduce_lr_on_plateau.py +11 -6
- mindspore/train/callback/_summary_collector.py +8 -13
- mindspore/train/callback/_time_monitor.py +2 -1
- mindspore/train/callback/{_tft_register.py → _train_fault_tolerance.py} +204 -105
- mindspore/train/data_sink.py +25 -2
- mindspore/train/dataset_helper.py +4 -5
- mindspore/train/loss_scale_manager.py +8 -7
- mindspore/train/metrics/accuracy.py +3 -3
- mindspore/train/metrics/confusion_matrix.py +9 -9
- mindspore/train/metrics/error.py +3 -3
- mindspore/train/metrics/hausdorff_distance.py +4 -4
- mindspore/train/metrics/mean_surface_distance.py +3 -3
- mindspore/train/metrics/metric.py +0 -12
- mindspore/train/metrics/occlusion_sensitivity.py +4 -2
- mindspore/train/metrics/precision.py +8 -6
- mindspore/train/metrics/recall.py +9 -9
- mindspore/train/metrics/root_mean_square_surface_distance.py +2 -2
- mindspore/train/mind_ir_pb2.py +19 -12
- mindspore/train/model.py +262 -127
- mindspore/train/serialization.py +246 -988
- mindspore/train/summary/_summary_adapter.py +2 -2
- mindspore/train/summary/summary_record.py +1 -1
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +4 -2
- mindspore/utils/hooks.py +81 -0
- mindspore/utils/runtime_execution_order_check.py +2 -0
- mindspore/utils/utils.py +138 -4
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.5.0.dist-info → mindspore-2.6.0.dist-info}/METADATA +2 -1
- {mindspore-2.5.0.dist-info → mindspore-2.6.0.dist-info}/RECORD +485 -440
- mindspore/_install_custom.py +0 -43
- mindspore/common/_register_for_adapter.py +0 -74
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +0 -252
- mindspore/ops/auto_generate/gen_arg_handler.py +0 -136
- mindspore/ops/operations/_opaque_predicate_registry.py +0 -41
- mindspore/ops_generate/gen_constants.py +0 -190
- mindspore/ops_generate/gen_ops_inner_prim.py +0 -131
- mindspore/ops_generate/ops_primitive_h_generator.py +0 -81
- /mindspore/ops_generate/{base_generator.py → common/base_generator.py} +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0.dist-info}/WHEEL +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -19,15 +19,18 @@ import sys
|
|
|
19
19
|
import signal
|
|
20
20
|
import subprocess
|
|
21
21
|
import socket
|
|
22
|
+
import psutil
|
|
22
23
|
import mindspore.log as logger
|
|
23
|
-
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url
|
|
24
|
-
|
|
24
|
+
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
|
|
25
|
+
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip
|
|
26
|
+
|
|
25
27
|
|
|
26
28
|
class _Node:
|
|
27
29
|
"""
|
|
28
30
|
Base class for dynamic networking nodes.
|
|
29
31
|
|
|
30
32
|
"""
|
|
33
|
+
|
|
31
34
|
def __init__(self, worker_num, sched_host, sched_port, timeout, args_list, output_file, tail_worker_log,
|
|
32
35
|
join, is_simulation):
|
|
33
36
|
self.worker_num = worker_num
|
|
@@ -40,24 +43,26 @@ class _Node:
|
|
|
40
43
|
self.join = join
|
|
41
44
|
self.is_simulation = is_simulation
|
|
42
45
|
|
|
43
|
-
|
|
44
46
|
def run(self):
|
|
45
47
|
"""
|
|
46
48
|
Runs the node by setting environment variables and executing the entrypoint command or script.
|
|
47
49
|
|
|
48
50
|
"""
|
|
49
51
|
os.environ["MS_WORKER_NUM"] = str(self.worker_num)
|
|
50
|
-
# If simulation level is set, environment variables for dynamic networking will not be set
|
|
52
|
+
# If simulation level is set, environment variables for dynamic networking will not be set,
|
|
53
|
+
# and scheduler will not be started.
|
|
51
54
|
if not self.is_simulation:
|
|
52
55
|
os.environ["MS_SCHED_HOST"] = self.sched_host
|
|
53
56
|
os.environ["MS_SCHED_PORT"] = str(self.sched_port)
|
|
54
57
|
os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
|
|
55
58
|
|
|
59
|
+
|
|
56
60
|
class _MetaServerNode(_Node):
|
|
57
61
|
"""
|
|
58
62
|
Scheduler node for dynamic networking. Inherits from the Node class.
|
|
59
63
|
|
|
60
64
|
"""
|
|
65
|
+
|
|
61
66
|
def run(self):
|
|
62
67
|
"""
|
|
63
68
|
Runs the MetaServerNode by setting environment variables, setting the MS_ROLE variable to
|
|
@@ -68,17 +73,18 @@ class _MetaServerNode(_Node):
|
|
|
68
73
|
with open(self.output_file, "w") as file_handle:
|
|
69
74
|
return subprocess.Popen(self.args_list, stdout=file_handle, stderr=subprocess.STDOUT)
|
|
70
75
|
|
|
76
|
+
|
|
71
77
|
class _ComputeGraphNode(_Node):
|
|
72
78
|
"""
|
|
73
79
|
Worker node for dynamic networking. Inherits from the Node class.
|
|
74
80
|
"""
|
|
81
|
+
|
|
75
82
|
def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file,
|
|
76
83
|
tail_worker_log, join, is_simulation):
|
|
77
84
|
super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file,
|
|
78
85
|
tail_worker_log, join, is_simulation)
|
|
79
86
|
self.node_id = node_id
|
|
80
87
|
|
|
81
|
-
|
|
82
88
|
def run(self):
|
|
83
89
|
"""
|
|
84
90
|
Runs the ComputeGraphNode by setting environment variables, setting the MS_NODE_ID variable
|
|
@@ -127,6 +133,7 @@ class _ProcessManager:
|
|
|
127
133
|
training
|
|
128
134
|
|
|
129
135
|
"""
|
|
136
|
+
|
|
130
137
|
def __init__(self, args):
|
|
131
138
|
"""
|
|
132
139
|
Initializes a ProcessManager object.
|
|
@@ -198,6 +205,21 @@ class _ProcessManager:
|
|
|
198
205
|
finally:
|
|
199
206
|
os.umask(origin_mask)
|
|
200
207
|
|
|
208
|
+
self.proc_rank_map = {}
|
|
209
|
+
self.enable_mindx = False
|
|
210
|
+
tft_env = os.getenv("MS_ENABLE_TFT", "")
|
|
211
|
+
if ("TTP:1" in tft_env) or ("UCE:1" in tft_env) or ("ARF:1" in tft_env):
|
|
212
|
+
try:
|
|
213
|
+
from taskd.python.framework.agent.ms_mgr.msrun_plugin import MSRunPlugin
|
|
214
|
+
self.msmgr = MSRunPlugin()
|
|
215
|
+
self.msmgr.register_callbacks("KILL_WORKER", self.kill_workers)
|
|
216
|
+
self.msmgr.register_callbacks("START_ALL_WORKER", self.start_all_workers)
|
|
217
|
+
self.msmgr.register_callbacks("MONITOR", self.monitor_rank_status)
|
|
218
|
+
self.enable_mindx = True
|
|
219
|
+
os.environ["MS_ENABLE_RECOVERY"] = str(1)
|
|
220
|
+
except Exception as e: # pylint: disable=broad-except
|
|
221
|
+
logger.warning(f"mindx is not installed, using original mindspore recovery strategy.: {str(e)}")
|
|
222
|
+
|
|
201
223
|
def run(self):
|
|
202
224
|
"""
|
|
203
225
|
Runs the process manager.
|
|
@@ -218,11 +240,13 @@ class _ProcessManager:
|
|
|
218
240
|
else:
|
|
219
241
|
if self.is_master and not self.is_simulation:
|
|
220
242
|
self.start_scheduler()
|
|
221
|
-
self.
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
self.
|
|
243
|
+
if self.enable_mindx:
|
|
244
|
+
self.msmgr.start()
|
|
245
|
+
else:
|
|
246
|
+
self.start_workers()
|
|
247
|
+
if self.join:
|
|
248
|
+
logger.warning("Distributed job is spawned. Waiting all processes to exit...")
|
|
249
|
+
self.join_processes()
|
|
226
250
|
|
|
227
251
|
def start_scheduler(self):
|
|
228
252
|
"""
|
|
@@ -262,17 +286,17 @@ class _ProcessManager:
|
|
|
262
286
|
# If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
|
|
263
287
|
# This is for rank_table method's compatibility consideration.
|
|
264
288
|
os.environ["RANK_ID"] = str(node_id)
|
|
265
|
-
|
|
266
|
-
|
|
289
|
+
print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
|
|
290
|
+
f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
|
|
267
291
|
if self.is_simulation and (self.sim_rank_id != -1):
|
|
268
292
|
# Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
|
|
269
293
|
os.environ["RANK_ID"] = str(self.sim_rank_id)
|
|
270
294
|
logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
|
|
271
295
|
|
|
272
|
-
cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
|
|
273
|
-
if not cpu_num.isdigit():
|
|
274
|
-
raise RuntimeError("Fail to get cpu number from /proc/cpuinfo.")
|
|
275
296
|
if self.bind_core:
|
|
297
|
+
cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
|
|
298
|
+
if not cpu_num.isdigit():
|
|
299
|
+
raise RuntimeError(f"Got cpu number from '/proc/cpuinfo' is {cpu_num}, failed to bind core.")
|
|
276
300
|
avg = int(cpu_num) // self.local_worker_num
|
|
277
301
|
cpu_start = avg * i
|
|
278
302
|
cpu_end = cpu_start + avg - 1
|
|
@@ -284,7 +308,7 @@ class _ProcessManager:
|
|
|
284
308
|
process, tail_process = cgn.run()
|
|
285
309
|
self.cgn_processes.append(process)
|
|
286
310
|
self.tail_cgn_processes.append(tail_process)
|
|
287
|
-
|
|
311
|
+
self.proc_rank_map[i] = process
|
|
288
312
|
|
|
289
313
|
def join_processes(self):
|
|
290
314
|
"""
|
|
@@ -292,6 +316,7 @@ class _ProcessManager:
|
|
|
292
316
|
If there's any process does not exit normally, logs will be analyzed
|
|
293
317
|
so that understandable root cause of exception could be returned.
|
|
294
318
|
"""
|
|
319
|
+
|
|
295
320
|
def signal_handler(sig, frame):
|
|
296
321
|
logger.warning("msrun process received SIGNIN (Ctrl+C), terminating all workers.")
|
|
297
322
|
self.kill_all_processes()
|
|
@@ -331,7 +356,7 @@ class _ProcessManager:
|
|
|
331
356
|
logger.error(f"Scheduler process {self.msn_process.pid} exit with exception.")
|
|
332
357
|
|
|
333
358
|
if has_exception:
|
|
334
|
-
logger.
|
|
359
|
+
logger.info("Analyzing exception log...")
|
|
335
360
|
self._analyze_log()
|
|
336
361
|
raise RuntimeError("Distributed job exited with exception. Please check logs in "
|
|
337
362
|
f"directory: {self.log_dir}.")
|
|
@@ -388,6 +413,115 @@ class _ProcessManager:
|
|
|
388
413
|
self.start_scheduler()
|
|
389
414
|
self.start_workers()
|
|
390
415
|
|
|
416
|
+
def kill_all_workers(self):
|
|
417
|
+
"""
|
|
418
|
+
Kill all running worker processes.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
NA.
|
|
422
|
+
"""
|
|
423
|
+
for p in self.cgn_processes:
|
|
424
|
+
if p.poll() is None:
|
|
425
|
+
p.kill()
|
|
426
|
+
self.cgn_processes.clear()
|
|
427
|
+
|
|
428
|
+
for p in self.tail_cgn_processes:
|
|
429
|
+
if p is not None:
|
|
430
|
+
p.kill()
|
|
431
|
+
self.tail_cgn_processes.clear()
|
|
432
|
+
|
|
433
|
+
def kill_single_worker(self, pid):
|
|
434
|
+
"""
|
|
435
|
+
Kill one worker process with specified pid.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
pid: Worker process' pid.
|
|
439
|
+
"""
|
|
440
|
+
kill_status = False
|
|
441
|
+
for i in range(len(self.cgn_processes)):
|
|
442
|
+
p = self.cgn_processes[i]
|
|
443
|
+
if p.pid == pid and p.poll() is None:
|
|
444
|
+
p.kill()
|
|
445
|
+
del self.cgn_processes[i]
|
|
446
|
+
tail_p = self.tail_cgn_processes[i]
|
|
447
|
+
if tail_p is not None:
|
|
448
|
+
tail_p.kill()
|
|
449
|
+
del self.tail_cgn_processes[i]
|
|
450
|
+
kill_status = True
|
|
451
|
+
break
|
|
452
|
+
if not kill_status:
|
|
453
|
+
logger.warning(f"There's no active worker with pid: {pid}")
|
|
454
|
+
|
|
455
|
+
def kill_workers(self, pids):
|
|
456
|
+
"""
|
|
457
|
+
Kill worker process according to pids. Worker process with pid within pids list will be killed.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
pids(list): a list of worker process pid. When local_ranks pids -1, kill all worker process.
|
|
461
|
+
"""
|
|
462
|
+
if -1 in pids:
|
|
463
|
+
self.kill_all_workers()
|
|
464
|
+
else:
|
|
465
|
+
for pid in pids:
|
|
466
|
+
self.kill_single_worker(pid)
|
|
467
|
+
return 0
|
|
468
|
+
|
|
469
|
+
def monitor_rank_status(self, local_ranks):
|
|
470
|
+
"""
|
|
471
|
+
Monitor the status of workers whose rank is within local_ranks list.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
local_ranks(list): a list of local worker ranks. When local_ranks contains -1,
|
|
475
|
+
monitor all workers' status.
|
|
476
|
+
"""
|
|
477
|
+
rank_status = {}
|
|
478
|
+
if -1 in local_ranks:
|
|
479
|
+
local_ranks = list(range(self.local_worker_num))
|
|
480
|
+
for i in local_ranks:
|
|
481
|
+
single_status = self.monitor_single_rank(i)
|
|
482
|
+
if single_status:
|
|
483
|
+
rank_status[i] = single_status
|
|
484
|
+
return rank_status
|
|
485
|
+
|
|
486
|
+
def monitor_single_rank(self, rank_id):
|
|
487
|
+
"""
|
|
488
|
+
Monitor the status of a single worker with rank_id
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
rank_id: worker process's local rank, which is also device_id.
|
|
492
|
+
"""
|
|
493
|
+
if 0 <= rank_id < self.local_worker_num:
|
|
494
|
+
global_rank_id = rank_id
|
|
495
|
+
if self.node_rank >= 0:
|
|
496
|
+
global_rank_id = self.node_rank * self.local_worker_num + rank_id
|
|
497
|
+
try:
|
|
498
|
+
p = self.proc_rank_map[rank_id]
|
|
499
|
+
p_status = p.poll()
|
|
500
|
+
if (not psutil.pid_exists(p.pid)) and (p_status != 0):
|
|
501
|
+
p_status = 300
|
|
502
|
+
return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id}
|
|
503
|
+
except KeyError:
|
|
504
|
+
logger.info(f"Process rank {rank_id} has not been initialized.")
|
|
505
|
+
return {"pid": None, "status": 200, "global_rank": global_rank_id}
|
|
506
|
+
else:
|
|
507
|
+
logger.warning(f"Invalid rank id!")
|
|
508
|
+
return {}
|
|
509
|
+
|
|
510
|
+
def start_all_workers(self):
|
|
511
|
+
"""
|
|
512
|
+
Start all worker processes after killing all workers.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
NA.
|
|
516
|
+
"""
|
|
517
|
+
if self.cgn_processes:
|
|
518
|
+
self.kill_all_workers()
|
|
519
|
+
self.start_workers()
|
|
520
|
+
worker_status = self.monitor_rank_status([-1])
|
|
521
|
+
for i in range(self.local_worker_num):
|
|
522
|
+
if worker_status[i]["status"] != None: # pylint: disable=singleton-comparison
|
|
523
|
+
return 1
|
|
524
|
+
return 0
|
|
391
525
|
|
|
392
526
|
def _get_node_id_and_log_path(self, index):
|
|
393
527
|
"""
|
|
@@ -410,7 +544,6 @@ class _ProcessManager:
|
|
|
410
544
|
log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
|
|
411
545
|
return node_id, log_name
|
|
412
546
|
|
|
413
|
-
|
|
414
547
|
def _analyze_log(self):
|
|
415
548
|
"""
|
|
416
549
|
Analyze exception logs.
|
|
@@ -432,7 +565,6 @@ class _ProcessManager:
|
|
|
432
565
|
|
|
433
566
|
os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
|
|
434
567
|
|
|
435
|
-
|
|
436
568
|
def format_worker_log_name(self):
|
|
437
569
|
"""
|
|
438
570
|
Format worker log files' name.
|
|
@@ -85,7 +85,7 @@ def get_args():
|
|
|
85
85
|
"--sim_level",
|
|
86
86
|
default=-1,
|
|
87
87
|
type=int,
|
|
88
|
-
choices=[0, 1],
|
|
88
|
+
choices=[0, 1, 2, 3],
|
|
89
89
|
help="specifies simulation level. When this argument is set, msrun only spawns one process "
|
|
90
90
|
"but export RANK_SIZE with value worker_num and RANK_ID with value sim_rank_id."
|
|
91
91
|
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 2025 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
Parallel function operator
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from mindspore.parallel.function.reshard_func import reshard
|
|
21
|
+
|
|
22
|
+
__all__ = []
|
|
23
|
+
__all__.extend(reshard_func.__all__)
|
|
24
|
+
__all__.sort()
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# Copyright 2023 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""Defines parameter operators with functional form."""
|
|
16
|
+
from mindspore import context, ops
|
|
17
|
+
from mindspore import log as logger
|
|
18
|
+
from mindspore.ops import operations as P
|
|
19
|
+
from mindspore.ops._primitive_cache import _get_cache_prim
|
|
20
|
+
from mindspore.common.tensor import Tensor
|
|
21
|
+
from mindspore.communication.management import get_group_size, get_rank
|
|
22
|
+
from mindspore.parallel.shard import Layout, _DistributedTensorInfo
|
|
23
|
+
from mindspore.parallel._auto_parallel_context import _get_all_auto_parallel_context, _recover_auto_parallel_context
|
|
24
|
+
from mindspore.ops.primitive import constexpr
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
REDIST_CELL_CACHE = {}
|
|
28
|
+
COMM_TENSOR_CELL_CACHE = {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@constexpr
|
|
32
|
+
def group_size():
|
|
33
|
+
""" Return the device number in the Cell's construct method. """
|
|
34
|
+
return get_group_size()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# pylint: disable=W0212
|
|
38
|
+
def reshard(tensor, layout):
|
|
39
|
+
r"""
|
|
40
|
+
Converting a tensor from one distributed arrangement to another distributed arrangement.
|
|
41
|
+
The given layout must be type mindspore.parallel.Layout,
|
|
42
|
+
can check :class:`mindspore.parallel.Layout` for reference.
|
|
43
|
+
|
|
44
|
+
Note:
|
|
45
|
+
- In the Graph mode, this function can set the sharding propagation strategy of a tensor.
|
|
46
|
+
For those tensor do not manually be set, their strategies are decided by the sharding
|
|
47
|
+
strategy propagation algorithm automatically.
|
|
48
|
+
- In PyNative mode, you can use this method to arrange tensors in a cell (that is, cells
|
|
49
|
+
that use Cell.shard/F.shard in PyNative mode) that is executed in parallel in graph mode.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
tensor (Tensor): The tensor to be set the sharding strategy.
|
|
53
|
+
layout (Layout): The layout to shard the tensor precisely, including the device
|
|
54
|
+
arrangement (device_matrix) and the alias for the device matrix
|
|
55
|
+
(alias_name).
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tensor. The mathematically equivalent of the input tensor.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
TypeError: If the type of input param `tensor` is not mindspore.Tensor.
|
|
62
|
+
TypeError: If the type of input param `layout` is not mindspore.parallel.Layout.
|
|
63
|
+
|
|
64
|
+
Supported Platforms:
|
|
65
|
+
``Ascend``
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
.. note::
|
|
69
|
+
Before running the following examples, you need to configure the communication environment variables.
|
|
70
|
+
|
|
71
|
+
For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
|
|
72
|
+
without any third-party or configuration file dependencies.
|
|
73
|
+
Please see the `msrun start-up
|
|
74
|
+
<https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
|
|
75
|
+
for more details.
|
|
76
|
+
|
|
77
|
+
This example should be run with 8 devices.
|
|
78
|
+
|
|
79
|
+
>>> import numpy as np
|
|
80
|
+
>>> import mindspore as ms
|
|
81
|
+
>>> from mindspore import ops, nn, Tensor, context, Layout
|
|
82
|
+
>>> from mindspore.parallel.function import reshard
|
|
83
|
+
>>> from mindspore.nn.utils import no_init_parameters
|
|
84
|
+
>>> from mindspore.parallel.auto_parallel import AutoParallel
|
|
85
|
+
>>> from mindspore.communication import init
|
|
86
|
+
>>> context.set_context(mode=ms.GRAPH_MODE)
|
|
87
|
+
>>> init()
|
|
88
|
+
>>> class Network(nn.Cell):
|
|
89
|
+
... def __init__(self):
|
|
90
|
+
... super().__init__()
|
|
91
|
+
... self.matmul = ops.MatMul()
|
|
92
|
+
... self.relu = ops.ReLU()
|
|
93
|
+
... def construct(self, x, layout):
|
|
94
|
+
... x = self.relu(x)
|
|
95
|
+
... x_reshard = reshard(x, layout)
|
|
96
|
+
... y = Tensor(np.ones(shape=(128, 128)), dtype=ms.float32)
|
|
97
|
+
... x = self.matmul(x_reshard, y)
|
|
98
|
+
... return x
|
|
99
|
+
>>> layout = Layout((4, 2), ("dp", "mp"))
|
|
100
|
+
>>> input_layout = layout("dp", "mp")
|
|
101
|
+
>>> with no_init_parameters():
|
|
102
|
+
... net = Network()
|
|
103
|
+
>>> parallel_net = AutoParallel(net, parallel_mode='sharding_propagation')
|
|
104
|
+
>>> tensor = Tensor(np.ones(shape=(128, 128)), dtype=ms.float32)
|
|
105
|
+
>>> out = parallel_net(tensor, input_layout)
|
|
106
|
+
"""
|
|
107
|
+
if group_size() == 1:
|
|
108
|
+
return tensor
|
|
109
|
+
if not isinstance(tensor, Tensor):
|
|
110
|
+
raise TypeError(f"Reshard takes in Tensor type as the first input param, but got: {type(tensor)}.")
|
|
111
|
+
if not isinstance(layout, Layout):
|
|
112
|
+
raise TypeError(f"Reshard only support type mindspore.parallel.Layout, but got: {type(layout)}.")
|
|
113
|
+
|
|
114
|
+
def layout_to_tuple(layout):
|
|
115
|
+
layout_dict = layout.to_dict()
|
|
116
|
+
tensor_map = layout_dict["tensor_map"]
|
|
117
|
+
device_matrix_rev = layout_dict["device_matrix"][::-1]
|
|
118
|
+
axis_stgy = ()
|
|
119
|
+
for ind in tensor_map:
|
|
120
|
+
if ind == -1:
|
|
121
|
+
axis_stgy += (1,)
|
|
122
|
+
else:
|
|
123
|
+
axis_stgy += (device_matrix_rev[ind],)
|
|
124
|
+
return axis_stgy
|
|
125
|
+
|
|
126
|
+
in_strategy = layout_to_tuple(layout)
|
|
127
|
+
_reshard = _get_cache_prim(P.Reshard)(in_layout=(layout,), out_layout=(layout,), in_strategy=(in_strategy,))
|
|
128
|
+
return _reshard(tensor)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _redistribute(tensor, dst_dtensor_info):
|
|
132
|
+
"""
|
|
133
|
+
Redistribute the tensor from the source sharding strategy to the destination sharding strategy.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
tensor (Tensor): The source tensor.
|
|
137
|
+
dst_dtensor_info (_DistributedTensorInfo): The destination sharding strategy.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Tensor, value is same as the source tensor, but the sharding strategy is the destination sharding strategy.
|
|
141
|
+
|
|
142
|
+
Supported Platforms:
|
|
143
|
+
``Ascend``
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
.. note::
|
|
147
|
+
Before running the following examples, you need to configure the communication environment variables.
|
|
148
|
+
|
|
149
|
+
For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
|
|
150
|
+
without any third-party or configuration file dependencies.
|
|
151
|
+
Please see the `msrun start up
|
|
152
|
+
<https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
|
|
153
|
+
for more details.
|
|
154
|
+
|
|
155
|
+
This example should be run with 2 devices.
|
|
156
|
+
|
|
157
|
+
>>> import numpy as np
|
|
158
|
+
>>> from mindspore.communication import init
|
|
159
|
+
>>> from mindspore import Tensor, Layout, _DistributedTensorInfo
|
|
160
|
+
>>>
|
|
161
|
+
>>> init()
|
|
162
|
+
>>> layout = Layout((2, 1), ("dp", "mp"))
|
|
163
|
+
>>> src_layout = layout("dp", "mp")
|
|
164
|
+
>>> distributed_info = _DistributedTensorInfo(src_layout)
|
|
165
|
+
>>> x = Tensor(np.ones([2, 2]).astype(np.float32))
|
|
166
|
+
>>> out = x.redistribute(distributed_info)
|
|
167
|
+
>>> print(out)
|
|
168
|
+
[[1. 1.]]
|
|
169
|
+
"""
|
|
170
|
+
from mindspore.parallel._cell_wrapper import RedistributionCell, _insert_virtual_pp_dim
|
|
171
|
+
if not isinstance(dst_dtensor_info, _DistributedTensorInfo):
|
|
172
|
+
raise TypeError(
|
|
173
|
+
"dst_dtensor_info should be _DistributedTensorInfo type, but got {}".format(type(dst_dtensor_info)))
|
|
174
|
+
run_mode = context.get_context("mode")
|
|
175
|
+
context.set_context(mode=context.GRAPH_MODE)
|
|
176
|
+
og_auto_parallel_context, pp_config = _get_all_auto_parallel_context()
|
|
177
|
+
context.reset_auto_parallel_context()
|
|
178
|
+
tensor_data = tensor
|
|
179
|
+
all_reduce_data = False
|
|
180
|
+
# If src_pp_stages is less than or equal to dst_pp_stages, the parameters of each pp stage of src can be
|
|
181
|
+
# directly swapped to the corresponding card of dst
|
|
182
|
+
# rank0 01 11 01
|
|
183
|
+
# rank1 02 12 02
|
|
184
|
+
# pp1 ------> pp2
|
|
185
|
+
# rank2 03 13 11
|
|
186
|
+
# rank3 04 14 12
|
|
187
|
+
# if dtensor info is None, return the all 1 strategy as from dtensor info
|
|
188
|
+
if tensor._dtensor_info is None:
|
|
189
|
+
all_dev_num = get_group_size()
|
|
190
|
+
dev_mat = Layout((all_dev_num,), ("replica",))
|
|
191
|
+
tensor_map = ["None"] * len(tensor.shape)
|
|
192
|
+
layout = dev_mat(*tensor_map)
|
|
193
|
+
tensor._dtensor_info = _DistributedTensorInfo(layout)
|
|
194
|
+
src_layout_info = tensor._dtensor_info.layout.to_dict()
|
|
195
|
+
dst_layout_info = dst_dtensor_info.layout.to_dict()
|
|
196
|
+
if len(tensor._dtensor_info.layout.to_dict()["rank_list"]) < len(dst_dtensor_info.layout.to_dict()["rank_list"]):
|
|
197
|
+
# If src_pp_stages is greater than dst_pp_stages, the weights of the corresponding cards need to
|
|
198
|
+
# be communicated via AllReduce to swap. Need to communicate src rank0's 01 to src rank2,
|
|
199
|
+
# so that rank2 holds param0's data. Similarly, communicate rank1's 02 to rank3
|
|
200
|
+
# rank0 01 01 11
|
|
201
|
+
# rank1 02 02 12
|
|
202
|
+
# pp2 -------> pp1
|
|
203
|
+
# rank2 11 03 13
|
|
204
|
+
# rank3 12 04 14
|
|
205
|
+
from mindspore.parallel._cell_wrapper import CommTensorDataForPP
|
|
206
|
+
if get_rank() in dst_dtensor_info.layout.to_dict()["rank_list"]:
|
|
207
|
+
comm_tensor_cache_key = (
|
|
208
|
+
f"{src_layout_info['device_matrix']}, {src_layout_info['tensor_map']}, {src_layout_info['rank_list']}"
|
|
209
|
+
f" -> "
|
|
210
|
+
f"{dst_layout_info['device_matrix']}, {dst_layout_info['tensor_map']}, {dst_layout_info['rank_list']}")
|
|
211
|
+
global COMM_TENSOR_CELL_CACHE
|
|
212
|
+
if comm_tensor_cache_key not in COMM_TENSOR_CELL_CACHE:
|
|
213
|
+
comm_tensor_data_func = CommTensorDataForPP(tensor._dtensor_info, dst_dtensor_info)
|
|
214
|
+
COMM_TENSOR_CELL_CACHE[comm_tensor_cache_key] = comm_tensor_data_func
|
|
215
|
+
logger.debug(f"comm_tensor_cache_key is {comm_tensor_cache_key}, not match cache")
|
|
216
|
+
else:
|
|
217
|
+
comm_tensor_data_func = COMM_TENSOR_CELL_CACHE[comm_tensor_cache_key]
|
|
218
|
+
logger.debug(f"comm_tensor_cache_key is {comm_tensor_cache_key}, match cache")
|
|
219
|
+
if not comm_tensor_data_func._current_rank_has_data:
|
|
220
|
+
new_tensor_shape = tuple([tensor_data.shape[i] // tensor._dtensor_info.sharding_strategy[i]
|
|
221
|
+
for i in range(len(tensor.shape))])
|
|
222
|
+
tensor_data = ops.zeros(new_tensor_shape, tensor.dtype)
|
|
223
|
+
_ = comm_tensor_data_func.comm_data(tensor_data)
|
|
224
|
+
else:
|
|
225
|
+
_ = comm_tensor_data_func.comm_data(tensor_data)
|
|
226
|
+
all_reduce_data = True
|
|
227
|
+
if src_layout_info['device_matrix'] == dst_layout_info['device_matrix'] and src_layout_info['tensor_map'] == \
|
|
228
|
+
dst_layout_info['tensor_map']:
|
|
229
|
+
return tensor_data
|
|
230
|
+
dataset_strategy = (_insert_virtual_pp_dim(tensor._dtensor_info.layout),)
|
|
231
|
+
if get_rank() not in tensor._dtensor_info.layout.to_dict()["rank_list"] and not all_reduce_data:
|
|
232
|
+
dataset_strategy = "full_batch"
|
|
233
|
+
context.set_auto_parallel_context(dataset_strategy=dataset_strategy,
|
|
234
|
+
parallel_mode="semi_auto_parallel", device_num=get_group_size())
|
|
235
|
+
global REDIST_CELL_CACHE
|
|
236
|
+
redist_cache_key = (f"{src_layout_info['device_matrix']}, {src_layout_info['tensor_map']} -> "
|
|
237
|
+
f"{dst_layout_info['device_matrix']}, {dst_layout_info['tensor_map']}")
|
|
238
|
+
if redist_cache_key in REDIST_CELL_CACHE.keys():
|
|
239
|
+
logger.debug(f"redist_cache_key is {redist_cache_key}, match cache")
|
|
240
|
+
redist_func = REDIST_CELL_CACHE[redist_cache_key]
|
|
241
|
+
else:
|
|
242
|
+
logger.debug(f"redist_cache_key is {redist_cache_key}, not match cache")
|
|
243
|
+
redist_func = RedistributionCell(tensor._dtensor_info.layout, dst_dtensor_info.layout)
|
|
244
|
+
REDIST_CELL_CACHE[redist_cache_key] = redist_func
|
|
245
|
+
redist_func.set_train(True)
|
|
246
|
+
redist_tensor_data = redist_func(tensor_data)
|
|
247
|
+
context.reset_auto_parallel_context()
|
|
248
|
+
_recover_auto_parallel_context(og_auto_parallel_context, pp_config)
|
|
249
|
+
context.set_context(mode=run_mode)
|
|
250
|
+
redist_tensor_data._dtensor_info = dst_dtensor_info
|
|
251
|
+
return redist_tensor_data
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
__all__ = [
|
|
255
|
+
'reshard'
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
__all__.sort()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Copyright 2025 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""
|
|
16
|
+
Interfaces for parallel-related functionality
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import absolute_import
|
|
19
|
+
|
|
20
|
+
from mindspore.parallel.nn.parallel_grad_reducer import PipelineGradReducer
|
|
21
|
+
from mindspore.parallel.nn.parallel_cell_wrapper import PipelineCell, Pipeline, MicroBatchInterleaved, GradAccumulation
|
|
22
|
+
|
|
23
|
+
__all__ = []
|
|
24
|
+
__all__.extend(parallel_grad_reducer.__all__)
|
|
25
|
+
__all__.extend(parallel_cell_wrapper.__all__)
|