mindspore 2.5.0__cp39-cp39-win_amd64.whl → 2.6.0rc1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +6 -4
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/_check_jit_forbidden_api.py +3 -0
- mindspore/_checkparam.py +3 -33
- mindspore/_deprecated/__init__.py +17 -0
- mindspore/_deprecated/jit.py +198 -0
- mindspore/_extends/builtin_operations.py +1 -1
- mindspore/_extends/parse/__init__.py +6 -7
- mindspore/_extends/parse/compile_config.py +19 -0
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +22 -3
- mindspore/_extends/parse/jit_fallback_modules/__init__.py +0 -0
- mindspore/_extends/parse/jit_fallback_modules/check_utils.py +123 -0
- mindspore/_extends/parse/jit_fallback_modules/third_party_modules.py +50 -0
- mindspore/_extends/parse/parser.py +24 -193
- mindspore/_extends/parse/resources.py +1 -5
- mindspore/_extends/parse/standard_method.py +97 -74
- mindspore/_extends/pijit/__init__.py +2 -2
- mindspore/_extends/pijit/pijit_func_white_list.py +16 -11
- mindspore/_extends/pijit/tensor_func_list.py +27 -0
- mindspore/_extends/utils.py +1 -1
- mindspore/amp.py +4 -4
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/__init__.py +2 -2
- mindspore/boost/base.py +3 -7
- mindspore/boost/boost_cell_wrapper.py +2 -2
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +4 -3
- mindspore/common/_grad_function.py +56 -0
- mindspore/common/_pijit_context.py +14 -5
- mindspore/common/_register_for_tensor.py +1 -1
- mindspore/common/_stub_tensor.py +5 -10
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +1915 -3287
- mindspore/common/api.py +341 -354
- mindspore/common/auto_dynamic_shape.py +41 -44
- mindspore/common/dtype.py +5 -2
- mindspore/common/dump.py +7 -5
- mindspore/common/file_system.py +3 -0
- mindspore/common/hook_handle.py +5 -3
- mindspore/common/initializer.py +10 -6
- mindspore/common/jit_begin_end.py +94 -0
- mindspore/common/jit_config.py +6 -1
- mindspore/common/jit_context.py +76 -0
- mindspore/common/jit_trace.py +378 -0
- mindspore/common/lazy_inline.py +2 -2
- mindspore/common/mutable.py +5 -4
- mindspore/common/parameter.py +106 -39
- mindspore/common/seed.py +2 -2
- mindspore/common/sparse_tensor.py +23 -17
- mindspore/common/tensor.py +297 -714
- mindspore/communication/__init__.py +7 -5
- mindspore/communication/_comm_helper.py +47 -2
- mindspore/communication/comm_func.py +70 -53
- mindspore/communication/management.py +83 -17
- mindspore/context.py +214 -560
- mindspore/dataset/__init__.py +44 -20
- mindspore/dataset/audio/__init__.py +2 -8
- mindspore/dataset/audio/transforms.py +3 -17
- mindspore/dataset/core/config.py +3 -3
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +102 -120
- mindspore/dataset/engine/datasets_audio.py +22 -22
- mindspore/dataset/engine/datasets_standard_format.py +43 -24
- mindspore/dataset/engine/datasets_text.py +78 -85
- mindspore/dataset/engine/datasets_user_defined.py +108 -76
- mindspore/dataset/engine/datasets_vision.py +111 -108
- mindspore/dataset/engine/iterators.py +5 -3
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +1 -1
- mindspore/dataset/engine/samplers.py +279 -57
- mindspore/dataset/engine/serializer_deserializer.py +2 -1
- mindspore/dataset/engine/validators.py +10 -0
- mindspore/dataset/text/__init__.py +7 -6
- mindspore/dataset/text/transforms.py +6 -5
- mindspore/dataset/text/utils.py +3 -3
- mindspore/dataset/transforms/__init__.py +0 -9
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/utils/browse_dataset.py +1 -1
- mindspore/dataset/vision/__init__.py +2 -9
- mindspore/dataset/vision/transforms.py +202 -158
- mindspore/dataset/vision/utils.py +7 -5
- mindspore/device_context/ascend/op_debug.py +60 -1
- mindspore/device_context/ascend/op_tuning.py +0 -4
- mindspore/device_manager.py +39 -3
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/es/embedding_service.py +35 -27
- mindspore/experimental/map_parameter.py +4 -4
- mindspore/experimental/optim/adadelta.py +22 -26
- mindspore/experimental/optim/adagrad.py +4 -4
- mindspore/experimental/optim/adam.py +4 -0
- mindspore/experimental/optim/adamax.py +4 -4
- mindspore/experimental/optim/adamw.py +4 -0
- mindspore/experimental/optim/asgd.py +1 -1
- mindspore/experimental/optim/lr_scheduler.py +40 -22
- mindspore/experimental/optim/radam.py +5 -5
- mindspore/experimental/optim/rprop.py +1 -1
- mindspore/experimental/optim/sgd.py +1 -1
- mindspore/hal/contiguous_tensors_handle.py +6 -10
- mindspore/hal/device.py +55 -81
- mindspore/hal/event.py +38 -55
- mindspore/hal/memory.py +93 -144
- mindspore/hal/stream.py +81 -125
- mindspore/include/dataset/constants.h +7 -4
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +40 -2
- mindspore/mindrecord/__init__.py +20 -7
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_backend.dll → mindspore_ops_host.dll} +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +131 -700
- mindspore/mint/distributed/__init__.py +5 -1
- mindspore/mint/distributed/distributed.py +194 -109
- mindspore/mint/linalg/__init__.py +2 -0
- mindspore/mint/nn/__init__.py +280 -18
- mindspore/mint/nn/functional.py +282 -64
- mindspore/mint/nn/layer/__init__.py +4 -0
- mindspore/mint/nn/layer/_functions.py +7 -3
- mindspore/mint/nn/layer/activation.py +120 -13
- mindspore/mint/nn/layer/conv.py +218 -24
- mindspore/mint/nn/layer/normalization.py +15 -16
- mindspore/mint/nn/layer/padding.py +1 -1
- mindspore/mint/nn/layer/pooling.py +66 -1
- mindspore/mint/optim/__init__.py +2 -1
- mindspore/mint/optim/sgd.py +171 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/__init__.py +4 -1
- mindspore/nn/cell.py +1250 -176
- mindspore/nn/layer/activation.py +23 -21
- mindspore/nn/layer/basic.py +22 -16
- mindspore/nn/layer/container.py +1 -1
- mindspore/nn/layer/conv.py +22 -17
- mindspore/nn/layer/embedding.py +9 -8
- mindspore/nn/layer/normalization.py +48 -42
- mindspore/nn/layer/pooling.py +75 -31
- mindspore/nn/layer/transformer.py +11 -10
- mindspore/nn/learning_rate_schedule.py +4 -2
- mindspore/nn/loss/loss.py +27 -19
- mindspore/nn/optim/ada_grad.py +6 -5
- mindspore/nn/optim/adadelta.py +9 -7
- mindspore/nn/optim/adafactor.py +1 -1
- mindspore/nn/optim/adam.py +16 -12
- mindspore/nn/optim/adamax.py +8 -7
- mindspore/nn/optim/adasum.py +5 -5
- mindspore/nn/optim/asgd.py +1 -1
- mindspore/nn/optim/ftrl.py +11 -9
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/lazyadam.py +12 -10
- mindspore/nn/optim/momentum.py +7 -6
- mindspore/nn/optim/optimizer.py +2 -2
- mindspore/nn/optim/proximal_ada_grad.py +12 -10
- mindspore/nn/optim/rmsprop.py +13 -12
- mindspore/nn/optim/rprop.py +9 -7
- mindspore/nn/optim/sgd.py +9 -6
- mindspore/nn/optim/tft_wrapper.py +5 -2
- mindspore/nn/probability/bijector/bijector.py +17 -11
- mindspore/nn/probability/bijector/gumbel_cdf.py +5 -5
- mindspore/nn/probability/bijector/invert.py +2 -2
- mindspore/nn/probability/bijector/scalar_affine.py +3 -3
- mindspore/nn/probability/bijector/softplus.py +3 -2
- mindspore/nn/probability/distribution/beta.py +3 -3
- mindspore/nn/probability/distribution/categorical.py +1 -1
- mindspore/nn/probability/distribution/cauchy.py +4 -2
- mindspore/nn/probability/distribution/exponential.py +6 -7
- mindspore/nn/probability/distribution/gamma.py +2 -2
- mindspore/nn/probability/distribution/gumbel.py +2 -2
- mindspore/nn/probability/distribution/half_normal.py +5 -3
- mindspore/nn/probability/distribution/logistic.py +5 -3
- mindspore/nn/probability/distribution/poisson.py +1 -1
- mindspore/nn/probability/distribution/uniform.py +5 -3
- mindspore/nn/reinforcement/_tensors_queue.py +1 -1
- mindspore/nn/reinforcement/tensor_array.py +1 -1
- mindspore/nn/wrap/__init__.py +6 -6
- mindspore/nn/wrap/cell_wrapper.py +178 -117
- mindspore/nn/wrap/grad_reducer.py +45 -36
- mindspore/nn/wrap/loss_scale.py +3 -3
- mindspore/numpy/array_creations.py +3 -3
- mindspore/numpy/array_ops.py +1 -1
- mindspore/numpy/math_ops.py +4 -4
- mindspore/numpy/utils.py +1 -2
- mindspore/numpy/utils_const.py +1 -2
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +3 -2
- mindspore/ops/_grad_experimental/grad_comm_ops.py +18 -3
- mindspore/ops/_grad_experimental/grad_debug_ops.py +8 -1
- mindspore/ops/_grad_experimental/taylor_rule.py +29 -0
- mindspore/ops/_register_for_op.py +0 -11
- mindspore/{ops_generate → ops/_utils}/arg_dtype_cast.py +123 -4
- mindspore/{ops_generate → ops/_utils}/arg_handler.py +3 -4
- mindspore/ops/_vmap/vmap_array_ops.py +7 -6
- mindspore/ops/_vmap/vmap_grad_nn_ops.py +2 -1
- mindspore/ops/_vmap/vmap_math_ops.py +4 -7
- mindspore/ops/_vmap/vmap_nn_ops.py +9 -8
- mindspore/ops/auto_generate/__init__.py +4 -3
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +102 -49
- mindspore/ops/auto_generate/gen_extend_func.py +281 -135
- mindspore/ops/auto_generate/gen_ops_def.py +2574 -2326
- mindspore/ops/auto_generate/gen_ops_prim.py +8566 -2755
- mindspore/ops/auto_generate/pyboost_inner_prim.py +106 -76
- mindspore/ops/composite/__init__.py +2 -1
- mindspore/ops/composite/base.py +19 -24
- mindspore/ops/composite/math_ops.py +6 -16
- mindspore/ops/composite/multitype_ops/__init__.py +5 -2
- mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -3
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -2
- mindspore/ops/composite/multitype_ops/add_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_and_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_or_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/bitwise_xor_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/div_impl.py +6 -4
- mindspore/ops/composite/multitype_ops/equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/getitem_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/greater_equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/greater_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/in_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/invert_impl.py +50 -0
- mindspore/ops/composite/multitype_ops/left_shift_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/less_equal_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/less_impl.py +4 -3
- mindspore/ops/composite/multitype_ops/logic_not_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/logical_and_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/logical_or_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/mod_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/mul_impl.py +3 -2
- mindspore/ops/composite/multitype_ops/negative_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/not_equal_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/ones_like_impl.py +18 -0
- mindspore/ops/composite/multitype_ops/pow_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/right_shift_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/setitem_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/sub_impl.py +2 -1
- mindspore/ops/function/__init__.py +28 -2
- mindspore/ops/function/_add_attr_func.py +58 -0
- mindspore/ops/function/array_func.py +1629 -2345
- mindspore/ops/function/clip_func.py +38 -45
- mindspore/ops/function/debug_func.py +36 -44
- mindspore/ops/function/grad/__init__.py +1 -0
- mindspore/ops/function/grad/grad_func.py +104 -71
- mindspore/ops/function/image_func.py +1 -1
- mindspore/ops/function/linalg_func.py +46 -78
- mindspore/ops/function/math_func.py +3035 -3705
- mindspore/ops/function/nn_func.py +676 -241
- mindspore/ops/function/other_func.py +159 -1
- mindspore/ops/function/parameter_func.py +17 -30
- mindspore/ops/function/random_func.py +204 -361
- mindspore/ops/function/reshard_func.py +4 -70
- mindspore/ops/function/sparse_func.py +3 -3
- mindspore/ops/function/sparse_unary_func.py +5 -5
- mindspore/ops/function/spectral_func.py +25 -58
- mindspore/ops/function/vmap_func.py +24 -17
- mindspore/ops/functional.py +6 -4
- mindspore/ops/functional_overload.py +547 -4
- mindspore/ops/op_info_register.py +32 -244
- mindspore/ops/operations/__init__.py +10 -5
- mindspore/ops/operations/_custom_ops_utils.py +247 -0
- mindspore/ops/operations/_grad_ops.py +1 -10
- mindspore/ops/operations/_inner_ops.py +5 -76
- mindspore/ops/operations/_ms_kernel.py +4 -10
- mindspore/ops/operations/_rl_inner_ops.py +1 -1
- mindspore/ops/operations/_scalar_ops.py +3 -2
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/_tensor_array.py +1 -1
- mindspore/ops/operations/array_ops.py +37 -22
- mindspore/ops/operations/comm_ops.py +150 -107
- mindspore/ops/operations/custom_ops.py +221 -23
- mindspore/ops/operations/debug_ops.py +115 -16
- mindspore/ops/operations/inner_ops.py +1 -1
- mindspore/ops/operations/linalg_ops.py +1 -58
- mindspore/ops/operations/manually_defined/_inner.py +1 -1
- mindspore/ops/operations/manually_defined/ops_def.py +746 -79
- mindspore/ops/operations/math_ops.py +21 -18
- mindspore/ops/operations/nn_ops.py +65 -191
- mindspore/ops/operations/other_ops.py +62 -9
- mindspore/ops/operations/random_ops.py +13 -7
- mindspore/ops/operations/reshard_ops.py +1 -1
- mindspore/ops/operations/sparse_ops.py +2 -2
- mindspore/ops/primitive.py +43 -32
- mindspore/ops/tensor_method.py +232 -13
- mindspore/ops_generate/__init__.py +0 -5
- mindspore/ops_generate/aclnn/__init__.py +0 -0
- mindspore/ops_generate/{aclnn_kernel_register_auto_cc_generator.py → aclnn/aclnn_kernel_register_auto_cc_generator.py} +43 -18
- mindspore/ops_generate/{gen_aclnn_implement.py → aclnn/gen_aclnn_implement.py} +49 -51
- mindspore/ops_generate/api/__init__.py +0 -0
- mindspore/ops_generate/{add_tensor_docs_generator.py → api/add_tensor_docs_generator.py} +9 -7
- mindspore/ops_generate/{cpp_create_prim_instance_helper_generator.py → api/cpp_create_prim_instance_helper_generator.py} +6 -9
- mindspore/ops_generate/{functional_map_cpp_generator.py → api/functional_map_cpp_generator.py} +25 -12
- mindspore/ops_generate/{functional_overload_py_generator.py → api/functional_overload_py_generator.py} +8 -6
- mindspore/ops_generate/{functions_cc_generator.py → api/functions_cc_generator.py} +14 -10
- mindspore/ops_generate/api/gen_api.py +103 -0
- mindspore/ops_generate/{op_api_proto.py → api/op_api_proto.py} +98 -69
- mindspore/ops_generate/{tensor_func_reg_cpp_generator.py → api/tensor_func_reg_cpp_generator.py} +82 -43
- mindspore/ops_generate/common/__init__.py +0 -0
- mindspore/ops_generate/common/gen_constants.py +91 -0
- mindspore/ops_generate/{gen_utils.py → common/gen_utils.py} +72 -19
- mindspore/ops_generate/{op_proto.py → common/op_proto.py} +64 -1
- mindspore/ops_generate/{template.py → common/template.py} +96 -84
- mindspore/ops_generate/gen_ops.py +23 -325
- mindspore/ops_generate/op_def/__init__.py +0 -0
- mindspore/ops_generate/op_def/gen_op_def.py +90 -0
- mindspore/ops_generate/{lite_ops_cpp_generator.py → op_def/lite_ops_cpp_generator.py} +47 -11
- mindspore/ops_generate/{ops_def_cc_generator.py → op_def/ops_def_cc_generator.py} +18 -7
- mindspore/ops_generate/{ops_def_h_generator.py → op_def/ops_def_h_generator.py} +5 -5
- mindspore/ops_generate/{ops_name_h_generator.py → op_def/ops_name_h_generator.py} +30 -15
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +125 -0
- mindspore/ops_generate/op_def_py/__init__.py +0 -0
- mindspore/ops_generate/op_def_py/gen_op_def_py.py +47 -0
- mindspore/ops_generate/{op_def_py_generator.py → op_def_py/op_def_py_generator.py} +6 -5
- mindspore/ops_generate/{op_prim_py_generator.py → op_def_py/op_prim_py_generator.py} +24 -15
- mindspore/ops_generate/pyboost/__init__.py +0 -0
- mindspore/ops_generate/{auto_grad_impl_cc_generator.py → pyboost/auto_grad_impl_cc_generator.py} +11 -7
- mindspore/ops_generate/{auto_grad_reg_cc_generator.py → pyboost/auto_grad_reg_cc_generator.py} +7 -7
- mindspore/ops_generate/{gen_pyboost_func.py → pyboost/gen_pyboost_func.py} +40 -16
- mindspore/ops_generate/{op_template_parser.py → pyboost/op_template_parser.py} +105 -24
- mindspore/ops_generate/{pyboost_functions_cpp_generator.py → pyboost/pyboost_functions_cpp_generator.py} +55 -18
- mindspore/ops_generate/{pyboost_functions_h_generator.py → pyboost/pyboost_functions_h_generator.py} +42 -10
- mindspore/ops_generate/{pyboost_functions_py_generator.py → pyboost/pyboost_functions_py_generator.py} +6 -6
- mindspore/ops_generate/{pyboost_grad_function_cpp_generator.py → pyboost/pyboost_grad_function_cpp_generator.py} +11 -10
- mindspore/ops_generate/{pyboost_inner_prim_generator.py → pyboost/pyboost_inner_prim_generator.py} +8 -7
- mindspore/ops_generate/{pyboost_native_grad_functions_generator.py → pyboost/pyboost_native_grad_functions_generator.py} +14 -10
- mindspore/ops_generate/{pyboost_op_cpp_code_generator.py → pyboost/pyboost_op_cpp_code_generator.py} +140 -53
- mindspore/ops_generate/{pyboost_overload_functions_cpp_generator.py → pyboost/pyboost_overload_functions_cpp_generator.py} +28 -15
- mindspore/ops_generate/{pyboost_utils.py → pyboost/pyboost_utils.py} +88 -4
- mindspore/ops_generate/resources/__init__.py +0 -0
- mindspore/ops_generate/resources/resource_list.py +30 -0
- mindspore/ops_generate/resources/resource_loader.py +36 -0
- mindspore/ops_generate/resources/resource_manager.py +64 -0
- mindspore/ops_generate/resources/yaml_loader.py +88 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +122 -0
- mindspore/parallel/__init__.py +6 -2
- mindspore/parallel/_auto_parallel_context.py +133 -6
- mindspore/parallel/_cell_wrapper.py +130 -15
- mindspore/parallel/_parallel_serialization.py +95 -4
- mindspore/parallel/_ps_context.py +1 -1
- mindspore/parallel/_recovery_context.py +7 -2
- mindspore/parallel/_tensor.py +142 -18
- mindspore/parallel/_utils.py +198 -25
- mindspore/parallel/algo_parameter_config.py +3 -3
- mindspore/parallel/auto_parallel.py +732 -0
- mindspore/parallel/checkpoint_convert.py +159 -0
- mindspore/parallel/checkpoint_transform.py +656 -37
- mindspore/parallel/cluster/process_entity/_api.py +151 -19
- mindspore/parallel/cluster/run.py +1 -1
- mindspore/parallel/function/__init__.py +24 -0
- mindspore/parallel/function/reshard_func.py +259 -0
- mindspore/parallel/nn/__init__.py +25 -0
- mindspore/parallel/nn/parallel_cell_wrapper.py +263 -0
- mindspore/parallel/nn/parallel_grad_reducer.py +169 -0
- mindspore/parallel/parameter_broadcast.py +24 -13
- mindspore/parallel/shard.py +137 -61
- mindspore/parallel/transform_safetensors.py +287 -95
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +9 -5
- mindspore/profiler/analysis/parser/ascend_cann_parser.py +6 -2
- mindspore/profiler/analysis/parser/ms_framework_parser.py +4 -4
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -4
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +22 -0
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +241 -86
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +41 -2
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +33 -35
- mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +7 -0
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +8 -3
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +141 -30
- mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +5 -6
- mindspore/profiler/common/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/common/constant.py +12 -0
- mindspore/profiler/common/msprof_cmd_tool.py +42 -23
- mindspore/profiler/common/path_manager.py +24 -0
- mindspore/profiler/common/profiler_context.py +26 -2
- mindspore/profiler/common/profiler_meta_data.py +74 -0
- mindspore/profiler/common/profiler_parameters.py +59 -18
- mindspore/profiler/common/profiler_path_manager.py +66 -7
- mindspore/profiler/dynamic_profiler.py +112 -79
- mindspore/profiler/envprofiler.py +26 -1
- mindspore/profiler/experimental_config.py +197 -0
- mindspore/profiler/mstx.py +57 -14
- mindspore/profiler/platform/npu_profiler.py +33 -7
- mindspore/profiler/profiler.py +541 -45
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +4 -0
- mindspore/profiler/schedule.py +57 -22
- mindspore/rewrite/api/node.py +15 -13
- mindspore/rewrite/api/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +25 -14
- mindspore/run_check/run_check.py +1 -1
- mindspore/runtime/__init__.py +2 -2
- mindspore/runtime/executor.py +40 -11
- mindspore/runtime/memory.py +25 -8
- mindspore/safeguard/rewrite_obfuscation.py +12 -9
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +8 -8
- mindspore/train/_utils.py +35 -7
- mindspore/train/amp.py +1 -1
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +2 -16
- mindspore/train/callback/_checkpoint.py +24 -40
- mindspore/train/callback/_cluster_monitor.py +14 -18
- mindspore/train/callback/_flops_collector.py +2 -3
- mindspore/train/callback/_history.py +7 -4
- mindspore/train/callback/_lambda_callback.py +2 -2
- mindspore/train/callback/_landscape.py +0 -3
- mindspore/train/callback/_loss_monitor.py +2 -1
- mindspore/train/callback/_on_request_exit.py +6 -5
- mindspore/train/callback/_reduce_lr_on_plateau.py +11 -6
- mindspore/train/callback/_summary_collector.py +8 -13
- mindspore/train/callback/_time_monitor.py +2 -1
- mindspore/train/callback/{_tft_register.py → _train_fault_tolerance.py} +179 -103
- mindspore/train/data_sink.py +25 -2
- mindspore/train/dataset_helper.py +4 -5
- mindspore/train/loss_scale_manager.py +8 -7
- mindspore/train/metrics/accuracy.py +3 -3
- mindspore/train/metrics/confusion_matrix.py +9 -9
- mindspore/train/metrics/error.py +3 -3
- mindspore/train/metrics/hausdorff_distance.py +4 -4
- mindspore/train/metrics/mean_surface_distance.py +3 -3
- mindspore/train/metrics/metric.py +0 -12
- mindspore/train/metrics/occlusion_sensitivity.py +4 -2
- mindspore/train/metrics/precision.py +8 -6
- mindspore/train/metrics/recall.py +9 -9
- mindspore/train/metrics/root_mean_square_surface_distance.py +2 -2
- mindspore/train/mind_ir_pb2.py +19 -12
- mindspore/train/model.py +176 -103
- mindspore/train/serialization.py +246 -988
- mindspore/train/summary/_summary_adapter.py +2 -2
- mindspore/train/summary/summary_record.py +1 -1
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +4 -2
- mindspore/utils/hooks.py +81 -0
- mindspore/utils/utils.py +138 -4
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.5.0.dist-info → mindspore-2.6.0rc1.dist-info}/METADATA +2 -1
- {mindspore-2.5.0.dist-info → mindspore-2.6.0rc1.dist-info}/RECORD +483 -438
- mindspore/_install_custom.py +0 -43
- mindspore/common/_register_for_adapter.py +0 -74
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +0 -252
- mindspore/ops/auto_generate/gen_arg_handler.py +0 -136
- mindspore/ops/operations/_opaque_predicate_registry.py +0 -41
- mindspore/ops_generate/gen_constants.py +0 -190
- mindspore/ops_generate/gen_ops_inner_prim.py +0 -131
- mindspore/ops_generate/ops_primitive_h_generator.py +0 -81
- /mindspore/ops_generate/{base_generator.py → common/base_generator.py} +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.5.0.dist-info → mindspore-2.6.0rc1.dist-info}/top_level.txt +0 -0
mindspore/train/_utils.py
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
from __future__ import absolute_import
|
|
17
17
|
|
|
18
18
|
import os
|
|
19
|
+
import sys
|
|
19
20
|
import json
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
|
|
@@ -23,7 +24,7 @@ import time
|
|
|
23
24
|
import numpy as np
|
|
24
25
|
|
|
25
26
|
from mindspore.common.tensor import Tensor
|
|
26
|
-
from mindspore._c_expression import
|
|
27
|
+
from mindspore._c_expression import TensorPy as Tensor_
|
|
27
28
|
from mindspore._c_expression import MSContext, ms_ctx_param
|
|
28
29
|
from mindspore.common.dtype import dtype_to_nptype, pytype_to_dtype
|
|
29
30
|
from mindspore.common import dtype as mstype
|
|
@@ -31,7 +32,7 @@ from mindspore import context
|
|
|
31
32
|
from mindspore import log as logger
|
|
32
33
|
from mindspore import _checkparam as Validator
|
|
33
34
|
from mindspore.common.api import _cell_graph_executor
|
|
34
|
-
from mindspore.communication import get_group_size
|
|
35
|
+
from mindspore.communication.management import get_rank, get_group_size
|
|
35
36
|
from mindspore.train.mind_ir_pb2 import ModelProto as mindir_model
|
|
36
37
|
from mindspore.train.checkpoint_pb2 import Checkpoint
|
|
37
38
|
from mindspore.train.node_strategy_pb2 import ParallelStrategyMap as ckpt_strategy
|
|
@@ -64,6 +65,7 @@ def _get_types_and_shapes(dataset):
|
|
|
64
65
|
dataset_shapes = dataset.output_shapes()
|
|
65
66
|
return dataset_types, dataset_shapes
|
|
66
67
|
|
|
68
|
+
|
|
67
69
|
def enable_data_broadcast():
|
|
68
70
|
"""Get status to indicate if enable dataset broadcast."""
|
|
69
71
|
return MSContext.get_instance().get_param(ms_ctx_param.dataset_broadcast_opt_level) > 0
|
|
@@ -375,20 +377,40 @@ def _get_parameter_redundancy_without_opt_shard(parameter_layout, param_redundan
|
|
|
375
377
|
param_redundancy_dict[key] = tuple(redundancy_list)
|
|
376
378
|
|
|
377
379
|
|
|
378
|
-
def
|
|
380
|
+
def _get_initial_rank(parameter_layout):
|
|
381
|
+
"""Get the initial rank of pp."""
|
|
382
|
+
for k, _ in parameter_layout.items():
|
|
383
|
+
dev_matrix = parameter_layout[k][0]
|
|
384
|
+
break
|
|
385
|
+
dev_num = 1
|
|
386
|
+
if dev_matrix:
|
|
387
|
+
for i in dev_matrix:
|
|
388
|
+
dev_num *= i
|
|
389
|
+
rank_id = get_rank()
|
|
390
|
+
initial_rank = (rank_id // dev_num) * dev_num
|
|
391
|
+
return initial_rank
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _get_pp_size_from_redundancy_map(param_redundancy):
|
|
395
|
+
"""Get pp size from redundancy map."""
|
|
396
|
+
for _, v in param_redundancy.items():
|
|
397
|
+
return len(v) * len(v[0])
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def get_parameter_redundancy(layout_obj, initial_rank=None):
|
|
379
401
|
"""
|
|
380
402
|
Get parameter redundancy map.
|
|
381
403
|
|
|
382
404
|
Args:
|
|
383
405
|
layout_obj (Union[str, layout): File name of `strategy.ckpt` or net.parameter_layout_dict.
|
|
384
|
-
initial_rank (int): Start rank id for each pipeline. Default:
|
|
406
|
+
initial_rank (int): Start rank id for each pipeline. Default: ``None``.
|
|
385
407
|
|
|
386
408
|
Returns:
|
|
387
409
|
Dict, dict of parameter redundancy info.
|
|
388
410
|
|
|
389
411
|
Examples:
|
|
390
412
|
>>> from mindspore.train.utils import get_parameter_redundancy
|
|
391
|
-
>>> param_redundancy_dict = get_parameter_redundancy("/path/to/strategy.ckpt")
|
|
413
|
+
>>> param_redundancy_dict = get_parameter_redundancy("/path/to/strategy.ckpt", initial_rank=0)
|
|
392
414
|
{'param1': ((0, 1, 2, 3, 4, 5, 6, 7),),
|
|
393
415
|
'param2': ((0, 4, 8, 12), (1, 5, 9, 13), (2, 6, 10, 14), (3, 7, 11, 15)),
|
|
394
416
|
'param3': ((0, 4, 8, 12), (1, 5, 9, 13), (2, 6, 10, 14), (3, 7, 11, 15)),
|
|
@@ -405,7 +427,8 @@ def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
|
405
427
|
from mindspore.communication.management import get_process_group_ranks
|
|
406
428
|
groups_ranks = (tuple(get_process_group_ranks()),)
|
|
407
429
|
param_redundancy_dict = {param.name: groups_ranks for _, param in layout_obj.parameters_and_names()}
|
|
408
|
-
|
|
430
|
+
sorted_param_redundancy_dict = {key: param_redundancy_dict[key] for key in sorted(param_redundancy_dict.keys())}
|
|
431
|
+
return sorted_param_redundancy_dict
|
|
409
432
|
else:
|
|
410
433
|
parameter_layout = {}
|
|
411
434
|
for k, v in layout_obj.items():
|
|
@@ -413,6 +436,9 @@ def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
|
413
436
|
|
|
414
437
|
param_redundancy_dict = {}
|
|
415
438
|
|
|
439
|
+
if initial_rank is None:
|
|
440
|
+
initial_rank = _get_initial_rank(parameter_layout)
|
|
441
|
+
|
|
416
442
|
_get_parameter_redundancy_without_opt_shard(parameter_layout, param_redundancy_dict, initial_rank)
|
|
417
443
|
|
|
418
444
|
if isinstance(layout_obj, str):
|
|
@@ -420,7 +446,8 @@ def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
|
420
446
|
else:
|
|
421
447
|
_get_layout_opt_shard(layout_obj, param_redundancy_dict)
|
|
422
448
|
|
|
423
|
-
|
|
449
|
+
sorted_param_redundancy_dict = {key: param_redundancy_dict[key] for key in sorted(param_redundancy_dict.keys())}
|
|
450
|
+
return sorted_param_redundancy_dict
|
|
424
451
|
|
|
425
452
|
|
|
426
453
|
def _collect_settings_by_rank(redundancy_map):
|
|
@@ -539,6 +566,7 @@ def _progress_bar(iterable, total=None):
|
|
|
539
566
|
elapsed_time_str = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
|
|
540
567
|
remaining_time_str = time.strftime("%H:%M:%S", time.gmtime(remaining_time))
|
|
541
568
|
|
|
569
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
542
570
|
print(f'\r{percent}%|{bar}|[{elapsed_time_str}<{remaining_time_str}]', end='')
|
|
543
571
|
if iteration == total:
|
|
544
572
|
print()
|
mindspore/train/amp.py
CHANGED
|
@@ -638,7 +638,7 @@ def _add_loss_network(network, loss_fn, cast_model_type):
|
|
|
638
638
|
|
|
639
639
|
|
|
640
640
|
def _is_grad_accumulation(mcell):
|
|
641
|
-
if mcell.cls_name == "GradAccumulationCell":
|
|
641
|
+
if mcell.cls_name == "GradAccumulationCell" or mcell.cls_name == "GradAccumulation":
|
|
642
642
|
return True
|
|
643
643
|
for cell in mcell.cells():
|
|
644
644
|
if _is_grad_accumulation(cell):
|
|
@@ -36,9 +36,9 @@ from mindspore.train.callback._reduce_lr_on_plateau import ReduceLROnPlateau
|
|
|
36
36
|
from mindspore.train.callback._on_request_exit import OnRequestExit
|
|
37
37
|
from mindspore.train.callback._backup_and_restore import BackupAndRestore
|
|
38
38
|
from mindspore.train.callback._flops_collector import FlopsUtilizationCollector
|
|
39
|
-
from mindspore.train.callback.
|
|
39
|
+
from mindspore.train.callback._train_fault_tolerance import TrainFaultTolerance
|
|
40
40
|
|
|
41
41
|
__all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
|
|
42
42
|
"SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
|
|
43
43
|
"History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
|
|
44
|
-
"
|
|
44
|
+
"TrainFaultTolerance"]
|
|
@@ -121,10 +121,7 @@ class Callback:
|
|
|
121
121
|
When creating a custom Callback, model context information can be obtained in Callback
|
|
122
122
|
methods by calling `RunContext.original_args()`, which is a dictionary varivable
|
|
123
123
|
recording current attributes. Users can add custimized attributes to the information.
|
|
124
|
-
Training process can also be stopped by calling `request_stop` method.
|
|
125
|
-
of custom Callback, please check
|
|
126
|
-
`Callback tutorial <https://www.mindspore.cn/docs/en/master/model_train/train_process/model/
|
|
127
|
-
callback.html#customized-callback-mechanism>`_.
|
|
124
|
+
Training process can also be stopped by calling `request_stop` method.
|
|
128
125
|
|
|
129
126
|
Examples:
|
|
130
127
|
>>> import numpy as np
|
|
@@ -491,9 +488,7 @@ class RunContext:
|
|
|
491
488
|
|
|
492
489
|
Callback objects not only can obtain the Model context information by calling by
|
|
493
490
|
`RunContext.original_args()` and add extra attributes to the information, but also can stop the
|
|
494
|
-
training process by calling `request_stop` method.
|
|
495
|
-
please check
|
|
496
|
-
`Callback Mechanism <https://www.mindspore.cn/docs/en/master/model_train/train_process/model/callback.html>`_.
|
|
491
|
+
training process by calling `request_stop` method.
|
|
497
492
|
|
|
498
493
|
`RunContext.original_args()` holds the model context information as a dictionary variable, and
|
|
499
494
|
different attributes of the dictionary are stored in training or eval process. Details are as follows:
|
|
@@ -572,10 +567,6 @@ class RunContext:
|
|
|
572
567
|
|
|
573
568
|
Returns:
|
|
574
569
|
Dict, an object that holds the original arguments of model.
|
|
575
|
-
|
|
576
|
-
Tutorial Examples:
|
|
577
|
-
- `Callback Mechanism - Customized Callback Mechanism
|
|
578
|
-
<https://mindspore.cn/docs/en/master/model_train/train_process/model/callback.html#customized-callback-mechanism>`_
|
|
579
570
|
"""
|
|
580
571
|
return self._original_args
|
|
581
572
|
|
|
@@ -585,11 +576,6 @@ class RunContext:
|
|
|
585
576
|
|
|
586
577
|
Callbacks can use this function to request stop of iterations.
|
|
587
578
|
model.train() checks whether this is called or not.
|
|
588
|
-
|
|
589
|
-
Tutorial Examples:
|
|
590
|
-
- `Callback Mechanism - Customized Training Termination Time
|
|
591
|
-
<https://mindspore.cn/docs/en/master/model_train/train_process/model/callback.html#
|
|
592
|
-
customized-training-termination-time>`_
|
|
593
579
|
"""
|
|
594
580
|
self._stop_requested = True
|
|
595
581
|
|
|
@@ -28,15 +28,12 @@ from mindspore.train.serialization import save_checkpoint, _save_graph, _wait_as
|
|
|
28
28
|
_wait_async_thread_save_ckpt, _check_async_save
|
|
29
29
|
from mindspore.parallel._cell_wrapper import destroy_allgather_cell
|
|
30
30
|
from mindspore.parallel._recovery_context import _set_recovery_context, _get_recovery_context
|
|
31
|
-
from mindspore.
|
|
32
|
-
from mindspore.
|
|
33
|
-
from mindspore.
|
|
34
|
-
from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy
|
|
35
|
-
from mindspore.train.callback._callback import Callback, set_cur_net
|
|
31
|
+
from mindspore.communication.management import get_rank, get_group_size
|
|
32
|
+
from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy, _get_pp_size_from_redundancy_map
|
|
33
|
+
from mindspore.train.callback._callback import Callback
|
|
36
34
|
from mindspore.common.tensor import Tensor
|
|
37
35
|
from mindspore.common.parameter import Parameter
|
|
38
36
|
from mindspore.common.generator import Generator
|
|
39
|
-
from mindspore.common.api import _cell_graph_executor
|
|
40
37
|
from mindspore._c_expression import collect_host_info, get_clock_syscnt
|
|
41
38
|
|
|
42
39
|
_cur_dir = os.getcwd()
|
|
@@ -87,7 +84,7 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
|
|
|
87
84
|
name_ext = os.path.splitext(filename)
|
|
88
85
|
if exception and filename[-16:] != "_breakpoint.ckpt":
|
|
89
86
|
continue
|
|
90
|
-
if not exception and (name_ext[-1]
|
|
87
|
+
if not exception and (name_ext[-1] not in (".ckpt", ".safetensors") or filename[-16:] == "_breakpoint.ckpt"):
|
|
91
88
|
continue
|
|
92
89
|
# find same prefix file
|
|
93
90
|
if filename.find(prefix) == 0 and not filename[pre_len].isalpha():
|
|
@@ -106,10 +103,10 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
|
|
|
106
103
|
return prefix
|
|
107
104
|
|
|
108
105
|
|
|
109
|
-
def _check_format_and_other_params(format, enc_key, enc_mode, crc_check=False,
|
|
106
|
+
def _check_format_and_other_params(format, enc_key, enc_mode, crc_check=False, exception_save=False,
|
|
110
107
|
map_param_inc=False, global_step_num=None):
|
|
111
|
-
param_not_default = (enc_key is not None or enc_mode != "AES-GCM" or crc_check or
|
|
112
|
-
or
|
|
108
|
+
param_not_default = (enc_key is not None or enc_mode != "AES-GCM" or crc_check or exception_save or map_param_inc
|
|
109
|
+
or global_step_num is not None)
|
|
113
110
|
if format == "safetensors" and param_not_default:
|
|
114
111
|
raise ValueError("For 'save_checkpoint', when format is 'safetensors', other param must be default.")
|
|
115
112
|
|
|
@@ -139,9 +136,9 @@ class CheckpointConfig:
|
|
|
139
136
|
integrated_save (bool): Whether to merge and save the split Tensor in the automatic parallel scenario.
|
|
140
137
|
Integrated save function is only supported in automatic parallel scene, not supported
|
|
141
138
|
in manual parallel. Default: ``True`` .
|
|
142
|
-
async_save (Union[bool, str]):Whether to use asynchronous saving of the checkpoint file
|
|
143
|
-
the asynchronous thread is used by default. If the type
|
|
144
|
-
the method of asynchronous saving, it can be "process" or "thread".
|
|
139
|
+
async_save (Union[bool, str], optional):Whether to use asynchronous saving of the checkpoint file or
|
|
140
|
+
safetensors file, if True, the asynchronous thread is used by default. If the type
|
|
141
|
+
is string, the method of asynchronous saving, it can be "process" or "thread".
|
|
145
142
|
Default: ``False`` .
|
|
146
143
|
saved_network (Cell): Network to be saved in checkpoint file. If the saved_network has no relation
|
|
147
144
|
with the network in training, the initial value of saved_network will be saved. Default: ``None`` .
|
|
@@ -261,8 +258,7 @@ class CheckpointConfig:
|
|
|
261
258
|
self.enable_redundance = kwargs.get('enable_redundance', False)
|
|
262
259
|
self.remove_redundancy = Validator.check_isinstance('remove_redundancy', remove_redundancy, bool)
|
|
263
260
|
|
|
264
|
-
_check_format_and_other_params(format, enc_key, enc_mode, crc_check,
|
|
265
|
-
self._map_param_inc)
|
|
261
|
+
_check_format_and_other_params(format, enc_key, enc_mode, crc_check, exception_save, self._map_param_inc)
|
|
266
262
|
|
|
267
263
|
@property
|
|
268
264
|
def save_checkpoint_steps(self):
|
|
@@ -452,8 +448,9 @@ class ModelCheckpoint(Callback):
|
|
|
452
448
|
Note:
|
|
453
449
|
In the distributed training scenario, please specify different directories for each training process
|
|
454
450
|
to save the checkpoint file. Otherwise, the training may fail.
|
|
455
|
-
If this callback is used in the
|
|
456
|
-
|
|
451
|
+
If this callback is used in the
|
|
452
|
+
`Model <https://www.mindspore.cn/docs/en/master/api_python/train/mindspore.train.Model.html>`_ function,
|
|
453
|
+
the checkpoint file will saved parameters of the optimizer by default.
|
|
457
454
|
|
|
458
455
|
Args:
|
|
459
456
|
prefix (Union[str, callable object]): The prefix name or callable object to generate name of checkpoint files.
|
|
@@ -514,7 +511,7 @@ class ModelCheckpoint(Callback):
|
|
|
514
511
|
if callable(prefix):
|
|
515
512
|
self._prefix_func = prefix
|
|
516
513
|
|
|
517
|
-
if _get_recovery_context("enable_recovery"):
|
|
514
|
+
if context.get_context("device_target") == "GPU" and _get_recovery_context("enable_recovery"):
|
|
518
515
|
_set_recovery_context(ckpt_path=self._directory)
|
|
519
516
|
|
|
520
517
|
if config is None:
|
|
@@ -556,19 +553,17 @@ class ModelCheckpoint(Callback):
|
|
|
556
553
|
from aiturbo.checkpoint import aiturbo_mindspore as aiturbo
|
|
557
554
|
ckpt_storage_path = self._directory
|
|
558
555
|
rank_id = get_rank()
|
|
559
|
-
|
|
560
|
-
stage_rank_num = _get_device_num() // stage_num
|
|
556
|
+
device_num = get_group_size()
|
|
561
557
|
param_layout = cb_params.train_network.parameter_layout_dict
|
|
562
558
|
if not param_layout:
|
|
563
|
-
layout = {"stage_num":
|
|
559
|
+
layout = {"stage_num": 1, "stage_rank_num": device_num, "stage_layout": None}
|
|
564
560
|
aiturbo.init(ckpt_storage_path, rank_id, layout, None, False, None)
|
|
565
561
|
else:
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
param_redundancy_dict = get_parameter_redundancy(param_layout, initial_rank)
|
|
562
|
+
param_redundancy_dict = get_parameter_redundancy(param_layout)
|
|
563
|
+
pp_size = _get_pp_size_from_redundancy_map(param_redundancy_dict)
|
|
564
|
+
stage_num = device_num // pp_size
|
|
570
565
|
dp, _ = _get_dp_tp_from_layout(param_redundancy_dict)
|
|
571
|
-
layout = {"stage_num": stage_num, "stage_rank_num":
|
|
566
|
+
layout = {"stage_num": stage_num, "stage_rank_num": pp_size,
|
|
572
567
|
"stage_layout": param_redundancy_dict}
|
|
573
568
|
single_params = remove_param_redundancy(param_redundancy_dict)
|
|
574
569
|
single_params = {device_id: list(params) for device_id, params in single_params.items()}
|
|
@@ -684,12 +679,6 @@ class ModelCheckpoint(Callback):
|
|
|
684
679
|
self._last_time_for_keep = time.time()
|
|
685
680
|
self._last_triggered_step = cb_params.cur_step_num
|
|
686
681
|
|
|
687
|
-
# TODO(MS_DISABLE_REF_MODE): Delete when remove MS_DISABLE_REF_MODE env.
|
|
688
|
-
if context.get_context("enable_ge") and os.getenv('MS_DISABLE_REF_MODE') \
|
|
689
|
-
and context.get_context("mode") == context.GRAPH_MODE:
|
|
690
|
-
set_cur_net(cb_params.train_network)
|
|
691
|
-
cb_params.train_network.add_flags(ge_sync_data=True)
|
|
692
|
-
_cell_graph_executor(cb_params.train_network, phase='save')
|
|
693
682
|
self._append_dict_content(cb_params.cur_epoch_num, cb_params.cur_step_num)
|
|
694
683
|
network = self._config.saved_network if self._config.saved_network is not None else cb_params.train_network
|
|
695
684
|
if os.getenv("AITURBO") == "1":
|
|
@@ -698,18 +687,13 @@ class ModelCheckpoint(Callback):
|
|
|
698
687
|
crc_check=self._config.crc_check, incremental=self._map_param_inc,
|
|
699
688
|
global_step_num=cb_params.cur_step_num)
|
|
700
689
|
elif self._config.remove_redundancy:
|
|
701
|
-
|
|
702
|
-
if parallel_mode == "stand_alone":
|
|
690
|
+
if get_group_size() == 1:
|
|
703
691
|
raise TypeError(f"The deduplication feature for saving checkpoint can only be used "
|
|
704
|
-
f"in parallel scenarios, but got
|
|
692
|
+
f"in parallel scenarios, but got 'stand_alone'.")
|
|
705
693
|
param_layout = network.parameter_layout_dict
|
|
706
694
|
rank_id = get_rank()
|
|
707
695
|
if param_layout:
|
|
708
|
-
|
|
709
|
-
stage_num = _get_auto_parallel_context("pipeline_stages")
|
|
710
|
-
chunk_size = device_num // stage_num
|
|
711
|
-
initial_rank = (rank_id // chunk_size) * chunk_size
|
|
712
|
-
param_redundancy_dict = get_parameter_redundancy(param_layout, initial_rank)
|
|
696
|
+
param_redundancy_dict = get_parameter_redundancy(param_layout)
|
|
713
697
|
single_params = remove_param_redundancy(param_redundancy_dict)
|
|
714
698
|
save_param_names = single_params.get(rank_id)
|
|
715
699
|
param_layout_set = set(param_layout.keys())
|
|
@@ -24,9 +24,8 @@ from threading import RLock
|
|
|
24
24
|
from mindspore.train.callback._callback import Callback
|
|
25
25
|
from mindspore.communication.management import get_rank, get_local_rank
|
|
26
26
|
from mindspore import log as logger
|
|
27
|
-
from mindspore.parallel._auto_parallel_context import _get_auto_parallel_context
|
|
28
27
|
from mindspore.parallel._utils import _get_device_num
|
|
29
|
-
from mindspore.train._utils import get_parameter_redundancy
|
|
28
|
+
from mindspore.train._utils import get_parameter_redundancy, _get_pp_size_from_redundancy_map
|
|
30
29
|
|
|
31
30
|
_perf_mutex = RLock()
|
|
32
31
|
|
|
@@ -42,7 +41,7 @@ def _get_dp_tp_from_redundancy(redundancy_tuple):
|
|
|
42
41
|
return dp, tp
|
|
43
42
|
|
|
44
43
|
|
|
45
|
-
def _get_dp_tp_from_layout(parameter_layout_dict, initial_rank=
|
|
44
|
+
def _get_dp_tp_from_layout(parameter_layout_dict, initial_rank=None):
|
|
46
45
|
"""From layout dict get dp and tp"""
|
|
47
46
|
tp = []
|
|
48
47
|
dp = []
|
|
@@ -132,21 +131,9 @@ class ClusterMonitor(Callback):
|
|
|
132
131
|
self.full_path = self.log_path + self.log_name
|
|
133
132
|
|
|
134
133
|
self.write_dp_tp_flag = True
|
|
135
|
-
self.initial_rank = 0
|
|
136
134
|
|
|
137
135
|
def begin(self, run_context):
|
|
138
136
|
_remove_pre_log()
|
|
139
|
-
pp_num = _get_auto_parallel_context("pipeline_stages")
|
|
140
|
-
device_num = _get_device_num()
|
|
141
|
-
|
|
142
|
-
original_list = list(range(device_num))
|
|
143
|
-
chunk_size = device_num // pp_num
|
|
144
|
-
split_pp_lists = []
|
|
145
|
-
for i in range(0, device_num, chunk_size):
|
|
146
|
-
end_index = i + chunk_size if i + chunk_size <= device_num else device_num
|
|
147
|
-
split_pp_lists.append(original_list[i:end_index])
|
|
148
|
-
|
|
149
|
-
self.initial_rank = (self.global_rank // chunk_size) * chunk_size
|
|
150
137
|
with _perf_mutex:
|
|
151
138
|
dir_path = os.path.dirname(self.full_path)
|
|
152
139
|
if not os.path.exists(dir_path):
|
|
@@ -157,8 +144,6 @@ class ClusterMonitor(Callback):
|
|
|
157
144
|
with open(self.full_path, 'w') as file:
|
|
158
145
|
log_message = f'UUID:{self.uuid_value}\nFRAMEWORK:{self.frame_work}\nGLOBAL RANKID:{self.global_rank}\n'
|
|
159
146
|
file.write(log_message)
|
|
160
|
-
for _, split_pp_list in enumerate(split_pp_lists):
|
|
161
|
-
file.write(f'PP:{split_pp_list}\n')
|
|
162
147
|
os.chmod(self.full_path, stat.S_IRUSR)
|
|
163
148
|
|
|
164
149
|
def step_begin(self, run_context):
|
|
@@ -183,10 +168,21 @@ class ClusterMonitor(Callback):
|
|
|
183
168
|
if self.enabled and self.enabled_dtp_group and self.write_dp_tp_flag:
|
|
184
169
|
cb_params = run_context.original_args()
|
|
185
170
|
param_layout_dict = cb_params.train_network.parameter_layout_dict
|
|
186
|
-
|
|
171
|
+
device_num = _get_device_num()
|
|
172
|
+
original_list = list(range(device_num))
|
|
173
|
+
param_redundancy_dict = get_parameter_redundancy(param_layout_dict)
|
|
174
|
+
pp_size = _get_pp_size_from_redundancy_map(param_redundancy_dict)
|
|
175
|
+
split_pp_lists = []
|
|
176
|
+
for i in range(0, device_num, pp_size):
|
|
177
|
+
end_index = i + pp_size if i + pp_size <= device_num else device_num
|
|
178
|
+
split_pp_lists.append(original_list[i:end_index])
|
|
179
|
+
dp, tp = _get_dp_tp_from_layout(param_layout_dict)
|
|
180
|
+
|
|
187
181
|
with _perf_mutex:
|
|
188
182
|
os.chmod(self.full_path, stat.S_IWUSR)
|
|
189
183
|
with open(self.full_path, 'a') as file:
|
|
184
|
+
for _, split_pp_list in enumerate(split_pp_lists):
|
|
185
|
+
file.write(f'PP:{split_pp_list}\n')
|
|
190
186
|
for dp_value in dp:
|
|
191
187
|
file.write(f'dp:{dp_value}\n')
|
|
192
188
|
for tp_value in tp:
|
|
@@ -89,7 +89,7 @@ class FlopsUtilizationCollector(Callback):
|
|
|
89
89
|
Train per step time: 135.572 ms, mfu:0.47% hfu:0.47%
|
|
90
90
|
Train per step time: 1.317 ms, mfu:48.59% hfu:48.59%
|
|
91
91
|
"""
|
|
92
|
-
def __init__(self, data_size
|
|
92
|
+
def __init__(self, data_size, computility=1, full_flops=True, enable_ma_collector=False):
|
|
93
93
|
super(FlopsUtilizationCollector, self).__init__()
|
|
94
94
|
self.step_time = time.time()
|
|
95
95
|
self.computility = computility
|
|
@@ -110,8 +110,7 @@ class FlopsUtilizationCollector(Callback):
|
|
|
110
110
|
self.batch_step_size = None
|
|
111
111
|
Validator.check_bool(full_flops, "full_flops")
|
|
112
112
|
Validator.check_bool(enable_ma_collector, "enable_ma_collector")
|
|
113
|
-
|
|
114
|
-
Validator.check_positive_int(data_size, "data_size")
|
|
113
|
+
Validator.check_positive_int(data_size, "data_size")
|
|
115
114
|
|
|
116
115
|
def step_begin(self, run_context):
|
|
117
116
|
"""
|
|
@@ -25,10 +25,13 @@ class History(Callback):
|
|
|
25
25
|
"""
|
|
26
26
|
Records the network outputs and metrics information into a `History` object.
|
|
27
27
|
|
|
28
|
-
The network outputs information will be the loss value if not custimizing the train network or eval network;
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
- The network outputs information will be the loss value if not custimizing the train network or eval network;
|
|
29
|
+
- If the train network or eval network is custimized:
|
|
30
|
+
|
|
31
|
+
- if the custimized network returns a `Tensor` or `numpy.ndarray`, the mean value of network output
|
|
32
|
+
will be recorded.
|
|
33
|
+
- if the custimized network returns a `tuple` or `list`, the first element of network
|
|
34
|
+
outputs will be recorded.
|
|
32
35
|
|
|
33
36
|
Note:
|
|
34
37
|
Normally used in :func:`mindspore.train.Model.train` or :func:`mindspore.train.Model.fit`.
|
|
@@ -36,8 +36,8 @@ class LambdaCallback(Callback):
|
|
|
36
36
|
on_train_step_end (Function): called at each train step end. Default: ``None`` .
|
|
37
37
|
on_train_begin (Function): called at the beginning of model train. Default: ``None`` .
|
|
38
38
|
on_train_end (Function): called at the end of model train. Default: ``None`` .
|
|
39
|
-
on_eval_epoch_begin (Function): called at eval epoch begin. Default: ``None`` .
|
|
40
|
-
on_eval_epoch_end (Function): called at eval epoch end. Default: ``None`` .
|
|
39
|
+
on_eval_epoch_begin (Function): called at each eval epoch begin. Default: ``None`` .
|
|
40
|
+
on_eval_epoch_end (Function): called at each eval epoch end. Default: ``None`` .
|
|
41
41
|
on_eval_step_begin (Function): called at each eval step begin. Default: ``None`` .
|
|
42
42
|
on_eval_step_end (Function): called at each eval step end. Default: ``None`` .
|
|
43
43
|
on_eval_begin (Function): called at the beginning of model eval. Default: ``None`` .
|
|
@@ -256,9 +256,6 @@ class SummaryLandscape:
|
|
|
256
256
|
"""
|
|
257
257
|
Clean the checkpoint.
|
|
258
258
|
|
|
259
|
-
Tutorial Examples:
|
|
260
|
-
- `Training Optimization Process Visualization
|
|
261
|
-
<https://www.mindspore.cn/mindinsight/docs/en/master/landscape.html>`_
|
|
262
259
|
"""
|
|
263
260
|
shutil.rmtree(self._ckpt_dir, ignore_errors=True)
|
|
264
261
|
|
|
@@ -93,7 +93,8 @@ class LossMonitor(Callback):
|
|
|
93
93
|
|
|
94
94
|
def on_train_epoch_end(self, run_context):
|
|
95
95
|
"""
|
|
96
|
-
When LossMonitor used in
|
|
96
|
+
When LossMonitor used in :func:`mindspore.train.Model.fit`, print eval metrics
|
|
97
|
+
at the end of epoch if current epoch
|
|
97
98
|
should do evaluation.
|
|
98
99
|
|
|
99
100
|
Args:
|
|
@@ -26,6 +26,7 @@ from mindspore.common.tensor import Tensor
|
|
|
26
26
|
from mindspore.train._utils import _make_directory
|
|
27
27
|
from mindspore import _checkparam as Validator
|
|
28
28
|
from mindspore.train.serialization import load_checkpoint, save_checkpoint, export
|
|
29
|
+
from mindspore.communication.management import get_group_size
|
|
29
30
|
from mindspore.train.callback._callback import Callback
|
|
30
31
|
from mindspore.parallel._utils import _get_parallel_mode
|
|
31
32
|
from mindspore.context import ParallelMode
|
|
@@ -37,7 +38,7 @@ class OnRequestExit(Callback):
|
|
|
37
38
|
|
|
38
39
|
Register OnRequestExit Callback before training, when the user want to exit the training process
|
|
39
40
|
and save the training data, could send the registered exit signal 'sig' to the training process or modify the
|
|
40
|
-
'GracefulExit' that a key in the
|
|
41
|
+
'GracefulExit' that a key in the JSON file specified by the 'config_file' to '1'.
|
|
41
42
|
After the training process executes the current step, saves the current training status,
|
|
42
43
|
including checkpoint and mindir, and then exit the training process.
|
|
43
44
|
|
|
@@ -58,7 +59,7 @@ class OnRequestExit(Callback):
|
|
|
58
59
|
ValueError: If the 'save_mindir' is not a bool.
|
|
59
60
|
ValueError: If the 'file_name' is not a str.
|
|
60
61
|
ValueError: If the 'directory' is not a str.
|
|
61
|
-
ValueError: If the 'sig' is not an int or the 'sig' is signal.
|
|
62
|
+
ValueError: If the 'sig' is not an int or the 'sig' is ``signal.SIGTERM``.
|
|
62
63
|
|
|
63
64
|
Examples:
|
|
64
65
|
>>> from mindspore import nn
|
|
@@ -92,10 +93,8 @@ class OnRequestExit(Callback):
|
|
|
92
93
|
self.key = "GracefulExit"
|
|
93
94
|
self.remote_config_file = config_file # used config file to save checkpoint and exit training process
|
|
94
95
|
self.use_graceful = os.environ.get("MS_ENABLE_GRACEFUL_EXIT") == "1"
|
|
95
|
-
self.is_distributed =
|
|
96
|
+
self.is_distributed = get_group_size() > 1
|
|
96
97
|
self.integrated_save = True
|
|
97
|
-
if self.is_distributed:
|
|
98
|
-
self.integrated_save = _get_parallel_mode() == ParallelMode.AUTO_PARALLEL
|
|
99
98
|
self.stop_train = False
|
|
100
99
|
self.need_do_step_end = False
|
|
101
100
|
if self.save_ckpt or self.save_mindir:
|
|
@@ -250,6 +249,8 @@ class OnRequestExit(Callback):
|
|
|
250
249
|
else:
|
|
251
250
|
global_step = int(call_params.network.optimizer.global_step.data)
|
|
252
251
|
append_dict["global_step"] = global_step
|
|
252
|
+
if self.is_distributed:
|
|
253
|
+
self.integrated_save = _get_parallel_mode() == ParallelMode.AUTO_PARALLEL
|
|
253
254
|
save_checkpoint(net, self.train_name, integrated_save=self.integrated_save,
|
|
254
255
|
append_dict=append_dict)
|
|
255
256
|
if self.save_mindir:
|
|
@@ -63,12 +63,17 @@ class ReduceLROnPlateau(Callback):
|
|
|
63
63
|
will be reduced. Default: ``10`` .
|
|
64
64
|
verbose (bool): If False: quiet, if True: print related information.
|
|
65
65
|
Default: ``False`` .
|
|
66
|
-
mode (str): one of `{'auto', 'min', 'max'}`.
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
mode (str): one of `{'auto', 'min', 'max'}`. Default: ``'auto'`` .
|
|
67
|
+
|
|
68
|
+
- In ``'min'`` mode,
|
|
69
|
+
the learning rate will be reduced when the
|
|
70
|
+
quantity monitored has stopped decreasing.
|
|
71
|
+
- In ``'max'`` mode it will be
|
|
72
|
+
reduced when the quantity monitored has stopped increasing.
|
|
73
|
+
- In ``'auto'``
|
|
74
|
+
mode, the direction is automatically inferred from the name of the
|
|
75
|
+
monitored quantity.
|
|
76
|
+
|
|
72
77
|
min_delta (float): threshold for measuring the new optimum, to only focus on
|
|
73
78
|
significant changes. Default: ``1e-4`` .
|
|
74
79
|
cooldown (int): number of epochs to wait before resuming normal operation after
|
|
@@ -107,22 +107,18 @@ class SummaryCollector(Callback):
|
|
|
107
107
|
The first output will be treated as the loss and it will be averaged. Default: ``True`` .
|
|
108
108
|
- collect_graph (bool): Whether to collect the computational graph. Currently, only
|
|
109
109
|
training computational graph is collected. Default: ``True`` .
|
|
110
|
-
- collect_train_lineage (bool): Whether to collect lineage data for the training phase
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
- collect_eval_lineage (bool): Whether to collect lineage data for the evaluation phase,
|
|
115
|
-
this field will be displayed on the `lineage page
|
|
116
|
-
<https://www.mindspore.cn/mindinsight/docs/en/master/lineage_and_scalars_comparison.html>`_
|
|
117
|
-
of MindInsight. Default: ``True`` .
|
|
110
|
+
- collect_train_lineage (bool): Whether to collect lineage data for the training phase.
|
|
111
|
+
Default: ``True`` .
|
|
112
|
+
- collect_eval_lineage (bool): Whether to collect lineage data for the evaluation phase.
|
|
113
|
+
Default: ``True`` .
|
|
118
114
|
- collect_input_data (bool): Whether to collect dataset for each training.
|
|
119
115
|
Currently only image data is supported.
|
|
120
116
|
If there are multiple columns of data in the dataset, the first column should be image data.
|
|
121
117
|
Default: ``True`` .
|
|
122
118
|
- collect_dataset_graph (bool): Whether to collect dataset graph for the training phase.
|
|
123
119
|
Default: ``True`` .
|
|
124
|
-
- histogram_regular (Union[str, None]): Collect weight and bias for parameter distribution page
|
|
125
|
-
|
|
120
|
+
- histogram_regular (Union[str, None]): Collect weight and bias for parameter distribution page.
|
|
121
|
+
This field allows regular strings to control which parameters to collect.
|
|
126
122
|
It is not recommended to collect too many parameters at once, as it can affect performance.
|
|
127
123
|
Note that if you collect too many parameters and run out of memory, the training will fail.
|
|
128
124
|
Default: ``None`` , it means only the first five parameters are collected.
|
|
@@ -153,8 +149,7 @@ class SummaryCollector(Callback):
|
|
|
153
149
|
True: it means that after specified data is set, non-specified data is collected as the default behavior.
|
|
154
150
|
False: it means that after specified data is set, only the specified data is collected,
|
|
155
151
|
and the others are not collected. Default: ``True`` .
|
|
156
|
-
custom_lineage_data (Union[dict, None]): Allows you to customize the data
|
|
157
|
-
`lineage page <https://www.mindspore.cn/mindinsight/docs/en/master/lineage_and_scalars_comparison.html>`_ .
|
|
152
|
+
custom_lineage_data (Union[dict, None]): Allows you to customize the data.
|
|
158
153
|
In the custom data, the type of the key supports str, and the type of value supports str, int
|
|
159
154
|
and float. Default: ``None`` , it means there is no custom data.
|
|
160
155
|
collect_tensor_freq (Optional[int]): The same semantics as the `collect_freq`, but controls TensorSummary only.
|
|
@@ -168,7 +163,7 @@ class SummaryCollector(Callback):
|
|
|
168
163
|
affect the number of steps TensorSummary will be collected.
|
|
169
164
|
Default: ``None`` , which means to follow the behavior as described above.
|
|
170
165
|
max_file_size (Optional[int]): The maximum size in bytes of each file that can be written to the disk.
|
|
171
|
-
For example, to write not larger than 4GB, specify `max_file_size=4*1024
|
|
166
|
+
For example, to write not larger than 4GB, specify `max_file_size=4*1024*3`.
|
|
172
167
|
Default: ``None`` , which means no limit.
|
|
173
168
|
export_options (Union[None, dict]): Perform custom operations on the export data.
|
|
174
169
|
Note that the size of export files is not limited by the max_file_size.
|
|
@@ -28,7 +28,8 @@ class TimeMonitor(Callback):
|
|
|
28
28
|
Args:
|
|
29
29
|
data_size (int): How many steps are the intervals between print information each time.
|
|
30
30
|
if the program get `batch_num` during training, `data_size` will be set to `batch_num`,
|
|
31
|
-
otherwise `data_size` will be used.
|
|
31
|
+
otherwise `data_size` will be used. If the program does not get `batch_num` during training,
|
|
32
|
+
meanwhile `data_size` does not set, the program will report an error. Default: ``None`` .
|
|
32
33
|
|
|
33
34
|
data_time (bool): Whether to show the average time of fetching data in Host.
|
|
34
35
|
Note that data fetch and network compute are processed sequentially in non dataset sink mode, while
|