mindspore 2.7.0rc1__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +5 -2
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +2 -2
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/compile_config.py +24 -1
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
- mindspore/_extends/parse/parser.py +28 -22
- mindspore/_extends/parse/resources.py +1 -1
- mindspore/_extends/parse/standard_method.py +23 -2
- mindspore/_extends/parse/trope.py +2 -1
- mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
- mindspore/amp.py +0 -18
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/base.py +29 -2
- mindspore/common/__init__.py +18 -12
- mindspore/common/_decorator.py +3 -2
- mindspore/common/_grad_function.py +3 -1
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +371 -96
- mindspore/common/_utils.py +7 -43
- mindspore/common/api.py +434 -135
- mindspore/common/dtype.py +98 -57
- mindspore/common/dump.py +7 -108
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
- mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
- mindspore/common/file_system.py +59 -9
- mindspore/common/hook_handle.py +82 -3
- mindspore/common/jit_config.py +5 -1
- mindspore/common/jit_trace.py +27 -12
- mindspore/common/lazy_inline.py +5 -3
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +17 -127
- mindspore/common/recompute.py +4 -13
- mindspore/common/tensor.py +50 -217
- mindspore/communication/_comm_helper.py +11 -1
- mindspore/communication/comm_func.py +138 -4
- mindspore/communication/management.py +85 -1
- mindspore/config/op_info.config +0 -15
- mindspore/context.py +20 -106
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +35 -1
- mindspore/dataset/engine/datasets.py +338 -319
- mindspore/dataset/engine/datasets_user_defined.py +38 -22
- mindspore/dataset/engine/datasets_vision.py +1 -1
- mindspore/dataset/engine/validators.py +1 -15
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +17 -5
- mindspore/dataset/vision/utils.py +632 -21
- mindspore/device_context/ascend/op_tuning.py +35 -1
- mindspore/dnnl.dll +0 -0
- mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
- mindspore/graph/custom_pass.py +55 -0
- mindspore/include/api/cell.h +28 -4
- mindspore/include/api/cfg.h +24 -7
- mindspore/include/api/context.h +1 -0
- mindspore/include/api/delegate.h +0 -2
- mindspore/include/api/dual_abi_helper.h +100 -19
- mindspore/include/api/graph.h +14 -1
- mindspore/include/api/kernel.h +16 -3
- mindspore/include/api/kernel_api.h +9 -1
- mindspore/include/api/metrics/accuracy.h +9 -0
- mindspore/include/api/model.h +5 -1
- mindspore/include/api/model_group.h +4 -0
- mindspore/include/api/model_parallel_runner.h +2 -0
- mindspore/include/api/status.h +48 -10
- mindspore/include/api/types.h +6 -1
- mindspore/include/dataset/constants.h +9 -0
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/__init__.py +3 -3
- mindspore/mindrecord/common/exceptions.py +1 -0
- mindspore/mindrecord/config.py +1 -1
- mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
- mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
- mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
- mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
- mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
- mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
- mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
- mindspore/mindrecord/filereader.py +4 -4
- mindspore/mindrecord/filewriter.py +5 -5
- mindspore/mindrecord/mindpage.py +2 -2
- mindspore/mindrecord/tools/cifar10.py +4 -3
- mindspore/mindrecord/tools/cifar100.py +1 -1
- mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
- mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
- mindspore/mindrecord/tools/csv_to_mr.py +1 -1
- mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
- mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
- mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_cluster.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_hardware_abstract.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mindspore_runtime_utils.dll +0 -0
- mindspore/mindspore_tools.dll +0 -0
- mindspore/mint/__init__.py +15 -10
- mindspore/mint/distributed/__init__.py +4 -0
- mindspore/mint/distributed/distributed.py +392 -69
- mindspore/mint/nn/__init__.py +2 -16
- mindspore/mint/nn/functional.py +4 -110
- mindspore/mint/nn/layer/__init__.py +0 -2
- mindspore/mint/nn/layer/_functions.py +1 -2
- mindspore/mint/nn/layer/activation.py +0 -6
- mindspore/mint/nn/layer/basic.py +0 -47
- mindspore/mint/nn/layer/conv.py +10 -10
- mindspore/mint/nn/layer/normalization.py +11 -16
- mindspore/mint/nn/layer/pooling.py +0 -4
- mindspore/nn/__init__.py +1 -3
- mindspore/nn/cell.py +231 -239
- mindspore/nn/layer/activation.py +4 -2
- mindspore/nn/layer/basic.py +56 -14
- mindspore/nn/layer/container.py +16 -0
- mindspore/nn/layer/embedding.py +4 -169
- mindspore/nn/layer/image.py +1 -1
- mindspore/nn/layer/normalization.py +2 -1
- mindspore/nn/layer/thor_layer.py +4 -85
- mindspore/nn/optim/ada_grad.py +0 -1
- mindspore/nn/optim/adafactor.py +0 -1
- mindspore/nn/optim/adam.py +32 -127
- mindspore/nn/optim/adamax.py +0 -1
- mindspore/nn/optim/asgd.py +0 -1
- mindspore/nn/optim/ftrl.py +8 -102
- mindspore/nn/optim/lamb.py +1 -4
- mindspore/nn/optim/lars.py +0 -3
- mindspore/nn/optim/lazyadam.py +25 -218
- mindspore/nn/optim/momentum.py +5 -43
- mindspore/nn/optim/optimizer.py +6 -55
- mindspore/nn/optim/proximal_ada_grad.py +0 -1
- mindspore/nn/optim/rmsprop.py +0 -1
- mindspore/nn/optim/rprop.py +0 -1
- mindspore/nn/optim/sgd.py +0 -1
- mindspore/nn/optim/tft_wrapper.py +2 -4
- mindspore/nn/optim/thor.py +0 -2
- mindspore/nn/probability/bijector/bijector.py +7 -8
- mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
- mindspore/nn/probability/bijector/power_transform.py +20 -21
- mindspore/nn/probability/bijector/scalar_affine.py +5 -5
- mindspore/nn/probability/bijector/softplus.py +13 -14
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/wrap/cell_wrapper.py +39 -5
- mindspore/nn/wrap/grad_reducer.py +4 -89
- mindspore/numpy/array_creations.py +4 -4
- mindspore/numpy/fft.py +9 -9
- mindspore/numpy/utils_const.py +1 -1
- mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
- mindspore/onnx/onnx_export.py +137 -0
- mindspore/opencv_core4110.dll +0 -0
- mindspore/opencv_imgcodecs4110.dll +0 -0
- mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
- mindspore/ops/__init__.py +2 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
- mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
- mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
- mindspore/ops/_op_impl/cpu/__init__.py +1 -5
- mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
- mindspore/ops/auto_generate/gen_extend_func.py +6 -11
- mindspore/ops/auto_generate/gen_ops_def.py +385 -154
- mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
- mindspore/ops/communication.py +97 -0
- mindspore/ops/composite/__init__.py +5 -2
- mindspore/ops/composite/base.py +16 -2
- mindspore/ops/composite/multitype_ops/__init__.py +3 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
- mindspore/ops/function/__init__.py +2 -0
- mindspore/ops/function/array_func.py +24 -18
- mindspore/ops/function/comm_func.py +3883 -0
- mindspore/ops/function/debug_func.py +7 -6
- mindspore/ops/function/grad/grad_func.py +4 -12
- mindspore/ops/function/math_func.py +89 -86
- mindspore/ops/function/nn_func.py +92 -313
- mindspore/ops/function/random_func.py +9 -18
- mindspore/ops/functional.py +4 -1
- mindspore/ops/functional_overload.py +377 -30
- mindspore/ops/operations/__init__.py +2 -5
- mindspore/ops/operations/_custom_ops_utils.py +7 -9
- mindspore/ops/operations/_inner_ops.py +12 -50
- mindspore/ops/operations/_rl_inner_ops.py +0 -933
- mindspore/ops/operations/array_ops.py +5 -50
- mindspore/ops/operations/comm_ops.py +95 -17
- mindspore/ops/operations/custom_ops.py +237 -22
- mindspore/ops/operations/debug_ops.py +33 -35
- mindspore/ops/operations/manually_defined/ops_def.py +39 -318
- mindspore/ops/operations/math_ops.py +5 -5
- mindspore/ops/operations/nn_ops.py +3 -3
- mindspore/ops/operations/sparse_ops.py +0 -83
- mindspore/ops/primitive.py +4 -27
- mindspore/ops/tensor_method.py +88 -10
- mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
- mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
- mindspore/ops_generate/api/functions_cc_generator.py +53 -4
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
- mindspore/ops_generate/common/gen_constants.py +11 -10
- mindspore/ops_generate/common/op_proto.py +18 -1
- mindspore/ops_generate/common/template.py +102 -245
- mindspore/ops_generate/common/template_utils.py +212 -0
- mindspore/ops_generate/gen_custom_ops.py +69 -0
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
- mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
- mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
- mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
- mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
- mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
- mindspore/ops_generate/resources/yaml_loader.py +13 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
- mindspore/parallel/_auto_parallel_context.py +5 -15
- mindspore/parallel/_cell_wrapper.py +1 -1
- mindspore/parallel/_parallel_serialization.py +4 -6
- mindspore/parallel/_ps_context.py +2 -2
- mindspore/parallel/_utils.py +34 -17
- mindspore/parallel/auto_parallel.py +23 -9
- mindspore/parallel/checkpoint_transform.py +20 -2
- mindspore/parallel/cluster/process_entity/_api.py +28 -33
- mindspore/parallel/cluster/process_entity/_utils.py +9 -5
- mindspore/parallel/cluster/run.py +5 -3
- mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
- mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
- mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
- mindspore/parallel/function/reshard_func.py +6 -5
- mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
- mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
- mindspore/parallel/shard.py +7 -21
- mindspore/parallel/strategy.py +336 -0
- mindspore/parallel/transform_safetensors.py +127 -20
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
- mindspore/profiler/common/constant.py +5 -0
- mindspore/profiler/common/file_manager.py +9 -0
- mindspore/profiler/common/msprof_cmd_tool.py +40 -4
- mindspore/profiler/common/path_manager.py +65 -24
- mindspore/profiler/common/profiler_context.py +27 -14
- mindspore/profiler/common/profiler_info.py +3 -3
- mindspore/profiler/common/profiler_meta_data.py +1 -0
- mindspore/profiler/common/profiler_op_analyse.py +10 -6
- mindspore/profiler/common/profiler_path_manager.py +13 -0
- mindspore/profiler/common/util.py +30 -3
- mindspore/profiler/dynamic_profiler.py +91 -46
- mindspore/profiler/envprofiler.py +30 -5
- mindspore/profiler/experimental_config.py +18 -2
- mindspore/profiler/platform/cpu_profiler.py +10 -4
- mindspore/profiler/platform/npu_profiler.py +34 -7
- mindspore/profiler/profiler.py +193 -145
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +108 -24
- mindspore/runtime/__init__.py +9 -6
- mindspore/runtime/executor.py +35 -0
- mindspore/runtime/memory.py +113 -0
- mindspore/runtime/thread_bind_core.py +1 -1
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
- mindspore/tools/data_dump.py +130 -0
- mindspore/tools/sdc_detect.py +91 -0
- mindspore/tools/stress_detect.py +63 -0
- mindspore/train/__init__.py +6 -6
- mindspore/train/_utils.py +8 -21
- mindspore/train/amp.py +6 -7
- mindspore/train/callback/_callback.py +2 -1
- mindspore/train/callback/_checkpoint.py +1 -17
- mindspore/train/callback/_flops_collector.py +10 -6
- mindspore/train/callback/_train_fault_tolerance.py +72 -25
- mindspore/train/data_sink.py +5 -9
- mindspore/train/dataset_helper.py +5 -5
- mindspore/train/model.py +41 -230
- mindspore/train/serialization.py +160 -401
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +6 -3
- mindspore/utils/dlpack.py +92 -0
- mindspore/utils/dryrun.py +1 -1
- mindspore/utils/runtime_execution_order_check.py +10 -0
- mindspore/utils/sdc_detect.py +14 -12
- mindspore/utils/stress_detect.py +43 -0
- mindspore/utils/utils.py +152 -16
- mindspore/version.py +1 -1
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
- mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
- mindspore/communication/_hccl_management.py +0 -297
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
- mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
- mindspore/experimental/llm_boost/atb/__init__.py +0 -23
- mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
- mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
- mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
- mindspore/experimental/llm_boost/register.py +0 -130
- mindspore/experimental/llm_boost/utils.py +0 -31
- mindspore/include/OWNERS +0 -7
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
- mindspore/nn/reinforcement/_batch_read_write.py +0 -142
- mindspore/nn/reinforcement/_tensors_queue.py +0 -152
- mindspore/nn/reinforcement/tensor_array.py +0 -145
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
- mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
- mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
- mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
- mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
- mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
- mindspore/ops/operations/_tensor_array.py +0 -359
- mindspore/ops/operations/rl_ops.py +0 -288
- mindspore/parallel/_offload_context.py +0 -275
- mindspore/parallel/_recovery_context.py +0 -115
- mindspore/parallel/_transformer/__init__.py +0 -35
- mindspore/parallel/_transformer/layers.py +0 -765
- mindspore/parallel/_transformer/loss.py +0 -251
- mindspore/parallel/_transformer/moe.py +0 -693
- mindspore/parallel/_transformer/op_parallel_config.py +0 -222
- mindspore/parallel/_transformer/transformer.py +0 -3124
- mindspore/parallel/mpi/_mpi_config.py +0 -116
- mindspore/profiler/common/validator/validate_path.py +0 -84
- mindspore/train/memory_profiling_pb2.py +0 -298
- mindspore/utils/hooks.py +0 -81
- /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
mindspore/nn/optim/lazyadam.py
CHANGED
|
@@ -26,34 +26,19 @@ from mindspore.common.tensor import Tensor
|
|
|
26
26
|
from mindspore import _checkparam as validator
|
|
27
27
|
from mindspore.nn.optim.optimizer import Optimizer
|
|
28
28
|
from mindspore.nn.optim.optimizer import opt_init_args_register
|
|
29
|
-
from mindspore.nn.optim._dist_optimizer_registry import _register_dist_optimizer
|
|
30
29
|
from mindspore.common._decorator import deprecated
|
|
31
30
|
|
|
32
31
|
_lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt")
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
@_lazy_adam_opt.register("Function", "Function", "
|
|
36
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor"
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter, cache_enable,
|
|
40
|
-
distributed_opt, use_flag, distributed_sparse_opt, use_sparse_flag):
|
|
34
|
+
@_lazy_adam_opt.register("Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
35
|
+
"Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor")
|
|
36
|
+
def _run_opt_with_sparse(opt, sparse_opt, use_locking, use_nesterov, target, beta1_power, beta2_power,
|
|
37
|
+
beta1, beta2, eps, lr, gradient, params, m, v):
|
|
41
38
|
"""Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
|
|
42
39
|
success = True
|
|
43
40
|
indices = gradient.indices
|
|
44
41
|
values = gradient.values
|
|
45
|
-
if use_sparse_flag:
|
|
46
|
-
success = F.depend(success, distributed_sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2,
|
|
47
|
-
eps, values, indices))
|
|
48
|
-
return success
|
|
49
|
-
if ps_parameter and not cache_enable:
|
|
50
|
-
op_shape = P.Shape()
|
|
51
|
-
shapes = (op_shape(params), op_shape(m), op_shape(v),
|
|
52
|
-
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
|
|
53
|
-
op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
|
|
54
|
-
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
|
|
55
|
-
eps, values, indices), shapes), params))
|
|
56
|
-
return success
|
|
57
42
|
|
|
58
43
|
if not target:
|
|
59
44
|
success = F.depend(success, sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2,
|
|
@@ -85,122 +70,10 @@ def _run_opt_with_sparse_dist(opt, sparse_opt, push, pull, use_locking, use_nest
|
|
|
85
70
|
return success
|
|
86
71
|
|
|
87
72
|
|
|
88
|
-
@_lazy_adam_opt.register("Function", "Function", "
|
|
89
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "MapTensor", "MapTensor", "MapTensor", "MapTensor"
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
beta2_power, beta1, beta2, eps, lr, gradient, params, m, v,
|
|
93
|
-
ps_parameter, cache_enable, distributed_opt, use_flag, distributed_sparse_opt,
|
|
94
|
-
use_sparse_flag):
|
|
95
|
-
"""Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
|
|
96
|
-
success = True
|
|
97
|
-
indices, values = gradient.get_data()
|
|
98
|
-
if use_sparse_flag:
|
|
99
|
-
# PS Mode.
|
|
100
|
-
success = F.depend(success, distributed_sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2,
|
|
101
|
-
eps, values, indices))
|
|
102
|
-
else:
|
|
103
|
-
# PS Cache mode.
|
|
104
|
-
op_sqrt = P.Sqrt()
|
|
105
|
-
|
|
106
|
-
m_slice = m.get(indices)
|
|
107
|
-
v_slice = v.get(indices)
|
|
108
|
-
|
|
109
|
-
next_m = m_slice * beta1 + values * (1 - beta1)
|
|
110
|
-
next_v = v_slice * beta2 + values * values * (1 - beta2)
|
|
111
|
-
|
|
112
|
-
lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
|
|
113
|
-
|
|
114
|
-
if use_nesterov:
|
|
115
|
-
m_temp = beta1 * next_m + values * (1 - beta1)
|
|
116
|
-
param_update = m_temp / (op_sqrt(next_v) + eps)
|
|
117
|
-
else:
|
|
118
|
-
param_update = next_m / (op_sqrt(next_v) + eps)
|
|
119
|
-
|
|
120
|
-
params_need_update = params.get(indices)
|
|
121
|
-
params.put(indices, params_need_update - lr_t * param_update)
|
|
122
|
-
m.put(indices, next_m)
|
|
123
|
-
v.put(indices, next_v)
|
|
124
|
-
|
|
125
|
-
return success
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
129
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool",
|
|
130
|
-
"Function", "Bool", "Function", "Bool")
|
|
131
|
-
def _run_opt_with_one_number_dist(opt, sparse_opt, push, pull, use_locking, use_nesterov, target,
|
|
132
|
-
beta1_power, beta2_power,
|
|
133
|
-
beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter, cache_enable,
|
|
134
|
-
distributed_opt, use_flag, distributed_sparse_opt, use_sparse_flag):
|
|
135
|
-
"""Apply lazy adam optimizer to the weight parameter using Tensor."""
|
|
136
|
-
success = True
|
|
137
|
-
if use_flag:
|
|
138
|
-
success = F.depend(success, distributed_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1,
|
|
139
|
-
beta2, eps, gradient))
|
|
140
|
-
elif ps_parameter and not cache_enable:
|
|
141
|
-
op_shape = P.Shape()
|
|
142
|
-
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient),
|
|
143
|
-
(op_shape(params), op_shape(moment1), op_shape(moment2))), params))
|
|
144
|
-
else:
|
|
145
|
-
success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
|
|
146
|
-
eps, gradient))
|
|
147
|
-
return success
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
151
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool",
|
|
152
|
-
"Bool")
|
|
153
|
-
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power,
|
|
154
|
-
beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter, cache_enable):
|
|
155
|
-
"""Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
|
|
156
|
-
success = True
|
|
157
|
-
indices = gradient.indices
|
|
158
|
-
values = gradient.values
|
|
159
|
-
if ps_parameter and not cache_enable:
|
|
160
|
-
op_shape = P.Shape()
|
|
161
|
-
shapes = (op_shape(params), op_shape(m), op_shape(v),
|
|
162
|
-
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
|
|
163
|
-
op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
|
|
164
|
-
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
|
|
165
|
-
eps, values, indices), shapes), params))
|
|
166
|
-
return success
|
|
167
|
-
|
|
168
|
-
if not target:
|
|
169
|
-
success = F.depend(success, sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2,
|
|
170
|
-
eps, values, indices))
|
|
171
|
-
else:
|
|
172
|
-
op_gather = P.Gather()
|
|
173
|
-
op_sqrt = P.Sqrt()
|
|
174
|
-
scatter_add = P.ScatterAdd(use_locking)
|
|
175
|
-
scatter_update = P.ScatterUpdate(use_locking)
|
|
176
|
-
|
|
177
|
-
m_slice = op_gather(m, indices, 0)
|
|
178
|
-
v_slice = op_gather(v, indices, 0)
|
|
179
|
-
|
|
180
|
-
next_m = m_slice * beta1 + values * (1 - beta1)
|
|
181
|
-
next_v = v_slice * beta2 + values * values * (1 - beta2)
|
|
182
|
-
|
|
183
|
-
lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
|
|
184
|
-
|
|
185
|
-
if use_nesterov:
|
|
186
|
-
m_temp = beta1 * next_m + values * (1 - beta1)
|
|
187
|
-
param_update = m_temp / (op_sqrt(next_v) + eps)
|
|
188
|
-
else:
|
|
189
|
-
param_update = next_m / (op_sqrt(next_v) + eps)
|
|
190
|
-
|
|
191
|
-
success = F.depend(success, scatter_add(params, indices, - lr_t * param_update))
|
|
192
|
-
success = F.depend(success, scatter_update(m, indices, next_m))
|
|
193
|
-
success = F.depend(success, scatter_update(v, indices, next_v))
|
|
194
|
-
|
|
195
|
-
return success
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
199
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "MapTensor", "MapTensor", "MapTensor", "MapTensor",
|
|
200
|
-
"Bool", "Bool")
|
|
201
|
-
def _run_map_tensor_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
|
|
202
|
-
beta2_power, beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter,
|
|
203
|
-
cache_enable):
|
|
73
|
+
@_lazy_adam_opt.register("Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
74
|
+
"Tensor", "Tensor", "Tensor", "Tensor", "MapTensor", "MapTensor", "MapTensor", "MapTensor")
|
|
75
|
+
def _run_map_tensor_opt_with_sparse(opt, sparse_opt, use_locking, use_nesterov, target, beta1_power,
|
|
76
|
+
beta2_power, beta1, beta2, eps, lr, gradient, params, m, v):
|
|
204
77
|
"""Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse(MapTensor)."""
|
|
205
78
|
success = True
|
|
206
79
|
indices, values = gradient.get_data()
|
|
@@ -229,19 +102,14 @@ def _run_map_tensor_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, us
|
|
|
229
102
|
return success
|
|
230
103
|
|
|
231
104
|
|
|
232
|
-
@_lazy_adam_opt.register("Function", "Function", "
|
|
233
|
-
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor"
|
|
234
|
-
def _run_opt_with_one_number(opt, sparse_opt,
|
|
235
|
-
beta1, beta2, eps, lr, gradient, params, moment1, moment2
|
|
105
|
+
@_lazy_adam_opt.register("Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
106
|
+
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
|
107
|
+
def _run_opt_with_one_number(opt, sparse_opt, use_locking, use_nesterov, target, beta1_power, beta2_power,
|
|
108
|
+
beta1, beta2, eps, lr, gradient, params, moment1, moment2):
|
|
236
109
|
"""Apply lazy adam optimizer to the weight parameter using Tensor."""
|
|
237
110
|
success = True
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient),
|
|
241
|
-
(op_shape(params), op_shape(moment1), op_shape(moment2))), params))
|
|
242
|
-
else:
|
|
243
|
-
success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
|
|
244
|
-
eps, gradient))
|
|
111
|
+
success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
|
|
112
|
+
eps, gradient))
|
|
245
113
|
return success
|
|
246
114
|
|
|
247
115
|
|
|
@@ -436,15 +304,9 @@ class LazyAdam(Optimizer):
|
|
|
436
304
|
self.opt = P.Adam(use_locking, use_nesterov)
|
|
437
305
|
self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov)
|
|
438
306
|
self.sparse_opt.set_device("CPU")
|
|
439
|
-
self._ps_pull = P.Pull()
|
|
440
|
-
self._ps_push = P.Push("Adam", [0, 1, 2])
|
|
441
|
-
self._ps_push.add_prim_attr("use_nesterov", use_nesterov)
|
|
442
|
-
|
|
443
|
-
self._init_distributed_opts(use_locking, use_nesterov)
|
|
444
307
|
|
|
445
308
|
@jit
|
|
446
309
|
def construct(self, gradients):
|
|
447
|
-
gradients = self.flatten_gradients(gradients)
|
|
448
310
|
gradients = self.decay_weight(gradients)
|
|
449
311
|
gradients = self.gradients_centralization(gradients)
|
|
450
312
|
gradients = self.scale_grad(gradients)
|
|
@@ -457,40 +319,18 @@ class LazyAdam(Optimizer):
|
|
|
457
319
|
beta2_power = self.beta2_power * self.beta2
|
|
458
320
|
self.beta2_power = beta2_power
|
|
459
321
|
|
|
460
|
-
if self.
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
lr, gradients, self._parameters, self.moment1, self.moment2,
|
|
467
|
-
self.ps_parameters, self.cache_enable, self.dense_lazyadam_opts,
|
|
468
|
-
self.use_dense_opt_flags, self.sparse_lazyadam_opts,
|
|
469
|
-
self.use_sparse_opt_flags)
|
|
470
|
-
else:
|
|
471
|
-
success = self.map_reverse(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
472
|
-
self._ps_push, self._ps_pull, self.use_locking, self.use_nesterov,
|
|
473
|
-
self._is_device, beta1_power, beta2_power,
|
|
474
|
-
self.beta1, self.beta2, self.eps, lr),
|
|
475
|
-
gradients, self._parameters, self.moment1, self.moment2,
|
|
476
|
-
self.ps_parameters, self.cache_enable, self.dense_lazyadam_opts,
|
|
477
|
-
self.use_dense_opt_flags, self.sparse_lazyadam_opts,
|
|
478
|
-
self.use_sparse_opt_flags)
|
|
322
|
+
if self.is_group_lr:
|
|
323
|
+
success = self.map_reverse(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
324
|
+
self.use_locking, self.use_nesterov,
|
|
325
|
+
self._is_device, beta1_power, beta2_power,
|
|
326
|
+
self.beta1, self.beta2, self.eps),
|
|
327
|
+
lr, gradients, self._parameters, self.moment1, self.moment2)
|
|
479
328
|
else:
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
lr, gradients, self._parameters, self.moment1, self.moment2,
|
|
486
|
-
self.ps_parameters, self.cache_enable)
|
|
487
|
-
else:
|
|
488
|
-
success = self.map_reverse(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
489
|
-
self._ps_push, self._ps_pull, self.use_locking, self.use_nesterov,
|
|
490
|
-
self._is_device, beta1_power, beta2_power,
|
|
491
|
-
self.beta1, self.beta2, self.eps, lr),
|
|
492
|
-
gradients, self._parameters, self.moment1, self.moment2,
|
|
493
|
-
self.ps_parameters, self.cache_enable)
|
|
329
|
+
success = self.map_reverse(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
330
|
+
self.use_locking, self.use_nesterov,
|
|
331
|
+
self._is_device, beta1_power, beta2_power,
|
|
332
|
+
self.beta1, self.beta2, self.eps, lr),
|
|
333
|
+
gradients, self._parameters, self.moment1, self.moment2)
|
|
494
334
|
return success
|
|
495
335
|
|
|
496
336
|
@Optimizer.target.setter
|
|
@@ -500,36 +340,3 @@ class LazyAdam(Optimizer):
|
|
|
500
340
|
optimizer operation.
|
|
501
341
|
"""
|
|
502
342
|
self._set_base_target(value)
|
|
503
|
-
|
|
504
|
-
def _init_distributed_opts(self, use_locking, use_nesterov):
|
|
505
|
-
self.use_dist_optimizer = self._use_distibuted_optimizer()
|
|
506
|
-
self.dense_lazyadam_opts, self.use_dense_opt_flags =\
|
|
507
|
-
self._get_distributed_optimizer_list("adam", use_locking, use_nesterov)
|
|
508
|
-
self.sparse_lazyadam_opts, self.use_sparse_opt_flags =\
|
|
509
|
-
self._get_distributed_optimizer_list("fused_sparse_lazy_adam", use_locking, use_nesterov)
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def create_distributed_adam(*args, **kwargs):
|
|
513
|
-
"""
|
|
514
|
-
Create the distributed Adam op.
|
|
515
|
-
"""
|
|
516
|
-
adam = P.Adam(*args, **kwargs)
|
|
517
|
-
adam.add_prim_attr("gradient_type", "dense_gradient")
|
|
518
|
-
adam.add_prim_attr("parameter_input_index", 0)
|
|
519
|
-
adam.add_prim_attr("gradient_input_index", 9)
|
|
520
|
-
return adam
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
def create_distributed_fused_sparse_lazy_adam(*args, **kwargs):
|
|
524
|
-
"""
|
|
525
|
-
Create the distributed FusedSparseLazyAdam op.
|
|
526
|
-
"""
|
|
527
|
-
sparse_lazy_adam = P.FusedSparseLazyAdam(*args, **kwargs)
|
|
528
|
-
sparse_lazy_adam.add_prim_attr("gradient_type", "sparse_gradient")
|
|
529
|
-
sparse_lazy_adam.add_prim_attr("parameter_input_index", 0)
|
|
530
|
-
sparse_lazy_adam.add_prim_attr("gradient_input_index", 9)
|
|
531
|
-
sparse_lazy_adam.add_prim_attr("indices_input_index", 10)
|
|
532
|
-
return sparse_lazy_adam
|
|
533
|
-
|
|
534
|
-
_register_dist_optimizer("adam", create_distributed_adam)
|
|
535
|
-
_register_dist_optimizer("fused_sparse_lazy_adam", create_distributed_fused_sparse_lazy_adam)
|
mindspore/nn/optim/momentum.py
CHANGED
|
@@ -23,7 +23,6 @@ import mindspore.common.dtype as mstype
|
|
|
23
23
|
from mindspore import _checkparam as Validator
|
|
24
24
|
from mindspore.nn.optim.optimizer import Optimizer
|
|
25
25
|
from mindspore.nn.optim.optimizer import opt_init_args_register
|
|
26
|
-
from mindspore.nn.optim._dist_optimizer_registry import _register_dist_optimizer
|
|
27
26
|
|
|
28
27
|
|
|
29
28
|
_momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
|
@@ -35,18 +34,6 @@ def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
|
|
|
35
34
|
success = F.depend(True, opt(weight, moment, learning_rate, gradient, momentum))
|
|
36
35
|
return success
|
|
37
36
|
|
|
38
|
-
|
|
39
|
-
@_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Function", "Bool")
|
|
40
|
-
def _tensor_run_opt_ext_dist(opt, momentum, learning_rate, gradient, weight, moment,
|
|
41
|
-
distributed_opt, use_flag):
|
|
42
|
-
"""Apply momentum optimizer to the weight parameter using Tensor."""
|
|
43
|
-
if use_flag:
|
|
44
|
-
success = F.depend(True, distributed_opt(weight, moment, learning_rate, gradient, momentum))
|
|
45
|
-
else:
|
|
46
|
-
success = F.depend(True, opt(weight, moment, learning_rate, gradient, momentum))
|
|
47
|
-
return success
|
|
48
|
-
|
|
49
|
-
|
|
50
37
|
class Momentum(Optimizer):
|
|
51
38
|
r"""
|
|
52
39
|
Implements the Momentum algorithm.
|
|
@@ -196,45 +183,20 @@ class Momentum(Optimizer):
|
|
|
196
183
|
self.moments = self.params.clone(prefix="moments", init='zeros')
|
|
197
184
|
self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
|
|
198
185
|
|
|
199
|
-
self.distributed_opts, self.use_distributed_opt_flags =\
|
|
200
|
-
self._get_distributed_optimizer_list("momentum", use_nesterov=self.use_nesterov)
|
|
201
|
-
self.use_dist_optimizer = self._use_distibuted_optimizer()
|
|
202
186
|
|
|
203
187
|
@jit(backend="ms_backend")
|
|
204
188
|
def construct(self, gradients):
|
|
205
189
|
params = self.params
|
|
206
190
|
moments = self.moments
|
|
207
|
-
gradients = self.flatten_gradients(gradients)
|
|
208
191
|
gradients = self.decay_weight(gradients)
|
|
209
192
|
gradients = self.gradients_centralization(gradients)
|
|
210
193
|
gradients = self.scale_grad(gradients)
|
|
211
194
|
lr = self.get_lr()
|
|
212
195
|
self.assignadd(self.global_step, self.global_step_increase_tensor)
|
|
213
|
-
if self.
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
lr, gradients, params, moments,
|
|
217
|
-
self.distributed_opts, self.use_distributed_opt_flags)
|
|
218
|
-
else:
|
|
219
|
-
success = self.hyper_map_reverse(F.partial(_momentum_opt, self.opt, self.momentum, lr),
|
|
220
|
-
gradients, params, moments,
|
|
221
|
-
self.distributed_opts, self.use_distributed_opt_flags)
|
|
196
|
+
if self.is_group_lr:
|
|
197
|
+
success = self.hyper_map_reverse(F.partial(_momentum_opt, self.opt, self.momentum),
|
|
198
|
+
lr, gradients, params, moments)
|
|
222
199
|
else:
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
lr, gradients, params, moments)
|
|
226
|
-
else:
|
|
227
|
-
success = self.hyper_map_reverse(F.partial(_momentum_opt, self.opt, self.momentum, lr),
|
|
228
|
-
gradients, params, moments)
|
|
200
|
+
success = self.hyper_map_reverse(F.partial(_momentum_opt, self.opt, self.momentum, lr),
|
|
201
|
+
gradients, params, moments)
|
|
229
202
|
return success
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def _create_distributed_momentum(*args, **kwargs):
|
|
233
|
-
momentum = P.ApplyMomentum(*args, **kwargs)
|
|
234
|
-
momentum.add_prim_attr("gradient_type", "dense_gradient")
|
|
235
|
-
momentum.add_prim_attr("parameter_input_index", 0)
|
|
236
|
-
momentum.add_prim_attr("gradient_input_index", 3)
|
|
237
|
-
return momentum
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
_register_dist_optimizer("momentum", _create_distributed_momentum)
|
mindspore/nn/optim/optimizer.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2020-
|
|
1
|
+
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -32,11 +32,9 @@ import mindspore.common.dtype as mstype
|
|
|
32
32
|
from mindspore import _checkparam as validator
|
|
33
33
|
from mindspore import log as logger
|
|
34
34
|
from mindspore.parallel._utils import _get_global_rank, _get_device_num, _get_parallel_mode
|
|
35
|
-
from mindspore.parallel._ps_context import _is_ps_mode
|
|
36
35
|
from mindspore.context import ParallelMode
|
|
37
36
|
from mindspore import context
|
|
38
37
|
from mindspore.nn.learning_rate_schedule import LearningRateSchedule
|
|
39
|
-
from mindspore.nn.optim._dist_optimizer_registry import generate_dist_optimizer_list
|
|
40
38
|
|
|
41
39
|
__all__ = ['Optimizer', 'opt_init_args_register']
|
|
42
40
|
|
|
@@ -161,7 +159,6 @@ class Optimizer(Cell):
|
|
|
161
159
|
... def construct(self, gradients):
|
|
162
160
|
... params = self.parameters
|
|
163
161
|
... lr = self.get_lr()
|
|
164
|
-
... gradients = self.flatten_gradients(gradients)
|
|
165
162
|
... gradients = self.decay_weight(gradients)
|
|
166
163
|
... gradients = self.gradients_centralization(gradients)
|
|
167
164
|
... gradients = self.scale_grad(gradients)
|
|
@@ -218,15 +215,12 @@ class Optimizer(Cell):
|
|
|
218
215
|
|
|
219
216
|
self._unique = True
|
|
220
217
|
self._target = context.get_context("device_target")
|
|
221
|
-
self._use_flattened_params = False
|
|
222
|
-
self._grad_fusion_size = 0
|
|
223
218
|
self.dynamic_lr = False
|
|
224
219
|
self.assignadd = P.AssignAdd()
|
|
225
220
|
self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step')
|
|
226
221
|
self.is_group = False
|
|
227
222
|
self.is_group_lr = False
|
|
228
223
|
self.is_group_params_ordered = False
|
|
229
|
-
self.use_parallel = False
|
|
230
224
|
learning_rate = self._preprocess_single_lr(learning_rate)
|
|
231
225
|
if isinstance(parameters[0], dict):
|
|
232
226
|
self.is_group = True
|
|
@@ -263,11 +257,7 @@ class Optimizer(Cell):
|
|
|
263
257
|
self.grad_centralization_flags = tuple(self.group_grad_centralization)
|
|
264
258
|
else:
|
|
265
259
|
self.parameters = ParameterTuple(parameters)
|
|
266
|
-
|
|
267
|
-
if self._use_flattened_params:
|
|
268
|
-
self._parameters = ParameterTuple(flat_params)
|
|
269
|
-
else:
|
|
270
|
-
self._parameters = self.parameters
|
|
260
|
+
self._parameters = self.parameters
|
|
271
261
|
decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
|
|
272
262
|
self.decay_flags = tuple(decay_filter(x) for x in self._parameters)
|
|
273
263
|
self.dynamic_decay_flags = isinstance(weight_decay, Cell)
|
|
@@ -281,10 +271,6 @@ class Optimizer(Cell):
|
|
|
281
271
|
# set user's parameters as local parameters
|
|
282
272
|
for param in self._parameters:
|
|
283
273
|
self._user_parameters.append(param.name)
|
|
284
|
-
ps_filter = lambda x: x.is_param_ps
|
|
285
|
-
self.ps_parameters = tuple(ps_filter(x) for x in self._parameters)
|
|
286
|
-
cache_filter = lambda x: x.cache_enable
|
|
287
|
-
self.cache_enable = tuple(cache_filter(x) for x in self._parameters)
|
|
288
274
|
self.reciprocal_scale = Tensor(1.0 / self.loss_scale, mstype.float32)
|
|
289
275
|
self.need_scale = self.loss_scale != 1.0
|
|
290
276
|
self.global_step_increase_tensor = Tensor([1], mstype.int32)
|
|
@@ -296,28 +282,6 @@ class Optimizer(Cell):
|
|
|
296
282
|
self._use_parallel_optimizer()
|
|
297
283
|
self.enable_tuple_broaden = True
|
|
298
284
|
|
|
299
|
-
def _get_flattened_params(self, parameters):
|
|
300
|
-
"""Get parameters for each contiguous memory chunks used by input parameters if they are flattened."""
|
|
301
|
-
if self.is_group:
|
|
302
|
-
# We don't use flattened parameters when parameters are grouped.
|
|
303
|
-
return parameters
|
|
304
|
-
# Check whether parameters are flattened.
|
|
305
|
-
flattened = Tensor._is_flattened(parameters) # pylint: disable=W0212
|
|
306
|
-
if not flattened:
|
|
307
|
-
# Parameters are not flattened.
|
|
308
|
-
return parameters
|
|
309
|
-
# Try to get chunk tensors from flattened parameters.
|
|
310
|
-
chunk_tensors = Tensor._get_flattened_tensors(parameters) # pylint: disable=W0212
|
|
311
|
-
if not chunk_tensors:
|
|
312
|
-
# Failed to get chunk tensors.
|
|
313
|
-
logger.warning("Parameters are not properly flattened, fallback to not flattened parameters.")
|
|
314
|
-
return parameters
|
|
315
|
-
# Convert chunk tensors to parameters.
|
|
316
|
-
self._use_flattened_params = True
|
|
317
|
-
self._grad_fusion_size = Tensor._get_fusion_size(chunk_tensors) # pylint: disable=W0212
|
|
318
|
-
return [Parameter._from_tensor(t, name='_chunk_param' + str(i) + '_' + str(t.dtype)) # pylint: disable=W0212
|
|
319
|
-
for i, t in enumerate(chunk_tensors)]
|
|
320
|
-
|
|
321
285
|
def _use_parallel_optimizer(self):
|
|
322
286
|
"""Indicates whether to use automatic parallelism."""
|
|
323
287
|
if context.get_auto_parallel_context("enable_parallel_optimizer"):
|
|
@@ -331,7 +295,10 @@ class Optimizer(Cell):
|
|
|
331
295
|
raise RuntimeError("For 'Optimizer', parallel optimizer is not supported in {}, you should set "
|
|
332
296
|
"parallel mode to 'data_parallel', 'semi_auto_parallel' or 'auto_parallel'."
|
|
333
297
|
.format(_get_parallel_mode()))
|
|
334
|
-
|
|
298
|
+
else:
|
|
299
|
+
self.use_parallel = False
|
|
300
|
+
else:
|
|
301
|
+
self.use_parallel = False
|
|
335
302
|
if self.use_parallel:
|
|
336
303
|
if not self._support_parallel_optimizer:
|
|
337
304
|
raise RuntimeError("For 'Optimizer', parallel optimizer only support optimizer 'Lamb' and "
|
|
@@ -403,13 +370,6 @@ class Optimizer(Cell):
|
|
|
403
370
|
raise ValueError(f"For 'Optimizer', the argument {param_info} must not be empty.")
|
|
404
371
|
return parameters
|
|
405
372
|
|
|
406
|
-
@staticmethod
|
|
407
|
-
def _use_distibuted_optimizer():
|
|
408
|
-
"""
|
|
409
|
-
Whether use distributed optimizers.
|
|
410
|
-
"""
|
|
411
|
-
return _is_ps_mode()
|
|
412
|
-
|
|
413
373
|
def flatten_gradients(self, gradients):
|
|
414
374
|
"""
|
|
415
375
|
Flatten gradients into several chunk tensors grouped by data type if network parameters are flattened.
|
|
@@ -424,9 +384,6 @@ class Optimizer(Cell):
|
|
|
424
384
|
Returns:
|
|
425
385
|
tuple[Tensor], The gradients after flattened, or the original gradients if parameters are not flattened.
|
|
426
386
|
"""
|
|
427
|
-
if self._use_flattened_params:
|
|
428
|
-
flatten_concat = inner.FlattenConcat(fusion_size=self._grad_fusion_size)
|
|
429
|
-
return flatten_concat(gradients)
|
|
430
387
|
return gradients
|
|
431
388
|
|
|
432
389
|
def decay_weight(self, gradients):
|
|
@@ -869,12 +826,6 @@ class Optimizer(Cell):
|
|
|
869
826
|
F.assign(param_group[root][i], next_params[i])
|
|
870
827
|
return new_param_group
|
|
871
828
|
|
|
872
|
-
def _get_distributed_optimizer_list(self, optimizer_type, *args, **kwargs):
|
|
873
|
-
"""
|
|
874
|
-
Get the distributed optimizers list in distributed training mode.
|
|
875
|
-
"""
|
|
876
|
-
return generate_dist_optimizer_list(optimizer_type, self._parameters, *args, **kwargs)
|
|
877
|
-
|
|
878
829
|
def construct(self, *hyper_params):
|
|
879
830
|
raise NotImplementedError
|
|
880
831
|
|
|
@@ -205,7 +205,6 @@ class ProximalAdagrad(Optimizer):
|
|
|
205
205
|
def construct(self, grads):
|
|
206
206
|
params = self._parameters
|
|
207
207
|
accum = self.accum
|
|
208
|
-
grads = self.flatten_gradients(grads)
|
|
209
208
|
grads = self.decay_weight(grads)
|
|
210
209
|
grads = self.gradients_centralization(grads)
|
|
211
210
|
grads = self.scale_grad(grads)
|
mindspore/nn/optim/rmsprop.py
CHANGED
|
@@ -238,7 +238,6 @@ class RMSProp(Optimizer):
|
|
|
238
238
|
@jit
|
|
239
239
|
def construct(self, gradients):
|
|
240
240
|
params = self._parameters
|
|
241
|
-
gradients = self.flatten_gradients(gradients)
|
|
242
241
|
gradients = self.decay_weight(gradients)
|
|
243
242
|
gradients = self.gradients_centralization(gradients)
|
|
244
243
|
gradients = self.scale_grad(gradients)
|
mindspore/nn/optim/rprop.py
CHANGED
|
@@ -203,7 +203,6 @@ class Rprop(Optimizer):
|
|
|
203
203
|
|
|
204
204
|
@jit(backend="ms_backend")
|
|
205
205
|
def construct(self, gradients):
|
|
206
|
-
gradients = self.flatten_gradients(gradients)
|
|
207
206
|
gradients = self.decay_weight(gradients)
|
|
208
207
|
gradients = self.gradients_centralization(gradients)
|
|
209
208
|
gradients = self.scale_grad(gradients)
|
mindspore/nn/optim/sgd.py
CHANGED
|
@@ -226,7 +226,6 @@ class SGD(Optimizer):
|
|
|
226
226
|
accum = self.accum
|
|
227
227
|
stat = self.stat
|
|
228
228
|
gradients = self.decay_weight(gradients)
|
|
229
|
-
gradients = self.flatten_gradients(gradients)
|
|
230
229
|
gradients = self.gradients_centralization(gradients)
|
|
231
230
|
gradients = self.scale_grad(gradients)
|
|
232
231
|
lr = self.get_lr()
|
|
@@ -69,10 +69,9 @@ class OptTFTWrapper(Optimizer):
|
|
|
69
69
|
tft_env = os.getenv("MS_ENABLE_TFT", "")
|
|
70
70
|
if ("TTP:1" not in tft_env) and ("UCE:1" not in tft_env) and ("ARF:1" not in tft_env):
|
|
71
71
|
raise ValueError("MindIO TFT regitster need custom switch on[MS_ENABLE_TFT='{TTP:1,UCE:1,ARF:1}']!")
|
|
72
|
-
mode = context.get_context("mode")
|
|
73
72
|
device_target = context.get_context("device_target")
|
|
74
|
-
if device_target != "Ascend"
|
|
75
|
-
raise ValueError("MindIO adataper only support on Ascend device
|
|
73
|
+
if device_target != "Ascend":
|
|
74
|
+
raise ValueError("MindIO adataper only support on Ascend device!")
|
|
76
75
|
self.opt = opt
|
|
77
76
|
self.report = TensorReport()
|
|
78
77
|
self.report_end = TensorReport()
|
|
@@ -109,7 +108,6 @@ class OptTFTWrapper(Optimizer):
|
|
|
109
108
|
self.dynamic_decay_flags = opt.dynamic_decay_flags
|
|
110
109
|
self.weight_decay = opt.weight_decay
|
|
111
110
|
self.exec_weight_decay = opt.exec_weight_decay
|
|
112
|
-
self.ps_parameters = opt.ps_parameters
|
|
113
111
|
self.cache_enable = opt.cache_enable
|
|
114
112
|
self.reciprocal_scale = opt.reciprocal_scale
|
|
115
113
|
self.need_scale = opt.need_scale
|
mindspore/nn/optim/thor.py
CHANGED
|
@@ -585,7 +585,6 @@ class ThorGpu(Optimizer):
|
|
|
585
585
|
def construct(self, gradients):
|
|
586
586
|
params = self.params
|
|
587
587
|
moments = self.moments
|
|
588
|
-
gradients = self.flatten_gradients(gradients)
|
|
589
588
|
gradients = self.scale_grad(gradients)
|
|
590
589
|
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
|
591
590
|
damping_step = self.cast(damping_step, mstype.float32)
|
|
@@ -1247,7 +1246,6 @@ class ThorAscend(Optimizer):
|
|
|
1247
1246
|
def construct(self, gradients):
|
|
1248
1247
|
params = self.params
|
|
1249
1248
|
moments = self.moments
|
|
1250
|
-
gradients = self.flatten_gradients(gradients)
|
|
1251
1249
|
gradients = self.scale_grad(gradients)
|
|
1252
1250
|
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
|
1253
1251
|
damping_step = self.cast(damping_step, mstype.float32)
|
|
@@ -15,8 +15,7 @@
|
|
|
15
15
|
"""Bijector"""
|
|
16
16
|
from mindspore import context
|
|
17
17
|
from mindspore.nn.cell import Cell
|
|
18
|
-
|
|
19
|
-
from mindspore.ops import functional as F
|
|
18
|
+
import mindspore.ops as ops
|
|
20
19
|
from mindspore.ops.operations import _inner_ops as inner
|
|
21
20
|
from mindspore.common import dtype as mstype
|
|
22
21
|
from mindspore.common.tensor import Tensor
|
|
@@ -99,9 +98,9 @@ class Bijector(Cell):
|
|
|
99
98
|
self.checktensor = CheckTensor()
|
|
100
99
|
|
|
101
100
|
# ops needed for the base class
|
|
102
|
-
self.cast_base =
|
|
103
|
-
self.dtype_base =
|
|
104
|
-
self.shape_base =
|
|
101
|
+
self.cast_base = ops.Cast()
|
|
102
|
+
self.dtype_base = ops.DType()
|
|
103
|
+
self.shape_base = ops.Shape()
|
|
105
104
|
self.sametypeshape_base = inner.SameTypeShape()
|
|
106
105
|
self.issubclass_base = inner.IsSubClass()
|
|
107
106
|
|
|
@@ -145,13 +144,13 @@ class Bijector(Cell):
|
|
|
145
144
|
if self.issubclass_base(value_type, mstype.float_):
|
|
146
145
|
return value
|
|
147
146
|
return raise_type_error('input value of bijector', value_type, mstype.float_)
|
|
148
|
-
dtype_tensor =
|
|
147
|
+
dtype_tensor = ops.fill(self.dtype, self.shape_base(value), 0.0)
|
|
149
148
|
self.sametypeshape_base(value, dtype_tensor)
|
|
150
149
|
return value
|
|
151
150
|
|
|
152
151
|
def _shape_mapping(self, shape):
|
|
153
|
-
shape_tensor =
|
|
154
|
-
dist_shape_tensor =
|
|
152
|
+
shape_tensor = ops.fill(self.parameter_type, shape, 0.0)
|
|
153
|
+
dist_shape_tensor = ops.fill(
|
|
155
154
|
self.parameter_type, self.batch_shape, 0.0)
|
|
156
155
|
return (shape_tensor + dist_shape_tensor).shape
|
|
157
156
|
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""GumbelCDF Bijector"""
|
|
16
|
-
|
|
16
|
+
import mindspore.ops as ops
|
|
17
17
|
from ..distribution._utils.utils import check_greater_zero
|
|
18
18
|
from ..distribution._utils.custom_ops import exp_generic, log_generic
|
|
19
19
|
from .bijector import Bijector
|
|
@@ -86,7 +86,7 @@ class GumbelCDF(Bijector):
|
|
|
86
86
|
self._scale = self._add_parameter(scale, 'scale')
|
|
87
87
|
check_greater_zero(self._scale, "scale")
|
|
88
88
|
|
|
89
|
-
self.cast =
|
|
89
|
+
self.cast = ops.Cast()
|
|
90
90
|
self.exp = exp_generic
|
|
91
91
|
self.log = log_generic
|
|
92
92
|
|