mindspore 2.7.0rc1__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +5 -2
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +2 -2
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/compile_config.py +24 -1
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
- mindspore/_extends/parse/parser.py +28 -22
- mindspore/_extends/parse/resources.py +1 -1
- mindspore/_extends/parse/standard_method.py +23 -2
- mindspore/_extends/parse/trope.py +2 -1
- mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
- mindspore/amp.py +0 -18
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/base.py +29 -2
- mindspore/common/__init__.py +18 -12
- mindspore/common/_decorator.py +3 -2
- mindspore/common/_grad_function.py +3 -1
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +371 -96
- mindspore/common/_utils.py +7 -43
- mindspore/common/api.py +434 -135
- mindspore/common/dtype.py +98 -57
- mindspore/common/dump.py +7 -108
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
- mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
- mindspore/common/file_system.py +59 -9
- mindspore/common/hook_handle.py +82 -3
- mindspore/common/jit_config.py +5 -1
- mindspore/common/jit_trace.py +27 -12
- mindspore/common/lazy_inline.py +5 -3
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +17 -127
- mindspore/common/recompute.py +4 -13
- mindspore/common/tensor.py +50 -217
- mindspore/communication/_comm_helper.py +11 -1
- mindspore/communication/comm_func.py +138 -4
- mindspore/communication/management.py +85 -1
- mindspore/config/op_info.config +0 -15
- mindspore/context.py +20 -106
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +35 -1
- mindspore/dataset/engine/datasets.py +338 -319
- mindspore/dataset/engine/datasets_user_defined.py +38 -22
- mindspore/dataset/engine/datasets_vision.py +1 -1
- mindspore/dataset/engine/validators.py +1 -15
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +17 -5
- mindspore/dataset/vision/utils.py +632 -21
- mindspore/device_context/ascend/op_tuning.py +35 -1
- mindspore/dnnl.dll +0 -0
- mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
- mindspore/graph/custom_pass.py +55 -0
- mindspore/include/api/cell.h +28 -4
- mindspore/include/api/cfg.h +24 -7
- mindspore/include/api/context.h +1 -0
- mindspore/include/api/delegate.h +0 -2
- mindspore/include/api/dual_abi_helper.h +100 -19
- mindspore/include/api/graph.h +14 -1
- mindspore/include/api/kernel.h +16 -3
- mindspore/include/api/kernel_api.h +9 -1
- mindspore/include/api/metrics/accuracy.h +9 -0
- mindspore/include/api/model.h +5 -1
- mindspore/include/api/model_group.h +4 -0
- mindspore/include/api/model_parallel_runner.h +2 -0
- mindspore/include/api/status.h +48 -10
- mindspore/include/api/types.h +6 -1
- mindspore/include/dataset/constants.h +9 -0
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/__init__.py +3 -3
- mindspore/mindrecord/common/exceptions.py +1 -0
- mindspore/mindrecord/config.py +1 -1
- mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
- mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
- mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
- mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
- mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
- mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
- mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
- mindspore/mindrecord/filereader.py +4 -4
- mindspore/mindrecord/filewriter.py +5 -5
- mindspore/mindrecord/mindpage.py +2 -2
- mindspore/mindrecord/tools/cifar10.py +4 -3
- mindspore/mindrecord/tools/cifar100.py +1 -1
- mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
- mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
- mindspore/mindrecord/tools/csv_to_mr.py +1 -1
- mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
- mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
- mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_cluster.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_hardware_abstract.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mindspore_runtime_utils.dll +0 -0
- mindspore/mindspore_tools.dll +0 -0
- mindspore/mint/__init__.py +15 -10
- mindspore/mint/distributed/__init__.py +4 -0
- mindspore/mint/distributed/distributed.py +392 -69
- mindspore/mint/nn/__init__.py +2 -16
- mindspore/mint/nn/functional.py +4 -110
- mindspore/mint/nn/layer/__init__.py +0 -2
- mindspore/mint/nn/layer/_functions.py +1 -2
- mindspore/mint/nn/layer/activation.py +0 -6
- mindspore/mint/nn/layer/basic.py +0 -47
- mindspore/mint/nn/layer/conv.py +10 -10
- mindspore/mint/nn/layer/normalization.py +11 -16
- mindspore/mint/nn/layer/pooling.py +0 -4
- mindspore/nn/__init__.py +1 -3
- mindspore/nn/cell.py +231 -239
- mindspore/nn/layer/activation.py +4 -2
- mindspore/nn/layer/basic.py +56 -14
- mindspore/nn/layer/container.py +16 -0
- mindspore/nn/layer/embedding.py +4 -169
- mindspore/nn/layer/image.py +1 -1
- mindspore/nn/layer/normalization.py +2 -1
- mindspore/nn/layer/thor_layer.py +4 -85
- mindspore/nn/optim/ada_grad.py +0 -1
- mindspore/nn/optim/adafactor.py +0 -1
- mindspore/nn/optim/adam.py +32 -127
- mindspore/nn/optim/adamax.py +0 -1
- mindspore/nn/optim/asgd.py +0 -1
- mindspore/nn/optim/ftrl.py +8 -102
- mindspore/nn/optim/lamb.py +1 -4
- mindspore/nn/optim/lars.py +0 -3
- mindspore/nn/optim/lazyadam.py +25 -218
- mindspore/nn/optim/momentum.py +5 -43
- mindspore/nn/optim/optimizer.py +6 -55
- mindspore/nn/optim/proximal_ada_grad.py +0 -1
- mindspore/nn/optim/rmsprop.py +0 -1
- mindspore/nn/optim/rprop.py +0 -1
- mindspore/nn/optim/sgd.py +0 -1
- mindspore/nn/optim/tft_wrapper.py +2 -4
- mindspore/nn/optim/thor.py +0 -2
- mindspore/nn/probability/bijector/bijector.py +7 -8
- mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
- mindspore/nn/probability/bijector/power_transform.py +20 -21
- mindspore/nn/probability/bijector/scalar_affine.py +5 -5
- mindspore/nn/probability/bijector/softplus.py +13 -14
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/wrap/cell_wrapper.py +39 -5
- mindspore/nn/wrap/grad_reducer.py +4 -89
- mindspore/numpy/array_creations.py +4 -4
- mindspore/numpy/fft.py +9 -9
- mindspore/numpy/utils_const.py +1 -1
- mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
- mindspore/onnx/onnx_export.py +137 -0
- mindspore/opencv_core4110.dll +0 -0
- mindspore/opencv_imgcodecs4110.dll +0 -0
- mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
- mindspore/ops/__init__.py +2 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
- mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
- mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
- mindspore/ops/_op_impl/cpu/__init__.py +1 -5
- mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
- mindspore/ops/auto_generate/gen_extend_func.py +6 -11
- mindspore/ops/auto_generate/gen_ops_def.py +385 -154
- mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
- mindspore/ops/communication.py +97 -0
- mindspore/ops/composite/__init__.py +5 -2
- mindspore/ops/composite/base.py +16 -2
- mindspore/ops/composite/multitype_ops/__init__.py +3 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
- mindspore/ops/function/__init__.py +2 -0
- mindspore/ops/function/array_func.py +24 -18
- mindspore/ops/function/comm_func.py +3883 -0
- mindspore/ops/function/debug_func.py +7 -6
- mindspore/ops/function/grad/grad_func.py +4 -12
- mindspore/ops/function/math_func.py +89 -86
- mindspore/ops/function/nn_func.py +92 -313
- mindspore/ops/function/random_func.py +9 -18
- mindspore/ops/functional.py +4 -1
- mindspore/ops/functional_overload.py +377 -30
- mindspore/ops/operations/__init__.py +2 -5
- mindspore/ops/operations/_custom_ops_utils.py +7 -9
- mindspore/ops/operations/_inner_ops.py +12 -50
- mindspore/ops/operations/_rl_inner_ops.py +0 -933
- mindspore/ops/operations/array_ops.py +5 -50
- mindspore/ops/operations/comm_ops.py +95 -17
- mindspore/ops/operations/custom_ops.py +237 -22
- mindspore/ops/operations/debug_ops.py +33 -35
- mindspore/ops/operations/manually_defined/ops_def.py +39 -318
- mindspore/ops/operations/math_ops.py +5 -5
- mindspore/ops/operations/nn_ops.py +3 -3
- mindspore/ops/operations/sparse_ops.py +0 -83
- mindspore/ops/primitive.py +4 -27
- mindspore/ops/tensor_method.py +88 -10
- mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
- mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
- mindspore/ops_generate/api/functions_cc_generator.py +53 -4
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
- mindspore/ops_generate/common/gen_constants.py +11 -10
- mindspore/ops_generate/common/op_proto.py +18 -1
- mindspore/ops_generate/common/template.py +102 -245
- mindspore/ops_generate/common/template_utils.py +212 -0
- mindspore/ops_generate/gen_custom_ops.py +69 -0
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
- mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
- mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
- mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
- mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
- mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
- mindspore/ops_generate/resources/yaml_loader.py +13 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
- mindspore/parallel/_auto_parallel_context.py +5 -15
- mindspore/parallel/_cell_wrapper.py +1 -1
- mindspore/parallel/_parallel_serialization.py +4 -6
- mindspore/parallel/_ps_context.py +2 -2
- mindspore/parallel/_utils.py +34 -17
- mindspore/parallel/auto_parallel.py +23 -9
- mindspore/parallel/checkpoint_transform.py +20 -2
- mindspore/parallel/cluster/process_entity/_api.py +28 -33
- mindspore/parallel/cluster/process_entity/_utils.py +9 -5
- mindspore/parallel/cluster/run.py +5 -3
- mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
- mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
- mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
- mindspore/parallel/function/reshard_func.py +6 -5
- mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
- mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
- mindspore/parallel/shard.py +7 -21
- mindspore/parallel/strategy.py +336 -0
- mindspore/parallel/transform_safetensors.py +127 -20
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
- mindspore/profiler/common/constant.py +5 -0
- mindspore/profiler/common/file_manager.py +9 -0
- mindspore/profiler/common/msprof_cmd_tool.py +40 -4
- mindspore/profiler/common/path_manager.py +65 -24
- mindspore/profiler/common/profiler_context.py +27 -14
- mindspore/profiler/common/profiler_info.py +3 -3
- mindspore/profiler/common/profiler_meta_data.py +1 -0
- mindspore/profiler/common/profiler_op_analyse.py +10 -6
- mindspore/profiler/common/profiler_path_manager.py +13 -0
- mindspore/profiler/common/util.py +30 -3
- mindspore/profiler/dynamic_profiler.py +91 -46
- mindspore/profiler/envprofiler.py +30 -5
- mindspore/profiler/experimental_config.py +18 -2
- mindspore/profiler/platform/cpu_profiler.py +10 -4
- mindspore/profiler/platform/npu_profiler.py +34 -7
- mindspore/profiler/profiler.py +193 -145
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +108 -24
- mindspore/runtime/__init__.py +9 -6
- mindspore/runtime/executor.py +35 -0
- mindspore/runtime/memory.py +113 -0
- mindspore/runtime/thread_bind_core.py +1 -1
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
- mindspore/tools/data_dump.py +130 -0
- mindspore/tools/sdc_detect.py +91 -0
- mindspore/tools/stress_detect.py +63 -0
- mindspore/train/__init__.py +6 -6
- mindspore/train/_utils.py +8 -21
- mindspore/train/amp.py +6 -7
- mindspore/train/callback/_callback.py +2 -1
- mindspore/train/callback/_checkpoint.py +1 -17
- mindspore/train/callback/_flops_collector.py +10 -6
- mindspore/train/callback/_train_fault_tolerance.py +72 -25
- mindspore/train/data_sink.py +5 -9
- mindspore/train/dataset_helper.py +5 -5
- mindspore/train/model.py +41 -230
- mindspore/train/serialization.py +160 -401
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +6 -3
- mindspore/utils/dlpack.py +92 -0
- mindspore/utils/dryrun.py +1 -1
- mindspore/utils/runtime_execution_order_check.py +10 -0
- mindspore/utils/sdc_detect.py +14 -12
- mindspore/utils/stress_detect.py +43 -0
- mindspore/utils/utils.py +152 -16
- mindspore/version.py +1 -1
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
- mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
- mindspore/communication/_hccl_management.py +0 -297
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
- mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
- mindspore/experimental/llm_boost/atb/__init__.py +0 -23
- mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
- mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
- mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
- mindspore/experimental/llm_boost/register.py +0 -130
- mindspore/experimental/llm_boost/utils.py +0 -31
- mindspore/include/OWNERS +0 -7
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
- mindspore/nn/reinforcement/_batch_read_write.py +0 -142
- mindspore/nn/reinforcement/_tensors_queue.py +0 -152
- mindspore/nn/reinforcement/tensor_array.py +0 -145
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
- mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
- mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
- mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
- mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
- mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
- mindspore/ops/operations/_tensor_array.py +0 -359
- mindspore/ops/operations/rl_ops.py +0 -288
- mindspore/parallel/_offload_context.py +0 -275
- mindspore/parallel/_recovery_context.py +0 -115
- mindspore/parallel/_transformer/__init__.py +0 -35
- mindspore/parallel/_transformer/layers.py +0 -765
- mindspore/parallel/_transformer/loss.py +0 -251
- mindspore/parallel/_transformer/moe.py +0 -693
- mindspore/parallel/_transformer/op_parallel_config.py +0 -222
- mindspore/parallel/_transformer/transformer.py +0 -3124
- mindspore/parallel/mpi/_mpi_config.py +0 -116
- mindspore/profiler/common/validator/validate_path.py +0 -84
- mindspore/train/memory_profiling_pb2.py +0 -298
- mindspore/utils/hooks.py +0 -81
- /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
mindspore/nn/optim/adam.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2020-
|
|
1
|
+
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -28,7 +28,6 @@ from mindspore.common.tensor import Tensor
|
|
|
28
28
|
from mindspore import _checkparam as validator
|
|
29
29
|
from mindspore.nn.optim.optimizer import Optimizer
|
|
30
30
|
from mindspore.nn.optim.optimizer import opt_init_args_register
|
|
31
|
-
from mindspore.nn.optim._dist_optimizer_registry import _register_dist_optimizer
|
|
32
31
|
from mindspore.common._decorator import deprecated
|
|
33
32
|
|
|
34
33
|
_adam_opt = C.MultitypeFuncGraph("adam_opt")
|
|
@@ -727,7 +726,6 @@ class Adam(Optimizer):
|
|
|
727
726
|
self.opt = P.Adam(use_locking, use_nesterov)
|
|
728
727
|
self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov)
|
|
729
728
|
self.sparse_opt.set_device("CPU")
|
|
730
|
-
self._init_distributed_opts(use_locking, use_nesterov)
|
|
731
729
|
|
|
732
730
|
else:
|
|
733
731
|
self._is_device = True
|
|
@@ -737,7 +735,6 @@ class Adam(Optimizer):
|
|
|
737
735
|
self.opt = P.Adam(use_locking, use_nesterov)
|
|
738
736
|
self.sparse_opt = P.FusedSparseAdam(use_locking, use_nesterov)
|
|
739
737
|
self.sparse_opt.set_device("CPU")
|
|
740
|
-
self._init_distributed_opts(use_locking, use_nesterov)
|
|
741
738
|
|
|
742
739
|
def _apply_adam(self, params, beta1_power, beta2_power, moment1, moment2, lr, gradients):
|
|
743
740
|
"""Execute Adam optimizer and its variants."""
|
|
@@ -750,83 +747,44 @@ class Adam(Optimizer):
|
|
|
750
747
|
self.beta2, self.eps, lr), gradients, params, moment1, moment2)
|
|
751
748
|
# Lazy adam or normal adam
|
|
752
749
|
else:
|
|
753
|
-
if self.
|
|
754
|
-
if self.
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
success = self.map_reverse(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
760
|
-
self.use_locking, self.use_nesterov,
|
|
761
|
-
self._is_device, beta1_power, beta2_power,
|
|
762
|
-
self.beta1, self.beta2, self.eps),
|
|
763
|
-
lr, gradients, self._parameters, self.moment1, self.moment2,
|
|
764
|
-
self.dense_lazyadam_opts,
|
|
765
|
-
self.use_dense_opt_flags, self.sparse_lazyadam_opts,
|
|
766
|
-
self.use_sparse_opt_flags)
|
|
767
|
-
# Normal Adam
|
|
768
|
-
else:
|
|
769
|
-
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self.use_locking,
|
|
770
|
-
self.use_nesterov, self._is_device, beta1_power, beta2_power,
|
|
771
|
-
self.beta1, self.beta2, self.eps),
|
|
772
|
-
lr, gradients, params, moment1, moment2,
|
|
773
|
-
self.dense_adam_opts, self.use_dense_opt_flags,
|
|
774
|
-
self.sparse_adam_opts, self.use_sparse_opt_flags)
|
|
750
|
+
if self.is_group_lr:
|
|
751
|
+
if self.use_lazy:
|
|
752
|
+
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
753
|
+
self.use_locking, self.use_nesterov,
|
|
754
|
+
self._is_device, beta1_power, beta2_power, self.beta1, self.beta2,
|
|
755
|
+
self.eps), lr, gradients, params, moment1, moment2)
|
|
775
756
|
else:
|
|
776
|
-
if self.
|
|
777
|
-
success = self.
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
self.dense_lazyadam_opts, self.use_dense_opt_flags,
|
|
783
|
-
self.sparse_lazyadam_opts, self.use_sparse_opt_flags)
|
|
757
|
+
if self.use_amsgrad:
|
|
758
|
+
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
759
|
+
self.use_locking, self.use_nesterov,
|
|
760
|
+
self._is_device, beta1_power, beta2_power,
|
|
761
|
+
self.beta1, self.beta2, self.eps), lr, gradients, params,
|
|
762
|
+
moment1, moment2, self.vhat)
|
|
784
763
|
else:
|
|
785
764
|
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
786
765
|
self.use_locking, self.use_nesterov,
|
|
787
|
-
self._is_device, beta1_power, beta2_power,
|
|
788
|
-
self.eps, lr
|
|
789
|
-
|
|
790
|
-
self.use_dense_opt_flags, self.sparse_adam_opts, self.use_sparse_opt_flags)
|
|
766
|
+
self._is_device, beta1_power, beta2_power,
|
|
767
|
+
self.beta1, self.beta2, self.eps), lr, gradients, params,
|
|
768
|
+
moment1, moment2)
|
|
791
769
|
else:
|
|
792
|
-
if self.
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
self.eps), lr, gradients, params, moment1, moment2)
|
|
798
|
-
else:
|
|
799
|
-
if self.use_amsgrad:
|
|
800
|
-
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
801
|
-
self.use_locking, self.use_nesterov,
|
|
802
|
-
self._is_device, beta1_power, beta2_power,
|
|
803
|
-
self.beta1, self.beta2, self.eps), lr, gradients, params,
|
|
804
|
-
moment1, moment2, self.vhat)
|
|
805
|
-
else:
|
|
806
|
-
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
807
|
-
self.use_locking, self.use_nesterov,
|
|
808
|
-
self._is_device, beta1_power, beta2_power,
|
|
809
|
-
self.beta1, self.beta2, self.eps), lr, gradients, params,
|
|
810
|
-
moment1, moment2)
|
|
770
|
+
if self.use_lazy:
|
|
771
|
+
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt,
|
|
772
|
+
self.use_locking, self.use_nesterov,
|
|
773
|
+
self._is_device, beta1_power, beta2_power, self.beta1, self.beta2,
|
|
774
|
+
self.eps, lr), gradients, params, moment1, moment2)
|
|
811
775
|
else:
|
|
812
|
-
if self.
|
|
813
|
-
success = self.map_(F.partial(
|
|
776
|
+
if self.use_amsgrad:
|
|
777
|
+
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
814
778
|
self.use_locking, self.use_nesterov,
|
|
815
|
-
self._is_device, beta1_power, beta2_power,
|
|
816
|
-
self.eps, lr), gradients, params,
|
|
779
|
+
self._is_device, beta1_power, beta2_power,
|
|
780
|
+
self.beta1, self.beta2, self.eps, lr), gradients, params,
|
|
781
|
+
moment1, moment2, self.vhat)
|
|
817
782
|
else:
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
moment1, moment2, self.vhat)
|
|
824
|
-
else:
|
|
825
|
-
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
826
|
-
self.use_locking, self.use_nesterov,
|
|
827
|
-
self._is_device, beta1_power, beta2_power,
|
|
828
|
-
self.beta1, self.beta2, self.eps, lr), gradients, params,
|
|
829
|
-
moment1, moment2)
|
|
783
|
+
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt,
|
|
784
|
+
self.use_locking, self.use_nesterov,
|
|
785
|
+
self._is_device, beta1_power, beta2_power,
|
|
786
|
+
self.beta1, self.beta2, self.eps, lr), gradients, params,
|
|
787
|
+
moment1, moment2)
|
|
830
788
|
|
|
831
789
|
return success
|
|
832
790
|
|
|
@@ -835,7 +793,6 @@ class Adam(Optimizer):
|
|
|
835
793
|
params = self._parameters
|
|
836
794
|
moment1 = self.moment1
|
|
837
795
|
moment2 = self.moment2
|
|
838
|
-
gradients = self.flatten_gradients(gradients)
|
|
839
796
|
gradients = self.decay_weight(gradients)
|
|
840
797
|
if not self.use_offload:
|
|
841
798
|
gradients = self.gradients_centralization(gradients)
|
|
@@ -859,13 +816,6 @@ class Adam(Optimizer):
|
|
|
859
816
|
"""
|
|
860
817
|
self._set_base_target(value)
|
|
861
818
|
|
|
862
|
-
def _init_distributed_opts(self, use_locking, use_nesterov):
|
|
863
|
-
self.use_dist_optimizer = self._use_distibuted_optimizer()
|
|
864
|
-
self.dense_adam_opts, self.use_dense_opt_flags = \
|
|
865
|
-
self._get_distributed_optimizer_list("adam", use_locking, use_nesterov)
|
|
866
|
-
self.sparse_adam_opts, self.use_sparse_opt_flags = \
|
|
867
|
-
self._get_distributed_optimizer_list("fused_sparse_adam", use_locking, use_nesterov)
|
|
868
|
-
|
|
869
819
|
|
|
870
820
|
class AdamWeightDecay(Optimizer):
|
|
871
821
|
r"""
|
|
@@ -909,9 +859,7 @@ class AdamWeightDecay(Optimizer):
|
|
|
909
859
|
Note:
|
|
910
860
|
There is usually no connection between a optimizer and mixed precision. But when `FixedLossScaleManager` is used
|
|
911
861
|
and `drop_overflow_update` in `FixedLossScaleManager` is set to False, optimizer needs to set the 'loss_scale'.
|
|
912
|
-
As this optimizer has no argument of `loss_scale`, so `loss_scale` needs to be processed by other means
|
|
913
|
-
document `LossScale <https://www.mindspore.cn/tutorials/en/master/beginner/mixed_precision.html>`_ to
|
|
914
|
-
process `loss_scale` correctly.
|
|
862
|
+
As this optimizer has no argument of `loss_scale`, so `loss_scale` needs to be processed by other means.
|
|
915
863
|
|
|
916
864
|
If parameters are not grouped, the `weight_decay` in optimizer will be applied on the network parameters without
|
|
917
865
|
'beta' or 'gamma' in their names. Users can group parameters to change the strategy of decaying weight. When
|
|
@@ -1030,11 +978,9 @@ class AdamWeightDecay(Optimizer):
|
|
|
1030
978
|
|
|
1031
979
|
@jit(backend="ms_backend")
|
|
1032
980
|
def construct(self, gradients):
|
|
1033
|
-
gradients = self.flatten_gradients(gradients)
|
|
1034
981
|
weight_decay = self.get_weight_decay()
|
|
1035
982
|
lr = self.get_lr()
|
|
1036
983
|
self.assignadd(self.global_step, self.global_step_increase_tensor)
|
|
1037
|
-
|
|
1038
984
|
if self.use_fused_opt:
|
|
1039
985
|
if self.is_group:
|
|
1040
986
|
if self.is_group_lr:
|
|
@@ -1072,19 +1018,6 @@ class AdamWeightDecay(Optimizer):
|
|
|
1072
1018
|
|
|
1073
1019
|
return optim_result
|
|
1074
1020
|
|
|
1075
|
-
@Optimizer.target.setter
|
|
1076
|
-
def target(self, value):
|
|
1077
|
-
"""
|
|
1078
|
-
If the input value is set to "CPU", the parameters will be updated on the host using the Fused
|
|
1079
|
-
optimizer operation.
|
|
1080
|
-
"""
|
|
1081
|
-
self._set_base_target(value)
|
|
1082
|
-
if value == 'CPU':
|
|
1083
|
-
self.fused_opt.set_device("CPU")
|
|
1084
|
-
self.use_fused_opt = True
|
|
1085
|
-
else:
|
|
1086
|
-
self.use_fused_opt = False
|
|
1087
|
-
|
|
1088
1021
|
|
|
1089
1022
|
class AdamOffload(Optimizer):
|
|
1090
1023
|
r"""
|
|
@@ -1253,7 +1186,6 @@ class AdamOffload(Optimizer):
|
|
|
1253
1186
|
params = self._parameters
|
|
1254
1187
|
moment1 = self.moment1
|
|
1255
1188
|
moment2 = self.moment2
|
|
1256
|
-
gradients = self.flatten_gradients(gradients)
|
|
1257
1189
|
gradients = self.decay_weight(gradients)
|
|
1258
1190
|
gradients = self.scale_grad(gradients)
|
|
1259
1191
|
lr = self.get_lr()
|
|
@@ -1272,30 +1204,3 @@ class AdamOffload(Optimizer):
|
|
|
1272
1204
|
beta1_power, beta2_power, self.beta1, self.beta2, self.eps, lr),
|
|
1273
1205
|
gradients, params, moment1, moment2)
|
|
1274
1206
|
return success
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
def create_distributed_adam(*args, **kwargs):
|
|
1278
|
-
"""
|
|
1279
|
-
Create the distributed Adam op.
|
|
1280
|
-
"""
|
|
1281
|
-
adam = P.Adam(*args, **kwargs)
|
|
1282
|
-
adam.add_prim_attr("gradient_type", "dense_gradient")
|
|
1283
|
-
adam.add_prim_attr("parameter_input_index", 0)
|
|
1284
|
-
adam.add_prim_attr("gradient_input_index", 9)
|
|
1285
|
-
return adam
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
def create_distributed_fused_sparse_adam(*args, **kwargs):
|
|
1289
|
-
"""
|
|
1290
|
-
Create the distributed FusedSparseAdam op.
|
|
1291
|
-
"""
|
|
1292
|
-
sparse_adam = P.FusedSparseAdam(*args, **kwargs)
|
|
1293
|
-
sparse_adam.add_prim_attr("gradient_type", "sparse_gradient")
|
|
1294
|
-
sparse_adam.add_prim_attr("parameter_input_index", 0)
|
|
1295
|
-
sparse_adam.add_prim_attr("gradient_input_index", 9)
|
|
1296
|
-
sparse_adam.add_prim_attr("indices_input_index", 10)
|
|
1297
|
-
return sparse_adam
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
_register_dist_optimizer("adam", create_distributed_adam)
|
|
1301
|
-
_register_dist_optimizer("fused_sparse_adam", create_distributed_fused_sparse_adam)
|
mindspore/nn/optim/adamax.py
CHANGED
|
@@ -202,7 +202,6 @@ class AdaMax(Optimizer):
|
|
|
202
202
|
|
|
203
203
|
@jit
|
|
204
204
|
def construct(self, gradients):
|
|
205
|
-
gradients = self.flatten_gradients(gradients)
|
|
206
205
|
gradients = self.decay_weight(gradients)
|
|
207
206
|
gradients = self.gradients_centralization(gradients)
|
|
208
207
|
gradients = self.scale_grad(gradients)
|
mindspore/nn/optim/asgd.py
CHANGED
|
@@ -184,7 +184,6 @@ class ASGD(Optimizer):
|
|
|
184
184
|
|
|
185
185
|
@jit(backend="ms_backend")
|
|
186
186
|
def construct(self, gradients):
|
|
187
|
-
gradients = self.flatten_gradients(gradients)
|
|
188
187
|
gradients = self.decay_weight(gradients)
|
|
189
188
|
gradients = self.gradients_centralization(gradients)
|
|
190
189
|
gradients = self.scale_grad(gradients)
|
mindspore/nn/optim/ftrl.py
CHANGED
|
@@ -21,27 +21,10 @@ from mindspore.common.api import jit
|
|
|
21
21
|
from mindspore import _checkparam as validator
|
|
22
22
|
from mindspore.nn.optim.optimizer import Optimizer
|
|
23
23
|
from mindspore.nn.optim.optimizer import opt_init_args_register
|
|
24
|
-
from mindspore.nn.optim._dist_optimizer_registry import _register_dist_optimizer
|
|
25
24
|
|
|
26
25
|
_ftrl_opt = C.MultitypeFuncGraph("ftrl_opt")
|
|
27
26
|
|
|
28
27
|
|
|
29
|
-
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor",
|
|
30
|
-
"RowTensor", "Tensor", "Tensor", "Bool", "Function", "Bool", "Function", "Bool")
|
|
31
|
-
def _tensor_run_opt_with_sparse_dist(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
32
|
-
gradient, weight, moment, cache_enable,
|
|
33
|
-
distributed_opt, use_flag, distributed_sparse_opt, use_sparse_flag):
|
|
34
|
-
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
35
|
-
success = True
|
|
36
|
-
indices = gradient.indices
|
|
37
|
-
values = gradient.values
|
|
38
|
-
if use_sparse_flag:
|
|
39
|
-
success = F.depend(success, distributed_sparse_opt(weight, moment, linear, values, indices))
|
|
40
|
-
else:
|
|
41
|
-
success = F.depend(success, spars_opt(weight, moment, linear, values, indices))
|
|
42
|
-
return success
|
|
43
|
-
|
|
44
|
-
|
|
45
28
|
def _apply_map_tensor_ftrl(l1, l2, lr_power, learning_rate, linear, weight, moment, indices, values):
|
|
46
29
|
"""Apllpy ftrl optimizer for map parameter"""
|
|
47
30
|
success = True
|
|
@@ -78,43 +61,10 @@ def _apply_map_tensor_ftrl(l1, l2, lr_power, learning_rate, linear, weight, mome
|
|
|
78
61
|
return success
|
|
79
62
|
|
|
80
63
|
|
|
81
|
-
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "MapTensor",
|
|
82
|
-
"MapTensor", "MapTensor", "MapTensor", "Bool", "Function", "Bool", "Function", "Bool")
|
|
83
|
-
def _run_map_tensor_opt_with_sparse_dist(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
84
|
-
gradient, weight, moment, cache_enable,
|
|
85
|
-
distributed_opt, use_flag, distributed_sparse_opt, use_sparse_flag):
|
|
86
|
-
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
87
|
-
success = True
|
|
88
|
-
indices, values = gradient.get_data()
|
|
89
|
-
if use_sparse_flag:
|
|
90
|
-
# PS Mode.
|
|
91
|
-
success = F.depend(success, distributed_sparse_opt(weight, moment, linear, values, indices))
|
|
92
|
-
elif cache_enable:
|
|
93
|
-
# PS Cache mode.
|
|
94
|
-
_apply_map_tensor_ftrl(l1, l2, lr_power, learning_rate, linear, weight, moment, indices, values)
|
|
95
|
-
else:
|
|
96
|
-
raise Exception("Unexpected mode for distributed optimizer.")
|
|
97
|
-
return success
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor",
|
|
101
|
-
"Tensor", "Tensor", "Tensor", "Bool", "Function", "Bool", "Function", "Bool")
|
|
102
|
-
def _tensor_run_opt_dist(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
103
|
-
gradient, weight, moment, cache_enable,
|
|
104
|
-
distributed_opt, use_flag, distributed_sparse_opt, use_sparse_flag):
|
|
105
|
-
"""Apply ftrl optimizer to the weight parameter."""
|
|
106
|
-
success = True
|
|
107
|
-
if use_flag:
|
|
108
|
-
success = F.depend(success, distributed_opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
|
|
109
|
-
else:
|
|
110
|
-
success = F.depend(success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
|
|
111
|
-
return success
|
|
112
|
-
|
|
113
|
-
|
|
114
64
|
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor",
|
|
115
|
-
"RowTensor", "Tensor", "Tensor"
|
|
65
|
+
"RowTensor", "Tensor", "Tensor")
|
|
116
66
|
def _tensor_run_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
117
|
-
gradient, weight, moment
|
|
67
|
+
gradient, weight, moment):
|
|
118
68
|
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
119
69
|
success = True
|
|
120
70
|
indices = gradient.indices
|
|
@@ -124,9 +74,9 @@ def _tensor_run_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_rate,
|
|
|
124
74
|
|
|
125
75
|
|
|
126
76
|
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "MapTensor",
|
|
127
|
-
"MapTensor", "MapTensor", "MapTensor"
|
|
77
|
+
"MapTensor", "MapTensor", "MapTensor")
|
|
128
78
|
def _run_map_tensor_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
129
|
-
gradient, weight, moment
|
|
79
|
+
gradient, weight, moment):
|
|
130
80
|
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
131
81
|
success = True
|
|
132
82
|
indices, values = gradient.get_data()
|
|
@@ -135,9 +85,9 @@ def _run_map_tensor_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_r
|
|
|
135
85
|
|
|
136
86
|
|
|
137
87
|
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor",
|
|
138
|
-
"Tensor", "Tensor", "Tensor"
|
|
88
|
+
"Tensor", "Tensor", "Tensor")
|
|
139
89
|
def _tensor_run_opt(opt, spars_opt, l1, l2, lr_power, learning_rate, linear,
|
|
140
|
-
gradient, weight, moment
|
|
90
|
+
gradient, weight, moment):
|
|
141
91
|
"""Apply ftrl optimizer to the weight parameter."""
|
|
142
92
|
success = True
|
|
143
93
|
success = F.depend(success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
|
|
@@ -320,14 +270,11 @@ class FTRL(Optimizer):
|
|
|
320
270
|
self.use_locking = use_locking
|
|
321
271
|
self.sparse_opt = P.SparseApplyFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking)
|
|
322
272
|
|
|
323
|
-
self._init_distributed_opts(use_locking, learning_rate, l1, l2, lr_power)
|
|
324
|
-
|
|
325
273
|
@jit
|
|
326
274
|
def construct(self, grads):
|
|
327
275
|
params = self._parameters
|
|
328
276
|
moments = self.moments
|
|
329
277
|
linear = self.linear
|
|
330
|
-
grads = self.flatten_gradients(grads)
|
|
331
278
|
grads = self.decay_weight(grads)
|
|
332
279
|
grads = self.gradients_centralization(grads)
|
|
333
280
|
grads = self.scale_grad(grads)
|
|
@@ -335,14 +282,8 @@ class FTRL(Optimizer):
|
|
|
335
282
|
lr = self.get_lr()
|
|
336
283
|
self.assignadd(self.global_step, self.global_step_increase_tensor)
|
|
337
284
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
linear, grads, params, moments, self.cache_enable,
|
|
341
|
-
self.distributed_opts, self.use_distributed_opt_flags,
|
|
342
|
-
self.distributed_sparse_opts, self.use_distributed_sparse_opt_flags)
|
|
343
|
-
else:
|
|
344
|
-
success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, self.l1, self.l2, self.lr_power, lr),
|
|
345
|
-
linear, grads, params, moments, self.cache_enable)
|
|
285
|
+
success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, self.l1, self.l2, self.lr_power, lr),
|
|
286
|
+
linear, grads, params, moments)
|
|
346
287
|
return success
|
|
347
288
|
|
|
348
289
|
@Optimizer.target.setter
|
|
@@ -366,38 +307,3 @@ class FTRL(Optimizer):
|
|
|
366
307
|
self.sparse_opt = P.SparseApplyFtrl(self.lr, self.l1, self.l2, self.lr_power, self.use_locking)
|
|
367
308
|
|
|
368
309
|
self._target = value
|
|
369
|
-
|
|
370
|
-
def _init_distributed_opts(self, use_locking, learning_rate, l1, l2, lr_power):
|
|
371
|
-
self.use_dist_optimizer = self._use_distibuted_optimizer()
|
|
372
|
-
self.distributed_opts, self.use_distributed_opt_flags =\
|
|
373
|
-
self._get_distributed_optimizer_list("ftrl", use_locking=use_locking)
|
|
374
|
-
self.distributed_sparse_opts, self.use_distributed_sparse_opt_flags =\
|
|
375
|
-
self._get_distributed_optimizer_list("fused_sparse_ftrl", learning_rate,
|
|
376
|
-
l1, l2, lr_power, use_locking=use_locking)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
def create_distributed_ftrl(*args, **kwargs):
|
|
380
|
-
"""
|
|
381
|
-
Create the distributed ApplyFtrl op.
|
|
382
|
-
"""
|
|
383
|
-
ftrl = P.ApplyFtrl(*args, **kwargs)
|
|
384
|
-
ftrl.add_prim_attr("gradient_type", "dense_gradient")
|
|
385
|
-
ftrl.add_prim_attr("parameter_input_index", 0)
|
|
386
|
-
ftrl.add_prim_attr("gradient_input_index", 3)
|
|
387
|
-
return ftrl
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
def create_distributed_fused_sparse_ftrl(*args, **kwargs):
|
|
391
|
-
"""
|
|
392
|
-
Create the distributed FusedSparseFtrl op.
|
|
393
|
-
"""
|
|
394
|
-
sparse_ftrl = P.FusedSparseFtrl(*args, **kwargs)
|
|
395
|
-
sparse_ftrl.add_prim_attr("gradient_type", "sparse_gradient")
|
|
396
|
-
sparse_ftrl.add_prim_attr("parameter_input_index", 0)
|
|
397
|
-
sparse_ftrl.add_prim_attr("gradient_input_index", 3)
|
|
398
|
-
sparse_ftrl.add_prim_attr("indices_input_index", 4)
|
|
399
|
-
return sparse_ftrl
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
_register_dist_optimizer("ftrl", create_distributed_ftrl)
|
|
403
|
-
_register_dist_optimizer("fused_sparse_ftrl", create_distributed_fused_sparse_ftrl)
|
mindspore/nn/optim/lamb.py
CHANGED
|
@@ -131,9 +131,7 @@ class Lamb(Optimizer):
|
|
|
131
131
|
Note:
|
|
132
132
|
There is usually no connection between a optimizer and mixed precision. But when `FixedLossScaleManager` is used
|
|
133
133
|
and `drop_overflow_update` in `FixedLossScaleManager` is set to False, optimizer needs to set the 'loss_scale'.
|
|
134
|
-
As this optimizer has no argument of `loss_scale`, so `loss_scale` needs to be processed by other means.
|
|
135
|
-
document `LossScale <https://www.mindspore.cn/tutorials/en/master/beginner/mixed_precision.html>`_ to
|
|
136
|
-
process `loss_scale` correctly.
|
|
134
|
+
As this optimizer has no argument of `loss_scale`, so `loss_scale` needs to be processed by other means.
|
|
137
135
|
|
|
138
136
|
If parameters are not grouped, the `weight_decay` in optimizer will be applied on the network parameters without
|
|
139
137
|
'beta' or 'gamma' in their names. Users can group parameters to change the strategy of decaying weight. When
|
|
@@ -271,7 +269,6 @@ class Lamb(Optimizer):
|
|
|
271
269
|
lr = self.get_lr()
|
|
272
270
|
self.assignadd(self.global_step, self.global_step_increase_tensor)
|
|
273
271
|
lamb_opt = _lamb_opt
|
|
274
|
-
gradients = self.flatten_gradients(gradients)
|
|
275
272
|
gradients = self.gradients_centralization(gradients)
|
|
276
273
|
if self.is_group:
|
|
277
274
|
if self.is_group_lr:
|
mindspore/nn/optim/lars.py
CHANGED
|
@@ -125,8 +125,6 @@ class LARS(Optimizer):
|
|
|
125
125
|
self.weight_decay = optimizer.weight_decay
|
|
126
126
|
self.global_step = optimizer.global_step
|
|
127
127
|
self.parameters = optimizer.parameters
|
|
128
|
-
if optimizer._use_flattened_params: # pylint: disable=W0212
|
|
129
|
-
self.opt._use_flattened_params = False # pylint: disable=W0212
|
|
130
128
|
self._user_parameters += [param.name for param in self.parameters]
|
|
131
129
|
self.use_clip = use_clip
|
|
132
130
|
self.lars_flag = tuple(lars_filter(x) for x in self.parameters)
|
|
@@ -173,7 +171,6 @@ class LARS(Optimizer):
|
|
|
173
171
|
@jit
|
|
174
172
|
def construct(self, gradients):
|
|
175
173
|
params = self.parameters
|
|
176
|
-
gradients = self.flatten_gradients(gradients)
|
|
177
174
|
if self.use_clip:
|
|
178
175
|
lr = self._get_lr()
|
|
179
176
|
else:
|