mindspore 2.7.0rc1__cp310-cp310-win_amd64.whl → 2.7.1__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +5 -2
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +2 -2
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/compile_config.py +24 -1
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
- mindspore/_extends/parse/parser.py +28 -22
- mindspore/_extends/parse/resources.py +1 -1
- mindspore/_extends/parse/standard_method.py +23 -2
- mindspore/_extends/parse/trope.py +2 -1
- mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
- mindspore/amp.py +0 -18
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/base.py +29 -2
- mindspore/common/__init__.py +18 -12
- mindspore/common/_decorator.py +3 -2
- mindspore/common/_grad_function.py +3 -1
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +371 -96
- mindspore/common/_utils.py +7 -43
- mindspore/common/api.py +434 -135
- mindspore/common/dtype.py +98 -57
- mindspore/common/dump.py +7 -108
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
- mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
- mindspore/common/file_system.py +59 -9
- mindspore/common/hook_handle.py +82 -3
- mindspore/common/jit_config.py +5 -1
- mindspore/common/jit_trace.py +27 -12
- mindspore/common/lazy_inline.py +5 -3
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +17 -127
- mindspore/common/recompute.py +4 -13
- mindspore/common/tensor.py +50 -217
- mindspore/communication/_comm_helper.py +11 -1
- mindspore/communication/comm_func.py +138 -4
- mindspore/communication/management.py +85 -1
- mindspore/config/op_info.config +0 -15
- mindspore/context.py +20 -106
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +35 -1
- mindspore/dataset/engine/datasets.py +338 -319
- mindspore/dataset/engine/datasets_user_defined.py +38 -22
- mindspore/dataset/engine/datasets_vision.py +1 -1
- mindspore/dataset/engine/validators.py +1 -15
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +17 -5
- mindspore/dataset/vision/utils.py +632 -21
- mindspore/device_context/ascend/op_tuning.py +35 -1
- mindspore/dnnl.dll +0 -0
- mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
- mindspore/graph/custom_pass.py +55 -0
- mindspore/include/api/cell.h +28 -4
- mindspore/include/api/cfg.h +24 -7
- mindspore/include/api/context.h +1 -0
- mindspore/include/api/delegate.h +0 -2
- mindspore/include/api/dual_abi_helper.h +100 -19
- mindspore/include/api/graph.h +14 -1
- mindspore/include/api/kernel.h +16 -3
- mindspore/include/api/kernel_api.h +9 -1
- mindspore/include/api/metrics/accuracy.h +9 -0
- mindspore/include/api/model.h +5 -1
- mindspore/include/api/model_group.h +4 -0
- mindspore/include/api/model_parallel_runner.h +2 -0
- mindspore/include/api/status.h +48 -10
- mindspore/include/api/types.h +6 -1
- mindspore/include/dataset/constants.h +9 -0
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/__init__.py +3 -3
- mindspore/mindrecord/common/exceptions.py +1 -0
- mindspore/mindrecord/config.py +1 -1
- mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
- mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
- mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
- mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
- mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
- mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
- mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
- mindspore/mindrecord/filereader.py +4 -4
- mindspore/mindrecord/filewriter.py +5 -5
- mindspore/mindrecord/mindpage.py +2 -2
- mindspore/mindrecord/tools/cifar10.py +4 -3
- mindspore/mindrecord/tools/cifar100.py +1 -1
- mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
- mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
- mindspore/mindrecord/tools/csv_to_mr.py +1 -1
- mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
- mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
- mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_cluster.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_hardware_abstract.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mindspore_runtime_utils.dll +0 -0
- mindspore/mindspore_tools.dll +0 -0
- mindspore/mint/__init__.py +15 -10
- mindspore/mint/distributed/__init__.py +4 -0
- mindspore/mint/distributed/distributed.py +392 -69
- mindspore/mint/nn/__init__.py +2 -16
- mindspore/mint/nn/functional.py +4 -110
- mindspore/mint/nn/layer/__init__.py +0 -2
- mindspore/mint/nn/layer/_functions.py +1 -2
- mindspore/mint/nn/layer/activation.py +0 -6
- mindspore/mint/nn/layer/basic.py +0 -47
- mindspore/mint/nn/layer/conv.py +10 -10
- mindspore/mint/nn/layer/normalization.py +11 -16
- mindspore/mint/nn/layer/pooling.py +0 -4
- mindspore/nn/__init__.py +1 -3
- mindspore/nn/cell.py +231 -239
- mindspore/nn/layer/activation.py +4 -2
- mindspore/nn/layer/basic.py +56 -14
- mindspore/nn/layer/container.py +16 -0
- mindspore/nn/layer/embedding.py +4 -169
- mindspore/nn/layer/image.py +1 -1
- mindspore/nn/layer/normalization.py +2 -1
- mindspore/nn/layer/thor_layer.py +4 -85
- mindspore/nn/optim/ada_grad.py +0 -1
- mindspore/nn/optim/adafactor.py +0 -1
- mindspore/nn/optim/adam.py +32 -127
- mindspore/nn/optim/adamax.py +0 -1
- mindspore/nn/optim/asgd.py +0 -1
- mindspore/nn/optim/ftrl.py +8 -102
- mindspore/nn/optim/lamb.py +1 -4
- mindspore/nn/optim/lars.py +0 -3
- mindspore/nn/optim/lazyadam.py +25 -218
- mindspore/nn/optim/momentum.py +5 -43
- mindspore/nn/optim/optimizer.py +6 -55
- mindspore/nn/optim/proximal_ada_grad.py +0 -1
- mindspore/nn/optim/rmsprop.py +0 -1
- mindspore/nn/optim/rprop.py +0 -1
- mindspore/nn/optim/sgd.py +0 -1
- mindspore/nn/optim/tft_wrapper.py +2 -4
- mindspore/nn/optim/thor.py +0 -2
- mindspore/nn/probability/bijector/bijector.py +7 -8
- mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
- mindspore/nn/probability/bijector/power_transform.py +20 -21
- mindspore/nn/probability/bijector/scalar_affine.py +5 -5
- mindspore/nn/probability/bijector/softplus.py +13 -14
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/wrap/cell_wrapper.py +39 -5
- mindspore/nn/wrap/grad_reducer.py +4 -89
- mindspore/numpy/array_creations.py +4 -4
- mindspore/numpy/fft.py +9 -9
- mindspore/numpy/utils_const.py +1 -1
- mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
- mindspore/onnx/onnx_export.py +137 -0
- mindspore/opencv_core4110.dll +0 -0
- mindspore/opencv_imgcodecs4110.dll +0 -0
- mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
- mindspore/ops/__init__.py +2 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
- mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
- mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
- mindspore/ops/_op_impl/cpu/__init__.py +1 -5
- mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
- mindspore/ops/auto_generate/gen_extend_func.py +6 -11
- mindspore/ops/auto_generate/gen_ops_def.py +385 -154
- mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
- mindspore/ops/communication.py +97 -0
- mindspore/ops/composite/__init__.py +5 -2
- mindspore/ops/composite/base.py +16 -2
- mindspore/ops/composite/multitype_ops/__init__.py +3 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
- mindspore/ops/function/__init__.py +2 -0
- mindspore/ops/function/array_func.py +24 -18
- mindspore/ops/function/comm_func.py +3883 -0
- mindspore/ops/function/debug_func.py +7 -6
- mindspore/ops/function/grad/grad_func.py +4 -12
- mindspore/ops/function/math_func.py +89 -86
- mindspore/ops/function/nn_func.py +92 -313
- mindspore/ops/function/random_func.py +9 -18
- mindspore/ops/functional.py +4 -1
- mindspore/ops/functional_overload.py +377 -30
- mindspore/ops/operations/__init__.py +2 -5
- mindspore/ops/operations/_custom_ops_utils.py +7 -9
- mindspore/ops/operations/_inner_ops.py +12 -50
- mindspore/ops/operations/_rl_inner_ops.py +0 -933
- mindspore/ops/operations/array_ops.py +5 -50
- mindspore/ops/operations/comm_ops.py +95 -17
- mindspore/ops/operations/custom_ops.py +237 -22
- mindspore/ops/operations/debug_ops.py +33 -35
- mindspore/ops/operations/manually_defined/ops_def.py +39 -318
- mindspore/ops/operations/math_ops.py +5 -5
- mindspore/ops/operations/nn_ops.py +3 -3
- mindspore/ops/operations/sparse_ops.py +0 -83
- mindspore/ops/primitive.py +4 -27
- mindspore/ops/tensor_method.py +88 -10
- mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
- mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
- mindspore/ops_generate/api/functions_cc_generator.py +53 -4
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
- mindspore/ops_generate/common/gen_constants.py +11 -10
- mindspore/ops_generate/common/op_proto.py +18 -1
- mindspore/ops_generate/common/template.py +102 -245
- mindspore/ops_generate/common/template_utils.py +212 -0
- mindspore/ops_generate/gen_custom_ops.py +69 -0
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
- mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
- mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
- mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
- mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
- mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
- mindspore/ops_generate/resources/yaml_loader.py +13 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
- mindspore/parallel/_auto_parallel_context.py +5 -15
- mindspore/parallel/_cell_wrapper.py +1 -1
- mindspore/parallel/_parallel_serialization.py +4 -6
- mindspore/parallel/_ps_context.py +2 -2
- mindspore/parallel/_utils.py +34 -17
- mindspore/parallel/auto_parallel.py +23 -9
- mindspore/parallel/checkpoint_transform.py +20 -2
- mindspore/parallel/cluster/process_entity/_api.py +28 -33
- mindspore/parallel/cluster/process_entity/_utils.py +9 -5
- mindspore/parallel/cluster/run.py +5 -3
- mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
- mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
- mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
- mindspore/parallel/function/reshard_func.py +6 -5
- mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
- mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
- mindspore/parallel/shard.py +7 -21
- mindspore/parallel/strategy.py +336 -0
- mindspore/parallel/transform_safetensors.py +127 -20
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
- mindspore/profiler/common/constant.py +5 -0
- mindspore/profiler/common/file_manager.py +9 -0
- mindspore/profiler/common/msprof_cmd_tool.py +40 -4
- mindspore/profiler/common/path_manager.py +65 -24
- mindspore/profiler/common/profiler_context.py +27 -14
- mindspore/profiler/common/profiler_info.py +3 -3
- mindspore/profiler/common/profiler_meta_data.py +1 -0
- mindspore/profiler/common/profiler_op_analyse.py +10 -6
- mindspore/profiler/common/profiler_path_manager.py +13 -0
- mindspore/profiler/common/util.py +30 -3
- mindspore/profiler/dynamic_profiler.py +91 -46
- mindspore/profiler/envprofiler.py +30 -5
- mindspore/profiler/experimental_config.py +18 -2
- mindspore/profiler/platform/cpu_profiler.py +10 -4
- mindspore/profiler/platform/npu_profiler.py +34 -7
- mindspore/profiler/profiler.py +193 -145
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +108 -24
- mindspore/runtime/__init__.py +9 -6
- mindspore/runtime/executor.py +35 -0
- mindspore/runtime/memory.py +113 -0
- mindspore/runtime/thread_bind_core.py +1 -1
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
- mindspore/tools/data_dump.py +130 -0
- mindspore/tools/sdc_detect.py +91 -0
- mindspore/tools/stress_detect.py +63 -0
- mindspore/train/__init__.py +6 -6
- mindspore/train/_utils.py +8 -21
- mindspore/train/amp.py +6 -7
- mindspore/train/callback/_callback.py +2 -1
- mindspore/train/callback/_checkpoint.py +1 -17
- mindspore/train/callback/_flops_collector.py +10 -6
- mindspore/train/callback/_train_fault_tolerance.py +72 -25
- mindspore/train/data_sink.py +5 -9
- mindspore/train/dataset_helper.py +5 -5
- mindspore/train/model.py +41 -230
- mindspore/train/serialization.py +160 -401
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +6 -3
- mindspore/utils/dlpack.py +92 -0
- mindspore/utils/dryrun.py +1 -1
- mindspore/utils/runtime_execution_order_check.py +10 -0
- mindspore/utils/sdc_detect.py +14 -12
- mindspore/utils/stress_detect.py +43 -0
- mindspore/utils/utils.py +152 -16
- mindspore/version.py +1 -1
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
- mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
- mindspore/communication/_hccl_management.py +0 -297
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
- mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
- mindspore/experimental/llm_boost/atb/__init__.py +0 -23
- mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
- mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
- mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
- mindspore/experimental/llm_boost/register.py +0 -130
- mindspore/experimental/llm_boost/utils.py +0 -31
- mindspore/include/OWNERS +0 -7
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
- mindspore/nn/reinforcement/_batch_read_write.py +0 -142
- mindspore/nn/reinforcement/_tensors_queue.py +0 -152
- mindspore/nn/reinforcement/tensor_array.py +0 -145
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
- mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
- mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
- mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
- mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
- mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
- mindspore/ops/operations/_tensor_array.py +0 -359
- mindspore/ops/operations/rl_ops.py +0 -288
- mindspore/parallel/_offload_context.py +0 -275
- mindspore/parallel/_recovery_context.py +0 -115
- mindspore/parallel/_transformer/__init__.py +0 -35
- mindspore/parallel/_transformer/layers.py +0 -765
- mindspore/parallel/_transformer/loss.py +0 -251
- mindspore/parallel/_transformer/moe.py +0 -693
- mindspore/parallel/_transformer/op_parallel_config.py +0 -222
- mindspore/parallel/_transformer/transformer.py +0 -3124
- mindspore/parallel/mpi/_mpi_config.py +0 -116
- mindspore/profiler/common/validator/validate_path.py +0 -84
- mindspore/train/memory_profiling_pb2.py +0 -298
- mindspore/utils/hooks.py +0 -81
- /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
|
@@ -78,14 +78,14 @@ def get_convert_type_str(dtype: str, optional, use_basic_type=False):
|
|
|
78
78
|
'float': 'ToFloat',
|
|
79
79
|
'bool': 'ToBool',
|
|
80
80
|
'number': 'ToScalar',
|
|
81
|
-
'tuple[int]': 'ToIntList<
|
|
82
|
-
'tuple[float]': 'ToFloatList<
|
|
83
|
-
'tuple[bool]': 'ToBoolList<
|
|
84
|
-
'tuple[tensor]': 'ToTensorList<
|
|
85
|
-
'list[int]': 'ToIntList<
|
|
86
|
-
'list[float]': 'ToFloatList<
|
|
87
|
-
'list[bool]': 'ToBoolList<
|
|
88
|
-
'list[tensor]': 'ToTensorList<
|
|
81
|
+
'tuple[int]': 'ToIntList<CPythonTuple>',
|
|
82
|
+
'tuple[float]': 'ToFloatList<CPythonTuple>',
|
|
83
|
+
'tuple[bool]': 'ToBoolList<CPythonTuple>',
|
|
84
|
+
'tuple[tensor]': 'ToTensorList<CPythonTuple>',
|
|
85
|
+
'list[int]': 'ToIntList<CPythonList>',
|
|
86
|
+
'list[float]': 'ToFloatList<CPythonList>',
|
|
87
|
+
'list[bool]': 'ToBoolList<CPythonList>',
|
|
88
|
+
'list[tensor]': 'ToTensorList<CPythonList>',
|
|
89
89
|
'tensor': 'ToTensor',
|
|
90
90
|
'str': 'ToString',
|
|
91
91
|
'type': 'ToDtype',
|
|
@@ -97,14 +97,14 @@ def get_convert_type_str(dtype: str, optional, use_basic_type=False):
|
|
|
97
97
|
'tensor': 'ToTensorOptional',
|
|
98
98
|
'type': 'ToDtypeOptional',
|
|
99
99
|
'str': 'ToStringOptional',
|
|
100
|
-
'tuple[int]': 'ToIntListOptional<
|
|
101
|
-
'tuple[float]': 'ToFloatListOptional<
|
|
102
|
-
'tuple[bool]': 'ToBoolListOptional<
|
|
103
|
-
'tuple[tensor]': 'ToTensorListOptional<
|
|
104
|
-
'list[int]': 'ToIntListOptional<
|
|
105
|
-
'list[float]': 'ToFloatListOptional<
|
|
106
|
-
'list[bool]': 'ToBoolListOptional<
|
|
107
|
-
'list[tensor]': 'ToTensorListOptional<
|
|
100
|
+
'tuple[int]': 'ToIntListOptional<CPythonTuple>',
|
|
101
|
+
'tuple[float]': 'ToFloatListOptional<CPythonTuple>',
|
|
102
|
+
'tuple[bool]': 'ToBoolListOptional<CPythonTuple>',
|
|
103
|
+
'tuple[tensor]': 'ToTensorListOptional<CPythonTuple>',
|
|
104
|
+
'list[int]': 'ToIntListOptional<CPythonList>',
|
|
105
|
+
'list[float]': 'ToFloatListOptional<CPythonList>',
|
|
106
|
+
'list[bool]': 'ToBoolListOptional<CPythonList>',
|
|
107
|
+
'list[tensor]': 'ToTensorListOptional<CPythonList>',
|
|
108
108
|
}
|
|
109
109
|
basic_optional_type_convert = {
|
|
110
110
|
'tuple[int]': "ToBasicIntVectorOptional",
|
|
@@ -385,6 +385,17 @@ def get_input_dtype(dtype: str, optional, use_basic_type=False):
|
|
|
385
385
|
raise TypeError(f"""Unsupported convert type {dtype} for args.""")
|
|
386
386
|
|
|
387
387
|
|
|
388
|
+
def get_output_dtype(dtype: str):
|
|
389
|
+
type_convert = {
|
|
390
|
+
'tensor': "mindspore::tensor::TensorPtr",
|
|
391
|
+
'tuple[tensor]': "std::vector<mindspore::tensor::TensorPtr>",
|
|
392
|
+
'list[tensor]': "std::vector<mindspore::tensor::TensorPtr>",
|
|
393
|
+
}
|
|
394
|
+
if dtype in type_convert:
|
|
395
|
+
return type_convert[dtype]
|
|
396
|
+
raise TypeError(f"""Unsupported convert type {dtype} for args.""")
|
|
397
|
+
|
|
398
|
+
|
|
388
399
|
def is_cube(class_name):
|
|
389
400
|
cube_set = {'Bmm', 'Baddbmm', 'MatMulExt', 'Mv'}
|
|
390
401
|
if class_name in cube_set:
|
|
@@ -29,6 +29,7 @@ class YamlLoader(ResourceLoader):
|
|
|
29
29
|
"""
|
|
30
30
|
YamlLoader is a utility class for loading yaml files.
|
|
31
31
|
"""
|
|
32
|
+
|
|
32
33
|
def __init__(self, resouce_type: ResourceType, yaml_path: Union[Sequence[str], str]):
|
|
33
34
|
"""
|
|
34
35
|
Initialize YamlLoader.
|
|
@@ -65,15 +66,26 @@ class OpDocYamlLoader(YamlLoader):
|
|
|
65
66
|
"""
|
|
66
67
|
OpDocYamlLoader is a class for loading op primitive doc yaml files.
|
|
67
68
|
"""
|
|
69
|
+
|
|
68
70
|
def __init__(self):
|
|
69
71
|
op_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_OP_DEF_YAML_PATH, "doc")
|
|
70
72
|
super().__init__(ResourceType.OP_DOC_YAML, op_doc_yaml_path)
|
|
71
73
|
|
|
72
74
|
|
|
75
|
+
class CustomOpDocYamlLoader(YamlLoader):
|
|
76
|
+
"""
|
|
77
|
+
CustomOpDocYamlLoader is a class for loading op primitive doc yaml files.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, doc_yaml_path):
|
|
81
|
+
super().__init__(ResourceType.OP_DOC_YAML, doc_yaml_path)
|
|
82
|
+
|
|
83
|
+
|
|
73
84
|
class TensorMethodDocYamlLoader(YamlLoader):
|
|
74
85
|
"""
|
|
75
86
|
TensorMethodDocYamlLoader is a class for loading tensor method doc yaml files.
|
|
76
87
|
"""
|
|
88
|
+
|
|
77
89
|
def __init__(self):
|
|
78
90
|
tensor_method_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_TENSOR_METHOD_DOC_YAML_PATH)
|
|
79
91
|
super().__init__(ResourceType.TENSOR_METHOD_DOC_YAML, tensor_method_doc_yaml_path)
|
|
@@ -83,6 +95,7 @@ class MintFuncDocYamlLoader(YamlLoader):
|
|
|
83
95
|
"""
|
|
84
96
|
MintFuncDocYamlLoader is a class for loading mint func doc yaml files.
|
|
85
97
|
"""
|
|
98
|
+
|
|
86
99
|
def __init__(self):
|
|
87
100
|
mint_func_doc_yaml_path = os.path.join(K.WORK_DIR, K.MS_MINT_FUNC_DOC_YAML_PATH)
|
|
88
101
|
super().__init__(ResourceType.MINT_FUNC_DOC_YAML, mint_func_doc_yaml_path)
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""
|
|
16
|
-
Generates mindspore/ccsrc/pybind_api/ir/tensor_py.cc which includes the CPython Tensor APIs.
|
|
16
|
+
Generates mindspore/ccsrc/pybind_api/ir/tensor/tensor_py.cc which includes the CPython Tensor APIs.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
import os
|
|
@@ -26,7 +26,7 @@ from pyboost import pyboost_utils
|
|
|
26
26
|
|
|
27
27
|
class TensorPyCppGenerator(BaseGenerator):
|
|
28
28
|
"""
|
|
29
|
-
This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor_register/
|
|
29
|
+
This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor/tensor_register/
|
|
30
30
|
auto_generate/tensor_py_gen.cc
|
|
31
31
|
"""
|
|
32
32
|
def __init__(self):
|
|
@@ -81,6 +81,7 @@ class _PipelineScheduler:
|
|
|
81
81
|
PIPELINE_SEQPIPE = "seqpipe"
|
|
82
82
|
PIPELINE_SEQVPP = "seqvpp"
|
|
83
83
|
PIPELINE_SEQSMARTVPP = "seqsmartvpp"
|
|
84
|
+
PIPELINE_ZBV = "zero_bubble_v"
|
|
84
85
|
|
|
85
86
|
|
|
86
87
|
class _AutoParallelContext:
|
|
@@ -434,13 +435,6 @@ class _AutoParallelContext:
|
|
|
434
435
|
"""
|
|
435
436
|
self.check_context_handle()
|
|
436
437
|
run_mode = context.get_context("mode")
|
|
437
|
-
if run_mode == context.PYNATIVE_MODE and parallel_mode not in (
|
|
438
|
-
context.ParallelMode.DATA_PARALLEL, context.ParallelMode.STAND_ALONE,
|
|
439
|
-
context.ParallelMode.AUTO_PARALLEL):
|
|
440
|
-
raise ValueError(f"Pynative only supports STAND_ALONE, DATA_PARALLEL and AUTO_PARALLEL using"
|
|
441
|
-
f" sharding_propagation under shard function"
|
|
442
|
-
f" for ParallelMode, "
|
|
443
|
-
f"but got {parallel_mode.upper()}.")
|
|
444
438
|
ret = self._context_handle.set_parallel_mode(parallel_mode)
|
|
445
439
|
if ret is False:
|
|
446
440
|
raise ValueError("The context configuration parameter 'parallel_mode' only support 'stand_alone', "
|
|
@@ -604,9 +598,6 @@ class _AutoParallelContext:
|
|
|
604
598
|
if not isinstance(dim, int):
|
|
605
599
|
raise TypeError("For 'set_auto_parallel_context', the element of argument "
|
|
606
600
|
"'dataset_strategy' must be int type, but got the type : {} .".format(type(dim)))
|
|
607
|
-
if context.get_context('mode') == context.PYNATIVE_MODE:
|
|
608
|
-
raise ValueError("In PyNative mode, the setting value of 'dataset_strategy' must be either 'full_batch' "
|
|
609
|
-
f"or 'data_parallel', but got {dataset_strategy}.")
|
|
610
601
|
self._dataset_strategy_using_str = False
|
|
611
602
|
self._context_handle.set_dataset_strategy(dataset_strategy)
|
|
612
603
|
|
|
@@ -646,9 +637,6 @@ class _AutoParallelContext:
|
|
|
646
637
|
return "full_batch"
|
|
647
638
|
return "data_parallel"
|
|
648
639
|
dataset_strategy = self._context_handle.get_dataset_strategy()
|
|
649
|
-
if context.get_context('mode') == context.PYNATIVE_MODE:
|
|
650
|
-
raise ValueError("In PyNative mode, the value of 'dataset_strategy' must be either 'full_batch' "
|
|
651
|
-
f"or 'data_parallel', but got the setting value is {dataset_strategy}.")
|
|
652
640
|
return dataset_strategy
|
|
653
641
|
|
|
654
642
|
def set_grad_accumulation_step(self, grad_accumulation_step):
|
|
@@ -662,7 +650,7 @@ class _AutoParallelContext:
|
|
|
662
650
|
raise ValueError("The interface is deprecated. To use gradient accumulation, "
|
|
663
651
|
"please use GradAccumulationCell in mindspore.nn.wrap.cell_wrapper.")
|
|
664
652
|
self.check_context_handle()
|
|
665
|
-
Validator.check_positive_int(grad_accumulation_step)
|
|
653
|
+
Validator.check_positive_int(grad_accumulation_step, prim_name='grad_accumulation_step')
|
|
666
654
|
self._context_handle.set_grad_accumulation_step(grad_accumulation_step)
|
|
667
655
|
|
|
668
656
|
def get_grad_accumulation_step(self):
|
|
@@ -998,6 +986,8 @@ class _AutoParallelContext:
|
|
|
998
986
|
_PipelineScheduler.PIPELINE_GPIPE,
|
|
999
987
|
_PipelineScheduler.PIPELINE_SEQPIPE,
|
|
1000
988
|
_PipelineScheduler.PIPELINE_SEQVPP,
|
|
989
|
+
_PipelineScheduler.PIPELINE_SEQSMARTVPP,
|
|
990
|
+
_PipelineScheduler.PIPELINE_ZBV,
|
|
1001
991
|
_PipelineScheduler.PIPELINE_SEQSMARTVPP])
|
|
1002
992
|
scheduler_val = pipeline_config[pp_scheduler]
|
|
1003
993
|
if not pipeline_config[pp_interleave] and scheduler_val != _PipelineScheduler.PIPELINE_1F1B:
|
|
@@ -1072,7 +1062,7 @@ class _AutoParallelContext:
|
|
|
1072
1062
|
|
|
1073
1063
|
if threshold_name in parallel_optimizer_config:
|
|
1074
1064
|
Validator.check_non_negative_int(
|
|
1075
|
-
parallel_optimizer_config[threshold_name])
|
|
1065
|
+
parallel_optimizer_config[threshold_name], prim_name=threshold_name)
|
|
1076
1066
|
self._context_handle.set_parallel_optimizer_threshold(
|
|
1077
1067
|
parallel_optimizer_config[threshold_name])
|
|
1078
1068
|
|
|
@@ -263,7 +263,7 @@ def _single_parameter_broadcast(net, layout, param_not_load=None, param_loaded=N
|
|
|
263
263
|
if not single_params:
|
|
264
264
|
return
|
|
265
265
|
param_redundancy_reversed = _get_param_redundancy_reversed(param_redundancy, cur_rank)
|
|
266
|
-
if not param_redundancy_reversed
|
|
266
|
+
if not param_redundancy_reversed:
|
|
267
267
|
return
|
|
268
268
|
net_param_dict = net.parameters_dict()
|
|
269
269
|
_chang_parallel_context(origin_dataset_strategy)
|
|
@@ -144,8 +144,7 @@ def _build_protobuf_strategy(strategy_filename):
|
|
|
144
144
|
parallel_strategy_map = _load_protobuf_strategy(strategy_filename)
|
|
145
145
|
layout_items = parallel_strategy_map.parallel_layout_item
|
|
146
146
|
if not layout_items:
|
|
147
|
-
|
|
148
|
-
f"parameter, please check whether the 'strategy_filename' is correct.")
|
|
147
|
+
return {}
|
|
149
148
|
|
|
150
149
|
strategy = {}
|
|
151
150
|
for layout_item in layout_items:
|
|
@@ -159,6 +158,8 @@ def _build_json_strategy(strategy_filename):
|
|
|
159
158
|
"""build strategy from json file"""
|
|
160
159
|
with open(strategy_filename, 'r') as f:
|
|
161
160
|
json_content = json.load(f)
|
|
161
|
+
if "parallel_layout_item" not in json_content:
|
|
162
|
+
return {}
|
|
162
163
|
layout_items = json_content.get("parallel_layout_item")
|
|
163
164
|
strategy = {}
|
|
164
165
|
for parameter_name, layout_item in layout_items.items():
|
|
@@ -525,10 +526,7 @@ def _make_dir(path, arg_name):
|
|
|
525
526
|
else:
|
|
526
527
|
ms.log.debug("The directory(%s) doesn't exist, will create it", path)
|
|
527
528
|
try:
|
|
528
|
-
|
|
529
|
-
os.umask(permissions << 3 | permissions)
|
|
530
|
-
mode = permissions << 6
|
|
531
|
-
os.makedirs(path, mode=mode, exist_ok=True)
|
|
529
|
+
os.makedirs(path, mode=0o700, exist_ok=True)
|
|
532
530
|
real_path = path
|
|
533
531
|
except PermissionError as e:
|
|
534
532
|
ms.log.critical("No write permission on the directory(%r), error = %r", path, e)
|
|
@@ -114,8 +114,8 @@ def _set_ps_context(**kwargs):
|
|
|
114
114
|
Default: ``False``.
|
|
115
115
|
config_file_path (str): Configuration file path used by recovery. Default: ''.
|
|
116
116
|
enable_ssl (bool): Set PS SSL mode enabled or disabled. Default: ``False``.
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
When set to False, users need to review and confirm the security of network environment
|
|
118
|
+
where the distributed job is located.
|
|
119
119
|
client_password (str): Password to decrypt the secret key stored in the client certificate. Default: ''.
|
|
120
120
|
server_password (str): Password to decrypt the secret key stored in the server certificate. Default: ''.
|
|
121
121
|
|
mindspore/parallel/_utils.py
CHANGED
|
@@ -14,14 +14,15 @@
|
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""Utils of auto parallel"""
|
|
16
16
|
import os
|
|
17
|
+
import re
|
|
17
18
|
from time import perf_counter
|
|
18
19
|
from importlib import import_module
|
|
19
20
|
import numpy as np
|
|
20
21
|
import mindspore as ms
|
|
21
22
|
from mindspore import context, log as logger
|
|
22
|
-
from mindspore._c_expression import reset_op_id
|
|
23
|
+
from mindspore._c_expression import reset_op_id
|
|
23
24
|
from mindspore.common.tensor import Tensor
|
|
24
|
-
from mindspore.common.dtype import
|
|
25
|
+
from mindspore.common.dtype import _dtype_to_nptype
|
|
25
26
|
from mindspore.common import dtype as mstype
|
|
26
27
|
from mindspore.communication.management import get_group_size, get_rank
|
|
27
28
|
from mindspore.communication._comm_helper import _is_initialized
|
|
@@ -156,7 +157,7 @@ def _is_in_auto_parallel_mode():
|
|
|
156
157
|
|
|
157
158
|
|
|
158
159
|
def _is_parallel_mode():
|
|
159
|
-
if not _is_initialized()
|
|
160
|
+
if not _is_initialized():
|
|
160
161
|
return False
|
|
161
162
|
if os.getenv("RUN_MODE") != "predict":
|
|
162
163
|
return False
|
|
@@ -173,12 +174,6 @@ def _is_in_hybrid_parallel_mode():
|
|
|
173
174
|
return _get_parallel_mode() == ms.ParallelMode.HYBRID_PARALLEL
|
|
174
175
|
|
|
175
176
|
|
|
176
|
-
def _is_pynative_parallel():
|
|
177
|
-
parallel_mode = context.get_auto_parallel_context('parallel_mode')
|
|
178
|
-
return context.get_context('mode') == context.PYNATIVE_MODE and parallel_mode in (
|
|
179
|
-
context.ParallelMode.SEMI_AUTO_PARALLEL, context.ParallelMode.AUTO_PARALLEL)
|
|
180
|
-
|
|
181
|
-
|
|
182
177
|
def _get_full_batch():
|
|
183
178
|
"""Get whether to use full_batch."""
|
|
184
179
|
return auto_parallel_context().get_full_batch()
|
|
@@ -452,7 +447,7 @@ def _to_full_tensor(elem, global_device_num, global_rank, scaling_sens=None):
|
|
|
452
447
|
batchsize_per_device = item
|
|
453
448
|
else:
|
|
454
449
|
new_shape += (item,)
|
|
455
|
-
new_tensor_numpy = np.zeros(new_shape,
|
|
450
|
+
new_tensor_numpy = np.zeros(new_shape, _dtype_to_nptype(type_)) # pylint:disable=protected-access
|
|
456
451
|
start = stage_rank * batchsize_per_device
|
|
457
452
|
new_tensor_numpy[start: start + batchsize_per_device] = data.asnumpy()
|
|
458
453
|
else:
|
|
@@ -466,7 +461,7 @@ def _to_full_tensor(elem, global_device_num, global_rank, scaling_sens=None):
|
|
|
466
461
|
end = (stage_rank % dataset_strategy[index][i] + 1) * item
|
|
467
462
|
s = slice(start, end, 1)
|
|
468
463
|
slice_index += (s,)
|
|
469
|
-
new_tensor_numpy = np.zeros(new_shape,
|
|
464
|
+
new_tensor_numpy = np.zeros(new_shape, _dtype_to_nptype(type_)) # pylint:disable=protected-access
|
|
470
465
|
new_tensor_numpy[slice_index] = data.asnumpy()
|
|
471
466
|
new_tensor = Tensor(new_tensor_numpy, dtype=type_)
|
|
472
467
|
lst.append(new_tensor)
|
|
@@ -590,11 +585,6 @@ def _reset_op_id():
|
|
|
590
585
|
reset_op_id()
|
|
591
586
|
|
|
592
587
|
|
|
593
|
-
def _reset_op_id_with_offset():
|
|
594
|
-
"""Reset op id with offset."""
|
|
595
|
-
reset_op_id_with_offset()
|
|
596
|
-
|
|
597
|
-
|
|
598
588
|
def _parallel_predict_check():
|
|
599
589
|
"""validate parallel model prediction"""
|
|
600
590
|
if _is_in_auto_parallel_mode():
|
|
@@ -773,7 +763,7 @@ def _grads_divided_by_device_num_if_recomputation(grads):
|
|
|
773
763
|
"""
|
|
774
764
|
If in pynative parallel and full_batch is True, divide grads by device num to ensure that the gradients is correct.
|
|
775
765
|
"""
|
|
776
|
-
if not
|
|
766
|
+
if not _get_full_batch():
|
|
777
767
|
return grads
|
|
778
768
|
|
|
779
769
|
device_num = _get_device_num()
|
|
@@ -804,3 +794,30 @@ def _check_rank(cur_rank, initial_rank, pipeline_stages):
|
|
|
804
794
|
raise ValueError(f"For parameter broadcast, the cur_rank: {cur_rank} is wrong.")
|
|
805
795
|
if initial_rank % (get_group_size() / pipeline_stages) != 0:
|
|
806
796
|
raise ValueError(f"For parameter broadcast, the initial_rank: {initial_rank} is wrong.")
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def _check_path_safe(path, arg_name):
|
|
800
|
+
"""
|
|
801
|
+
Check input path string is safe.
|
|
802
|
+
"""
|
|
803
|
+
illegal_patterns = [
|
|
804
|
+
r"\.\.",
|
|
805
|
+
r"//+",
|
|
806
|
+
r"~",
|
|
807
|
+
r"^\s*$",
|
|
808
|
+
r"\./\."
|
|
809
|
+
]
|
|
810
|
+
for pattern in illegal_patterns:
|
|
811
|
+
if re.search(pattern, path):
|
|
812
|
+
pattern_info = pattern.replace('\\', '')
|
|
813
|
+
raise ValueError(f"{arg_name} contains '{pattern_info}' is not safe, please use a safe one.")
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _check_path_writable(path):
|
|
817
|
+
"""
|
|
818
|
+
Check the write permission of the input path.
|
|
819
|
+
"""
|
|
820
|
+
if not os.path.exists(path):
|
|
821
|
+
raise RuntimeError(f"{path} Path does not exist.")
|
|
822
|
+
if not os.access(path, os.W_OK):
|
|
823
|
+
raise PermissionError(f"Don't have the write permission on the directory {path}.")
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""Cell of auto parallel"""
|
|
16
16
|
import os
|
|
17
|
+
from mindspore import jit
|
|
17
18
|
from mindspore.nn.cell import Cell
|
|
18
19
|
from mindspore.parallel.shard import Layout
|
|
19
20
|
from mindspore.communication.management import get_rank, get_group_size
|
|
@@ -281,7 +282,8 @@ class AutoParallel(Cell):
|
|
|
281
282
|
Note:
|
|
282
283
|
- It only works when `parallel_mode=sharding_propagation`.
|
|
283
284
|
- When performing distributed training, users can first save the strategy using dryrun on a single device
|
|
284
|
-
and then load strategy to perform distributed training.
|
|
285
|
+
and then load strategy to perform distributed training. Note that only the first device of each node will
|
|
286
|
+
save the strategy file, so the simulated rank id specified by Dryrun must be divisible by 8.
|
|
285
287
|
|
|
286
288
|
Args:
|
|
287
289
|
file_path (str): Path to save parallel strategy json, must be an absolute path.
|
|
@@ -511,17 +513,17 @@ class AutoParallel(Cell):
|
|
|
511
513
|
raise ValueError("For 'AutoParallel.pipeline', the argument 'stages' "
|
|
512
514
|
"must be larger than zero, but got value: {}.".format(stages))
|
|
513
515
|
if not isinstance(output_broadcast, bool):
|
|
514
|
-
raise TypeError("For 'AutoParallel.pipeline', the argument '
|
|
516
|
+
raise TypeError("For 'AutoParallel.pipeline', the argument 'output_broadcast' "
|
|
515
517
|
"must be bool type, but got the type : {}.".format(type(output_broadcast)))
|
|
516
518
|
if not isinstance(interleave, bool):
|
|
517
|
-
raise TypeError("For 'AutoParallel.pipeline', the argument '
|
|
519
|
+
raise TypeError("For 'AutoParallel.pipeline', the argument 'interleave' "
|
|
518
520
|
"must be bool type, but got the type : {}.".format(type(interleave)))
|
|
519
521
|
if not isinstance(scheduler, str):
|
|
520
|
-
raise TypeError("For 'AutoParallel.pipeline', the argument '
|
|
522
|
+
raise TypeError("For 'AutoParallel.pipeline', the argument 'scheduler' "
|
|
521
523
|
"must be str type, but got the type : {}.".format(type(scheduler)))
|
|
522
|
-
if scheduler not in ("1f1b", "gpipe", "seqpipe", "seqvpp", "seqsmartvpp"):
|
|
524
|
+
if scheduler not in ("1f1b", "gpipe", "seqpipe", "seqvpp", "seqsmartvpp", "zero_bubble_v"):
|
|
523
525
|
raise ValueError("For 'AutoParallel.pipeline', the argument "
|
|
524
|
-
"'scheduler' must be '1f1b'/'gpipe'/'seqpipe'/'seqvpp'/'seqsmartvpp' ," \
|
|
526
|
+
"'scheduler' must be '1f1b'/'gpipe'/'seqpipe'/'seqvpp'/'seqsmartvpp'/'zero_bubble_v' ," \
|
|
525
527
|
" but got the value : {}."
|
|
526
528
|
.format(scheduler))
|
|
527
529
|
self._pipeline_stages = stages
|
|
@@ -665,8 +667,11 @@ class AutoParallel(Cell):
|
|
|
665
667
|
- recomputation_communication_overlap (bool): Enable overlap between recompute ops and communication ops
|
|
666
668
|
if True.
|
|
667
669
|
Default: False.
|
|
668
|
-
- grad_matmul_communication_overlap (bool):
|
|
669
|
-
tensor parallel communication
|
|
670
|
+
- grad_matmul_communication_overlap (bool, str): When set to ``True``, it indicates that overlap
|
|
671
|
+
between dw matmul and tensor parallel communication is enabled. When set to ``False``, it indicates
|
|
672
|
+
that this feature is disabled. When set to str, it only optimizes the specified communication
|
|
673
|
+
operator types, with operators separated by ``,``. For example, "AlltoAll,AlltoAllV" indicates that
|
|
674
|
+
only ``AlltoAll`` and ``AlltoAllV`` are optimized. Default: ``False``.
|
|
670
675
|
- grad_fa_allgather_overlap (bool): Enable overlap between duplicated allgather by recomputing
|
|
671
676
|
in sequence parallel and flashattentionscoregrad ops if True. Default: False.
|
|
672
677
|
- enable_communication_fusion (bool): Enable communication fusion to optimize the number of
|
|
@@ -681,7 +686,9 @@ class AutoParallel(Cell):
|
|
|
681
686
|
and optimizer parallel allgather communication if True. Currently, do not support
|
|
682
687
|
`O2 <https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.JitConfig.html>`_
|
|
683
688
|
Default: False.
|
|
684
|
-
- computation_communication_fusion_level (int): Enable the fusion between compute and communicate
|
|
689
|
+
- computation_communication_fusion_level (int): Enable the fusion between compute and communicate,
|
|
690
|
+
which fuses communication tasks and computing tasks, allows for partial pipelining and parallel
|
|
691
|
+
execution of these tasks during operation, thereby enhancing performance.
|
|
685
692
|
Default: ``0``. Note: This function must be used with Ascend Training Solution 24.0.RC2 or later.
|
|
686
693
|
This is an experimental configuration, may be changed or canceled in the future.
|
|
687
694
|
|
|
@@ -692,6 +699,12 @@ class AutoParallel(Cell):
|
|
|
692
699
|
- 2: Apply fusion to backward nodes.
|
|
693
700
|
|
|
694
701
|
- 3: Apply fusion to all nodes.
|
|
702
|
+
|
|
703
|
+
.. warning::
|
|
704
|
+
After setting ``export MS_ENABLE_LCCL=on``, the fusion operator based on memory semantics will be
|
|
705
|
+
used. Please note that this operator is still in an experimental stage and may be changed or
|
|
706
|
+
removed in the future.
|
|
707
|
+
|
|
695
708
|
- dataset_broadcast_opt_level (int): Optimize the scenario that the dataset repeated reading. Only
|
|
696
709
|
support O0/O1 jit level. It doesn't work in O2 mode. Default: ``0``.
|
|
697
710
|
|
|
@@ -735,5 +748,6 @@ class AutoParallel(Cell):
|
|
|
735
748
|
self._transformer_opt_config = file_path
|
|
736
749
|
ctx.ascend_config['parallel_speed_up_json_path'] = file_path
|
|
737
750
|
|
|
751
|
+
@jit
|
|
738
752
|
def construct(self, *args, **kwargs):
|
|
739
753
|
return self.network(*args, **kwargs)
|
|
@@ -31,7 +31,7 @@ from mindspore.communication.management import get_rank, get_group_size
|
|
|
31
31
|
from mindspore.parallel._tensor import _load_tensor, _reshape_param_data, _reshape_param_data_with_weight, \
|
|
32
32
|
_get_tensor_slice_index, _get_tensor_strategy
|
|
33
33
|
from mindspore.parallel._utils import _is_in_auto_parallel_mode, _get_pipeline_stages, _infer_rank_list, \
|
|
34
|
-
_remove_repeated_slices, _get_auto_parallel_net
|
|
34
|
+
_remove_repeated_slices, _get_auto_parallel_net, _check_path_safe, _check_path_writable
|
|
35
35
|
from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
|
|
36
36
|
_transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, _build_searched_strategy, \
|
|
37
37
|
_extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
|
|
@@ -69,7 +69,9 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
|
69
69
|
>>> ms.parallel.merge_pipeline_strategys("./src_strategy_dir", "./dst_strategy.ckpt")
|
|
70
70
|
|
|
71
71
|
"""
|
|
72
|
-
|
|
72
|
+
dst_strategy_file = os.path.normpath(dst_strategy_file)
|
|
73
|
+
dst_strategy_file = os.path.abspath(dst_strategy_file)
|
|
74
|
+
dst_strategy_dir = os.path.dirname(dst_strategy_file)
|
|
73
75
|
if not os.path.exists(dst_strategy_dir):
|
|
74
76
|
_make_dir(dst_strategy_dir, "path")
|
|
75
77
|
if not os.path.isdir(src_strategy_dirs):
|
|
@@ -495,6 +497,9 @@ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckp
|
|
|
495
497
|
def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
|
|
496
498
|
dst_strategy_file=None):
|
|
497
499
|
"""Transform checkpoints for all stages in src_strategy_file"""
|
|
500
|
+
_check_path_safe(dst_checkpoints_dir, "dst_checkpoints_dir")
|
|
501
|
+
dst_checkpoints_dir = os.path.realpath(dst_checkpoints_dir)
|
|
502
|
+
_check_path_safe(ckpt_prefix, "ckpt_prefix")
|
|
498
503
|
checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
|
|
499
504
|
all_checkpoint_files_map = {}
|
|
500
505
|
for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
|
|
@@ -563,6 +568,7 @@ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix
|
|
|
563
568
|
save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(transform_rank))
|
|
564
569
|
if not os.path.exists(save_checkpoint_file_dir):
|
|
565
570
|
_make_dir(save_checkpoint_file_dir, "path")
|
|
571
|
+
_check_path_writable(save_checkpoint_file_dir)
|
|
566
572
|
save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
|
|
567
573
|
ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
|
|
568
574
|
del param_total_dict_copy
|
|
@@ -913,6 +919,15 @@ def set_op_strategy_config(mode="SAVE", path=""):
|
|
|
913
919
|
if file_type != ".json":
|
|
914
920
|
raise KeyError("File type must be .json")
|
|
915
921
|
dir_path = os.path.dirname(path)
|
|
922
|
+
|
|
923
|
+
normalized_path = os.path.abspath(os.path.realpath(path))
|
|
924
|
+
dangerous_paths = ['/etc', '/usr', '/bin', '/sbin', '/boot', '/proc', '/sys']
|
|
925
|
+
for dangerous_path in dangerous_paths:
|
|
926
|
+
if normalized_path.startswith(dangerous_path):
|
|
927
|
+
raise PermissionError(
|
|
928
|
+
f"Writing to system directory '{dangerous_path}' is not allowed"
|
|
929
|
+
)
|
|
930
|
+
|
|
916
931
|
if dir_path and not os.path.exists(dir_path):
|
|
917
932
|
os.makedirs(dir_path, mode=0o700, exist_ok=True)
|
|
918
933
|
check_mode_type = ["SAVE", "LOAD"]
|
|
@@ -1165,6 +1180,8 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1165
1180
|
train_strategy_filename = ms.context.get_auto_parallel_context("strategy_ckpt_load_file")
|
|
1166
1181
|
|
|
1167
1182
|
_train_strategy = build_searched_strategy(train_strategy_filename)
|
|
1183
|
+
if not _train_strategy:
|
|
1184
|
+
return True
|
|
1168
1185
|
train_strategy = _convert_to_list(_train_strategy)
|
|
1169
1186
|
|
|
1170
1187
|
train_dev_count = 1
|
|
@@ -1180,6 +1197,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1180
1197
|
|
|
1181
1198
|
param_total_dict = defaultdict(dict)
|
|
1182
1199
|
for file_index, file_name in enumerate(checkpoint_filenames):
|
|
1200
|
+
file_name = os.path.abspath(file_name)
|
|
1183
1201
|
ckpt_dict = ms.load_checkpoint(file_name, dec_key=dec_key, dec_mode=dec_mode)
|
|
1184
1202
|
for param_name, param in ckpt_dict.items():
|
|
1185
1203
|
param_total_dict[param_name][file_index] = param
|
|
@@ -21,6 +21,7 @@ import subprocess
|
|
|
21
21
|
import socket
|
|
22
22
|
import psutil
|
|
23
23
|
import mindspore.log as logger
|
|
24
|
+
from mindspore.utils import RSCPluginHandle
|
|
24
25
|
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
|
|
25
26
|
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip, _generate_auto_bind_core_strategy, \
|
|
26
27
|
_generate_bind_core_strategy
|
|
@@ -179,9 +180,12 @@ class _ProcessManager:
|
|
|
179
180
|
self.is_simulation = self.sim_level != -1
|
|
180
181
|
if self.is_simulation:
|
|
181
182
|
os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
|
|
183
|
+
if self.sim_rank_id == -1:
|
|
184
|
+
self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
|
|
182
185
|
elif os.getenv("MS_SIMULATION_LEVEL"):
|
|
183
186
|
self.is_simulation = True
|
|
184
|
-
self.sim_rank_id
|
|
187
|
+
if self.sim_rank_id == -1:
|
|
188
|
+
self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
|
|
185
189
|
if os.getenv("RANK_SIZE"):
|
|
186
190
|
self.exported_rank_size = os.getenv("RANK_SIZE")
|
|
187
191
|
# If sim_rank_id is set, single worker can be started.
|
|
@@ -218,23 +222,28 @@ class _ProcessManager:
|
|
|
218
222
|
|
|
219
223
|
self.proc_rank_map = {}
|
|
220
224
|
self.enable_mindx = False
|
|
225
|
+
self.handler = None
|
|
221
226
|
self._check_taskd()
|
|
222
227
|
|
|
223
228
|
def _check_taskd(self):
|
|
224
229
|
"""check if enable taskd."""
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
230
|
+
self.handler = RSCPluginHandle()
|
|
231
|
+
self.enable_mindx = self.handler.check_enable()
|
|
232
|
+
if self.enable_mindx is False:
|
|
233
|
+
self.handler = None
|
|
234
|
+
return
|
|
235
|
+
ret = self.handler.register_callback({"KILL_WORKER": self.kill_workers,
|
|
236
|
+
"START_ALL_WORKER": self.start_all_workers,
|
|
237
|
+
"START_WORKER_LIST": self.start_worker_list,
|
|
238
|
+
"MONITOR": self.monitor_rank_status
|
|
239
|
+
})
|
|
240
|
+
if not ret:
|
|
241
|
+
logger.warning(f"Register callback to mindx failed, process controlled by msrun.")
|
|
242
|
+
self.enable_mindx = False
|
|
243
|
+
self.handler = None
|
|
244
|
+
return
|
|
245
|
+
logger.warning(f"Mindx enabled, process controlled by mindx.")
|
|
246
|
+
os.environ["MS_ENABLE_RECOVERY"] = str(1)
|
|
238
247
|
|
|
239
248
|
def run(self):
|
|
240
249
|
"""
|
|
@@ -257,7 +266,7 @@ class _ProcessManager:
|
|
|
257
266
|
if self.is_master and not self.is_simulation:
|
|
258
267
|
self.start_scheduler()
|
|
259
268
|
if self.enable_mindx:
|
|
260
|
-
self.
|
|
269
|
+
self.handler.start()
|
|
261
270
|
else:
|
|
262
271
|
self.start_workers()
|
|
263
272
|
if self.join:
|
|
@@ -379,8 +388,7 @@ class _ProcessManager:
|
|
|
379
388
|
logger.error(f"Scheduler process {self.msn_process.pid} exit with exception.")
|
|
380
389
|
|
|
381
390
|
if has_exception:
|
|
382
|
-
|
|
383
|
-
self._analyze_log()
|
|
391
|
+
self._analyze_sched_log()
|
|
384
392
|
raise RuntimeError("Distributed job exited with exception. Please check logs in "
|
|
385
393
|
f"directory: {self.log_dir}.")
|
|
386
394
|
|
|
@@ -580,26 +588,13 @@ class _ProcessManager:
|
|
|
580
588
|
log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
|
|
581
589
|
return node_id, log_name
|
|
582
590
|
|
|
583
|
-
def
|
|
591
|
+
def _analyze_sched_log(self):
|
|
584
592
|
"""
|
|
585
|
-
Analyze
|
|
593
|
+
Analyze scheduler log.
|
|
586
594
|
"""
|
|
587
595
|
scheduler_log_path = os.path.join(self.log_dir, "scheduler.log")
|
|
588
|
-
time_out_node_ids = []
|
|
589
596
|
if os.path.exists(scheduler_log_path):
|
|
590
|
-
|
|
591
|
-
scheduler_log = log.read()
|
|
592
|
-
# Filter out abnormal logs.
|
|
593
|
-
time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
|
|
594
|
-
|
|
595
|
-
# Filter out node ids of the processes which exit abnormally.
|
|
596
|
-
def node_id_splitter(node_id):
|
|
597
|
-
return re.split(" is timed out", re.split("node: ", node_id)[1])[0]
|
|
598
|
-
for node_id in time_out_node_log:
|
|
599
|
-
time_out_node_ids.append(node_id_splitter(node_id))
|
|
600
|
-
logger.error(f"Time out nodes are {time_out_node_ids}")
|
|
601
|
-
|
|
602
|
-
os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
|
|
597
|
+
os.system(f"cat {scheduler_log_path} | grep -E 'ERROR|CRITICAL|Traceback|Error' -C 5")
|
|
603
598
|
|
|
604
599
|
def format_worker_log_name(self):
|
|
605
600
|
"""
|