mindspore 2.6.0rc1__cp311-cp311-win_amd64.whl → 2.7.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +2 -2
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +42 -11
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
- mindspore/_extends/optimize/cell_utils.py +96 -0
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/compile_config.py +44 -22
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -2
- mindspore/_extends/parse/parser.py +65 -84
- mindspore/_extends/parse/resources.py +39 -0
- mindspore/_extends/parse/standard_method.py +58 -14
- mindspore/_extends/parse/trope.py +8 -1
- mindspore/_extends/pijit/__init__.py +1 -2
- mindspore/_extends/pijit/pijit_func_white_list.py +2 -5
- mindspore/amp.py +4 -22
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/adasum.py +1 -1
- mindspore/boost/boost_cell_wrapper.py +4 -4
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +43 -12
- mindspore/common/_grad_function.py +2 -1
- mindspore/common/_pijit_context.py +28 -7
- mindspore/common/_stub_tensor.py +1 -209
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +178 -53
- mindspore/common/_utils.py +9 -1
- mindspore/common/api.py +377 -203
- mindspore/common/dtype.py +108 -57
- mindspore/common/dump.py +11 -16
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +17 -23
- mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
- mindspore/common/file_system.py +59 -9
- mindspore/common/generator.py +5 -3
- mindspore/common/hook_handle.py +33 -5
- mindspore/common/jit_config.py +1 -1
- mindspore/common/jit_trace.py +84 -105
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +27 -29
- mindspore/common/recompute.py +5 -7
- mindspore/common/sparse_tensor.py +0 -3
- mindspore/common/symbol.py +0 -1
- mindspore/common/tensor.py +117 -131
- mindspore/communication/_comm_helper.py +46 -4
- mindspore/communication/management.py +79 -7
- mindspore/context.py +67 -55
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +38 -4
- mindspore/dataset/engine/datasets.py +350 -322
- mindspore/dataset/engine/datasets_user_defined.py +70 -24
- mindspore/dataset/engine/iterators.py +2 -2
- mindspore/dataset/engine/obs/config_loader.py +2 -2
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/py_transforms.py +7 -3
- mindspore/dataset/transforms/transforms.py +10 -6
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +17 -5
- mindspore/dataset/vision/utils.py +632 -21
- mindspore/dataset/vision/validators.py +1 -0
- mindspore/device_context/ascend/device.py +1 -1
- mindspore/device_context/ascend/op_tuning.py +35 -1
- mindspore/device_context/gpu/__init__.py +2 -2
- mindspore/device_context/gpu/device.py +1 -1
- mindspore/device_context/gpu/op_precision.py +4 -2
- mindspore/device_context/gpu/op_tuning.py +6 -3
- mindspore/device_manager.py +16 -9
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -4
- mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
- mindspore/experimental/optim/adadelta.py +13 -20
- mindspore/experimental/optim/adagrad.py +15 -22
- mindspore/experimental/optim/adam.py +17 -24
- mindspore/experimental/optim/adamax.py +14 -22
- mindspore/experimental/optim/adamw.py +28 -34
- mindspore/experimental/optim/asgd.py +15 -25
- mindspore/experimental/optim/lr_scheduler.py +27 -45
- mindspore/experimental/optim/nadam.py +14 -24
- mindspore/experimental/optim/optimizer.py +13 -23
- mindspore/experimental/optim/radam.py +18 -24
- mindspore/experimental/optim/rmsprop.py +14 -25
- mindspore/experimental/optim/rprop.py +15 -26
- mindspore/experimental/optim/sgd.py +9 -19
- mindspore/hal/__init__.py +4 -4
- mindspore/hal/contiguous_tensors_handle.py +2 -2
- mindspore/hal/memory.py +27 -7
- mindspore/include/api/cell.h +65 -5
- mindspore/include/api/cfg.h +24 -7
- mindspore/include/api/context.h +1 -0
- mindspore/include/api/delegate.h +10 -2
- mindspore/include/api/dual_abi_helper.h +100 -19
- mindspore/include/api/graph.h +14 -1
- mindspore/include/api/kernel.h +16 -3
- mindspore/include/api/kernel_api.h +9 -1
- mindspore/include/api/metrics/accuracy.h +9 -0
- mindspore/include/api/model.h +8 -1
- mindspore/include/api/model_group.h +4 -0
- mindspore/include/api/model_parallel_runner.h +2 -0
- mindspore/include/api/status.h +48 -10
- mindspore/include/api/types.h +8 -3
- mindspore/include/c_api/model_c.h +0 -58
- mindspore/include/c_api/tensor_c.h +0 -26
- mindspore/include/dataset/constants.h +9 -0
- mindspore/include/dataset/vision_ascend.h +1 -1
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +61 -11
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +6 -46
- mindspore/mint/distributed/__init__.py +5 -0
- mindspore/mint/distributed/distributed.py +429 -23
- mindspore/mint/nn/__init__.py +1 -1
- mindspore/mint/nn/functional.py +53 -6
- mindspore/mint/nn/layer/_functions.py +163 -294
- mindspore/mint/nn/layer/activation.py +8 -6
- mindspore/mint/nn/layer/conv.py +140 -104
- mindspore/mint/nn/layer/normalization.py +11 -25
- mindspore/mint/optim/adam.py +19 -18
- mindspore/mint/optim/adamw.py +14 -8
- mindspore/mint/optim/sgd.py +5 -5
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/cell.py +491 -623
- mindspore/nn/grad/cell_grad.py +11 -12
- mindspore/nn/layer/activation.py +36 -36
- mindspore/nn/layer/basic.py +74 -77
- mindspore/nn/layer/channel_shuffle.py +4 -4
- mindspore/nn/layer/combined.py +4 -2
- mindspore/nn/layer/conv.py +117 -110
- mindspore/nn/layer/dense.py +9 -7
- mindspore/nn/layer/embedding.py +50 -52
- mindspore/nn/layer/image.py +38 -40
- mindspore/nn/layer/math.py +111 -112
- mindspore/nn/layer/normalization.py +56 -44
- mindspore/nn/layer/pooling.py +58 -63
- mindspore/nn/layer/rnn_cells.py +33 -33
- mindspore/nn/layer/rnns.py +56 -56
- mindspore/nn/layer/thor_layer.py +74 -73
- mindspore/nn/layer/transformer.py +11 -1
- mindspore/nn/learning_rate_schedule.py +20 -20
- mindspore/nn/loss/loss.py +79 -81
- mindspore/nn/optim/adam.py +4 -6
- mindspore/nn/optim/adasum.py +2 -2
- mindspore/nn/optim/asgd.py +2 -0
- mindspore/nn/optim/lamb.py +1 -3
- mindspore/nn/optim/optimizer.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +2 -3
- mindspore/nn/optim/thor.py +2 -2
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/probability/distribution/exponential.py +2 -1
- mindspore/nn/probability/distribution/poisson.py +2 -1
- mindspore/nn/sparse/sparse.py +3 -3
- mindspore/nn/wrap/cell_wrapper.py +73 -42
- mindspore/nn/wrap/grad_reducer.py +37 -52
- mindspore/nn/wrap/loss_scale.py +72 -74
- mindspore/numpy/array_creations.py +7 -7
- mindspore/numpy/fft.py +1 -1
- mindspore/numpy/math_ops.py +5 -5
- mindspore/numpy/utils_const.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
- mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
- mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
- mindspore/ops/_op_impl/cpu/__init__.py +1 -0
- mindspore/{experimental/es/__init__.py → ops/_op_impl/cpu/joinedstr_op.py} +12 -6
- mindspore/ops/_vmap/vmap_array_ops.py +31 -13
- mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +54 -13
- mindspore/ops/auto_generate/gen_extend_func.py +27 -145
- mindspore/ops/auto_generate/gen_ops_def.py +1027 -347
- mindspore/ops/auto_generate/gen_ops_prim.py +2341 -1117
- mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
- mindspore/ops/composite/__init__.py +10 -0
- mindspore/ops/composite/base.py +9 -5
- mindspore/ops/composite/multitype_ops/__init__.py +12 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +133 -109
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
- mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
- mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
- mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
- mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
- mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
- mindspore/ops/function/__init__.py +4 -1
- mindspore/ops/function/_add_attr_func.py +11 -6
- mindspore/ops/function/array_func.py +19 -102
- mindspore/ops/function/debug_func.py +8 -5
- mindspore/ops/function/grad/grad_func.py +5 -13
- mindspore/ops/function/math_func.py +77 -572
- mindspore/ops/function/nn_func.py +46 -94
- mindspore/ops/function/other_func.py +4 -1
- mindspore/ops/function/random_func.py +44 -5
- mindspore/ops/function/vmap_func.py +2 -1
- mindspore/ops/functional.py +4 -4
- mindspore/ops/functional_overload.py +594 -18
- mindspore/ops/op_info_register.py +21 -0
- mindspore/ops/operations/__init__.py +16 -11
- mindspore/ops/operations/_custom_ops_utils.py +689 -34
- mindspore/ops/operations/_inner_ops.py +14 -18
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/array_ops.py +5 -51
- mindspore/ops/operations/comm_ops.py +186 -41
- mindspore/ops/operations/custom_ops.py +303 -177
- mindspore/ops/operations/debug_ops.py +59 -4
- mindspore/ops/operations/image_ops.py +13 -13
- mindspore/ops/operations/manually_defined/ops_def.py +27 -28
- mindspore/ops/operations/math_ops.py +8 -9
- mindspore/ops/operations/nn_ops.py +8 -40
- mindspore/ops/primitive.py +9 -20
- mindspore/ops/tensor_method.py +63 -15
- mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
- mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
- mindspore/ops_generate/api/functions_cc_generator.py +58 -10
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
- mindspore/ops_generate/common/base_generator.py +14 -0
- mindspore/ops_generate/common/gen_constants.py +8 -3
- mindspore/ops_generate/common/gen_utils.py +0 -19
- mindspore/ops_generate/common/op_proto.py +11 -4
- mindspore/ops_generate/common/template.py +88 -11
- mindspore/ops_generate/gen_ops.py +1 -1
- mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -16
- mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
- mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
- mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
- mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
- mindspore/parallel/_auto_parallel_context.py +16 -23
- mindspore/parallel/_cell_wrapper.py +113 -45
- mindspore/parallel/_parallel_serialization.py +4 -3
- mindspore/parallel/_ps_context.py +4 -6
- mindspore/parallel/_tensor.py +167 -12
- mindspore/parallel/_transformer/moe.py +1 -1
- mindspore/parallel/_transformer/transformer.py +17 -12
- mindspore/parallel/_utils.py +5 -11
- mindspore/parallel/auto_parallel.py +35 -14
- mindspore/parallel/checkpoint_convert.py +3 -3
- mindspore/parallel/checkpoint_transform.py +13 -7
- mindspore/parallel/cluster/process_entity/_api.py +88 -49
- mindspore/parallel/cluster/process_entity/_utils.py +95 -7
- mindspore/parallel/cluster/run.py +48 -7
- mindspore/parallel/function/__init__.py +8 -1
- mindspore/parallel/function/reshard_func.py +12 -12
- mindspore/parallel/nn/__init__.py +15 -2
- mindspore/parallel/nn/parallel_cell_wrapper.py +50 -14
- mindspore/parallel/nn/parallel_grad_reducer.py +7 -14
- mindspore/parallel/shard.py +10 -25
- mindspore/parallel/transform_safetensors.py +469 -174
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
- mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +12 -6
- mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
- mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
- mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
- mindspore/profiler/analysis/task_manager.py +1 -1
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
- mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +10 -9
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +43 -23
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
- mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
- mindspore/profiler/common/constant.py +16 -0
- mindspore/profiler/common/msprof_cmd_tool.py +2 -2
- mindspore/profiler/common/path_manager.py +9 -0
- mindspore/profiler/common/profiler_context.py +50 -29
- mindspore/profiler/common/profiler_info.py +0 -16
- mindspore/profiler/common/profiler_meta_data.py +1 -0
- mindspore/profiler/common/profiler_op_analyse.py +239 -0
- mindspore/profiler/common/profiler_output_path.py +23 -8
- mindspore/profiler/common/profiler_parameters.py +128 -35
- mindspore/profiler/dynamic_profile/__init__.py +0 -0
- mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
- mindspore/profiler/dynamic_profiler.py +374 -338
- mindspore/profiler/envprofiler.py +42 -12
- mindspore/profiler/experimental_config.py +112 -7
- mindspore/profiler/mstx.py +33 -12
- mindspore/profiler/platform/__init__.py +2 -3
- mindspore/profiler/platform/cpu_profiler.py +10 -4
- mindspore/profiler/platform/npu_profiler.py +30 -20
- mindspore/profiler/profiler.py +218 -154
- mindspore/profiler/profiler_action_controller.py +65 -77
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/profiler/schedule.py +10 -4
- mindspore/rewrite/common/config.py +1 -0
- mindspore/rewrite/common/namer.py +1 -0
- mindspore/rewrite/common/namespace.py +1 -0
- mindspore/rewrite/node/node.py +31 -11
- mindspore/rewrite/parsers/assign_parser.py +1 -1
- mindspore/rewrite/symbol_tree/symbol_tree.py +2 -2
- mindspore/run_check/_check_version.py +7 -10
- mindspore/runtime/__init__.py +8 -6
- mindspore/runtime/event.py +10 -4
- mindspore/runtime/executor.py +87 -45
- mindspore/runtime/memory.py +31 -32
- mindspore/runtime/thread_bind_core.py +299 -165
- mindspore/safeguard/rewrite_obfuscation.py +12 -13
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +17 -7
- mindspore/train/amp.py +43 -23
- mindspore/train/callback/__init__.py +5 -5
- mindspore/train/callback/_callback.py +2 -1
- mindspore/train/callback/_checkpoint.py +4 -14
- mindspore/train/callback/_flops_collector.py +11 -7
- mindspore/train/callback/_landscape.py +0 -1
- mindspore/train/callback/_train_fault_tolerance.py +98 -21
- mindspore/train/data_sink.py +15 -6
- mindspore/train/dataset_helper.py +14 -5
- mindspore/train/model.py +133 -69
- mindspore/train/serialization.py +168 -126
- mindspore/train/summary/summary_record.py +13 -2
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +0 -6
- mindspore/utils/runtime_execution_order_check.py +163 -77
- mindspore/utils/sdc_detect.py +68 -0
- mindspore/utils/utils.py +14 -17
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/METADATA +5 -4
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/RECORD +403 -442
- mindspore/_deprecated/jit.py +0 -198
- mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
- mindspore/communication/_hccl_management.py +0 -297
- mindspore/experimental/es/embedding_service.py +0 -891
- mindspore/experimental/es/embedding_service_layer.py +0 -581
- mindspore/profiler/common/validator/__init__.py +0 -14
- mindspore/profiler/common/validator/validate_path.py +0 -84
- mindspore/profiler/parser/__init__.py +0 -14
- mindspore/profiler/parser/aicpu_data_parser.py +0 -272
- mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
- mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
- mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
- mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
- mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
- mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
- mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
- mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
- mindspore/profiler/parser/ascend_flops_generator.py +0 -116
- mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
- mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
- mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
- mindspore/profiler/parser/ascend_memory_generator.py +0 -185
- mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
- mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
- mindspore/profiler/parser/ascend_op_generator.py +0 -334
- mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
- mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
- mindspore/profiler/parser/base_timeline_generator.py +0 -483
- mindspore/profiler/parser/container.py +0 -229
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
- mindspore/profiler/parser/flops_parser.py +0 -531
- mindspore/profiler/parser/framework_enum.py +0 -111
- mindspore/profiler/parser/framework_parser.py +0 -464
- mindspore/profiler/parser/framework_struct.py +0 -61
- mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
- mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
- mindspore/profiler/parser/hccl_parser.py +0 -573
- mindspore/profiler/parser/hwts_log_parser.py +0 -122
- mindspore/profiler/parser/integrator.py +0 -526
- mindspore/profiler/parser/memory_usage_parser.py +0 -277
- mindspore/profiler/parser/minddata_analyzer.py +0 -800
- mindspore/profiler/parser/minddata_parser.py +0 -186
- mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
- mindspore/profiler/parser/op_intermediate_parser.py +0 -149
- mindspore/profiler/parser/optime_parser.py +0 -250
- mindspore/profiler/parser/profiler_info.py +0 -213
- mindspore/profiler/parser/step_trace_parser.py +0 -666
- mindspore/utils/hooks.py +0 -81
- /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/WHEEL +0 -0
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0.dist-info}/top_level.txt +0 -0
|
@@ -582,6 +582,8 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
|
582
582
|
The number of multiprocess settings is related to the size of the host, and it is not recommended to set it
|
|
583
583
|
too large, otherwise it may cause freezing.
|
|
584
584
|
|
|
585
|
+
This function does not support converting remove_redundancy's checkpoint file.
|
|
586
|
+
|
|
585
587
|
Args:
|
|
586
588
|
src_checkpoints_dir (str): The source checkpoints directory.
|
|
587
589
|
dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
|
|
@@ -924,8 +926,8 @@ def set_op_strategy_config(mode="SAVE", path=""):
|
|
|
924
926
|
|
|
925
927
|
def build_searched_strategy(strategy_filename):
|
|
926
928
|
"""
|
|
927
|
-
Extract the sharding strategy for each parameter in the network
|
|
928
|
-
|
|
929
|
+
Extract the sharding strategy for each parameter in the network from the strategy file
|
|
930
|
+
for distributed inference scenarios.
|
|
929
931
|
|
|
930
932
|
Args:
|
|
931
933
|
strategy_filename (str): Name of strategy file.
|
|
@@ -1025,8 +1027,10 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1025
1027
|
>>> from mindspore.parallel.auto_parallel import AutoParallel
|
|
1026
1028
|
>>> from mindspore.nn.utils import no_init_parameters
|
|
1027
1029
|
>>> from mindspore.common.initializer import initializer, One
|
|
1030
|
+
>>> from mindspore.communication.management import get_group_size
|
|
1028
1031
|
>>>
|
|
1029
1032
|
>>> step_per_epoch = 4
|
|
1033
|
+
>>> device_num = get_group_size()
|
|
1030
1034
|
>>>
|
|
1031
1035
|
>>> # Define the network structure.
|
|
1032
1036
|
>>> class Net(nn.Cell):
|
|
@@ -1070,7 +1074,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1070
1074
|
... network = AutoParallel(network, parallel_mode="semi_auto")
|
|
1071
1075
|
... network.save_param_strategy_file(file_path="./train_strategy.ckpt")
|
|
1072
1076
|
... model = ms.Model(network=network, loss_fn=net_loss, optimizer=net_opt)
|
|
1073
|
-
... ckpt_config = train.CheckpointConfig(keep_checkpoint_max=1, integrated_save=
|
|
1077
|
+
... ckpt_config = train.CheckpointConfig(keep_checkpoint_max=1, integrated_save=True)
|
|
1074
1078
|
... global_rank_id = int(os.getenv("RANK_ID"))
|
|
1075
1079
|
... ckpt_path = "./rank_{}_ckpt".format(global_rank_id)
|
|
1076
1080
|
... ckpt_callback = train.ModelCheckpoint(prefix="parallel", directory=ckpt_path, config=ckpt_config)
|
|
@@ -1096,10 +1100,10 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1096
1100
|
>>>
|
|
1097
1101
|
>>> train_net()
|
|
1098
1102
|
>>> load_model()
|
|
1099
|
-
[[-
|
|
1100
|
-
[
|
|
1103
|
+
[[-9.62929535e+00, -9.76258755e+00, -9.70192051e+00 ... -9.67151260e+00, -9.71998310e+00, -9.64571190e+00],
|
|
1104
|
+
[-4.63218540e-01, -4.07317460e-01, -3.78161550e-01 ... -3.95918339e-01, -2.87363172e-01, -3.48693460e-01],
|
|
1101
1105
|
...
|
|
1102
|
-
[
|
|
1106
|
+
[-4.28075647e+00, -4.36630344e+00, -4.25664043e+00 ... -4.32012939e+00, -4.30337954e+00, -4.27571440e+00]]
|
|
1103
1107
|
"""
|
|
1104
1108
|
if format not in ['safetensors', 'ckpt'] or output_format not in ['safetensors', 'ckpt']:
|
|
1105
1109
|
raise ValueError(
|
|
@@ -1161,6 +1165,8 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1161
1165
|
train_strategy_filename = ms.context.get_auto_parallel_context("strategy_ckpt_load_file")
|
|
1162
1166
|
|
|
1163
1167
|
_train_strategy = build_searched_strategy(train_strategy_filename)
|
|
1168
|
+
if not _train_strategy:
|
|
1169
|
+
return True
|
|
1164
1170
|
train_strategy = _convert_to_list(_train_strategy)
|
|
1165
1171
|
|
|
1166
1172
|
train_dev_count = 1
|
|
@@ -1185,7 +1191,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1185
1191
|
param_not_in_ckpt = []
|
|
1186
1192
|
for _, param in network.parameters_and_names():
|
|
1187
1193
|
sliced_params = []
|
|
1188
|
-
if param.name not in rank_list
|
|
1194
|
+
if param.name not in rank_list:
|
|
1189
1195
|
param_not_in_strategy.append(param.name)
|
|
1190
1196
|
continue
|
|
1191
1197
|
if param.name not in param_total_dict:
|
|
@@ -22,7 +22,8 @@ import socket
|
|
|
22
22
|
import psutil
|
|
23
23
|
import mindspore.log as logger
|
|
24
24
|
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
|
|
25
|
-
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip
|
|
25
|
+
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip, _generate_auto_bind_core_strategy, \
|
|
26
|
+
_generate_bind_core_strategy
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class _Node:
|
|
@@ -79,11 +80,12 @@ class _ComputeGraphNode(_Node):
|
|
|
79
80
|
Worker node for dynamic networking. Inherits from the Node class.
|
|
80
81
|
"""
|
|
81
82
|
|
|
82
|
-
def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file,
|
|
83
|
+
def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, node_rank, args_list, output_file,
|
|
83
84
|
tail_worker_log, join, is_simulation):
|
|
84
85
|
super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file,
|
|
85
86
|
tail_worker_log, join, is_simulation)
|
|
86
87
|
self.node_id = node_id
|
|
88
|
+
self.node_rank = node_rank
|
|
87
89
|
|
|
88
90
|
def run(self):
|
|
89
91
|
"""
|
|
@@ -95,6 +97,8 @@ class _ComputeGraphNode(_Node):
|
|
|
95
97
|
super().run()
|
|
96
98
|
if self.node_id is not None:
|
|
97
99
|
os.environ["MS_NODE_ID"] = str(self.node_id)
|
|
100
|
+
if self.node_rank is not None:
|
|
101
|
+
os.environ["MS_NODE_RANK"] = str(self.node_rank)
|
|
98
102
|
# If simulation level is set, environment variable 'MS_ROLE' will not be set.
|
|
99
103
|
if not self.is_simulation:
|
|
100
104
|
os.environ["MS_ROLE"] = "MS_WORKER"
|
|
@@ -119,6 +123,9 @@ class _ComputeGraphNode(_Node):
|
|
|
119
123
|
return subprocess.Popen(['/usr/bin/tail', '-f', self.output_file])
|
|
120
124
|
|
|
121
125
|
def enable_tail_worker_log(self):
|
|
126
|
+
"""
|
|
127
|
+
Get valid rank ID for tailing the corresponding worker log.
|
|
128
|
+
"""
|
|
122
129
|
tail_worker_log_list = []
|
|
123
130
|
if self.tail_worker_log != "-1":
|
|
124
131
|
tail_worker_log_list.extend([int(num) for num in self.tail_worker_log.split(',')])
|
|
@@ -169,12 +176,15 @@ class _ProcessManager:
|
|
|
169
176
|
|
|
170
177
|
self.sim_level = args.sim_level
|
|
171
178
|
self.sim_rank_id = args.sim_rank_id
|
|
172
|
-
self.is_simulation =
|
|
179
|
+
self.is_simulation = self.sim_level != -1
|
|
173
180
|
if self.is_simulation:
|
|
174
181
|
os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
|
|
182
|
+
if self.sim_rank_id == -1:
|
|
183
|
+
self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
|
|
175
184
|
elif os.getenv("MS_SIMULATION_LEVEL"):
|
|
176
185
|
self.is_simulation = True
|
|
177
|
-
self.sim_rank_id
|
|
186
|
+
if self.sim_rank_id == -1:
|
|
187
|
+
self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
|
|
178
188
|
if os.getenv("RANK_SIZE"):
|
|
179
189
|
self.exported_rank_size = os.getenv("RANK_SIZE")
|
|
180
190
|
# If sim_rank_id is set, single worker can be started.
|
|
@@ -205,15 +215,24 @@ class _ProcessManager:
|
|
|
205
215
|
finally:
|
|
206
216
|
os.umask(origin_mask)
|
|
207
217
|
|
|
218
|
+
self.device_to_cpu_map = {}
|
|
219
|
+
if self.bind_core is True:
|
|
220
|
+
self.device_to_cpu_map = _generate_auto_bind_core_strategy(self.local_worker_num)
|
|
221
|
+
|
|
208
222
|
self.proc_rank_map = {}
|
|
209
223
|
self.enable_mindx = False
|
|
224
|
+
self._check_taskd()
|
|
225
|
+
|
|
226
|
+
def _check_taskd(self):
|
|
227
|
+
"""check if enable taskd."""
|
|
210
228
|
tft_env = os.getenv("MS_ENABLE_TFT", "")
|
|
211
|
-
if (
|
|
229
|
+
if any(v in tft_env for v in ('TTP:1', 'UCE:1', 'ARF:1', 'TSP:1', 'RSC:1', 'HCCE:1')):
|
|
212
230
|
try:
|
|
213
231
|
from taskd.python.framework.agent.ms_mgr.msrun_plugin import MSRunPlugin
|
|
214
232
|
self.msmgr = MSRunPlugin()
|
|
215
233
|
self.msmgr.register_callbacks("KILL_WORKER", self.kill_workers)
|
|
216
234
|
self.msmgr.register_callbacks("START_ALL_WORKER", self.start_all_workers)
|
|
235
|
+
self.msmgr.register_callbacks("START_WORKER_LIST", self.start_worker_list)
|
|
217
236
|
self.msmgr.register_callbacks("MONITOR", self.monitor_rank_status)
|
|
218
237
|
self.enable_mindx = True
|
|
219
238
|
os.environ["MS_ENABLE_RECOVERY"] = str(1)
|
|
@@ -261,6 +280,45 @@ class _ProcessManager:
|
|
|
261
280
|
self.is_simulation)
|
|
262
281
|
self.msn_process = msn.run()
|
|
263
282
|
|
|
283
|
+
def _start_single_worker(self, local_rank):
|
|
284
|
+
"""
|
|
285
|
+
Start worker processor
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
local_rank: local rank id.
|
|
289
|
+
"""
|
|
290
|
+
os.environ["DEVICE_ID"] = str(local_rank)
|
|
291
|
+
node_id, log_name = self._get_node_id_and_log_path(local_rank)
|
|
292
|
+
if node_id is None:
|
|
293
|
+
logger.warning(f"Rank ids will be assigned automatically, "
|
|
294
|
+
"please use 'grep -rn 'rank id:' command to check each worker log's rank id.")
|
|
295
|
+
else:
|
|
296
|
+
# If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
|
|
297
|
+
# This is for rank_table method's compatibility consideration.
|
|
298
|
+
os.environ["RANK_ID"] = str(node_id)
|
|
299
|
+
print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
|
|
300
|
+
f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
|
|
301
|
+
if self.is_simulation and (self.sim_rank_id != -1):
|
|
302
|
+
# Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
|
|
303
|
+
os.environ["RANK_ID"] = str(self.sim_rank_id)
|
|
304
|
+
logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
|
|
305
|
+
|
|
306
|
+
if self.bind_core:
|
|
307
|
+
affinity_cpu_str = _generate_bind_core_strategy(local_rank, self.device_to_cpu_map, self.bind_core)
|
|
308
|
+
if affinity_cpu_str is not None:
|
|
309
|
+
cmd = _generate_cmd_args_list_with_core(self.cmd, self.cmd_args, affinity_cpu_str)
|
|
310
|
+
else:
|
|
311
|
+
cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
|
|
312
|
+
else:
|
|
313
|
+
cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
|
|
314
|
+
cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
|
|
315
|
+
node_id, self.node_rank, cmd, log_name, self.tail_worker_log, self.join,
|
|
316
|
+
self.is_simulation)
|
|
317
|
+
process, tail_process = cgn.run()
|
|
318
|
+
self.cgn_processes.append(process)
|
|
319
|
+
self.tail_cgn_processes.append(tail_process)
|
|
320
|
+
self.proc_rank_map[local_rank] = process
|
|
321
|
+
|
|
264
322
|
def start_workers(self):
|
|
265
323
|
"""
|
|
266
324
|
Starts the worker nodes.
|
|
@@ -275,40 +333,8 @@ class _ProcessManager:
|
|
|
275
333
|
"'rank_id' of each process will be assigned after cluster is successfully built.\n"
|
|
276
334
|
"You can access 'RANK_ID' environment variable after calling "
|
|
277
335
|
"'mindspore.communication.init()'")
|
|
278
|
-
|
|
279
336
|
for i in range(self.local_worker_num):
|
|
280
|
-
|
|
281
|
-
node_id, log_name = self._get_node_id_and_log_path(i)
|
|
282
|
-
if node_id is None:
|
|
283
|
-
logger.warning(f"Rank ids will be assigned automatically, "
|
|
284
|
-
"please use 'grep -rn 'rank id:' command to check each worker log's rank id.")
|
|
285
|
-
else:
|
|
286
|
-
# If node_id is generated in '_get_node_id_and_log_path' method, export 'RANK_ID' environment variable.
|
|
287
|
-
# This is for rank_table method's compatibility consideration.
|
|
288
|
-
os.environ["RANK_ID"] = str(node_id)
|
|
289
|
-
print(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
|
|
290
|
-
f"Environment variable [RANK_ID={node_id}] is exported.", flush=True)
|
|
291
|
-
if self.is_simulation and (self.sim_rank_id != -1):
|
|
292
|
-
# Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
|
|
293
|
-
os.environ["RANK_ID"] = str(self.sim_rank_id)
|
|
294
|
-
logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
|
|
295
|
-
|
|
296
|
-
if self.bind_core:
|
|
297
|
-
cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
|
|
298
|
-
if not cpu_num.isdigit():
|
|
299
|
-
raise RuntimeError(f"Got cpu number from '/proc/cpuinfo' is {cpu_num}, failed to bind core.")
|
|
300
|
-
avg = int(cpu_num) // self.local_worker_num
|
|
301
|
-
cpu_start = avg * i
|
|
302
|
-
cpu_end = cpu_start + avg - 1
|
|
303
|
-
cmd = _generate_cmd_args_list_with_core(self.cmd, self.cmd_args, cpu_start, cpu_end)
|
|
304
|
-
else:
|
|
305
|
-
cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
|
|
306
|
-
cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
|
|
307
|
-
node_id, cmd, log_name, self.tail_worker_log, self.join, self.is_simulation)
|
|
308
|
-
process, tail_process = cgn.run()
|
|
309
|
-
self.cgn_processes.append(process)
|
|
310
|
-
self.tail_cgn_processes.append(tail_process)
|
|
311
|
-
self.proc_rank_map[i] = process
|
|
337
|
+
self._start_single_worker(i)
|
|
312
338
|
|
|
313
339
|
def join_processes(self):
|
|
314
340
|
"""
|
|
@@ -334,7 +360,7 @@ class _ProcessManager:
|
|
|
334
360
|
continue
|
|
335
361
|
elif ret_code != 0:
|
|
336
362
|
has_exception = True
|
|
337
|
-
logger.error(f"Worker process {p.pid} exit with exception.")
|
|
363
|
+
logger.error(f"Worker process {p.pid} exit with exception. Error code: {ret_code}.")
|
|
338
364
|
break
|
|
339
365
|
else:
|
|
340
366
|
success_cgn_processes.add(p)
|
|
@@ -420,14 +446,9 @@ class _ProcessManager:
|
|
|
420
446
|
Args:
|
|
421
447
|
NA.
|
|
422
448
|
"""
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
p.kill()
|
|
449
|
+
self.kill_worker_processes()
|
|
450
|
+
self.kill_tail_log_processes()
|
|
426
451
|
self.cgn_processes.clear()
|
|
427
|
-
|
|
428
|
-
for p in self.tail_cgn_processes:
|
|
429
|
-
if p is not None:
|
|
430
|
-
p.kill()
|
|
431
452
|
self.tail_cgn_processes.clear()
|
|
432
453
|
|
|
433
454
|
def kill_single_worker(self, pid):
|
|
@@ -441,7 +462,7 @@ class _ProcessManager:
|
|
|
441
462
|
for i in range(len(self.cgn_processes)):
|
|
442
463
|
p = self.cgn_processes[i]
|
|
443
464
|
if p.pid == pid and p.poll() is None:
|
|
444
|
-
p.
|
|
465
|
+
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
|
|
445
466
|
del self.cgn_processes[i]
|
|
446
467
|
tail_p = self.tail_cgn_processes[i]
|
|
447
468
|
if tail_p is not None:
|
|
@@ -499,7 +520,8 @@ class _ProcessManager:
|
|
|
499
520
|
p_status = p.poll()
|
|
500
521
|
if (not psutil.pid_exists(p.pid)) and (p_status != 0):
|
|
501
522
|
p_status = 300
|
|
502
|
-
return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id
|
|
523
|
+
return {"pid": p.pid, "status": p_status, "global_rank": global_rank_id, "local_rank": rank_id,
|
|
524
|
+
"node_id": self.node_rank}
|
|
503
525
|
except KeyError:
|
|
504
526
|
logger.info(f"Process rank {rank_id} has not been initialized.")
|
|
505
527
|
return {"pid": None, "status": 200, "global_rank": global_rank_id}
|
|
@@ -519,7 +541,24 @@ class _ProcessManager:
|
|
|
519
541
|
self.start_workers()
|
|
520
542
|
worker_status = self.monitor_rank_status([-1])
|
|
521
543
|
for i in range(self.local_worker_num):
|
|
522
|
-
if worker_status[i]["status"]
|
|
544
|
+
if worker_status[i]["status"] is not None:
|
|
545
|
+
return 1
|
|
546
|
+
return 0
|
|
547
|
+
|
|
548
|
+
def start_worker_list(self, rank_ids):
|
|
549
|
+
"""
|
|
550
|
+
Start worker processor by rank list.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
rank_ids: worker process's local rank list, which is also device_id.
|
|
554
|
+
"""
|
|
555
|
+
if not isinstance(rank_ids, list):
|
|
556
|
+
raise TypeError(f"The type of 'rank_ids' must be a list, but got:{rank_ids}")
|
|
557
|
+
for idx in rank_ids:
|
|
558
|
+
self._start_single_worker(idx)
|
|
559
|
+
worker_status = self.monitor_rank_status(rank_ids)
|
|
560
|
+
for i in rank_ids:
|
|
561
|
+
if worker_status[i]["status"] is not None:
|
|
523
562
|
return 1
|
|
524
563
|
return 0
|
|
525
564
|
|
|
@@ -18,6 +18,8 @@ import json
|
|
|
18
18
|
import socket
|
|
19
19
|
import ipaddress
|
|
20
20
|
import mindspore.log as logger
|
|
21
|
+
from mindspore.runtime.thread_bind_core import _get_physical_device_id, _get_cpu_available, \
|
|
22
|
+
_auto_generate_strategy, _equal_distribution_strategy
|
|
21
23
|
|
|
22
24
|
CURRENT_IP = None
|
|
23
25
|
|
|
@@ -45,19 +47,19 @@ def _generate_cmd_args_list(cmd, cmd_args):
|
|
|
45
47
|
return [cmd] + cmd_args
|
|
46
48
|
|
|
47
49
|
|
|
48
|
-
def _generate_cmd_args_list_with_core(cmd, cmd_args,
|
|
50
|
+
def _generate_cmd_args_list_with_core(cmd, cmd_args, affinity_cpu_str):
|
|
49
51
|
"""
|
|
50
52
|
Generates arguments list for 'Popen'. It consists of a binary file name and subsequential arguments.
|
|
51
53
|
"""
|
|
52
54
|
# Bind cpu cores to this process.
|
|
53
|
-
taskset_args = ['taskset'] + ['-c'] + [
|
|
55
|
+
taskset_args = ['taskset'] + ['-c'] + [affinity_cpu_str]
|
|
54
56
|
final_cmd = []
|
|
55
57
|
if cmd not in ['python', 'pytest', 'python3']:
|
|
56
58
|
# If user don't set binary file name, defaulty use 'python' to launch the job.
|
|
57
59
|
final_cmd = taskset_args + ['python'] + [cmd] + cmd_args
|
|
58
60
|
else:
|
|
59
61
|
final_cmd = taskset_args + [cmd] + cmd_args
|
|
60
|
-
logger.
|
|
62
|
+
logger.warning(f"Launch process with command: {' '.join(final_cmd)}")
|
|
61
63
|
return final_cmd
|
|
62
64
|
|
|
63
65
|
|
|
@@ -83,8 +85,8 @@ def _get_local_ip(ip_address):
|
|
|
83
85
|
CURRENT_IP = s.getsockname()[0]
|
|
84
86
|
s.close()
|
|
85
87
|
except Exception as e:
|
|
86
|
-
raise RuntimeError(
|
|
87
|
-
"
|
|
88
|
+
raise RuntimeError("Get local ip has failed. Please verify that the accessible address has been "
|
|
89
|
+
"specified in the '--master_address' parameter") from e
|
|
88
90
|
return CURRENT_IP
|
|
89
91
|
|
|
90
92
|
|
|
@@ -124,8 +126,8 @@ def _convert_addr_to_ip(master_addr):
|
|
|
124
126
|
logger.info(f"Convert input host name:{master_addr} to ip address:{ip_address}.")
|
|
125
127
|
return ip_address
|
|
126
128
|
except socket.gaierror as e:
|
|
127
|
-
raise RuntimeError(
|
|
128
|
-
"
|
|
129
|
+
raise RuntimeError("DNS resolution has failed. Please verify that the correct hostname has been "
|
|
130
|
+
"specified in the '--master_address' parameter") from e
|
|
129
131
|
|
|
130
132
|
|
|
131
133
|
def _send_scale_num(url, scale_num):
|
|
@@ -134,3 +136,89 @@ def _send_scale_num(url, scale_num):
|
|
|
134
136
|
|
|
135
137
|
"""
|
|
136
138
|
return ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, device_to_cpu_map):
|
|
142
|
+
"""
|
|
143
|
+
Parse the global device_to_cpu_map and return a cpu list for assigned local_rank_id.
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
input_device_id = int(list(device_to_cpu_map.keys())[local_rank_id].replace("device", ""))
|
|
147
|
+
if physical_device_id != input_device_id:
|
|
148
|
+
return ""
|
|
149
|
+
affinity_cpu_list = list(device_to_cpu_map.values())[local_rank_id]
|
|
150
|
+
affinity_cpu_str = ",".join(affinity_cpu_list)
|
|
151
|
+
return affinity_cpu_str
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _generate_auto_bind_core_strategy(local_worker_num):
|
|
155
|
+
"""
|
|
156
|
+
Get device to core range assigned for the all processes.
|
|
157
|
+
|
|
158
|
+
"""
|
|
159
|
+
simulation_level = os.getenv("MS_SIMULATION_LEVEL", "").strip()
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
available_cpus = _get_cpu_available()
|
|
163
|
+
except RuntimeError as e:
|
|
164
|
+
logger.warning(f"Failed to acquire available cpu info, error: {e} Will not launch process with taskset.")
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
if not simulation_level:
|
|
168
|
+
device_to_cpu_map = _auto_generate_strategy(local_worker_num, available_cpus)
|
|
169
|
+
else:
|
|
170
|
+
device_to_cpu_map = _equal_distribution_strategy(local_worker_num, available_cpus)
|
|
171
|
+
|
|
172
|
+
return device_to_cpu_map
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def ranges_to_str(num_list):
|
|
176
|
+
"""
|
|
177
|
+
Convert a num list to a range string.
|
|
178
|
+
|
|
179
|
+
"""
|
|
180
|
+
ranges = []
|
|
181
|
+
start = num_list[0]
|
|
182
|
+
for i in range(1, len(num_list)):
|
|
183
|
+
if num_list[i] != num_list[i-1] + 1:
|
|
184
|
+
ranges.append((start, num_list[i-1]))
|
|
185
|
+
start = num_list[i]
|
|
186
|
+
ranges.append((start, num_list[-1]))
|
|
187
|
+
|
|
188
|
+
parts = []
|
|
189
|
+
for start, end in ranges:
|
|
190
|
+
if start == end:
|
|
191
|
+
parts.append(str(start))
|
|
192
|
+
else:
|
|
193
|
+
parts.append(f"{start}-{end}")
|
|
194
|
+
return ",".join(parts)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _generate_bind_core_strategy(local_rank_id, device_to_cpu_map, arg_bind_core):
|
|
198
|
+
"""
|
|
199
|
+
Get device to core range assigned for the all processes.
|
|
200
|
+
|
|
201
|
+
"""
|
|
202
|
+
affinity_cpu_str = ""
|
|
203
|
+
cpu_list_for_device = []
|
|
204
|
+
simulation_level = os.getenv("MS_SIMULATION_LEVEL", "").strip()
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
physical_device_id = _get_physical_device_id(local_rank_id, simulation_level)
|
|
208
|
+
except RuntimeError as e:
|
|
209
|
+
logger.warning(f"Failed to acquire device id, error: {e} Will not launch process with taskset.")
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
if isinstance(arg_bind_core, dict):
|
|
213
|
+
affinity_cpu_str = _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, arg_bind_core)
|
|
214
|
+
if not affinity_cpu_str:
|
|
215
|
+
logger.warning(f"Failed to find physical_device_id[{physical_device_id}] for "
|
|
216
|
+
f"process[{local_rank_id}]. Will not launch process with taskset.")
|
|
217
|
+
return None
|
|
218
|
+
elif arg_bind_core is True:
|
|
219
|
+
cpu_list_for_device = device_to_cpu_map.get(physical_device_id, [])
|
|
220
|
+
if not cpu_list_for_device:
|
|
221
|
+
return None
|
|
222
|
+
os.environ["MSRUN_CPU_LIST"] = str(cpu_list_for_device)
|
|
223
|
+
affinity_cpu_str = ranges_to_str(cpu_list_for_device)
|
|
224
|
+
return affinity_cpu_str
|
|
@@ -14,9 +14,47 @@
|
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""Entrypoint of ms_run"""
|
|
16
16
|
import ast
|
|
17
|
-
|
|
17
|
+
import re
|
|
18
|
+
import json
|
|
19
|
+
from argparse import REMAINDER, ArgumentParser, ArgumentTypeError
|
|
18
20
|
from .process_entity import _ProcessManager
|
|
19
21
|
|
|
22
|
+
|
|
23
|
+
def parse_and_validate_bind_core(value):
|
|
24
|
+
"""
|
|
25
|
+
Parse input argument of --bind_core.
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
if value.lower() == "true":
|
|
29
|
+
return True
|
|
30
|
+
if value.lower() == "false":
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
value_dict = json.loads(value)
|
|
35
|
+
except json.JSONDecodeError as e:
|
|
36
|
+
raise ArgumentTypeError("Failed to parse JSON into a dictionary") from e
|
|
37
|
+
|
|
38
|
+
if isinstance(value_dict, dict):
|
|
39
|
+
range_pattern = re.compile(r'^\d+-\d+$')
|
|
40
|
+
for device_id, affinity_cpu_list in value_dict.items():
|
|
41
|
+
if not re.fullmatch(r"device\d+", device_id):
|
|
42
|
+
raise ArgumentTypeError(f"Key '{device_id}' must be in format 'deviceX' (X ≥ 0).")
|
|
43
|
+
if not isinstance(affinity_cpu_list, list):
|
|
44
|
+
raise ArgumentTypeError(f"Value for '{device_id}':{affinity_cpu_list} should be a list, "
|
|
45
|
+
f"but got {type(affinity_cpu_list)}.")
|
|
46
|
+
|
|
47
|
+
for cpu_range in affinity_cpu_list:
|
|
48
|
+
if not isinstance(cpu_range, str):
|
|
49
|
+
raise ArgumentTypeError(f"CPU range '{cpu_range}' in '{affinity_cpu_list}' should be a string.")
|
|
50
|
+
if not range_pattern.match(cpu_range):
|
|
51
|
+
raise ArgumentTypeError(f"CPU range '{cpu_range}' in '{affinity_cpu_list}' should be "
|
|
52
|
+
"in format 'cpuidX-cpuidY'.")
|
|
53
|
+
return value_dict
|
|
54
|
+
|
|
55
|
+
raise ArgumentTypeError(f"Type of {value} should be bool or dict, but got {type(value)}.")
|
|
56
|
+
|
|
57
|
+
|
|
20
58
|
def get_args():
|
|
21
59
|
"""
|
|
22
60
|
Parses and retrieves command-line arguments.
|
|
@@ -77,23 +115,26 @@ def get_args():
|
|
|
77
115
|
parser.add_argument(
|
|
78
116
|
"--bind_core",
|
|
79
117
|
default=False,
|
|
80
|
-
type=
|
|
81
|
-
|
|
82
|
-
|
|
118
|
+
type=parse_and_validate_bind_core,
|
|
119
|
+
help="specifies whether msrun should bind CPU cores to spawned processes. "
|
|
120
|
+
"If set to True, msrun will bind core based on the environment automatically, "
|
|
121
|
+
"and if passed a dict, msrun will bind core based on this dict information."
|
|
83
122
|
)
|
|
84
123
|
parser.add_argument(
|
|
85
124
|
"--sim_level",
|
|
86
125
|
default=-1,
|
|
87
126
|
type=int,
|
|
88
127
|
choices=[0, 1, 2, 3],
|
|
89
|
-
help="specifies simulation level.
|
|
90
|
-
"
|
|
128
|
+
help="specifies simulation level. This argument activates dryrun mode, functioning "
|
|
129
|
+
"equivalently to environment variable 'MS_SIMULATION_LEVEL' while having higher priority."
|
|
91
130
|
)
|
|
92
131
|
parser.add_argument(
|
|
93
132
|
"--sim_rank_id",
|
|
94
133
|
default=-1,
|
|
95
134
|
type=int,
|
|
96
|
-
help="specifies simulation process's rank id.
|
|
135
|
+
help="specifies simulation process's rank id. When this argument is set, only one process "
|
|
136
|
+
"is spawned on dryrun mode, functioning equivalently to environment variable 'RANK_ID' "
|
|
137
|
+
"while having higher priority."
|
|
97
138
|
)
|
|
98
139
|
parser.add_argument(
|
|
99
140
|
"--rank_table_file",
|
|
@@ -16,8 +16,15 @@
|
|
|
16
16
|
"""
|
|
17
17
|
Parallel function operator
|
|
18
18
|
"""
|
|
19
|
+
from __future__ import absolute_import
|
|
19
20
|
|
|
20
|
-
from
|
|
21
|
+
from . import (
|
|
22
|
+
reshard_func
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from .reshard_func import (
|
|
26
|
+
reshard
|
|
27
|
+
)
|
|
21
28
|
|
|
22
29
|
__all__ = []
|
|
23
30
|
__all__.extend(reshard_func.__all__)
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""Defines parameter operators with functional form."""
|
|
16
|
-
import mindspore as ms
|
|
17
16
|
from mindspore import context, ops
|
|
18
17
|
from mindspore import log as logger
|
|
19
18
|
from mindspore.ops import operations as P
|
|
@@ -43,11 +42,12 @@ def reshard(tensor, layout):
|
|
|
43
42
|
can check :class:`mindspore.parallel.Layout` for reference.
|
|
44
43
|
|
|
45
44
|
Note:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
45
|
+
In the Graph mode, this function can set the sharding propagation strategy of a tensor.
|
|
46
|
+
For those tensor do not manually be set, their strategies are decided by the sharding
|
|
47
|
+
strategy propagation algorithm automatically.
|
|
48
|
+
|
|
49
|
+
.. warning::
|
|
50
|
+
The method is currently not supported in PyNative mode.
|
|
51
51
|
|
|
52
52
|
Args:
|
|
53
53
|
tensor (Tensor): The tensor to be set the sharding strategy.
|
|
@@ -59,8 +59,8 @@ def reshard(tensor, layout):
|
|
|
59
59
|
Tensor. The mathematically equivalent of the input tensor.
|
|
60
60
|
|
|
61
61
|
Raises:
|
|
62
|
-
TypeError:
|
|
63
|
-
TypeError:
|
|
62
|
+
TypeError: If the type of input param `tensor` is not mindspore.Tensor.
|
|
63
|
+
TypeError: If the type of input param `layout` is not mindspore.parallel.Layout.
|
|
64
64
|
|
|
65
65
|
Supported Platforms:
|
|
66
66
|
``Ascend``
|
|
@@ -220,11 +220,11 @@ def _redistribute(tensor, dst_dtensor_info):
|
|
|
220
220
|
if not comm_tensor_data_func._current_rank_has_data:
|
|
221
221
|
new_tensor_shape = tuple([tensor_data.shape[i] // tensor._dtensor_info.sharding_strategy[i]
|
|
222
222
|
for i in range(len(tensor.shape))])
|
|
223
|
-
tensor_data =
|
|
223
|
+
tensor_data = ops.zeros(new_tensor_shape, tensor.dtype)
|
|
224
|
+
_ = comm_tensor_data_func.comm_data(tensor_data)
|
|
224
225
|
else:
|
|
225
|
-
|
|
226
|
+
_ = comm_tensor_data_func.comm_data(tensor_data)
|
|
226
227
|
all_reduce_data = True
|
|
227
|
-
ms.communication.comm_func.barrier()
|
|
228
228
|
if src_layout_info['device_matrix'] == dst_layout_info['device_matrix'] and src_layout_info['tensor_map'] == \
|
|
229
229
|
dst_layout_info['tensor_map']:
|
|
230
230
|
return tensor_data
|
|
@@ -236,7 +236,7 @@ def _redistribute(tensor, dst_dtensor_info):
|
|
|
236
236
|
global REDIST_CELL_CACHE
|
|
237
237
|
redist_cache_key = (f"{src_layout_info['device_matrix']}, {src_layout_info['tensor_map']} -> "
|
|
238
238
|
f"{dst_layout_info['device_matrix']}, {dst_layout_info['tensor_map']}")
|
|
239
|
-
if redist_cache_key in REDIST_CELL_CACHE
|
|
239
|
+
if redist_cache_key in REDIST_CELL_CACHE:
|
|
240
240
|
logger.debug(f"redist_cache_key is {redist_cache_key}, match cache")
|
|
241
241
|
redist_func = REDIST_CELL_CACHE[redist_cache_key]
|
|
242
242
|
else:
|
|
@@ -17,8 +17,21 @@ Interfaces for parallel-related functionality
|
|
|
17
17
|
"""
|
|
18
18
|
from __future__ import absolute_import
|
|
19
19
|
|
|
20
|
-
from
|
|
21
|
-
|
|
20
|
+
from . import (
|
|
21
|
+
parallel_grad_reducer,
|
|
22
|
+
parallel_cell_wrapper
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from .parallel_grad_reducer import (
|
|
26
|
+
PipelineGradReducer
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .parallel_cell_wrapper import (
|
|
30
|
+
PipelineCell,
|
|
31
|
+
Pipeline,
|
|
32
|
+
MicroBatchInterleaved,
|
|
33
|
+
GradAccumulation
|
|
34
|
+
)
|
|
22
35
|
|
|
23
36
|
__all__ = []
|
|
24
37
|
__all__.extend(parallel_grad_reducer.__all__)
|